feat: Introduce advanced search capabilities (#753)

* feat: Implement search filtering in the backend * feat: Implement search language parser * rename matcher name * Add ability to interleve text * More fixes * be more tolerable to parsing errors * Add a search query explainer widget * Handle date parsing gracefully * Fix the lockfile * Encode query search param * Fix table body error * Fix error when writing quotes
author: Mohamed Bassem <me@mbassem.com> 2024-12-31 13:17:56 +0200
committer: GitHub <noreply@github.com> 2024-12-31 13:17:56 +0200
commit: cbaf9e6034aa09911fca967b7af6cad11f154b3e (patch)
tree: 6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages/shared
parent: f476fca758bb039f9605488b61ba35fc097d6cfc (diff)
download: karakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst
5 files changed, 717 insertions, 2 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json
index d741b70f..d412301a 100644
--- a/packages/shared/package.json
+++ b/packages/shared/package.json
@@ -10,18 +10,21 @@
     "meilisearch": "^0.37.0",
     "ollama": "^0.5.9",
     "openai": "^4.67.1",
+    "typescript-parsec": "^0.3.4",
     "winston": "^3.11.0",
     "zod": "^3.22.4"
   },
   "devDependencies": {
     "@hoarder/eslint-config": "workspace:^0.2.0",
     "@hoarder/prettier-config": "workspace:^0.1.0",
-    "@hoarder/tsconfig": "workspace:^0.1.0"
+    "@hoarder/tsconfig": "workspace:^0.1.0",
+    "vitest": "^1.3.1"
   },
   "scripts": {
     "typecheck": "tsc --noEmit",
     "format": "prettier . --ignore-path ../../.prettierignore",
-    "lint": "eslint ."
+    "lint": "eslint .",
+    "test": "vitest"
   },
   "main": "index.ts",
   "eslintConfig": {
diff --git a/packages/shared/searchQueryParser.test.ts b/packages/shared/searchQueryParser.test.ts
new file mode 100644
index 00000000..428d5929
--- /dev/null
+++ b/packages/shared/searchQueryParser.test.ts
@@ -0,0 +1,275 @@
+import { describe, expect, test } from "vitest";
+
+import { parseSearchQuery } from "./searchQueryParser";
+
+describe("Search Query Parser", () => {
+  test("simple is queries", () => {
+    expect(parseSearchQuery("is:archived")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "archived",
+        archived: true,
+      },
+    });
+    expect(parseSearchQuery("is:not_archived")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "archived",
+        archived: false,
+      },
+    });
+    expect(parseSearchQuery("is:fav")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "favourited",
+        favourited: true,
+      },
+    });
+    expect(parseSearchQuery("is:not_fav")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "favourited",
+        favourited: false,
+      },
+    });
+  });
+
+  test("simple string queries", () => {
+    expect(parseSearchQuery("url:https://example.com")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "url",
+        url: "https://example.com",
+      },
+    });
+    expect(parseSearchQuery('url:"https://example.com"')).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "url",
+        url: "https://example.com",
+      },
+    });
+    expect(parseSearchQuery("#my-tag")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "tagName",
+        tagName: "my-tag",
+      },
+    });
+    expect(parseSearchQuery('#"my tag"')).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "tagName",
+        tagName: "my tag",
+      },
+    });
+    expect(parseSearchQuery("list:my-list")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "listName",
+        listName: "my-list",
+      },
+    });
+    expect(parseSearchQuery('list:"my list"')).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "listName",
+        listName: "my list",
+      },
+    });
+  });
+  test("date queries", () => {
+    expect(parseSearchQuery("after:2023-10-12")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "dateAfter",
+        dateAfter: new Date("2023-10-12"),
+      },
+    });
+    expect(parseSearchQuery("before:2023-10-12")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "dateBefore",
+        dateBefore: new Date("2023-10-12"),
+      },
+    });
+  });
+
+  test("complex queries", () => {
+    expect(parseSearchQuery("is:fav is:archived")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "and",
+        matchers: [
+          {
+            type: "favourited",
+            favourited: true,
+          },
+          {
+            type: "archived",
+            archived: true,
+          },
+        ],
+      },
+    });
+
+    expect(parseSearchQuery("(is:fav is:archived) #my-tag")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "and",
+        matchers: [
+          {
+            type: "favourited",
+            favourited: true,
+          },
+          {
+            type: "archived",
+            archived: true,
+          },
+          {
+            type: "tagName",
+            tagName: "my-tag",
+          },
+        ],
+      },
+    });
+
+    expect(parseSearchQuery("(is:fav is:archived) or (#my-tag)")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "or",
+        matchers: [
+          {
+            type: "and",
+            matchers: [
+              {
+                type: "favourited",
+                favourited: true,
+              },
+              {
+                type: "archived",
+                archived: true,
+              },
+            ],
+          },
+          {
+            type: "tagName",
+            tagName: "my-tag",
+          },
+        ],
+      },
+    });
+
+    expect(parseSearchQuery("(is:fav or is:archived) and #my-tag")).toEqual({
+      result: "full",
+      text: "",
+      matcher: {
+        type: "and",
+        matchers: [
+          {
+            type: "or",
+            matchers: [
+              {
+                type: "favourited",
+                favourited: true,
+              },
+              {
+                type: "archived",
+                archived: true,
+              },
+            ],
+          },
+          {
+            type: "tagName",
+            tagName: "my-tag",
+          },
+        ],
+      },
+    });
+  });
+  test("pure text", () => {
+    expect(parseSearchQuery("hello")).toEqual({
+      result: "full",
+      text: "hello",
+      matcher: undefined,
+    });
+    expect(parseSearchQuery("hello world")).toEqual({
+      result: "full",
+      text: "hello world",
+      matcher: undefined,
+    });
+  });
+
+  test("text interlived with matchers", () => {
+    expect(
+      parseSearchQuery(
+        "hello is:fav world is:archived mixed world #my-tag test",
+      ),
+    ).toEqual({
+      result: "full",
+      text: "hello world mixed world test",
+      matcher: {
+        type: "and",
+        matchers: [
+          {
+            type: "favourited",
+            favourited: true,
+          },
+          {
+            type: "archived",
+            archived: true,
+          },
+          {
+            type: "tagName",
+            tagName: "my-tag",
+          },
+        ],
+      },
+    });
+  });
+
+  test("unknown qualifiers are emitted as pure text", () => {
+    expect(parseSearchQuery("is:fav is:helloworld")).toEqual({
+      result: "full",
+      text: "is:helloworld",
+      matcher: {
+        type: "favourited",
+        favourited: true,
+      },
+    });
+  });
+
+  test("partial results", () => {
+    expect(parseSearchQuery("(is:archived) or ")).toEqual({
+      result: "partial",
+      text: "or",
+      matcher: {
+        type: "archived",
+        archived: true,
+      },
+    });
+    expect(parseSearchQuery("is:fav is: ( random")).toEqual({
+      result: "partial",
+      text: "is: ( random",
+      matcher: {
+        type: "favourited",
+        favourited: true,
+      },
+    });
+  });
+});
diff --git a/packages/shared/searchQueryParser.ts b/packages/shared/searchQueryParser.ts
new file mode 100644
index 00000000..faf74d08
--- /dev/null
+++ b/packages/shared/searchQueryParser.ts
@@ -0,0 +1,351 @@
+import {
+  alt,
+  alt_sc,
+  apply,
+  kmid,
+  kright,
+  lrec_sc,
+  rule,
+  seq,
+  str,
+  tok,
+  Token,
+  TokenPosition,
+} from "typescript-parsec";
+import { z } from "zod";
+
+import { Matcher } from "./types/search";
+
+enum TokenType {
+  And = "AND",
+  Or = "OR",
+
+  Qualifier = "QUALIFIER",
+  Ident = "IDENT",
+  StringLiteral = "STRING_LITERAL",
+
+  LParen = "LPAREN",
+  RParen = "RPAREN",
+  Space = "SPACE",
+  Hash = "HASH",
+}
+
+// Rules are in order of priority
+const lexerRules: [RegExp, TokenType][] = [
+  [/^and/i, TokenType.And],
+  [/^or/i, TokenType.Or],
+
+  [/^#/, TokenType.Hash],
+  [/^(is|url|list|after|before):/, TokenType.Qualifier],
+
+  [/^"([^"]+)"/, TokenType.StringLiteral],
+
+  [/^\(/, TokenType.LParen],
+  [/^\)/, TokenType.RParen],
+  [/^\s+/, TokenType.Space],
+
+  // This needs to be last as it matches a lot of stuff
+  [/^[^ )(]+/, TokenType.Ident],
+] as const;
+
+class LexerToken implements Token<TokenType> {
+  private constructor(
+    private readonly input: string,
+    public kind: TokenType,
+    public text: string,
+    public pos: TokenPosition,
+  ) {}
+
+  public static from(input: string): Token<TokenType> | undefined {
+    const tok = new LexerToken(
+      input,
+      /* Doesn't matter */ TokenType.Ident,
+      "",
+      {
+        index: 0,
+        rowBegin: 1,
+        rowEnd: 1,
+        columnBegin: 0,
+        columnEnd: 0,
+      },
+    );
+    return tok.next;
+  }
+
+  public get next(): Token<TokenType> | undefined {
+    if (!this.input.length) {
+      return undefined;
+    }
+
+    for (const [regex, tokenType] of lexerRules) {
+      const matchRes = regex.exec(this.input);
+      if (!matchRes) {
+        continue;
+      }
+      const match = matchRes[0];
+      return new LexerToken(this.input.slice(match.length), tokenType, match, {
+        index: this.pos.index + match.length,
+        columnBegin: this.pos.index + 1,
+        columnEnd: this.pos.index + 1 + match.length,
+        // Our strings are always only one line
+        rowBegin: 1,
+        rowEnd: 1,
+      });
+    }
+    // No match
+    throw new Error(
+      `Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`,
+    );
+  }
+}
+
+export interface TextAndMatcher {
+  text: string;
+  matcher?: Matcher;
+}
+
+const MATCHER = rule<TokenType, TextAndMatcher>();
+const EXP = rule<TokenType, TextAndMatcher>();
+
+MATCHER.setPattern(
+  alt_sc(
+    apply(kright(str("is:"), tok(TokenType.Ident)), (toks) => {
+      switch (toks.text) {
+        case "fav":
+          return {
+            text: "",
+            matcher: { type: "favourited", favourited: true },
+          };
+        case "not_fav":
+          return {
+            text: "",
+            matcher: { type: "favourited", favourited: false },
+          };
+        case "archived":
+          return {
+            text: "",
+            matcher: { type: "archived", archived: true },
+          };
+        case "not_archived":
+          return {
+            text: "",
+            matcher: { type: "archived", archived: false },
+          };
+        default:
+          // If the token is not known, emit it as pure text
+          return {
+            text: `is:${toks.text}`,
+            matcher: undefined,
+          };
+      }
+    }),
+    apply(
+      seq(
+        alt(tok(TokenType.Qualifier), tok(TokenType.Hash)),
+        alt(
+          apply(tok(TokenType.Ident), (tok) => {
+            return tok.text;
+          }),
+          apply(tok(TokenType.StringLiteral), (tok) => {
+            return tok.text.slice(1, -1);
+          }),
+        ),
+      ),
+      (toks) => {
+        switch (toks[0].text) {
+          case "url:":
+            return {
+              text: "",
+              matcher: { type: "url", url: toks[1] },
+            };
+          case "#":
+            return {
+              text: "",
+              matcher: { type: "tagName", tagName: toks[1] },
+            };
+          case "list:":
+            return {
+              text: "",
+              matcher: { type: "listName", listName: toks[1] },
+            };
+          case "after:":
+            try {
+              return {
+                text: "",
+                matcher: {
+                  type: "dateAfter",
+                  dateAfter: z.coerce.date().parse(toks[1]),
+                },
+              };
+            } catch (e) {
+              return {
+                // If parsing the date fails, emit it as pure text
+                text: toks[0].text + toks[1],
+                matcher: undefined,
+              };
+            }
+          case "before:":
+            try {
+              return {
+                text: "",
+                matcher: {
+                  type: "dateBefore",
+                  dateBefore: z.coerce.date().parse(toks[1]),
+                },
+              };
+            } catch (e) {
+              return {
+                // If parsing the date fails, emit it as pure text
+                text: toks[0].text + toks[1],
+                matcher: undefined,
+              };
+            }
+          default:
+            // If the token is not known, emit it as pure text
+            return {
+              text: toks[0].text + toks[1],
+              matcher: undefined,
+            };
+        }
+      },
+    ),
+    // Ident or an incomlete qualifier
+    apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => {
+      return {
+        text: toks.text,
+        matcher: undefined,
+      };
+    }),
+    kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)),
+  ),
+);
+
+EXP.setPattern(
+  lrec_sc(
+    MATCHER,
+    seq(
+      alt(
+        tok(TokenType.Space),
+        kmid(tok(TokenType.Space), tok(TokenType.And), tok(TokenType.Space)),
+        kmid(tok(TokenType.Space), tok(TokenType.Or), tok(TokenType.Space)),
+      ),
+      MATCHER,
+    ),
+    (toks, next) => {
+      switch (next[0].kind) {
+        case TokenType.Space:
+        case TokenType.And:
+          return {
+            text: [toks.text, next[1].text].join(" ").trim(),
+            matcher:
+              !!toks.matcher || !!next[1].matcher
+                ? {
+                    type: "and",
+                    matchers: [toks.matcher, next[1].matcher].filter(
+                      (a) => !!a,
+                    ) as Matcher[],
+                  }
+                : undefined,
+          };
+        case TokenType.Or:
+          return {
+            text: [toks.text, next[1].text].join(" ").trim(),
+            matcher:
+              !!toks.matcher || !!next[1].matcher
+                ? {
+                    type: "or",
+                    matchers: [toks.matcher, next[1].matcher].filter(
+                      (a) => !!a,
+                    ) as Matcher[],
+                  }
+                : undefined,
+          };
+      }
+    },
+  ),
+);
+
+function flattenAndsAndOrs(matcher: Matcher): Matcher {
+  switch (matcher.type) {
+    case "and":
+    case "or": {
+      if (matcher.matchers.length == 1) {
+        return flattenAndsAndOrs(matcher.matchers[0]);
+      }
+      const flattened: Matcher[] = [];
+      for (let m of matcher.matchers) {
+        // If inside the matcher is another matcher of the same type, flatten it
+        m = flattenAndsAndOrs(m);
+        if (m.type == matcher.type) {
+          flattened.push(...m.matchers);
+        } else {
+          flattened.push(m);
+        }
+      }
+      matcher.matchers = flattened;
+      return matcher;
+    }
+    default:
+      return matcher;
+  }
+}
+
+export function _parseAndPrintTokens(query: string) {
+  console.log(`PARSING: ${query}`);
+  let tok = LexerToken.from(query);
+  do {
+    console.log(tok?.kind, tok?.text);
+    tok = tok?.next;
+  } while (tok);
+  console.log("DONE");
+}
+
+function consumeTokenStream(token: Token<TokenType>) {
+  let str = "";
+  let tok: Token<TokenType> | undefined = token;
+  do {
+    str += tok.text;
+    tok = tok.next;
+  } while (tok);
+  return str;
+}
+
+export function parseSearchQuery(
+  query: string,
+): TextAndMatcher & { result: "full" | "partial" | "invalid" } {
+  // _parseAndPrintTokens(query); // Uncomment to debug tokenization
+  const parsed = EXP.parse(LexerToken.from(query.trim()));
+  if (!parsed.successful || parsed.candidates.length != 1) {
+    // If the query is not valid, return the whole query as pure text
+    return {
+      text: query,
+      result: "invalid",
+    };
+  }
+
+  const parseCandidate = parsed.candidates[0];
+  if (parseCandidate.result.matcher) {
+    parseCandidate.result.matcher = flattenAndsAndOrs(
+      parseCandidate.result.matcher,
+    );
+  }
+  if (parseCandidate.nextToken) {
+    // Parser failed to consume the whole query. This usually happen
+    // when the user is still typing the query. Return the partial
+    // result and the remaining query as pure text
+    return {
+      text: (
+        parseCandidate.result.text +
+        consumeTokenStream(parseCandidate.nextToken)
+      ).trim(),
+      matcher: parseCandidate.result.matcher,
+      result: "partial",
+    };
+  }
+
+  return {
+    text: parseCandidate.result.text,
+    matcher: parseCandidate.result.matcher,
+    result: "full",
+  };
+}
diff --git a/packages/shared/types/search.ts b/packages/shared/types/search.ts
new file mode 100644
index 00000000..d430dad5
--- /dev/null
+++ b/packages/shared/types/search.ts
@@ -0,0 +1,72 @@
+import { z } from "zod";
+
+const zTagNameMatcher = z.object({
+  type: z.literal("tagName"),
+  tagName: z.string(),
+});
+
+const zListNameMatcher = z.object({
+  type: z.literal("listName"),
+  listName: z.string(),
+});
+
+const zArchivedMatcher = z.object({
+  type: z.literal("archived"),
+  archived: z.boolean(),
+});
+
+const urlMatcher = z.object({
+  type: z.literal("url"),
+  url: z.string(),
+});
+
+const zFavouritedMatcher = z.object({
+  type: z.literal("favourited"),
+  favourited: z.boolean(),
+});
+
+const zDateAfterMatcher = z.object({
+  type: z.literal("dateAfter"),
+  dateAfter: z.date(),
+});
+
+const zDateBeforeMatcher = z.object({
+  type: z.literal("dateBefore"),
+  dateBefore: z.date(),
+});
+
+const zNonRecursiveMatcher = z.union([
+  zTagNameMatcher,
+  zListNameMatcher,
+  zArchivedMatcher,
+  urlMatcher,
+  zFavouritedMatcher,
+  zDateAfterMatcher,
+  zDateBeforeMatcher,
+]);
+
+type NonRecursiveMatcher = z.infer<typeof zNonRecursiveMatcher>;
+export type Matcher =
+  | NonRecursiveMatcher
+  | { type: "and"; matchers: Matcher[] }
+  | { type: "or"; matchers: Matcher[] };
+
+export const zMatcherSchema: z.ZodType<Matcher> = z.lazy(() => {
+  return z.discriminatedUnion("type", [
+    zTagNameMatcher,
+    zListNameMatcher,
+    zArchivedMatcher,
+    urlMatcher,
+    zFavouritedMatcher,
+    zDateAfterMatcher,
+    zDateBeforeMatcher,
+    z.object({
+      type: z.literal("and"),
+      matchers: z.array(zMatcherSchema),
+    }),
+    z.object({
+      type: z.literal("or"),
+      matchers: z.array(zMatcherSchema),
+    }),
+  ]);
+});
diff --git a/packages/shared/vitest.config.ts b/packages/shared/vitest.config.ts
new file mode 100644
index 00000000..41fd70c4
--- /dev/null
+++ b/packages/shared/vitest.config.ts
@@ -0,0 +1,14 @@
+/// <reference types="vitest" />
+
+import tsconfigPaths from "vite-tsconfig-paths";
+import { defineConfig } from "vitest/config";
+
+// https://vitejs.dev/config/
+export default defineConfig({
+  plugins: [tsconfigPaths()],
+  test: {
+    alias: {
+      "@/*": "./*",
+    },
+  },
+});
author	Mohamed Bassem <me@mbassem.com>	2024-12-31 13:17:56 +0200
committer	GitHub <noreply@github.com>	2024-12-31 13:17:56 +0200
commit	cbaf9e6034aa09911fca967b7af6cad11f154b3e (patch)
tree	6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages/shared
parent	f476fca758bb039f9605488b61ba35fc097d6cfc (diff)
download	karakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst