diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-12-31 13:17:56 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-12-31 13:17:56 +0200 |
| commit | cbaf9e6034aa09911fca967b7af6cad11f154b3e (patch) | |
| tree | 6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages/shared | |
| parent | f476fca758bb039f9605488b61ba35fc097d6cfc (diff) | |
| download | karakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst | |
feat: Introduce advanced search capabilities (#753)
* feat: Implement search filtering in the backend
* feat: Implement search language parser
* rename matcher name
* Add ability to interleve text
* More fixes
* be more tolerable to parsing errors
* Add a search query explainer widget
* Handle date parsing gracefully
* Fix the lockfile
* Encode query search param
* Fix table body error
* Fix error when writing quotes
Diffstat (limited to 'packages/shared')
| -rw-r--r-- | packages/shared/package.json | 7 | ||||
| -rw-r--r-- | packages/shared/searchQueryParser.test.ts | 275 | ||||
| -rw-r--r-- | packages/shared/searchQueryParser.ts | 351 | ||||
| -rw-r--r-- | packages/shared/types/search.ts | 72 | ||||
| -rw-r--r-- | packages/shared/vitest.config.ts | 14 |
5 files changed, 717 insertions, 2 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json index d741b70f..d412301a 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -10,18 +10,21 @@ "meilisearch": "^0.37.0", "ollama": "^0.5.9", "openai": "^4.67.1", + "typescript-parsec": "^0.3.4", "winston": "^3.11.0", "zod": "^3.22.4" }, "devDependencies": { "@hoarder/eslint-config": "workspace:^0.2.0", "@hoarder/prettier-config": "workspace:^0.1.0", - "@hoarder/tsconfig": "workspace:^0.1.0" + "@hoarder/tsconfig": "workspace:^0.1.0", + "vitest": "^1.3.1" }, "scripts": { "typecheck": "tsc --noEmit", "format": "prettier . --ignore-path ../../.prettierignore", - "lint": "eslint ." + "lint": "eslint .", + "test": "vitest" }, "main": "index.ts", "eslintConfig": { diff --git a/packages/shared/searchQueryParser.test.ts b/packages/shared/searchQueryParser.test.ts new file mode 100644 index 00000000..428d5929 --- /dev/null +++ b/packages/shared/searchQueryParser.test.ts @@ -0,0 +1,275 @@ +import { describe, expect, test } from "vitest"; + +import { parseSearchQuery } from "./searchQueryParser"; + +describe("Search Query Parser", () => { + test("simple is queries", () => { + expect(parseSearchQuery("is:archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "archived", + archived: true, + }, + }); + expect(parseSearchQuery("is:not_archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "archived", + archived: false, + }, + }); + expect(parseSearchQuery("is:fav")).toEqual({ + result: "full", + text: "", + matcher: { + type: "favourited", + favourited: true, + }, + }); + expect(parseSearchQuery("is:not_fav")).toEqual({ + result: "full", + text: "", + matcher: { + type: "favourited", + favourited: false, + }, + }); + }); + + test("simple string queries", () => { + expect(parseSearchQuery("url:https://example.com")).toEqual({ + result: "full", + text: "", + matcher: { + type: "url", + url: "https://example.com", + }, + }); + expect(parseSearchQuery('url:"https://example.com"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "url", + url: "https://example.com", + }, + }); + expect(parseSearchQuery("#my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "tagName", + tagName: "my-tag", + }, + }); + expect(parseSearchQuery('#"my tag"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "tagName", + tagName: "my tag", + }, + }); + expect(parseSearchQuery("list:my-list")).toEqual({ + result: "full", + text: "", + matcher: { + type: "listName", + listName: "my-list", + }, + }); + expect(parseSearchQuery('list:"my list"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "listName", + listName: "my list", + }, + }); + }); + test("date queries", () => { + expect(parseSearchQuery("after:2023-10-12")).toEqual({ + result: "full", + text: "", + matcher: { + type: "dateAfter", + dateAfter: new Date("2023-10-12"), + }, + }); + expect(parseSearchQuery("before:2023-10-12")).toEqual({ + result: "full", + text: "", + matcher: { + type: "dateBefore", + dateBefore: new Date("2023-10-12"), + }, + }); + }); + + test("complex queries", () => { + expect(parseSearchQuery("is:fav is:archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav is:archived) #my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav is:archived) or (#my-tag)")).toEqual({ + result: "full", + text: "", + matcher: { + type: "or", + matchers: [ + { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav or is:archived) and #my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "or", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + }); + test("pure text", () => { + expect(parseSearchQuery("hello")).toEqual({ + result: "full", + text: "hello", + matcher: undefined, + }); + expect(parseSearchQuery("hello world")).toEqual({ + result: "full", + text: "hello world", + matcher: undefined, + }); + }); + + test("text interlived with matchers", () => { + expect( + parseSearchQuery( + "hello is:fav world is:archived mixed world #my-tag test", + ), + ).toEqual({ + result: "full", + text: "hello world mixed world test", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + }); + + test("unknown qualifiers are emitted as pure text", () => { + expect(parseSearchQuery("is:fav is:helloworld")).toEqual({ + result: "full", + text: "is:helloworld", + matcher: { + type: "favourited", + favourited: true, + }, + }); + }); + + test("partial results", () => { + expect(parseSearchQuery("(is:archived) or ")).toEqual({ + result: "partial", + text: "or", + matcher: { + type: "archived", + archived: true, + }, + }); + expect(parseSearchQuery("is:fav is: ( random")).toEqual({ + result: "partial", + text: "is: ( random", + matcher: { + type: "favourited", + favourited: true, + }, + }); + }); +}); diff --git a/packages/shared/searchQueryParser.ts b/packages/shared/searchQueryParser.ts new file mode 100644 index 00000000..faf74d08 --- /dev/null +++ b/packages/shared/searchQueryParser.ts @@ -0,0 +1,351 @@ +import { + alt, + alt_sc, + apply, + kmid, + kright, + lrec_sc, + rule, + seq, + str, + tok, + Token, + TokenPosition, +} from "typescript-parsec"; +import { z } from "zod"; + +import { Matcher } from "./types/search"; + +enum TokenType { + And = "AND", + Or = "OR", + + Qualifier = "QUALIFIER", + Ident = "IDENT", + StringLiteral = "STRING_LITERAL", + + LParen = "LPAREN", + RParen = "RPAREN", + Space = "SPACE", + Hash = "HASH", +} + +// Rules are in order of priority +const lexerRules: [RegExp, TokenType][] = [ + [/^and/i, TokenType.And], + [/^or/i, TokenType.Or], + + [/^#/, TokenType.Hash], + [/^(is|url|list|after|before):/, TokenType.Qualifier], + + [/^"([^"]+)"/, TokenType.StringLiteral], + + [/^\(/, TokenType.LParen], + [/^\)/, TokenType.RParen], + [/^\s+/, TokenType.Space], + + // This needs to be last as it matches a lot of stuff + [/^[^ )(]+/, TokenType.Ident], +] as const; + +class LexerToken implements Token<TokenType> { + private constructor( + private readonly input: string, + public kind: TokenType, + public text: string, + public pos: TokenPosition, + ) {} + + public static from(input: string): Token<TokenType> | undefined { + const tok = new LexerToken( + input, + /* Doesn't matter */ TokenType.Ident, + "", + { + index: 0, + rowBegin: 1, + rowEnd: 1, + columnBegin: 0, + columnEnd: 0, + }, + ); + return tok.next; + } + + public get next(): Token<TokenType> | undefined { + if (!this.input.length) { + return undefined; + } + + for (const [regex, tokenType] of lexerRules) { + const matchRes = regex.exec(this.input); + if (!matchRes) { + continue; + } + const match = matchRes[0]; + return new LexerToken(this.input.slice(match.length), tokenType, match, { + index: this.pos.index + match.length, + columnBegin: this.pos.index + 1, + columnEnd: this.pos.index + 1 + match.length, + // Our strings are always only one line + rowBegin: 1, + rowEnd: 1, + }); + } + // No match + throw new Error( + `Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`, + ); + } +} + +export interface TextAndMatcher { + text: string; + matcher?: Matcher; +} + +const MATCHER = rule<TokenType, TextAndMatcher>(); +const EXP = rule<TokenType, TextAndMatcher>(); + +MATCHER.setPattern( + alt_sc( + apply(kright(str("is:"), tok(TokenType.Ident)), (toks) => { + switch (toks.text) { + case "fav": + return { + text: "", + matcher: { type: "favourited", favourited: true }, + }; + case "not_fav": + return { + text: "", + matcher: { type: "favourited", favourited: false }, + }; + case "archived": + return { + text: "", + matcher: { type: "archived", archived: true }, + }; + case "not_archived": + return { + text: "", + matcher: { type: "archived", archived: false }, + }; + default: + // If the token is not known, emit it as pure text + return { + text: `is:${toks.text}`, + matcher: undefined, + }; + } + }), + apply( + seq( + alt(tok(TokenType.Qualifier), tok(TokenType.Hash)), + alt( + apply(tok(TokenType.Ident), (tok) => { + return tok.text; + }), + apply(tok(TokenType.StringLiteral), (tok) => { + return tok.text.slice(1, -1); + }), + ), + ), + (toks) => { + switch (toks[0].text) { + case "url:": + return { + text: "", + matcher: { type: "url", url: toks[1] }, + }; + case "#": + return { + text: "", + matcher: { type: "tagName", tagName: toks[1] }, + }; + case "list:": + return { + text: "", + matcher: { type: "listName", listName: toks[1] }, + }; + case "after:": + try { + return { + text: "", + matcher: { + type: "dateAfter", + dateAfter: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + case "before:": + try { + return { + text: "", + matcher: { + type: "dateBefore", + dateBefore: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + default: + // If the token is not known, emit it as pure text + return { + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + }, + ), + // Ident or an incomlete qualifier + apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => { + return { + text: toks.text, + matcher: undefined, + }; + }), + kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)), + ), +); + +EXP.setPattern( + lrec_sc( + MATCHER, + seq( + alt( + tok(TokenType.Space), + kmid(tok(TokenType.Space), tok(TokenType.And), tok(TokenType.Space)), + kmid(tok(TokenType.Space), tok(TokenType.Or), tok(TokenType.Space)), + ), + MATCHER, + ), + (toks, next) => { + switch (next[0].kind) { + case TokenType.Space: + case TokenType.And: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "and", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + case TokenType.Or: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "or", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + } + }, + ), +); + +function flattenAndsAndOrs(matcher: Matcher): Matcher { + switch (matcher.type) { + case "and": + case "or": { + if (matcher.matchers.length == 1) { + return flattenAndsAndOrs(matcher.matchers[0]); + } + const flattened: Matcher[] = []; + for (let m of matcher.matchers) { + // If inside the matcher is another matcher of the same type, flatten it + m = flattenAndsAndOrs(m); + if (m.type == matcher.type) { + flattened.push(...m.matchers); + } else { + flattened.push(m); + } + } + matcher.matchers = flattened; + return matcher; + } + default: + return matcher; + } +} + +export function _parseAndPrintTokens(query: string) { + console.log(`PARSING: ${query}`); + let tok = LexerToken.from(query); + do { + console.log(tok?.kind, tok?.text); + tok = tok?.next; + } while (tok); + console.log("DONE"); +} + +function consumeTokenStream(token: Token<TokenType>) { + let str = ""; + let tok: Token<TokenType> | undefined = token; + do { + str += tok.text; + tok = tok.next; + } while (tok); + return str; +} + +export function parseSearchQuery( + query: string, +): TextAndMatcher & { result: "full" | "partial" | "invalid" } { + // _parseAndPrintTokens(query); // Uncomment to debug tokenization + const parsed = EXP.parse(LexerToken.from(query.trim())); + if (!parsed.successful || parsed.candidates.length != 1) { + // If the query is not valid, return the whole query as pure text + return { + text: query, + result: "invalid", + }; + } + + const parseCandidate = parsed.candidates[0]; + if (parseCandidate.result.matcher) { + parseCandidate.result.matcher = flattenAndsAndOrs( + parseCandidate.result.matcher, + ); + } + if (parseCandidate.nextToken) { + // Parser failed to consume the whole query. This usually happen + // when the user is still typing the query. Return the partial + // result and the remaining query as pure text + return { + text: ( + parseCandidate.result.text + + consumeTokenStream(parseCandidate.nextToken) + ).trim(), + matcher: parseCandidate.result.matcher, + result: "partial", + }; + } + + return { + text: parseCandidate.result.text, + matcher: parseCandidate.result.matcher, + result: "full", + }; +} diff --git a/packages/shared/types/search.ts b/packages/shared/types/search.ts new file mode 100644 index 00000000..d430dad5 --- /dev/null +++ b/packages/shared/types/search.ts @@ -0,0 +1,72 @@ +import { z } from "zod"; + +const zTagNameMatcher = z.object({ + type: z.literal("tagName"), + tagName: z.string(), +}); + +const zListNameMatcher = z.object({ + type: z.literal("listName"), + listName: z.string(), +}); + +const zArchivedMatcher = z.object({ + type: z.literal("archived"), + archived: z.boolean(), +}); + +const urlMatcher = z.object({ + type: z.literal("url"), + url: z.string(), +}); + +const zFavouritedMatcher = z.object({ + type: z.literal("favourited"), + favourited: z.boolean(), +}); + +const zDateAfterMatcher = z.object({ + type: z.literal("dateAfter"), + dateAfter: z.date(), +}); + +const zDateBeforeMatcher = z.object({ + type: z.literal("dateBefore"), + dateBefore: z.date(), +}); + +const zNonRecursiveMatcher = z.union([ + zTagNameMatcher, + zListNameMatcher, + zArchivedMatcher, + urlMatcher, + zFavouritedMatcher, + zDateAfterMatcher, + zDateBeforeMatcher, +]); + +type NonRecursiveMatcher = z.infer<typeof zNonRecursiveMatcher>; +export type Matcher = + | NonRecursiveMatcher + | { type: "and"; matchers: Matcher[] } + | { type: "or"; matchers: Matcher[] }; + +export const zMatcherSchema: z.ZodType<Matcher> = z.lazy(() => { + return z.discriminatedUnion("type", [ + zTagNameMatcher, + zListNameMatcher, + zArchivedMatcher, + urlMatcher, + zFavouritedMatcher, + zDateAfterMatcher, + zDateBeforeMatcher, + z.object({ + type: z.literal("and"), + matchers: z.array(zMatcherSchema), + }), + z.object({ + type: z.literal("or"), + matchers: z.array(zMatcherSchema), + }), + ]); +}); diff --git a/packages/shared/vitest.config.ts b/packages/shared/vitest.config.ts new file mode 100644 index 00000000..41fd70c4 --- /dev/null +++ b/packages/shared/vitest.config.ts @@ -0,0 +1,14 @@ +/// <reference types="vitest" /> + +import tsconfigPaths from "vite-tsconfig-paths"; +import { defineConfig } from "vitest/config"; + +// https://vitejs.dev/config/ +export default defineConfig({ + plugins: [tsconfigPaths()], + test: { + alias: { + "@/*": "./*", + }, + }, +}); |
