diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-12-31 13:17:56 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-12-31 13:17:56 +0200 |
| commit | cbaf9e6034aa09911fca967b7af6cad11f154b3e (patch) | |
| tree | 6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages/shared/searchQueryParser.ts | |
| parent | f476fca758bb039f9605488b61ba35fc097d6cfc (diff) | |
| download | karakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst | |
feat: Introduce advanced search capabilities (#753)
* feat: Implement search filtering in the backend
* feat: Implement search language parser
* rename matcher name
* Add ability to interleve text
* More fixes
* be more tolerable to parsing errors
* Add a search query explainer widget
* Handle date parsing gracefully
* Fix the lockfile
* Encode query search param
* Fix table body error
* Fix error when writing quotes
Diffstat (limited to 'packages/shared/searchQueryParser.ts')
| -rw-r--r-- | packages/shared/searchQueryParser.ts | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/packages/shared/searchQueryParser.ts b/packages/shared/searchQueryParser.ts new file mode 100644 index 00000000..faf74d08 --- /dev/null +++ b/packages/shared/searchQueryParser.ts @@ -0,0 +1,351 @@ +import { + alt, + alt_sc, + apply, + kmid, + kright, + lrec_sc, + rule, + seq, + str, + tok, + Token, + TokenPosition, +} from "typescript-parsec"; +import { z } from "zod"; + +import { Matcher } from "./types/search"; + +enum TokenType { + And = "AND", + Or = "OR", + + Qualifier = "QUALIFIER", + Ident = "IDENT", + StringLiteral = "STRING_LITERAL", + + LParen = "LPAREN", + RParen = "RPAREN", + Space = "SPACE", + Hash = "HASH", +} + +// Rules are in order of priority +const lexerRules: [RegExp, TokenType][] = [ + [/^and/i, TokenType.And], + [/^or/i, TokenType.Or], + + [/^#/, TokenType.Hash], + [/^(is|url|list|after|before):/, TokenType.Qualifier], + + [/^"([^"]+)"/, TokenType.StringLiteral], + + [/^\(/, TokenType.LParen], + [/^\)/, TokenType.RParen], + [/^\s+/, TokenType.Space], + + // This needs to be last as it matches a lot of stuff + [/^[^ )(]+/, TokenType.Ident], +] as const; + +class LexerToken implements Token<TokenType> { + private constructor( + private readonly input: string, + public kind: TokenType, + public text: string, + public pos: TokenPosition, + ) {} + + public static from(input: string): Token<TokenType> | undefined { + const tok = new LexerToken( + input, + /* Doesn't matter */ TokenType.Ident, + "", + { + index: 0, + rowBegin: 1, + rowEnd: 1, + columnBegin: 0, + columnEnd: 0, + }, + ); + return tok.next; + } + + public get next(): Token<TokenType> | undefined { + if (!this.input.length) { + return undefined; + } + + for (const [regex, tokenType] of lexerRules) { + const matchRes = regex.exec(this.input); + if (!matchRes) { + continue; + } + const match = matchRes[0]; + return new LexerToken(this.input.slice(match.length), tokenType, match, { + index: this.pos.index + match.length, + columnBegin: this.pos.index + 1, + columnEnd: this.pos.index + 1 + match.length, + // Our strings are always only one line + rowBegin: 1, + rowEnd: 1, + }); + } + // No match + throw new Error( + `Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`, + ); + } +} + +export interface TextAndMatcher { + text: string; + matcher?: Matcher; +} + +const MATCHER = rule<TokenType, TextAndMatcher>(); +const EXP = rule<TokenType, TextAndMatcher>(); + +MATCHER.setPattern( + alt_sc( + apply(kright(str("is:"), tok(TokenType.Ident)), (toks) => { + switch (toks.text) { + case "fav": + return { + text: "", + matcher: { type: "favourited", favourited: true }, + }; + case "not_fav": + return { + text: "", + matcher: { type: "favourited", favourited: false }, + }; + case "archived": + return { + text: "", + matcher: { type: "archived", archived: true }, + }; + case "not_archived": + return { + text: "", + matcher: { type: "archived", archived: false }, + }; + default: + // If the token is not known, emit it as pure text + return { + text: `is:${toks.text}`, + matcher: undefined, + }; + } + }), + apply( + seq( + alt(tok(TokenType.Qualifier), tok(TokenType.Hash)), + alt( + apply(tok(TokenType.Ident), (tok) => { + return tok.text; + }), + apply(tok(TokenType.StringLiteral), (tok) => { + return tok.text.slice(1, -1); + }), + ), + ), + (toks) => { + switch (toks[0].text) { + case "url:": + return { + text: "", + matcher: { type: "url", url: toks[1] }, + }; + case "#": + return { + text: "", + matcher: { type: "tagName", tagName: toks[1] }, + }; + case "list:": + return { + text: "", + matcher: { type: "listName", listName: toks[1] }, + }; + case "after:": + try { + return { + text: "", + matcher: { + type: "dateAfter", + dateAfter: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + case "before:": + try { + return { + text: "", + matcher: { + type: "dateBefore", + dateBefore: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + default: + // If the token is not known, emit it as pure text + return { + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + }, + ), + // Ident or an incomlete qualifier + apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => { + return { + text: toks.text, + matcher: undefined, + }; + }), + kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)), + ), +); + +EXP.setPattern( + lrec_sc( + MATCHER, + seq( + alt( + tok(TokenType.Space), + kmid(tok(TokenType.Space), tok(TokenType.And), tok(TokenType.Space)), + kmid(tok(TokenType.Space), tok(TokenType.Or), tok(TokenType.Space)), + ), + MATCHER, + ), + (toks, next) => { + switch (next[0].kind) { + case TokenType.Space: + case TokenType.And: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "and", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + case TokenType.Or: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "or", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + } + }, + ), +); + +function flattenAndsAndOrs(matcher: Matcher): Matcher { + switch (matcher.type) { + case "and": + case "or": { + if (matcher.matchers.length == 1) { + return flattenAndsAndOrs(matcher.matchers[0]); + } + const flattened: Matcher[] = []; + for (let m of matcher.matchers) { + // If inside the matcher is another matcher of the same type, flatten it + m = flattenAndsAndOrs(m); + if (m.type == matcher.type) { + flattened.push(...m.matchers); + } else { + flattened.push(m); + } + } + matcher.matchers = flattened; + return matcher; + } + default: + return matcher; + } +} + +export function _parseAndPrintTokens(query: string) { + console.log(`PARSING: ${query}`); + let tok = LexerToken.from(query); + do { + console.log(tok?.kind, tok?.text); + tok = tok?.next; + } while (tok); + console.log("DONE"); +} + +function consumeTokenStream(token: Token<TokenType>) { + let str = ""; + let tok: Token<TokenType> | undefined = token; + do { + str += tok.text; + tok = tok.next; + } while (tok); + return str; +} + +export function parseSearchQuery( + query: string, +): TextAndMatcher & { result: "full" | "partial" | "invalid" } { + // _parseAndPrintTokens(query); // Uncomment to debug tokenization + const parsed = EXP.parse(LexerToken.from(query.trim())); + if (!parsed.successful || parsed.candidates.length != 1) { + // If the query is not valid, return the whole query as pure text + return { + text: query, + result: "invalid", + }; + } + + const parseCandidate = parsed.candidates[0]; + if (parseCandidate.result.matcher) { + parseCandidate.result.matcher = flattenAndsAndOrs( + parseCandidate.result.matcher, + ); + } + if (parseCandidate.nextToken) { + // Parser failed to consume the whole query. This usually happen + // when the user is still typing the query. Return the partial + // result and the remaining query as pure text + return { + text: ( + parseCandidate.result.text + + consumeTokenStream(parseCandidate.nextToken) + ).trim(), + matcher: parseCandidate.result.matcher, + result: "partial", + }; + } + + return { + text: parseCandidate.result.text, + matcher: parseCandidate.result.matcher, + result: "full", + }; +} |
