aboutsummaryrefslogtreecommitdiffstats
path: root/packages/shared/searchQueryParser.ts
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2024-12-31 13:17:56 +0200
committerGitHub <noreply@github.com>2024-12-31 13:17:56 +0200
commitcbaf9e6034aa09911fca967b7af6cad11f154b3e (patch)
tree6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages/shared/searchQueryParser.ts
parentf476fca758bb039f9605488b61ba35fc097d6cfc (diff)
downloadkarakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst
feat: Introduce advanced search capabilities (#753)
* feat: Implement search filtering in the backend * feat: Implement search language parser * rename matcher name * Add ability to interleve text * More fixes * be more tolerable to parsing errors * Add a search query explainer widget * Handle date parsing gracefully * Fix the lockfile * Encode query search param * Fix table body error * Fix error when writing quotes
Diffstat (limited to 'packages/shared/searchQueryParser.ts')
-rw-r--r--packages/shared/searchQueryParser.ts351
1 files changed, 351 insertions, 0 deletions
diff --git a/packages/shared/searchQueryParser.ts b/packages/shared/searchQueryParser.ts
new file mode 100644
index 00000000..faf74d08
--- /dev/null
+++ b/packages/shared/searchQueryParser.ts
@@ -0,0 +1,351 @@
+import {
+ alt,
+ alt_sc,
+ apply,
+ kmid,
+ kright,
+ lrec_sc,
+ rule,
+ seq,
+ str,
+ tok,
+ Token,
+ TokenPosition,
+} from "typescript-parsec";
+import { z } from "zod";
+
+import { Matcher } from "./types/search";
+
+enum TokenType {
+ And = "AND",
+ Or = "OR",
+
+ Qualifier = "QUALIFIER",
+ Ident = "IDENT",
+ StringLiteral = "STRING_LITERAL",
+
+ LParen = "LPAREN",
+ RParen = "RPAREN",
+ Space = "SPACE",
+ Hash = "HASH",
+}
+
+// Rules are in order of priority
+const lexerRules: [RegExp, TokenType][] = [
+ [/^and/i, TokenType.And],
+ [/^or/i, TokenType.Or],
+
+ [/^#/, TokenType.Hash],
+ [/^(is|url|list|after|before):/, TokenType.Qualifier],
+
+ [/^"([^"]+)"/, TokenType.StringLiteral],
+
+ [/^\(/, TokenType.LParen],
+ [/^\)/, TokenType.RParen],
+ [/^\s+/, TokenType.Space],
+
+ // This needs to be last as it matches a lot of stuff
+ [/^[^ )(]+/, TokenType.Ident],
+] as const;
+
+class LexerToken implements Token<TokenType> {
+ private constructor(
+ private readonly input: string,
+ public kind: TokenType,
+ public text: string,
+ public pos: TokenPosition,
+ ) {}
+
+ public static from(input: string): Token<TokenType> | undefined {
+ const tok = new LexerToken(
+ input,
+ /* Doesn't matter */ TokenType.Ident,
+ "",
+ {
+ index: 0,
+ rowBegin: 1,
+ rowEnd: 1,
+ columnBegin: 0,
+ columnEnd: 0,
+ },
+ );
+ return tok.next;
+ }
+
+ public get next(): Token<TokenType> | undefined {
+ if (!this.input.length) {
+ return undefined;
+ }
+
+ for (const [regex, tokenType] of lexerRules) {
+ const matchRes = regex.exec(this.input);
+ if (!matchRes) {
+ continue;
+ }
+ const match = matchRes[0];
+ return new LexerToken(this.input.slice(match.length), tokenType, match, {
+ index: this.pos.index + match.length,
+ columnBegin: this.pos.index + 1,
+ columnEnd: this.pos.index + 1 + match.length,
+ // Our strings are always only one line
+ rowBegin: 1,
+ rowEnd: 1,
+ });
+ }
+ // No match
+ throw new Error(
+ `Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`,
+ );
+ }
+}
+
+export interface TextAndMatcher {
+ text: string;
+ matcher?: Matcher;
+}
+
+const MATCHER = rule<TokenType, TextAndMatcher>();
+const EXP = rule<TokenType, TextAndMatcher>();
+
+MATCHER.setPattern(
+ alt_sc(
+ apply(kright(str("is:"), tok(TokenType.Ident)), (toks) => {
+ switch (toks.text) {
+ case "fav":
+ return {
+ text: "",
+ matcher: { type: "favourited", favourited: true },
+ };
+ case "not_fav":
+ return {
+ text: "",
+ matcher: { type: "favourited", favourited: false },
+ };
+ case "archived":
+ return {
+ text: "",
+ matcher: { type: "archived", archived: true },
+ };
+ case "not_archived":
+ return {
+ text: "",
+ matcher: { type: "archived", archived: false },
+ };
+ default:
+ // If the token is not known, emit it as pure text
+ return {
+ text: `is:${toks.text}`,
+ matcher: undefined,
+ };
+ }
+ }),
+ apply(
+ seq(
+ alt(tok(TokenType.Qualifier), tok(TokenType.Hash)),
+ alt(
+ apply(tok(TokenType.Ident), (tok) => {
+ return tok.text;
+ }),
+ apply(tok(TokenType.StringLiteral), (tok) => {
+ return tok.text.slice(1, -1);
+ }),
+ ),
+ ),
+ (toks) => {
+ switch (toks[0].text) {
+ case "url:":
+ return {
+ text: "",
+ matcher: { type: "url", url: toks[1] },
+ };
+ case "#":
+ return {
+ text: "",
+ matcher: { type: "tagName", tagName: toks[1] },
+ };
+ case "list:":
+ return {
+ text: "",
+ matcher: { type: "listName", listName: toks[1] },
+ };
+ case "after:":
+ try {
+ return {
+ text: "",
+ matcher: {
+ type: "dateAfter",
+ dateAfter: z.coerce.date().parse(toks[1]),
+ },
+ };
+ } catch (e) {
+ return {
+ // If parsing the date fails, emit it as pure text
+ text: toks[0].text + toks[1],
+ matcher: undefined,
+ };
+ }
+ case "before:":
+ try {
+ return {
+ text: "",
+ matcher: {
+ type: "dateBefore",
+ dateBefore: z.coerce.date().parse(toks[1]),
+ },
+ };
+ } catch (e) {
+ return {
+ // If parsing the date fails, emit it as pure text
+ text: toks[0].text + toks[1],
+ matcher: undefined,
+ };
+ }
+ default:
+ // If the token is not known, emit it as pure text
+ return {
+ text: toks[0].text + toks[1],
+ matcher: undefined,
+ };
+ }
+ },
+ ),
+ // Ident or an incomlete qualifier
+ apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => {
+ return {
+ text: toks.text,
+ matcher: undefined,
+ };
+ }),
+ kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)),
+ ),
+);
+
+EXP.setPattern(
+ lrec_sc(
+ MATCHER,
+ seq(
+ alt(
+ tok(TokenType.Space),
+ kmid(tok(TokenType.Space), tok(TokenType.And), tok(TokenType.Space)),
+ kmid(tok(TokenType.Space), tok(TokenType.Or), tok(TokenType.Space)),
+ ),
+ MATCHER,
+ ),
+ (toks, next) => {
+ switch (next[0].kind) {
+ case TokenType.Space:
+ case TokenType.And:
+ return {
+ text: [toks.text, next[1].text].join(" ").trim(),
+ matcher:
+ !!toks.matcher || !!next[1].matcher
+ ? {
+ type: "and",
+ matchers: [toks.matcher, next[1].matcher].filter(
+ (a) => !!a,
+ ) as Matcher[],
+ }
+ : undefined,
+ };
+ case TokenType.Or:
+ return {
+ text: [toks.text, next[1].text].join(" ").trim(),
+ matcher:
+ !!toks.matcher || !!next[1].matcher
+ ? {
+ type: "or",
+ matchers: [toks.matcher, next[1].matcher].filter(
+ (a) => !!a,
+ ) as Matcher[],
+ }
+ : undefined,
+ };
+ }
+ },
+ ),
+);
+
+function flattenAndsAndOrs(matcher: Matcher): Matcher {
+ switch (matcher.type) {
+ case "and":
+ case "or": {
+ if (matcher.matchers.length == 1) {
+ return flattenAndsAndOrs(matcher.matchers[0]);
+ }
+ const flattened: Matcher[] = [];
+ for (let m of matcher.matchers) {
+ // If inside the matcher is another matcher of the same type, flatten it
+ m = flattenAndsAndOrs(m);
+ if (m.type == matcher.type) {
+ flattened.push(...m.matchers);
+ } else {
+ flattened.push(m);
+ }
+ }
+ matcher.matchers = flattened;
+ return matcher;
+ }
+ default:
+ return matcher;
+ }
+}
+
+export function _parseAndPrintTokens(query: string) {
+ console.log(`PARSING: ${query}`);
+ let tok = LexerToken.from(query);
+ do {
+ console.log(tok?.kind, tok?.text);
+ tok = tok?.next;
+ } while (tok);
+ console.log("DONE");
+}
+
+function consumeTokenStream(token: Token<TokenType>) {
+ let str = "";
+ let tok: Token<TokenType> | undefined = token;
+ do {
+ str += tok.text;
+ tok = tok.next;
+ } while (tok);
+ return str;
+}
+
+export function parseSearchQuery(
+ query: string,
+): TextAndMatcher & { result: "full" | "partial" | "invalid" } {
+ // _parseAndPrintTokens(query); // Uncomment to debug tokenization
+ const parsed = EXP.parse(LexerToken.from(query.trim()));
+ if (!parsed.successful || parsed.candidates.length != 1) {
+ // If the query is not valid, return the whole query as pure text
+ return {
+ text: query,
+ result: "invalid",
+ };
+ }
+
+ const parseCandidate = parsed.candidates[0];
+ if (parseCandidate.result.matcher) {
+ parseCandidate.result.matcher = flattenAndsAndOrs(
+ parseCandidate.result.matcher,
+ );
+ }
+ if (parseCandidate.nextToken) {
+ // Parser failed to consume the whole query. This usually happen
+ // when the user is still typing the query. Return the partial
+ // result and the remaining query as pure text
+ return {
+ text: (
+ parseCandidate.result.text +
+ consumeTokenStream(parseCandidate.nextToken)
+ ).trim(),
+ matcher: parseCandidate.result.matcher,
+ result: "partial",
+ };
+ }
+
+ return {
+ text: parseCandidate.result.text,
+ matcher: parseCandidate.result.matcher,
+ result: "full",
+ };
+}