diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-12-31 13:17:56 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-12-31 13:17:56 +0200 |
| commit | cbaf9e6034aa09911fca967b7af6cad11f154b3e (patch) | |
| tree | 6995d9d60d9ae5181af78e6577f8d7b724d7a971 /packages | |
| parent | f476fca758bb039f9605488b61ba35fc097d6cfc (diff) | |
| download | karakeep-cbaf9e6034aa09911fca967b7af6cad11f154b3e.tar.zst | |
feat: Introduce advanced search capabilities (#753)
* feat: Implement search filtering in the backend
* feat: Implement search language parser
* rename matcher name
* Add ability to interleve text
* More fixes
* be more tolerable to parsing errors
* Add a search query explainer widget
* Handle date parsing gracefully
* Fix the lockfile
* Encode query search param
* Fix table body error
* Fix error when writing quotes
Diffstat (limited to 'packages')
| -rw-r--r-- | packages/shared/package.json | 7 | ||||
| -rw-r--r-- | packages/shared/searchQueryParser.test.ts | 275 | ||||
| -rw-r--r-- | packages/shared/searchQueryParser.ts | 351 | ||||
| -rw-r--r-- | packages/shared/types/search.ts | 72 | ||||
| -rw-r--r-- | packages/shared/vitest.config.ts | 14 | ||||
| -rw-r--r-- | packages/trpc/lib/search.ts | 182 | ||||
| -rw-r--r-- | packages/trpc/routers/bookmarks.ts | 16 |
7 files changed, 914 insertions, 3 deletions
diff --git a/packages/shared/package.json b/packages/shared/package.json index d741b70f..d412301a 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -10,18 +10,21 @@ "meilisearch": "^0.37.0", "ollama": "^0.5.9", "openai": "^4.67.1", + "typescript-parsec": "^0.3.4", "winston": "^3.11.0", "zod": "^3.22.4" }, "devDependencies": { "@hoarder/eslint-config": "workspace:^0.2.0", "@hoarder/prettier-config": "workspace:^0.1.0", - "@hoarder/tsconfig": "workspace:^0.1.0" + "@hoarder/tsconfig": "workspace:^0.1.0", + "vitest": "^1.3.1" }, "scripts": { "typecheck": "tsc --noEmit", "format": "prettier . --ignore-path ../../.prettierignore", - "lint": "eslint ." + "lint": "eslint .", + "test": "vitest" }, "main": "index.ts", "eslintConfig": { diff --git a/packages/shared/searchQueryParser.test.ts b/packages/shared/searchQueryParser.test.ts new file mode 100644 index 00000000..428d5929 --- /dev/null +++ b/packages/shared/searchQueryParser.test.ts @@ -0,0 +1,275 @@ +import { describe, expect, test } from "vitest"; + +import { parseSearchQuery } from "./searchQueryParser"; + +describe("Search Query Parser", () => { + test("simple is queries", () => { + expect(parseSearchQuery("is:archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "archived", + archived: true, + }, + }); + expect(parseSearchQuery("is:not_archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "archived", + archived: false, + }, + }); + expect(parseSearchQuery("is:fav")).toEqual({ + result: "full", + text: "", + matcher: { + type: "favourited", + favourited: true, + }, + }); + expect(parseSearchQuery("is:not_fav")).toEqual({ + result: "full", + text: "", + matcher: { + type: "favourited", + favourited: false, + }, + }); + }); + + test("simple string queries", () => { + expect(parseSearchQuery("url:https://example.com")).toEqual({ + result: "full", + text: "", + matcher: { + type: "url", + url: "https://example.com", + }, + }); + expect(parseSearchQuery('url:"https://example.com"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "url", + url: "https://example.com", + }, + }); + expect(parseSearchQuery("#my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "tagName", + tagName: "my-tag", + }, + }); + expect(parseSearchQuery('#"my tag"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "tagName", + tagName: "my tag", + }, + }); + expect(parseSearchQuery("list:my-list")).toEqual({ + result: "full", + text: "", + matcher: { + type: "listName", + listName: "my-list", + }, + }); + expect(parseSearchQuery('list:"my list"')).toEqual({ + result: "full", + text: "", + matcher: { + type: "listName", + listName: "my list", + }, + }); + }); + test("date queries", () => { + expect(parseSearchQuery("after:2023-10-12")).toEqual({ + result: "full", + text: "", + matcher: { + type: "dateAfter", + dateAfter: new Date("2023-10-12"), + }, + }); + expect(parseSearchQuery("before:2023-10-12")).toEqual({ + result: "full", + text: "", + matcher: { + type: "dateBefore", + dateBefore: new Date("2023-10-12"), + }, + }); + }); + + test("complex queries", () => { + expect(parseSearchQuery("is:fav is:archived")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav is:archived) #my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav is:archived) or (#my-tag)")).toEqual({ + result: "full", + text: "", + matcher: { + type: "or", + matchers: [ + { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + + expect(parseSearchQuery("(is:fav or is:archived) and #my-tag")).toEqual({ + result: "full", + text: "", + matcher: { + type: "and", + matchers: [ + { + type: "or", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + ], + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + }); + test("pure text", () => { + expect(parseSearchQuery("hello")).toEqual({ + result: "full", + text: "hello", + matcher: undefined, + }); + expect(parseSearchQuery("hello world")).toEqual({ + result: "full", + text: "hello world", + matcher: undefined, + }); + }); + + test("text interlived with matchers", () => { + expect( + parseSearchQuery( + "hello is:fav world is:archived mixed world #my-tag test", + ), + ).toEqual({ + result: "full", + text: "hello world mixed world test", + matcher: { + type: "and", + matchers: [ + { + type: "favourited", + favourited: true, + }, + { + type: "archived", + archived: true, + }, + { + type: "tagName", + tagName: "my-tag", + }, + ], + }, + }); + }); + + test("unknown qualifiers are emitted as pure text", () => { + expect(parseSearchQuery("is:fav is:helloworld")).toEqual({ + result: "full", + text: "is:helloworld", + matcher: { + type: "favourited", + favourited: true, + }, + }); + }); + + test("partial results", () => { + expect(parseSearchQuery("(is:archived) or ")).toEqual({ + result: "partial", + text: "or", + matcher: { + type: "archived", + archived: true, + }, + }); + expect(parseSearchQuery("is:fav is: ( random")).toEqual({ + result: "partial", + text: "is: ( random", + matcher: { + type: "favourited", + favourited: true, + }, + }); + }); +}); diff --git a/packages/shared/searchQueryParser.ts b/packages/shared/searchQueryParser.ts new file mode 100644 index 00000000..faf74d08 --- /dev/null +++ b/packages/shared/searchQueryParser.ts @@ -0,0 +1,351 @@ +import { + alt, + alt_sc, + apply, + kmid, + kright, + lrec_sc, + rule, + seq, + str, + tok, + Token, + TokenPosition, +} from "typescript-parsec"; +import { z } from "zod"; + +import { Matcher } from "./types/search"; + +enum TokenType { + And = "AND", + Or = "OR", + + Qualifier = "QUALIFIER", + Ident = "IDENT", + StringLiteral = "STRING_LITERAL", + + LParen = "LPAREN", + RParen = "RPAREN", + Space = "SPACE", + Hash = "HASH", +} + +// Rules are in order of priority +const lexerRules: [RegExp, TokenType][] = [ + [/^and/i, TokenType.And], + [/^or/i, TokenType.Or], + + [/^#/, TokenType.Hash], + [/^(is|url|list|after|before):/, TokenType.Qualifier], + + [/^"([^"]+)"/, TokenType.StringLiteral], + + [/^\(/, TokenType.LParen], + [/^\)/, TokenType.RParen], + [/^\s+/, TokenType.Space], + + // This needs to be last as it matches a lot of stuff + [/^[^ )(]+/, TokenType.Ident], +] as const; + +class LexerToken implements Token<TokenType> { + private constructor( + private readonly input: string, + public kind: TokenType, + public text: string, + public pos: TokenPosition, + ) {} + + public static from(input: string): Token<TokenType> | undefined { + const tok = new LexerToken( + input, + /* Doesn't matter */ TokenType.Ident, + "", + { + index: 0, + rowBegin: 1, + rowEnd: 1, + columnBegin: 0, + columnEnd: 0, + }, + ); + return tok.next; + } + + public get next(): Token<TokenType> | undefined { + if (!this.input.length) { + return undefined; + } + + for (const [regex, tokenType] of lexerRules) { + const matchRes = regex.exec(this.input); + if (!matchRes) { + continue; + } + const match = matchRes[0]; + return new LexerToken(this.input.slice(match.length), tokenType, match, { + index: this.pos.index + match.length, + columnBegin: this.pos.index + 1, + columnEnd: this.pos.index + 1 + match.length, + // Our strings are always only one line + rowBegin: 1, + rowEnd: 1, + }); + } + // No match + throw new Error( + `Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`, + ); + } +} + +export interface TextAndMatcher { + text: string; + matcher?: Matcher; +} + +const MATCHER = rule<TokenType, TextAndMatcher>(); +const EXP = rule<TokenType, TextAndMatcher>(); + +MATCHER.setPattern( + alt_sc( + apply(kright(str("is:"), tok(TokenType.Ident)), (toks) => { + switch (toks.text) { + case "fav": + return { + text: "", + matcher: { type: "favourited", favourited: true }, + }; + case "not_fav": + return { + text: "", + matcher: { type: "favourited", favourited: false }, + }; + case "archived": + return { + text: "", + matcher: { type: "archived", archived: true }, + }; + case "not_archived": + return { + text: "", + matcher: { type: "archived", archived: false }, + }; + default: + // If the token is not known, emit it as pure text + return { + text: `is:${toks.text}`, + matcher: undefined, + }; + } + }), + apply( + seq( + alt(tok(TokenType.Qualifier), tok(TokenType.Hash)), + alt( + apply(tok(TokenType.Ident), (tok) => { + return tok.text; + }), + apply(tok(TokenType.StringLiteral), (tok) => { + return tok.text.slice(1, -1); + }), + ), + ), + (toks) => { + switch (toks[0].text) { + case "url:": + return { + text: "", + matcher: { type: "url", url: toks[1] }, + }; + case "#": + return { + text: "", + matcher: { type: "tagName", tagName: toks[1] }, + }; + case "list:": + return { + text: "", + matcher: { type: "listName", listName: toks[1] }, + }; + case "after:": + try { + return { + text: "", + matcher: { + type: "dateAfter", + dateAfter: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + case "before:": + try { + return { + text: "", + matcher: { + type: "dateBefore", + dateBefore: z.coerce.date().parse(toks[1]), + }, + }; + } catch (e) { + return { + // If parsing the date fails, emit it as pure text + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + default: + // If the token is not known, emit it as pure text + return { + text: toks[0].text + toks[1], + matcher: undefined, + }; + } + }, + ), + // Ident or an incomlete qualifier + apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => { + return { + text: toks.text, + matcher: undefined, + }; + }), + kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)), + ), +); + +EXP.setPattern( + lrec_sc( + MATCHER, + seq( + alt( + tok(TokenType.Space), + kmid(tok(TokenType.Space), tok(TokenType.And), tok(TokenType.Space)), + kmid(tok(TokenType.Space), tok(TokenType.Or), tok(TokenType.Space)), + ), + MATCHER, + ), + (toks, next) => { + switch (next[0].kind) { + case TokenType.Space: + case TokenType.And: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "and", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + case TokenType.Or: + return { + text: [toks.text, next[1].text].join(" ").trim(), + matcher: + !!toks.matcher || !!next[1].matcher + ? { + type: "or", + matchers: [toks.matcher, next[1].matcher].filter( + (a) => !!a, + ) as Matcher[], + } + : undefined, + }; + } + }, + ), +); + +function flattenAndsAndOrs(matcher: Matcher): Matcher { + switch (matcher.type) { + case "and": + case "or": { + if (matcher.matchers.length == 1) { + return flattenAndsAndOrs(matcher.matchers[0]); + } + const flattened: Matcher[] = []; + for (let m of matcher.matchers) { + // If inside the matcher is another matcher of the same type, flatten it + m = flattenAndsAndOrs(m); + if (m.type == matcher.type) { + flattened.push(...m.matchers); + } else { + flattened.push(m); + } + } + matcher.matchers = flattened; + return matcher; + } + default: + return matcher; + } +} + +export function _parseAndPrintTokens(query: string) { + console.log(`PARSING: ${query}`); + let tok = LexerToken.from(query); + do { + console.log(tok?.kind, tok?.text); + tok = tok?.next; + } while (tok); + console.log("DONE"); +} + +function consumeTokenStream(token: Token<TokenType>) { + let str = ""; + let tok: Token<TokenType> | undefined = token; + do { + str += tok.text; + tok = tok.next; + } while (tok); + return str; +} + +export function parseSearchQuery( + query: string, +): TextAndMatcher & { result: "full" | "partial" | "invalid" } { + // _parseAndPrintTokens(query); // Uncomment to debug tokenization + const parsed = EXP.parse(LexerToken.from(query.trim())); + if (!parsed.successful || parsed.candidates.length != 1) { + // If the query is not valid, return the whole query as pure text + return { + text: query, + result: "invalid", + }; + } + + const parseCandidate = parsed.candidates[0]; + if (parseCandidate.result.matcher) { + parseCandidate.result.matcher = flattenAndsAndOrs( + parseCandidate.result.matcher, + ); + } + if (parseCandidate.nextToken) { + // Parser failed to consume the whole query. This usually happen + // when the user is still typing the query. Return the partial + // result and the remaining query as pure text + return { + text: ( + parseCandidate.result.text + + consumeTokenStream(parseCandidate.nextToken) + ).trim(), + matcher: parseCandidate.result.matcher, + result: "partial", + }; + } + + return { + text: parseCandidate.result.text, + matcher: parseCandidate.result.matcher, + result: "full", + }; +} diff --git a/packages/shared/types/search.ts b/packages/shared/types/search.ts new file mode 100644 index 00000000..d430dad5 --- /dev/null +++ b/packages/shared/types/search.ts @@ -0,0 +1,72 @@ +import { z } from "zod"; + +const zTagNameMatcher = z.object({ + type: z.literal("tagName"), + tagName: z.string(), +}); + +const zListNameMatcher = z.object({ + type: z.literal("listName"), + listName: z.string(), +}); + +const zArchivedMatcher = z.object({ + type: z.literal("archived"), + archived: z.boolean(), +}); + +const urlMatcher = z.object({ + type: z.literal("url"), + url: z.string(), +}); + +const zFavouritedMatcher = z.object({ + type: z.literal("favourited"), + favourited: z.boolean(), +}); + +const zDateAfterMatcher = z.object({ + type: z.literal("dateAfter"), + dateAfter: z.date(), +}); + +const zDateBeforeMatcher = z.object({ + type: z.literal("dateBefore"), + dateBefore: z.date(), +}); + +const zNonRecursiveMatcher = z.union([ + zTagNameMatcher, + zListNameMatcher, + zArchivedMatcher, + urlMatcher, + zFavouritedMatcher, + zDateAfterMatcher, + zDateBeforeMatcher, +]); + +type NonRecursiveMatcher = z.infer<typeof zNonRecursiveMatcher>; +export type Matcher = + | NonRecursiveMatcher + | { type: "and"; matchers: Matcher[] } + | { type: "or"; matchers: Matcher[] }; + +export const zMatcherSchema: z.ZodType<Matcher> = z.lazy(() => { + return z.discriminatedUnion("type", [ + zTagNameMatcher, + zListNameMatcher, + zArchivedMatcher, + urlMatcher, + zFavouritedMatcher, + zDateAfterMatcher, + zDateBeforeMatcher, + z.object({ + type: z.literal("and"), + matchers: z.array(zMatcherSchema), + }), + z.object({ + type: z.literal("or"), + matchers: z.array(zMatcherSchema), + }), + ]); +}); diff --git a/packages/shared/vitest.config.ts b/packages/shared/vitest.config.ts new file mode 100644 index 00000000..41fd70c4 --- /dev/null +++ b/packages/shared/vitest.config.ts @@ -0,0 +1,14 @@ +/// <reference types="vitest" /> + +import tsconfigPaths from "vite-tsconfig-paths"; +import { defineConfig } from "vitest/config"; + +// https://vitejs.dev/config/ +export default defineConfig({ + plugins: [tsconfigPaths()], + test: { + alias: { + "@/*": "./*", + }, + }, +}); diff --git a/packages/trpc/lib/search.ts b/packages/trpc/lib/search.ts new file mode 100644 index 00000000..0ee9c76e --- /dev/null +++ b/packages/trpc/lib/search.ts @@ -0,0 +1,182 @@ +import { and, eq, gte, like, lte, sql } from "drizzle-orm"; + +import { + bookmarkLinks, + bookmarkLists, + bookmarks, + bookmarksInLists, + bookmarkTags, + tagsOnBookmarks, +} from "@hoarder/db/schema"; +import { Matcher } from "@hoarder/shared/types/search"; + +import { AuthedContext } from ".."; + +interface BookmarkQueryReturnType { + id: string; +} + +function intersect( + vals: BookmarkQueryReturnType[][], +): BookmarkQueryReturnType[] { + if (!vals || vals.length === 0) { + return []; + } + + if (vals.length === 1) { + return [...vals[0]]; + } + + const countMap = new Map<string, number>(); + const map = new Map<string, BookmarkQueryReturnType>(); + + for (const arr of vals) { + for (const item of arr) { + countMap.set(item.id, (countMap.get(item.id) ?? 0) + 1); + map.set(item.id, item); + } + } + + const result: BookmarkQueryReturnType[] = []; + for (const [id, count] of countMap) { + if (count === vals.length) { + result.push(map.get(id)!); + } + } + + return result; +} + +function union(vals: BookmarkQueryReturnType[][]): BookmarkQueryReturnType[] { + if (!vals || vals.length === 0) { + return []; + } + + const uniqueIds = new Set<string>(); + const map = new Map<string, BookmarkQueryReturnType>(); + for (const arr of vals) { + for (const item of arr) { + uniqueIds.add(item.id); + map.set(item.id, item); + } + } + + const result: BookmarkQueryReturnType[] = []; + for (const id of uniqueIds) { + result.push(map.get(id)!); + } + + return result; +} + +async function getIds( + db: AuthedContext["db"], + userId: string, + matcher: Matcher, +): Promise<BookmarkQueryReturnType[]> { + switch (matcher.type) { + case "tagName": { + return db + .select({ id: sql<string>`${tagsOnBookmarks.bookmarkId}`.as("id") }) + .from(tagsOnBookmarks) + .innerJoin(bookmarkTags, eq(tagsOnBookmarks.tagId, bookmarkTags.id)) + .where( + and( + eq(bookmarkTags.userId, userId), + eq(bookmarkTags.name, matcher.tagName), + ), + ); + } + case "listName": { + return db + .select({ id: sql<string>`${bookmarksInLists.bookmarkId}`.as("id") }) + .from(bookmarksInLists) + .innerJoin(bookmarkLists, eq(bookmarksInLists.listId, bookmarkLists.id)) + .where( + and( + eq(bookmarkLists.userId, userId), + eq(bookmarkLists.name, matcher.listName), + ), + ); + } + case "archived": { + return db + .select({ id: bookmarks.id }) + .from(bookmarks) + .where( + and( + eq(bookmarks.userId, userId), + eq(bookmarks.archived, matcher.archived), + ), + ); + } + case "url": { + return db + .select({ id: bookmarkLinks.id }) + .from(bookmarkLinks) + .leftJoin(bookmarks, eq(bookmarks.id, bookmarkLinks.id)) + .where( + and( + eq(bookmarks.userId, userId), + like(bookmarkLinks.url, `%${matcher.url}%`), + ), + ); + } + case "favourited": { + return db + .select({ id: bookmarks.id }) + .from(bookmarks) + .where( + and( + eq(bookmarks.userId, userId), + eq(bookmarks.favourited, matcher.favourited), + ), + ); + } + case "dateAfter": { + return db + .select({ id: bookmarks.id }) + .from(bookmarks) + .where( + and( + eq(bookmarks.userId, userId), + gte(bookmarks.createdAt, matcher.dateAfter), + ), + ); + } + case "dateBefore": { + return db + .select({ id: bookmarks.id }) + .from(bookmarks) + .where( + and( + eq(bookmarks.userId, userId), + lte(bookmarks.createdAt, matcher.dateBefore), + ), + ); + } + case "and": { + const vals = await Promise.all( + matcher.matchers.map((m) => getIds(db, userId, m)), + ); + return intersect(vals); + } + case "or": { + const vals = await Promise.all( + matcher.matchers.map((m) => getIds(db, userId, m)), + ); + return union(vals); + } + default: { + throw new Error("Unknown matcher type"); + } + } +} + +export async function getBookmarkIdsFromMatcher( + ctx: AuthedContext, + matcher: Matcher, +): Promise<string[]> { + const results = await getIds(ctx.db, ctx.user.id, matcher); + return results.map((r) => r.id); +} diff --git a/packages/trpc/routers/bookmarks.ts b/packages/trpc/routers/bookmarks.ts index 254ac6c2..3320b3b9 100644 --- a/packages/trpc/routers/bookmarks.ts +++ b/packages/trpc/routers/bookmarks.ts @@ -45,6 +45,7 @@ import { zNewBookmarkRequestSchema, zUpdateBookmarksRequestSchema, } from "@hoarder/shared/types/bookmarks"; +import { zMatcherSchema } from "@hoarder/shared/types/search"; import type { AuthedContext, Context } from "../index"; import { authedProcedure, router } from "../index"; @@ -54,6 +55,7 @@ import { mapDBAssetTypeToUserType, mapSchemaAssetTypeToDB, } from "../lib/attachments"; +import { getBookmarkIdsFromMatcher } from "../lib/search"; export const ensureBookmarkOwnership = experimental_trpcMiddleware<{ ctx: Context; @@ -521,6 +523,7 @@ export const bookmarksAppRouter = router({ .input( z.object({ text: z.string(), + matcher: zMatcherSchema.optional(), cursor: z .object({ offset: z.number(), @@ -548,8 +551,19 @@ export const bookmarksAppRouter = router({ message: "Search functionality is not configured", }); } + + let filter: string[]; + if (input.matcher) { + const bookmarkIds = await getBookmarkIdsFromMatcher(ctx, input.matcher); + filter = [ + `userId = '${ctx.user.id}' AND id IN [${bookmarkIds.join(",")}]`, + ]; + } else { + filter = [`userId = '${ctx.user.id}'`]; + } + const resp = await client.search(input.text, { - filter: [`userId = '${ctx.user.id}'`], + filter, showRankingScore: true, attributesToRetrieve: ["id"], sort: ["createdAt:desc"], |
