diff options
| author | Mohamed Bassem <me@mbassem.com> | 2026-01-03 10:39:45 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-01-03 10:39:45 +0000 |
| commit | 6fe20639702e3eb81bd262075094fb5d1f7033b9 (patch) | |
| tree | 9f6ee471afe159158184dfbad9d61bee236307b8 /packages | |
| parent | 1af9b9ddf69cc7215d10e2f0713123756b36077b (diff) | |
| download | karakeep-6fe20639702e3eb81bd262075094fb5d1f7033b9.tar.zst | |
fix: Eliminate the O(n2) parsing of the netscape import parsing (#2338)
* fix: Eliminate the O(n2) parsing of the netscape import parsing
* remove unneeded tests
Diffstat (limited to 'packages')
| -rw-r--r-- | packages/shared/import-export/parsers.test.ts | 301 | ||||
| -rw-r--r-- | packages/shared/import-export/parsers.ts | 84 |
2 files changed, 351 insertions, 34 deletions
diff --git a/packages/shared/import-export/parsers.test.ts b/packages/shared/import-export/parsers.test.ts new file mode 100644 index 00000000..18502305 --- /dev/null +++ b/packages/shared/import-export/parsers.test.ts @@ -0,0 +1,301 @@ +import { describe, expect, it } from "vitest"; + +import { parseImportFile } from "./parsers"; + +describe("parseNetscapeBookmarkFile", () => { + it("parses a simple bookmark file with single bookmark", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Example Site</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + title: "Example Site", + content: { + type: "link", + url: "https://example.com", + }, + tags: [], + addDate: 1234567890, + paths: [[]], + }); + }); + + it("parses bookmarks with tags", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1,tag2,tag3">Example Site</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].tags).toEqual(["tag1", "tag2", "tag3"]); + }); + + it("parses bookmarks in nested folders", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Folder1</H3> + <DL><p> + <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="1234567893">Folder2</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567894">Nested Bookmark</A> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + title: "Nested Bookmark", + content: { + type: "link", + url: "https://example.com", + }, + paths: [["Folder1", "Folder2"]], + }); + }); + + it("handles empty folder names by replacing with 'Unnamed'", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Named Folder</H3> + <DL><p> + <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="0"></H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567894">Bookmark</A> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].paths).toEqual([["Named Folder", "Unnamed"]]); + }); + + it("parses multiple bookmarks in different folders", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890">Tech</H3> + <DL><p> + <DT><A HREF="https://github.com" ADD_DATE="1234567891">GitHub</A> + <DT><A HREF="https://stackoverflow.com" ADD_DATE="1234567892">Stack Overflow</A> + </DL><p> + <DT><H3 ADD_DATE="1234567893">News</H3> + <DL><p> + <DT><A HREF="https://news.ycombinator.com" ADD_DATE="1234567894">Hacker News</A> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(3); + + expect(result[0]).toMatchObject({ + title: "GitHub", + content: { type: "link", url: "https://github.com" }, + paths: [["Tech"]], + }); + + expect(result[1]).toMatchObject({ + title: "Stack Overflow", + content: { type: "link", url: "https://stackoverflow.com" }, + paths: [["Tech"]], + }); + + expect(result[2]).toMatchObject({ + title: "Hacker News", + content: { type: "link", url: "https://news.ycombinator.com" }, + paths: [["News"]], + }); + }); + + it("parses bookmarks at root level (no folders)", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example1.com" ADD_DATE="1234567890">Bookmark 1</A> + <DT><A HREF="https://example2.com" ADD_DATE="1234567891">Bookmark 2</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(2); + expect(result[0].paths).toEqual([[]]); + expect(result[1].paths).toEqual([[]]); + }); + + it("handles deeply nested folder structures", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3>Level1</H3> + <DL><p> + <DT><H3>Level2</H3> + <DL><p> + <DT><H3>Level3</H3> + <DL><p> + <DT><H3>Level4</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Deep Bookmark</A> + </DL><p> + </DL><p> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].paths).toEqual([["Level1", "Level2", "Level3", "Level4"]]); + }); + + it("deduplicates bookmarks with the same URL", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3>Folder1</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1">First Instance</A> + </DL><p> + <DT><H3>Folder2</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567891" TAGS="tag2">Second Instance</A> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + content: { type: "link", url: "https://example.com" }, + tags: ["tag1", "tag2"], + addDate: 1234567890, // Should keep the earlier date + }); + expect(result[0].paths).toHaveLength(2); + expect(result[0].paths).toContainEqual(["Folder1"]); + expect(result[0].paths).toContainEqual(["Folder2"]); + }); + + it("merges notes from duplicate bookmarks", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Bookmark</A> + <DD>First note + <DT><A HREF="https://example.com" ADD_DATE="1234567891">Bookmark</A> + <DD>Second note +</DL><p>`; + + // Note: The current parser doesn't extract DD notes, but this test + // documents the expected behavior if/when DD parsing is added + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].content).toMatchObject({ + type: "link", + url: "https://example.com", + }); + }); + + it("handles bookmarks without ADD_DATE attribute", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com">No Date Bookmark</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].addDate).toBeUndefined(); + }); + + it("handles bookmarks without HREF attribute", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A ADD_DATE="1234567890">No URL Bookmark</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].content).toBeUndefined(); + }); + + it("handles mixed structure with folders and root-level bookmarks", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://root1.com" ADD_DATE="1234567890">Root Bookmark 1</A> + <DT><H3>Folder</H3> + <DL><p> + <DT><A HREF="https://folder1.com" ADD_DATE="1234567891">Folder Bookmark</A> + </DL><p> + <DT><A HREF="https://root2.com" ADD_DATE="1234567892">Root Bookmark 2</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(3); + expect(result[0]).toMatchObject({ + title: "Root Bookmark 1", + paths: [[]], + }); + expect(result[1]).toMatchObject({ + title: "Folder Bookmark", + paths: [["Folder"]], + }); + expect(result[2]).toMatchObject({ + title: "Root Bookmark 2", + paths: [[]], + }); + }); + + it("throws error for non-Netscape bookmark files", () => { + const html = `<html> +<head><title>Not a bookmark file</title></head> +<body>Just a regular HTML file</body> +</html>`; + + expect(() => parseImportFile("html", html)).toThrow( + "The uploaded html file does not seem to be a bookmark file", + ); + }); +}); diff --git a/packages/shared/import-export/parsers.ts b/packages/shared/import-export/parsers.ts index a56cbb98..df3d2c45 100644 --- a/packages/shared/import-export/parsers.ts +++ b/packages/shared/import-export/parsers.ts @@ -1,5 +1,6 @@ // Copied from https://gist.github.com/devster31/4e8c6548fd16ffb75c02e6f24e27f9b9 +import type { AnyNode } from "domhandler"; import * as cheerio from "cheerio"; import { parse } from "csv-parse/sync"; import { z } from "zod"; @@ -35,43 +36,58 @@ function parseNetscapeBookmarkFile(textContent: string): ParsedBookmark[] { } const $ = cheerio.load(textContent); - - return $("a") - .map(function (_index, a) { - const $a = $(a); - const addDate = $a.attr("add_date"); - let tags: string[] = []; - - const tagsStr = $a.attr("tags"); - try { - tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : []; - } catch { - /* empty */ - } - const url = $a.attr("href"); - - // Build folder path by traversing up the hierarchy - const path: string[] = []; - let current = $a.parent(); - while (current && current.length > 0) { - const h3 = current.find("> h3").first(); - if (h3.length > 0) { - const folderName = h3.text().trim(); - // Use "Unnamed" for empty folder names - path.unshift(folderName || "Unnamed"); + const bookmarks: ParsedBookmark[] = []; + + // Recursively traverse the bookmark hierarchy top-down + function traverseFolder( + element: cheerio.Cheerio<AnyNode>, + currentPath: string[], + ) { + element.children().each((_index, child) => { + const $child = $(child); + + // Check if this is a folder (DT with H3) + const h3 = $child.children("h3").first(); + if (h3.length > 0) { + const folderName = h3.text().trim() || "Unnamed"; + const newPath = [...currentPath, folderName]; + + // Find the DL that follows this folder and recurse into it + const dl = $child.children("dl").first(); + if (dl.length > 0) { + traverseFolder(dl, newPath); + } + } else { + // Check if this is a bookmark (DT with A) + const anchor = $child.children("a").first(); + if (anchor.length > 0) { + const addDate = anchor.attr("add_date"); + const tagsStr = anchor.attr("tags"); + const tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : []; + const url = anchor.attr("href"); + + bookmarks.push({ + title: anchor.text(), + content: url + ? { type: BookmarkTypes.LINK as const, url } + : undefined, + tags, + addDate: + typeof addDate === "undefined" ? undefined : parseInt(addDate), + paths: [currentPath], + }); } - current = current.parent(); } + }); + } - return { - title: $a.text(), - content: url ? { type: BookmarkTypes.LINK as const, url } : undefined, - tags, - addDate: typeof addDate === "undefined" ? undefined : parseInt(addDate), - paths: [path], - }; - }) - .get(); + // Start traversal from the root DL element + const rootDl = $("dl").first(); + if (rootDl.length > 0) { + traverseFolder(rootDl, []); + } + + return bookmarks; } function parsePocketBookmarkFile(textContent: string): ParsedBookmark[] { |
