fix: Eliminate the O(n2) parsing of the netscape import parsing (#2338)

* fix: Eliminate the O(n2) parsing of the netscape import parsing * remove unneeded tests
author: Mohamed Bassem <me@mbassem.com> 2026-01-03 10:39:45 +0000
committer: GitHub <noreply@github.com> 2026-01-03 10:39:45 +0000
commit: 6fe20639702e3eb81bd262075094fb5d1f7033b9 (patch)
tree: 9f6ee471afe159158184dfbad9d61bee236307b8 /packages
parent: 1af9b9ddf69cc7215d10e2f0713123756b36077b (diff)
download: karakeep-6fe20639702e3eb81bd262075094fb5d1f7033b9.tar.zst
2 files changed, 348 insertions, 31 deletions
diff --git a/packages/shared/import-export/parsers.test.ts b/packages/shared/import-export/parsers.test.ts
new file mode 100644
index 00000000..18502305
--- /dev/null
+++ b/packages/shared/import-export/parsers.test.ts
@@ -0,0 +1,301 @@
+import { describe, expect, it } from "vitest";
+
+import { parseImportFile } from "./parsers";
+
+describe("parseNetscapeBookmarkFile", () => {
+  it("parses a simple bookmark file with single bookmark", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1234567890">Example Site</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0]).toMatchObject({
+      title: "Example Site",
+      content: {
+        type: "link",
+        url: "https://example.com",
+      },
+      tags: [],
+      addDate: 1234567890,
+      paths: [[]],
+    });
+  });
+
+  it("parses bookmarks with tags", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1,tag2,tag3">Example Site</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].tags).toEqual(["tag1", "tag2", "tag3"]);
+  });
+
+  it("parses bookmarks in nested folders", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Folder1</H3>
+    <DL><p>
+        <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="1234567893">Folder2</H3>
+        <DL><p>
+            <DT><A HREF="https://example.com" ADD_DATE="1234567894">Nested Bookmark</A>
+        </DL><p>
+    </DL><p>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0]).toMatchObject({
+      title: "Nested Bookmark",
+      content: {
+        type: "link",
+        url: "https://example.com",
+      },
+      paths: [["Folder1", "Folder2"]],
+    });
+  });
+
+  it("handles empty folder names by replacing with 'Unnamed'", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Named Folder</H3>
+    <DL><p>
+        <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="0"></H3>
+        <DL><p>
+            <DT><A HREF="https://example.com" ADD_DATE="1234567894">Bookmark</A>
+        </DL><p>
+    </DL><p>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].paths).toEqual([["Named Folder", "Unnamed"]]);
+  });
+
+  it("parses multiple bookmarks in different folders", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3 ADD_DATE="1234567890">Tech</H3>
+    <DL><p>
+        <DT><A HREF="https://github.com" ADD_DATE="1234567891">GitHub</A>
+        <DT><A HREF="https://stackoverflow.com" ADD_DATE="1234567892">Stack Overflow</A>
+    </DL><p>
+    <DT><H3 ADD_DATE="1234567893">News</H3>
+    <DL><p>
+        <DT><A HREF="https://news.ycombinator.com" ADD_DATE="1234567894">Hacker News</A>
+    </DL><p>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(3);
+
+    expect(result[0]).toMatchObject({
+      title: "GitHub",
+      content: { type: "link", url: "https://github.com" },
+      paths: [["Tech"]],
+    });
+
+    expect(result[1]).toMatchObject({
+      title: "Stack Overflow",
+      content: { type: "link", url: "https://stackoverflow.com" },
+      paths: [["Tech"]],
+    });
+
+    expect(result[2]).toMatchObject({
+      title: "Hacker News",
+      content: { type: "link", url: "https://news.ycombinator.com" },
+      paths: [["News"]],
+    });
+  });
+
+  it("parses bookmarks at root level (no folders)", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example1.com" ADD_DATE="1234567890">Bookmark 1</A>
+    <DT><A HREF="https://example2.com" ADD_DATE="1234567891">Bookmark 2</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(2);
+    expect(result[0].paths).toEqual([[]]);
+    expect(result[1].paths).toEqual([[]]);
+  });
+
+  it("handles deeply nested folder structures", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3>Level1</H3>
+    <DL><p>
+        <DT><H3>Level2</H3>
+        <DL><p>
+            <DT><H3>Level3</H3>
+            <DL><p>
+                <DT><H3>Level4</H3>
+                <DL><p>
+                    <DT><A HREF="https://example.com" ADD_DATE="1234567890">Deep Bookmark</A>
+                </DL><p>
+            </DL><p>
+        </DL><p>
+    </DL><p>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].paths).toEqual([["Level1", "Level2", "Level3", "Level4"]]);
+  });
+
+  it("deduplicates bookmarks with the same URL", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3>Folder1</H3>
+    <DL><p>
+        <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1">First Instance</A>
+    </DL><p>
+    <DT><H3>Folder2</H3>
+    <DL><p>
+        <DT><A HREF="https://example.com" ADD_DATE="1234567891" TAGS="tag2">Second Instance</A>
+    </DL><p>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0]).toMatchObject({
+      content: { type: "link", url: "https://example.com" },
+      tags: ["tag1", "tag2"],
+      addDate: 1234567890, // Should keep the earlier date
+    });
+    expect(result[0].paths).toHaveLength(2);
+    expect(result[0].paths).toContainEqual(["Folder1"]);
+    expect(result[0].paths).toContainEqual(["Folder2"]);
+  });
+
+  it("merges notes from duplicate bookmarks", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1234567890">Bookmark</A>
+    <DD>First note
+    <DT><A HREF="https://example.com" ADD_DATE="1234567891">Bookmark</A>
+    <DD>Second note
+</DL><p>`;
+
+    // Note: The current parser doesn't extract DD notes, but this test
+    // documents the expected behavior if/when DD parsing is added
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].content).toMatchObject({
+      type: "link",
+      url: "https://example.com",
+    });
+  });
+
+  it("handles bookmarks without ADD_DATE attribute", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com">No Date Bookmark</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].addDate).toBeUndefined();
+  });
+
+  it("handles bookmarks without HREF attribute", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A ADD_DATE="1234567890">No URL Bookmark</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].content).toBeUndefined();
+  });
+
+  it("handles mixed structure with folders and root-level bookmarks", () => {
+    const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://root1.com" ADD_DATE="1234567890">Root Bookmark 1</A>
+    <DT><H3>Folder</H3>
+    <DL><p>
+        <DT><A HREF="https://folder1.com" ADD_DATE="1234567891">Folder Bookmark</A>
+    </DL><p>
+    <DT><A HREF="https://root2.com" ADD_DATE="1234567892">Root Bookmark 2</A>
+</DL><p>`;
+
+    const result = parseImportFile("html", html);
+
+    expect(result).toHaveLength(3);
+    expect(result[0]).toMatchObject({
+      title: "Root Bookmark 1",
+      paths: [[]],
+    });
+    expect(result[1]).toMatchObject({
+      title: "Folder Bookmark",
+      paths: [["Folder"]],
+    });
+    expect(result[2]).toMatchObject({
+      title: "Root Bookmark 2",
+      paths: [[]],
+    });
+  });
+
+  it("throws error for non-Netscape bookmark files", () => {
+    const html = `<html>
+<head><title>Not a bookmark file</title></head>
+<body>Just a regular HTML file</body>
+</html>`;
+
+    expect(() => parseImportFile("html", html)).toThrow(
+      "The uploaded html file does not seem to be a bookmark file",
+    );
+  });
+});
diff --git a/packages/shared/import-export/parsers.ts b/packages/shared/import-export/parsers.ts
index a56cbb98..df3d2c45 100644
--- a/packages/shared/import-export/parsers.ts
+++ b/packages/shared/import-export/parsers.ts
@@ -1,5 +1,6 @@
 // Copied from https://gist.github.com/devster31/4e8c6548fd16ffb75c02e6f24e27f9b9
 
+import type { AnyNode } from "domhandler";
 import * as cheerio from "cheerio";
 import { parse } from "csv-parse/sync";
 import { z } from "zod";
@@ -35,43 +36,58 @@ function parseNetscapeBookmarkFile(textContent: string): ParsedBookmark[] {
   }
 
   const $ = cheerio.load(textContent);
+  const bookmarks: ParsedBookmark[] = [];
 
-  return $("a")
-    .map(function (_index, a) {
-      const $a = $(a);
-      const addDate = $a.attr("add_date");
-      let tags: string[] = [];
+  // Recursively traverse the bookmark hierarchy top-down
+  function traverseFolder(
+    element: cheerio.Cheerio<AnyNode>,
+    currentPath: string[],
+  ) {
+    element.children().each((_index, child) => {
+      const $child = $(child);
 
-      const tagsStr = $a.attr("tags");
-      try {
-        tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : [];
-      } catch {
-        /* empty */
-      }
-      const url = $a.attr("href");
+      // Check if this is a folder (DT with H3)
+      const h3 = $child.children("h3").first();
+      if (h3.length > 0) {
+        const folderName = h3.text().trim() || "Unnamed";
+        const newPath = [...currentPath, folderName];
+
+        // Find the DL that follows this folder and recurse into it
+        const dl = $child.children("dl").first();
+        if (dl.length > 0) {
+          traverseFolder(dl, newPath);
+        }
+      } else {
+        // Check if this is a bookmark (DT with A)
+        const anchor = $child.children("a").first();
+        if (anchor.length > 0) {
+          const addDate = anchor.attr("add_date");
+          const tagsStr = anchor.attr("tags");
+          const tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : [];
+          const url = anchor.attr("href");
 
-      // Build folder path by traversing up the hierarchy
-      const path: string[] = [];
-      let current = $a.parent();
-      while (current && current.length > 0) {
-        const h3 = current.find("> h3").first();
-        if (h3.length > 0) {
-          const folderName = h3.text().trim();
-          // Use "Unnamed" for empty folder names
-          path.unshift(folderName || "Unnamed");
+          bookmarks.push({
+            title: anchor.text(),
+            content: url
+              ? { type: BookmarkTypes.LINK as const, url }
+              : undefined,
+            tags,
+            addDate:
+              typeof addDate === "undefined" ? undefined : parseInt(addDate),
+            paths: [currentPath],
+          });
         }
-        current = current.parent();
       }
+    });
+  }
 
-      return {
-        title: $a.text(),
-        content: url ? { type: BookmarkTypes.LINK as const, url } : undefined,
-        tags,
-        addDate: typeof addDate === "undefined" ? undefined : parseInt(addDate),
-        paths: [path],
-      };
-    })
-    .get();
+  // Start traversal from the root DL element
+  const rootDl = $("dl").first();
+  if (rootDl.length > 0) {
+    traverseFolder(rootDl, []);
+  }
+
+  return bookmarks;
 }
 
 function parsePocketBookmarkFile(textContent: string): ParsedBookmark[] {
author	Mohamed Bassem <me@mbassem.com>	2026-01-03 10:39:45 +0000
committer	GitHub <noreply@github.com>	2026-01-03 10:39:45 +0000
commit	6fe20639702e3eb81bd262075094fb5d1f7033b9 (patch)
tree	9f6ee471afe159158184dfbad9d61bee236307b8 /packages
parent	1af9b9ddf69cc7215d10e2f0713123756b36077b (diff)
download	karakeep-6fe20639702e3eb81bd262075094fb5d1f7033b9.tar.zst