aboutsummaryrefslogtreecommitdiffstats
path: root/packages
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2026-01-03 10:39:45 +0000
committerGitHub <noreply@github.com>2026-01-03 10:39:45 +0000
commit6fe20639702e3eb81bd262075094fb5d1f7033b9 (patch)
tree9f6ee471afe159158184dfbad9d61bee236307b8 /packages
parent1af9b9ddf69cc7215d10e2f0713123756b36077b (diff)
downloadkarakeep-6fe20639702e3eb81bd262075094fb5d1f7033b9.tar.zst
fix: Eliminate the O(n2) parsing of the netscape import parsing (#2338)
* fix: Eliminate the O(n2) parsing of the netscape import parsing * remove unneeded tests
Diffstat (limited to 'packages')
-rw-r--r--packages/shared/import-export/parsers.test.ts301
-rw-r--r--packages/shared/import-export/parsers.ts84
2 files changed, 351 insertions, 34 deletions
diff --git a/packages/shared/import-export/parsers.test.ts b/packages/shared/import-export/parsers.test.ts
new file mode 100644
index 00000000..18502305
--- /dev/null
+++ b/packages/shared/import-export/parsers.test.ts
@@ -0,0 +1,301 @@
+import { describe, expect, it } from "vitest";
+
+import { parseImportFile } from "./parsers";
+
+describe("parseNetscapeBookmarkFile", () => {
+ it("parses a simple bookmark file with single bookmark", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567890">Example Site</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0]).toMatchObject({
+ title: "Example Site",
+ content: {
+ type: "link",
+ url: "https://example.com",
+ },
+ tags: [],
+ addDate: 1234567890,
+ paths: [[]],
+ });
+ });
+
+ it("parses bookmarks with tags", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1,tag2,tag3">Example Site</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].tags).toEqual(["tag1", "tag2", "tag3"]);
+ });
+
+ it("parses bookmarks in nested folders", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Folder1</H3>
+ <DL><p>
+ <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="1234567893">Folder2</H3>
+ <DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567894">Nested Bookmark</A>
+ </DL><p>
+ </DL><p>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0]).toMatchObject({
+ title: "Nested Bookmark",
+ content: {
+ type: "link",
+ url: "https://example.com",
+ },
+ paths: [["Folder1", "Folder2"]],
+ });
+ });
+
+ it("handles empty folder names by replacing with 'Unnamed'", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Named Folder</H3>
+ <DL><p>
+ <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="0"></H3>
+ <DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567894">Bookmark</A>
+ </DL><p>
+ </DL><p>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].paths).toEqual([["Named Folder", "Unnamed"]]);
+ });
+
+ it("parses multiple bookmarks in different folders", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><H3 ADD_DATE="1234567890">Tech</H3>
+ <DL><p>
+ <DT><A HREF="https://github.com" ADD_DATE="1234567891">GitHub</A>
+ <DT><A HREF="https://stackoverflow.com" ADD_DATE="1234567892">Stack Overflow</A>
+ </DL><p>
+ <DT><H3 ADD_DATE="1234567893">News</H3>
+ <DL><p>
+ <DT><A HREF="https://news.ycombinator.com" ADD_DATE="1234567894">Hacker News</A>
+ </DL><p>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(3);
+
+ expect(result[0]).toMatchObject({
+ title: "GitHub",
+ content: { type: "link", url: "https://github.com" },
+ paths: [["Tech"]],
+ });
+
+ expect(result[1]).toMatchObject({
+ title: "Stack Overflow",
+ content: { type: "link", url: "https://stackoverflow.com" },
+ paths: [["Tech"]],
+ });
+
+ expect(result[2]).toMatchObject({
+ title: "Hacker News",
+ content: { type: "link", url: "https://news.ycombinator.com" },
+ paths: [["News"]],
+ });
+ });
+
+ it("parses bookmarks at root level (no folders)", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://example1.com" ADD_DATE="1234567890">Bookmark 1</A>
+ <DT><A HREF="https://example2.com" ADD_DATE="1234567891">Bookmark 2</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(2);
+ expect(result[0].paths).toEqual([[]]);
+ expect(result[1].paths).toEqual([[]]);
+ });
+
+ it("handles deeply nested folder structures", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><H3>Level1</H3>
+ <DL><p>
+ <DT><H3>Level2</H3>
+ <DL><p>
+ <DT><H3>Level3</H3>
+ <DL><p>
+ <DT><H3>Level4</H3>
+ <DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567890">Deep Bookmark</A>
+ </DL><p>
+ </DL><p>
+ </DL><p>
+ </DL><p>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].paths).toEqual([["Level1", "Level2", "Level3", "Level4"]]);
+ });
+
+ it("deduplicates bookmarks with the same URL", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><H3>Folder1</H3>
+ <DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1">First Instance</A>
+ </DL><p>
+ <DT><H3>Folder2</H3>
+ <DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567891" TAGS="tag2">Second Instance</A>
+ </DL><p>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0]).toMatchObject({
+ content: { type: "link", url: "https://example.com" },
+ tags: ["tag1", "tag2"],
+ addDate: 1234567890, // Should keep the earlier date
+ });
+ expect(result[0].paths).toHaveLength(2);
+ expect(result[0].paths).toContainEqual(["Folder1"]);
+ expect(result[0].paths).toContainEqual(["Folder2"]);
+ });
+
+ it("merges notes from duplicate bookmarks", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://example.com" ADD_DATE="1234567890">Bookmark</A>
+ <DD>First note
+ <DT><A HREF="https://example.com" ADD_DATE="1234567891">Bookmark</A>
+ <DD>Second note
+</DL><p>`;
+
+ // Note: The current parser doesn't extract DD notes, but this test
+ // documents the expected behavior if/when DD parsing is added
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].content).toMatchObject({
+ type: "link",
+ url: "https://example.com",
+ });
+ });
+
+ it("handles bookmarks without ADD_DATE attribute", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://example.com">No Date Bookmark</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].addDate).toBeUndefined();
+ });
+
+ it("handles bookmarks without HREF attribute", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A ADD_DATE="1234567890">No URL Bookmark</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(1);
+ expect(result[0].content).toBeUndefined();
+ });
+
+ it("handles mixed structure with folders and root-level bookmarks", () => {
+ const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+ <DT><A HREF="https://root1.com" ADD_DATE="1234567890">Root Bookmark 1</A>
+ <DT><H3>Folder</H3>
+ <DL><p>
+ <DT><A HREF="https://folder1.com" ADD_DATE="1234567891">Folder Bookmark</A>
+ </DL><p>
+ <DT><A HREF="https://root2.com" ADD_DATE="1234567892">Root Bookmark 2</A>
+</DL><p>`;
+
+ const result = parseImportFile("html", html);
+
+ expect(result).toHaveLength(3);
+ expect(result[0]).toMatchObject({
+ title: "Root Bookmark 1",
+ paths: [[]],
+ });
+ expect(result[1]).toMatchObject({
+ title: "Folder Bookmark",
+ paths: [["Folder"]],
+ });
+ expect(result[2]).toMatchObject({
+ title: "Root Bookmark 2",
+ paths: [[]],
+ });
+ });
+
+ it("throws error for non-Netscape bookmark files", () => {
+ const html = `<html>
+<head><title>Not a bookmark file</title></head>
+<body>Just a regular HTML file</body>
+</html>`;
+
+ expect(() => parseImportFile("html", html)).toThrow(
+ "The uploaded html file does not seem to be a bookmark file",
+ );
+ });
+});
diff --git a/packages/shared/import-export/parsers.ts b/packages/shared/import-export/parsers.ts
index a56cbb98..df3d2c45 100644
--- a/packages/shared/import-export/parsers.ts
+++ b/packages/shared/import-export/parsers.ts
@@ -1,5 +1,6 @@
// Copied from https://gist.github.com/devster31/4e8c6548fd16ffb75c02e6f24e27f9b9
+import type { AnyNode } from "domhandler";
import * as cheerio from "cheerio";
import { parse } from "csv-parse/sync";
import { z } from "zod";
@@ -35,43 +36,58 @@ function parseNetscapeBookmarkFile(textContent: string): ParsedBookmark[] {
}
const $ = cheerio.load(textContent);
-
- return $("a")
- .map(function (_index, a) {
- const $a = $(a);
- const addDate = $a.attr("add_date");
- let tags: string[] = [];
-
- const tagsStr = $a.attr("tags");
- try {
- tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : [];
- } catch {
- /* empty */
- }
- const url = $a.attr("href");
-
- // Build folder path by traversing up the hierarchy
- const path: string[] = [];
- let current = $a.parent();
- while (current && current.length > 0) {
- const h3 = current.find("> h3").first();
- if (h3.length > 0) {
- const folderName = h3.text().trim();
- // Use "Unnamed" for empty folder names
- path.unshift(folderName || "Unnamed");
+ const bookmarks: ParsedBookmark[] = [];
+
+ // Recursively traverse the bookmark hierarchy top-down
+ function traverseFolder(
+ element: cheerio.Cheerio<AnyNode>,
+ currentPath: string[],
+ ) {
+ element.children().each((_index, child) => {
+ const $child = $(child);
+
+ // Check if this is a folder (DT with H3)
+ const h3 = $child.children("h3").first();
+ if (h3.length > 0) {
+ const folderName = h3.text().trim() || "Unnamed";
+ const newPath = [...currentPath, folderName];
+
+ // Find the DL that follows this folder and recurse into it
+ const dl = $child.children("dl").first();
+ if (dl.length > 0) {
+ traverseFolder(dl, newPath);
+ }
+ } else {
+ // Check if this is a bookmark (DT with A)
+ const anchor = $child.children("a").first();
+ if (anchor.length > 0) {
+ const addDate = anchor.attr("add_date");
+ const tagsStr = anchor.attr("tags");
+ const tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : [];
+ const url = anchor.attr("href");
+
+ bookmarks.push({
+ title: anchor.text(),
+ content: url
+ ? { type: BookmarkTypes.LINK as const, url }
+ : undefined,
+ tags,
+ addDate:
+ typeof addDate === "undefined" ? undefined : parseInt(addDate),
+ paths: [currentPath],
+ });
}
- current = current.parent();
}
+ });
+ }
- return {
- title: $a.text(),
- content: url ? { type: BookmarkTypes.LINK as const, url } : undefined,
- tags,
- addDate: typeof addDate === "undefined" ? undefined : parseInt(addDate),
- paths: [path],
- };
- })
- .get();
+ // Start traversal from the root DL element
+ const rootDl = $("dl").first();
+ if (rootDl.length > 0) {
+ traverseFolder(rootDl, []);
+ }
+
+ return bookmarks;
}
function parsePocketBookmarkFile(textContent: string): ParsedBookmark[] {