diff options
Diffstat (limited to 'packages/shared/import-export')
| -rw-r--r-- | packages/shared/import-export/importer.test.ts | 454 | ||||
| -rw-r--r-- | packages/shared/import-export/importer.ts | 123 | ||||
| -rw-r--r-- | packages/shared/import-export/parsers.test.ts | 301 | ||||
| -rw-r--r-- | packages/shared/import-export/parsers.ts | 190 |
4 files changed, 795 insertions, 273 deletions
diff --git a/packages/shared/import-export/importer.test.ts b/packages/shared/import-export/importer.test.ts index 48cd1204..f097f8d5 100644 --- a/packages/shared/import-export/importer.test.ts +++ b/packages/shared/import-export/importer.test.ts @@ -1,13 +1,14 @@ import { describe, expect, it, vi } from "vitest"; -import { importBookmarksFromFile, ParsedBookmark } from "."; +import type { StagedBookmark } from "."; +import { importBookmarksFromFile } from "."; const fakeFile = { text: vi.fn().mockResolvedValue("fake file content"), } as unknown as File; describe("importBookmarksFromFile", () => { - it("creates root list, folders and imports bookmarks with progress", async () => { + it("creates root list, folders and stages bookmarks with progress", async () => { const parsers = { pocket: vi.fn().mockReturnValue([ { @@ -61,32 +62,23 @@ describe("importBookmarksFromFile", () => { }, ); - const createdBookmarks: ParsedBookmark[] = []; - const addedToLists: { bookmarkId: string; listIds: string[] }[] = []; - const updatedTags: { bookmarkId: string; tags: string[] }[] = []; - - const createBookmark = vi.fn(async (bookmark: ParsedBookmark) => { - createdBookmarks.push(bookmark); - return { - id: `bookmark-${createdBookmarks.length}`, - alreadyExists: false, - }; - }); - - const addBookmarkToLists = vi.fn( - async (input: { bookmarkId: string; listIds: string[] }) => { - addedToLists.push(input); + const stagedBookmarks: StagedBookmark[] = []; + const stageImportedBookmarks = vi.fn( + async (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; + }) => { + stagedBookmarks.push(...input.bookmarks); }, ); - const updateBookmarkTags = vi.fn( - async (input: { bookmarkId: string; tags: string[] }) => { - updatedTags.push(input); - }, + const finalizeImportStaging = vi.fn(); + const createImportSession = vi.fn( + async (_input: { name: string; rootListId: string }) => ({ + id: "session-1", + }), ); - const createImportSession = vi.fn(async () => ({ id: "session-1" })); - const progress: number[] = []; const res = await importBookmarksFromFile( { @@ -95,9 +87,8 @@ describe("importBookmarksFromFile", () => { rootListName: "Imported", deps: { createList, - createBookmark, - addBookmarkToLists, - updateBookmarkTags, + stageImportedBookmarks, + finalizeImportStaging, createImportSession, }, onProgress: (d, t) => progress.push(d / t), @@ -106,12 +97,14 @@ describe("importBookmarksFromFile", () => { ); expect(res.rootListId).toBe("Imported"); + expect(res.importSessionId).toBe("session-1"); expect(res.counts).toEqual({ - successes: 5, + successes: 0, failures: 0, alreadyExisted: 0, total: 5, // Using custom parser, no deduplication }); + // Root + all unique folders from paths expect(createdLists).toEqual([ { name: "Imported", icon: "ā¬ļø" }, @@ -122,38 +115,43 @@ describe("importBookmarksFromFile", () => { { name: "Tech", parentId: "Imported/Reading", icon: "š" }, { name: "Duplicates", parentId: "Imported/Development", icon: "š" }, ]); - // Verify we have 5 created bookmarks (no deduplication with custom parser) - expect(createdBookmarks).toHaveLength(5); - // Verify GitHub bookmark exists (will be two separate bookmarks since no deduplication) - const githubBookmarks = createdBookmarks.filter( - (bookmark) => - bookmark.content?.type === "link" && - bookmark.content.url === "https://github.com/example/repo", - ); - expect(githubBookmarks).toHaveLength(2); - // Verify text bookmark exists - const textBookmark = createdBookmarks.find( - (bookmark) => bookmark.content?.type === "text", + + // Verify 5 bookmarks were staged (in 1 batch since < 50) + expect(stagedBookmarks).toHaveLength(5); + expect(stageImportedBookmarks).toHaveBeenCalledTimes(1); + + // Verify GitHub link bookmark was staged correctly + const githubBookmark = stagedBookmarks.find( + (b) => b.url === "https://github.com/example/repo" && b.type === "link", ); + expect(githubBookmark).toBeDefined(); + if (!githubBookmark) { + throw new Error("Expected GitHub bookmark to be staged"); + } + expect(githubBookmark.title).toBe("GitHub Repository"); + expect(githubBookmark.tags).toEqual(["dev", "github"]); + expect(githubBookmark.listIds).toEqual(["Imported/Development/Projects"]); + + // Verify text bookmark was staged correctly + const textBookmark = stagedBookmarks.find((b) => b.type === "text"); expect(textBookmark).toBeDefined(); - expect(textBookmark!.archived).toBe(true); - expect(textBookmark!.notes).toBe("Additional context"); - // Verify bookmark with no path goes to root - const noCategoryBookmark = createdBookmarks.find( - (bookmark) => - bookmark.content?.type === "link" && - bookmark.content.url === "https://example.com/misc", + if (!textBookmark) { + throw new Error("Expected text bookmark to be staged"); + } + expect(textBookmark.content).toBe("Important notes about the project"); + expect(textBookmark.note).toBe("Additional context"); + expect(textBookmark.listIds).toEqual(["Imported/Personal"]); + + // Verify bookmark with empty paths gets root list ID + const noCategoryBookmark = stagedBookmarks.find( + (b) => b.url === "https://example.com/misc", ); expect(noCategoryBookmark).toBeDefined(); - // Find the corresponding list assignment for this bookmark - const noCategoryBookmarkId = `bookmark-${createdBookmarks.indexOf(noCategoryBookmark!) + 1}`; - const listAssignment = addedToLists.find( - (a) => a.bookmarkId === noCategoryBookmarkId, - ); - expect(listAssignment!.listIds).toEqual(["Imported"]); + expect(noCategoryBookmark!.listIds).toEqual(["Imported"]); + + // Verify finalizeImportStaging was called + expect(finalizeImportStaging).toHaveBeenCalledWith("session-1"); - // Verify that tags were updated for bookmarks that have tags - expect(updatedTags.length).toBeGreaterThan(0); expect(progress).toContain(0); expect(progress.at(-1)).toBe(1); }); @@ -167,9 +165,8 @@ describe("importBookmarksFromFile", () => { rootListName: "Imported", deps: { createList: vi.fn(), - createBookmark: vi.fn(), - addBookmarkToLists: vi.fn(), - updateBookmarkTags: vi.fn(), + stageImportedBookmarks: vi.fn(), + finalizeImportStaging: vi.fn(), createImportSession: vi.fn(async () => ({ id: "session-1" })), }, }, @@ -182,29 +179,29 @@ describe("importBookmarksFromFile", () => { }); }); - it("continues import when individual bookmarks fail", async () => { + it("stages all bookmarks successfully", async () => { const parsers = { pocket: vi.fn().mockReturnValue([ { - title: "Success Bookmark 1", - content: { type: "link", url: "https://example.com/success1" }, - tags: ["success"], + title: "Bookmark 1", + content: { type: "link", url: "https://example.com/1" }, + tags: ["tag1"], addDate: 100, - paths: [["Success"]], + paths: [["Category1"]], }, { - title: "Failure Bookmark", - content: { type: "link", url: "https://example.com/failure" }, - tags: ["failure"], + title: "Bookmark 2", + content: { type: "link", url: "https://example.com/2" }, + tags: ["tag2"], addDate: 200, - paths: [["Failure"]], + paths: [["Category2"]], }, { - title: "Success Bookmark 2", - content: { type: "link", url: "https://example.com/success2" }, - tags: ["success"], + title: "Bookmark 3", + content: { type: "link", url: "https://example.com/3" }, + tags: ["tag3"], addDate: 300, - paths: [["Success"]], + paths: [["Category1"]], }, ]), }; @@ -220,37 +217,23 @@ describe("importBookmarksFromFile", () => { }, ); - const createdBookmarks: ParsedBookmark[] = []; - const addedToLists: { bookmarkId: string; listIds: string[] }[] = []; - const updatedTags: { bookmarkId: string; tags: string[] }[] = []; - - const createBookmark = vi.fn(async (bookmark: ParsedBookmark) => { - // Simulate failure for the "Failure Bookmark" - if (bookmark.title === "Failure Bookmark") { - throw new Error("Simulated bookmark creation failure"); - } - - createdBookmarks.push(bookmark); - return { - id: `bookmark-${createdBookmarks.length}`, - alreadyExists: false, - }; - }); - - const addBookmarkToLists = vi.fn( - async (input: { bookmarkId: string; listIds: string[] }) => { - addedToLists.push(input); + const stagedBookmarks: StagedBookmark[] = []; + const stageImportedBookmarks = vi.fn( + async (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; + }) => { + stagedBookmarks.push(...input.bookmarks); }, ); - const updateBookmarkTags = vi.fn( - async (input: { bookmarkId: string; tags: string[] }) => { - updatedTags.push(input); - }, + const finalizeImportStaging = vi.fn(); + const createImportSession = vi.fn( + async (_input: { name: string; rootListId: string }) => ({ + id: "session-1", + }), ); - const createImportSession = vi.fn(async () => ({ id: "session-1" })); - const progress: number[] = []; const res = await importBookmarksFromFile( { @@ -259,9 +242,8 @@ describe("importBookmarksFromFile", () => { rootListName: "Imported", deps: { createList, - createBookmark, - addBookmarkToLists, - updateBookmarkTags, + stageImportedBookmarks, + finalizeImportStaging, createImportSession, }, onProgress: (d, t) => progress.push(d / t), @@ -269,63 +251,57 @@ describe("importBookmarksFromFile", () => { { parsers }, ); - // Should still create the root list expect(res.rootListId).toBe("Imported"); - - // Should track both successes and failures + expect(res.importSessionId).toBe("session-1"); expect(res.counts).toEqual({ - successes: 2, // Two successful bookmarks - failures: 1, // One failed bookmark + successes: 0, + failures: 0, alreadyExisted: 0, total: 3, }); - // Should create folders for all bookmarks (including failed ones) + // Should create folders for all bookmarks expect(createdLists).toEqual([ { name: "Imported", icon: "ā¬ļø" }, - { name: "Success", parentId: "Imported", icon: "š" }, - { name: "Failure", parentId: "Imported", icon: "š" }, + { name: "Category1", parentId: "Imported", icon: "š" }, + { name: "Category2", parentId: "Imported", icon: "š" }, ]); - // Only successful bookmarks should be created - expect(createdBookmarks).toHaveLength(2); - expect(createdBookmarks.map((b) => b.title)).toEqual([ - "Success Bookmark 1", - "Success Bookmark 2", - ]); + // All bookmarks should be staged (in 1 batch since < 50) + expect(stagedBookmarks).toHaveLength(3); + expect(stageImportedBookmarks).toHaveBeenCalledTimes(1); - // Only successful bookmarks should be added to lists and have tags updated - expect(addedToLists).toHaveLength(2); - expect(updatedTags).toHaveLength(2); + // Verify finalizeImportStaging was called + expect(finalizeImportStaging).toHaveBeenCalledWith("session-1"); - // Progress should complete even with failures + // Progress should complete expect(progress).toContain(0); expect(progress.at(-1)).toBe(1); }); - it("handles failures in different stages of bookmark import", async () => { + it("stages bookmarks with different paths", async () => { const parsers = { pocket: vi.fn().mockReturnValue([ { - title: "Success Bookmark", - content: { type: "link", url: "https://example.com/success" }, - tags: ["success"], + title: "Bookmark 1", + content: { type: "link", url: "https://example.com/1" }, + tags: ["tag1"], addDate: 100, - paths: [["Success"]], + paths: [["Path1"]], }, { - title: "Fail at List Assignment", - content: { type: "link", url: "https://example.com/fail-list" }, - tags: ["fail"], + title: "Bookmark 2", + content: { type: "link", url: "https://example.com/2" }, + tags: ["tag2"], addDate: 200, - paths: [["Failure"]], + paths: [["Path2"]], }, { - title: "Fail at Tag Update", - content: { type: "link", url: "https://example.com/fail-tag" }, - tags: ["fail-tag"], + title: "Bookmark 3", + content: { type: "link", url: "https://example.com/3" }, + tags: ["tag3"], addDate: 300, - paths: [["Failure"]], + paths: [["Path2"]], }, ]), }; @@ -338,31 +314,23 @@ describe("importBookmarksFromFile", () => { }, ); - let bookmarkIdCounter = 1; - const createBookmark = vi.fn(async () => { - return { id: `bookmark-${bookmarkIdCounter++}`, alreadyExists: false }; - }); - - const addBookmarkToLists = vi.fn( - async (input: { bookmarkId: string; listIds: string[] }) => { - // Simulate failure for specific bookmark - if (input.bookmarkId === "bookmark-2") { - throw new Error("Failed to add bookmark to lists"); - } + const stagedBookmarks: StagedBookmark[] = []; + const stageImportedBookmarks = vi.fn( + async (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; + }) => { + stagedBookmarks.push(...input.bookmarks); }, ); - const updateBookmarkTags = vi.fn( - async (input: { bookmarkId: string; tags: string[] }) => { - // Simulate failure for specific bookmark - if (input.bookmarkId === "bookmark-3") { - throw new Error("Failed to update bookmark tags"); - } - }, + const finalizeImportStaging = vi.fn(); + const createImportSession = vi.fn( + async (_input: { name: string; rootListId: string }) => ({ + id: "session-1", + }), ); - const createImportSession = vi.fn(async () => ({ id: "session-1" })); - const progress: number[] = []; const res = await importBookmarksFromFile( { @@ -371,9 +339,8 @@ describe("importBookmarksFromFile", () => { rootListName: "Imported", deps: { createList, - createBookmark, - addBookmarkToLists, - updateBookmarkTags, + stageImportedBookmarks, + finalizeImportStaging, createImportSession, }, onProgress: (d, t) => progress.push(d / t), @@ -383,23 +350,110 @@ describe("importBookmarksFromFile", () => { expect(res.rootListId).toBe("Imported"); expect(res.importSessionId).toBe("session-1"); - - // All bookmarks are created successfully, but 2 fail in post-processing expect(res.counts).toEqual({ - successes: 1, // Only one fully successful bookmark - failures: 2, // Two failed in post-processing steps + successes: 0, + failures: 0, alreadyExisted: 0, total: 3, }); - // All bookmarks should be created (failures happen after bookmark creation) - expect(createBookmark).toHaveBeenCalledTimes(3); + // All bookmarks should be staged (in 1 batch since < 50) + expect(stagedBookmarks).toHaveLength(3); + expect(stageImportedBookmarks).toHaveBeenCalledTimes(1); + + // Verify finalizeImportStaging was called + expect(finalizeImportStaging).toHaveBeenCalledWith("session-1"); + }); + + it("handles HTML bookmarks with empty folder names", async () => { + const htmlContent = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1765995928" LAST_MODIFIED="1765995928">Bluetooth Fernbedienung</H3> + <DL><p> + <DT><H3 ADD_DATE="1765995928" LAST_MODIFIED="0"></H3> + <DL><p> + <DT><A HREF="https://www.example.com/product.html" ADD_DATE="1593444456">Example Product</A> + </DL><p> + </DL><p> +</DL><p>`; + + const mockFile = { + text: vi.fn().mockResolvedValue(htmlContent), + } as unknown as File; + + const createdLists: { name: string; icon: string; parentId?: string }[] = + []; + const createList = vi.fn( + async (input: { name: string; icon: string; parentId?: string }) => { + createdLists.push(input); + return { + id: `${input.parentId ? input.parentId + "/" : ""}${input.name}`, + }; + }, + ); + + const stagedBookmarks: StagedBookmark[] = []; + const stageImportedBookmarks = vi.fn( + async (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; + }) => { + stagedBookmarks.push(...input.bookmarks); + }, + ); + + const finalizeImportStaging = vi.fn(); + const createImportSession = vi.fn( + async (_input: { name: string; rootListId: string }) => ({ + id: "session-1", + }), + ); + + const res = await importBookmarksFromFile({ + file: mockFile, + source: "html", + rootListName: "HTML Import", + deps: { + createList, + stageImportedBookmarks, + finalizeImportStaging, + createImportSession, + }, + }); + + expect(res.counts).toEqual({ + successes: 0, + failures: 0, + alreadyExisted: 0, + total: 1, + }); + + // Verify that the empty folder name was replaced with "Unnamed" + expect(createdLists).toEqual([ + { name: "HTML Import", icon: "ā¬ļø" }, + { name: "Bluetooth Fernbedienung", parentId: "HTML Import", icon: "š" }, + { + name: "Unnamed", + parentId: "HTML Import/Bluetooth Fernbedienung", + icon: "š", + }, + ]); - // addBookmarkToLists should be called 3 times (but one fails) - expect(addBookmarkToLists).toHaveBeenCalledTimes(3); + // Verify the bookmark was staged with correct listIds + expect(stagedBookmarks).toHaveLength(1); + expect(stagedBookmarks[0]).toMatchObject({ + title: "Example Product", + url: "https://www.example.com/product.html", + type: "link", + tags: [], + listIds: ["HTML Import/Bluetooth Fernbedienung/Unnamed"], + }); - // updateBookmarkTags should be called 2 times (once fails at list assignment, one fails at tag update) - expect(updateBookmarkTags).toHaveBeenCalledTimes(2); + // Verify finalizeImportStaging was called + expect(finalizeImportStaging).toHaveBeenCalledWith("session-1"); }); it("parses mymind CSV export correctly", async () => { @@ -413,14 +467,22 @@ describe("importBookmarksFromFile", () => { text: vi.fn().mockResolvedValue(mymindCsv), } as unknown as File; - const createdBookmarks: ParsedBookmark[] = []; - const createBookmark = vi.fn(async (bookmark: ParsedBookmark) => { - createdBookmarks.push(bookmark); - return { - id: `bookmark-${createdBookmarks.length}`, - alreadyExists: false, - }; - }); + const stagedBookmarks: StagedBookmark[] = []; + const stageImportedBookmarks = vi.fn( + async (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; + }) => { + stagedBookmarks.push(...input.bookmarks); + }, + ); + + const finalizeImportStaging = vi.fn(); + const createImportSession = vi.fn( + async (_input: { name: string; rootListId: string }) => ({ + id: "session-1", + }), + ); const res = await importBookmarksFromFile({ file: mockFile, @@ -432,52 +494,54 @@ describe("importBookmarksFromFile", () => { id: `${input.parentId ? input.parentId + "/" : ""}${input.name}`, }), ), - createBookmark, - addBookmarkToLists: vi.fn(), - updateBookmarkTags: vi.fn(), - createImportSession: vi.fn(async () => ({ id: "session-1" })), + stageImportedBookmarks, + finalizeImportStaging, + createImportSession, }, }); expect(res.counts).toEqual({ - successes: 3, + successes: 0, failures: 0, alreadyExisted: 0, total: 3, }); - // Verify first bookmark (WebPage with URL) - expect(createdBookmarks[0]).toMatchObject({ + // Verify 3 bookmarks were staged + expect(stagedBookmarks).toHaveLength(3); + + // Verify first bookmark (WebPage with URL) - mymind has no paths, so root list + expect(stagedBookmarks[0]).toMatchObject({ title: "mymind", - content: { - type: "link", - url: "https://access.mymind.com/everything", - }, + url: "https://access.mymind.com/everything", + type: "link", tags: ["Wellness", "Self-Improvement", "Psychology"], + listIds: ["mymind Import"], }); - expect(createdBookmarks[0].addDate).toBeCloseTo( - new Date("2024-12-04T23:02:10Z").getTime() / 1000, + expect(stagedBookmarks[0].sourceAddedAt).toEqual( + new Date("2024-12-04T23:02:10Z"), ); // Verify second bookmark (WebPage with note) - expect(createdBookmarks[1]).toMatchObject({ + expect(stagedBookmarks[1]).toMatchObject({ title: "Movies / TV / Anime", - content: { - type: "link", - url: "https://fmhy.pages.dev/videopiracyguide", - }, + url: "https://fmhy.pages.dev/videopiracyguide", + type: "link", tags: ["Tools", "media", "Entertainment"], - notes: "Free Media!", + note: "Free Media!", + listIds: ["mymind Import"], }); // Verify third bookmark (Note with text content) - expect(createdBookmarks[2]).toMatchObject({ + expect(stagedBookmarks[2]).toMatchObject({ title: "", - content: { - type: "text", - text: "⢠Critical Thinking\n⢠Empathy", - }, + content: "⢠Critical Thinking\n⢠Empathy", + type: "text", tags: [], + listIds: ["mymind Import"], }); + + // Verify finalizeImportStaging was called + expect(finalizeImportStaging).toHaveBeenCalledWith("session-1"); }); }); diff --git a/packages/shared/import-export/importer.ts b/packages/shared/import-export/importer.ts index b32c49c1..be24ca73 100644 --- a/packages/shared/import-export/importer.ts +++ b/packages/shared/import-export/importer.ts @@ -1,4 +1,3 @@ -import { limitConcurrency } from "../concurrency"; import { MAX_LIST_NAME_LENGTH } from "../types/lists"; import { ImportSource, ParsedBookmark, parseImportFile } from "./parsers"; @@ -9,28 +8,32 @@ export interface ImportCounts { total: number; } +export interface StagedBookmark { + type: "link" | "text" | "asset"; + url?: string; + title?: string; + content?: string; + note?: string; + tags: string[]; + listIds: string[]; + sourceAddedAt?: Date; +} + export interface ImportDeps { createList: (input: { name: string; icon: string; parentId?: string; }) => Promise<{ id: string }>; - createBookmark: ( - bookmark: ParsedBookmark, - sessionId: string, - ) => Promise<{ id: string; alreadyExists?: boolean }>; - addBookmarkToLists: (input: { - bookmarkId: string; - listIds: string[]; - }) => Promise<void>; - updateBookmarkTags: (input: { - bookmarkId: string; - tags: string[]; + stageImportedBookmarks: (input: { + importSessionId: string; + bookmarks: StagedBookmark[]; }) => Promise<void>; createImportSession: (input: { name: string; rootListId: string; }) => Promise<{ id: string }>; + finalizeImportStaging: (sessionId: string) => Promise<void>; } export interface ImportOptions { @@ -62,7 +65,7 @@ export async function importBookmarksFromFile( }, options: ImportOptions = {}, ): Promise<ImportResult> { - const { concurrencyLimit = 20, parsers } = options; + const { parsers } = options; const textContent = await file.text(); const parsedBookmarks = parsers?.[source] @@ -120,50 +123,74 @@ export async function importBookmarksFromFile( pathMap[pathKey] = folderList.id; } - let done = 0; - const importPromises = parsedBookmarks.map((bookmark) => async () => { - try { - const listIds = bookmark.paths.map( - (path) => pathMap[path.join(PATH_DELIMITER)] || rootList.id, - ); - if (listIds.length === 0) listIds.push(rootList.id); + // Prepare all bookmarks for staging + const bookmarksToStage: StagedBookmark[] = parsedBookmarks.map((bookmark) => { + // Convert paths to list IDs using pathMap + // If no paths, assign to root list + const listIds = + bookmark.paths.length === 0 + ? [rootList.id] + : bookmark.paths + .map((path) => { + if (path.length === 0) { + return rootList.id; + } + const pathKey = path.join(PATH_DELIMITER); + return pathMap[pathKey] || rootList.id; + }) + .filter((id, index, arr) => arr.indexOf(id) === index); // dedupe - const created = await deps.createBookmark(bookmark, session.id); - await deps.addBookmarkToLists({ bookmarkId: created.id, listIds }); - if (bookmark.tags && bookmark.tags.length > 0) { - await deps.updateBookmarkTags({ - bookmarkId: created.id, - tags: bookmark.tags, - }); - } + // Determine type and extract content appropriately + let type: "link" | "text" | "asset" = "link"; + let url: string | undefined; + let textContent: string | undefined; - return created; - } finally { - done += 1; - onProgress?.(done, parsedBookmarks.length); + if (bookmark.content) { + if (bookmark.content.type === "link") { + type = "link"; + url = bookmark.content.url; + } else if (bookmark.content.type === "text") { + type = "text"; + textContent = bookmark.content.text; + } } - }); - const resultsPromises = limitConcurrency(importPromises, concurrencyLimit); - const results = await Promise.allSettled(resultsPromises); + return { + type, + url, + title: bookmark.title, + content: textContent, + note: bookmark.notes, + tags: bookmark.tags ?? [], + listIds, + sourceAddedAt: bookmark.addDate + ? new Date(bookmark.addDate * 1000) + : undefined, + }; + }); - let successes = 0; - let failures = 0; - let alreadyExisted = 0; + // Stage bookmarks in batches of 50 + const BATCH_SIZE = 50; + let staged = 0; - for (const r of results) { - if (r.status === "fulfilled") { - if (r.value.alreadyExists) alreadyExisted++; - else successes++; - } else { - failures++; - } + for (let i = 0; i < bookmarksToStage.length; i += BATCH_SIZE) { + const batch = bookmarksToStage.slice(i, i + BATCH_SIZE); + await deps.stageImportedBookmarks({ + importSessionId: session.id, + bookmarks: batch, + }); + staged += batch.length; + onProgress?.(staged, parsedBookmarks.length); } + + // Finalize staging - marks session as "pending" for worker pickup + await deps.finalizeImportStaging(session.id); + return { counts: { - successes, - failures, - alreadyExisted, + successes: 0, + failures: 0, + alreadyExisted: 0, total: parsedBookmarks.length, }, rootListId: rootList.id, diff --git a/packages/shared/import-export/parsers.test.ts b/packages/shared/import-export/parsers.test.ts new file mode 100644 index 00000000..18502305 --- /dev/null +++ b/packages/shared/import-export/parsers.test.ts @@ -0,0 +1,301 @@ +import { describe, expect, it } from "vitest"; + +import { parseImportFile } from "./parsers"; + +describe("parseNetscapeBookmarkFile", () => { + it("parses a simple bookmark file with single bookmark", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Example Site</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + title: "Example Site", + content: { + type: "link", + url: "https://example.com", + }, + tags: [], + addDate: 1234567890, + paths: [[]], + }); + }); + + it("parses bookmarks with tags", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1,tag2,tag3">Example Site</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].tags).toEqual(["tag1", "tag2", "tag3"]); + }); + + it("parses bookmarks in nested folders", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Folder1</H3> + <DL><p> + <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="1234567893">Folder2</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567894">Nested Bookmark</A> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + title: "Nested Bookmark", + content: { + type: "link", + url: "https://example.com", + }, + paths: [["Folder1", "Folder2"]], + }); + }); + + it("handles empty folder names by replacing with 'Unnamed'", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890" LAST_MODIFIED="1234567891">Named Folder</H3> + <DL><p> + <DT><H3 ADD_DATE="1234567892" LAST_MODIFIED="0"></H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567894">Bookmark</A> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].paths).toEqual([["Named Folder", "Unnamed"]]); + }); + + it("parses multiple bookmarks in different folders", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3 ADD_DATE="1234567890">Tech</H3> + <DL><p> + <DT><A HREF="https://github.com" ADD_DATE="1234567891">GitHub</A> + <DT><A HREF="https://stackoverflow.com" ADD_DATE="1234567892">Stack Overflow</A> + </DL><p> + <DT><H3 ADD_DATE="1234567893">News</H3> + <DL><p> + <DT><A HREF="https://news.ycombinator.com" ADD_DATE="1234567894">Hacker News</A> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(3); + + expect(result[0]).toMatchObject({ + title: "GitHub", + content: { type: "link", url: "https://github.com" }, + paths: [["Tech"]], + }); + + expect(result[1]).toMatchObject({ + title: "Stack Overflow", + content: { type: "link", url: "https://stackoverflow.com" }, + paths: [["Tech"]], + }); + + expect(result[2]).toMatchObject({ + title: "Hacker News", + content: { type: "link", url: "https://news.ycombinator.com" }, + paths: [["News"]], + }); + }); + + it("parses bookmarks at root level (no folders)", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example1.com" ADD_DATE="1234567890">Bookmark 1</A> + <DT><A HREF="https://example2.com" ADD_DATE="1234567891">Bookmark 2</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(2); + expect(result[0].paths).toEqual([[]]); + expect(result[1].paths).toEqual([[]]); + }); + + it("handles deeply nested folder structures", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3>Level1</H3> + <DL><p> + <DT><H3>Level2</H3> + <DL><p> + <DT><H3>Level3</H3> + <DL><p> + <DT><H3>Level4</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Deep Bookmark</A> + </DL><p> + </DL><p> + </DL><p> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].paths).toEqual([["Level1", "Level2", "Level3", "Level4"]]); + }); + + it("deduplicates bookmarks with the same URL", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><H3>Folder1</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890" TAGS="tag1">First Instance</A> + </DL><p> + <DT><H3>Folder2</H3> + <DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567891" TAGS="tag2">Second Instance</A> + </DL><p> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ + content: { type: "link", url: "https://example.com" }, + tags: ["tag1", "tag2"], + addDate: 1234567890, // Should keep the earlier date + }); + expect(result[0].paths).toHaveLength(2); + expect(result[0].paths).toContainEqual(["Folder1"]); + expect(result[0].paths).toContainEqual(["Folder2"]); + }); + + it("merges notes from duplicate bookmarks", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com" ADD_DATE="1234567890">Bookmark</A> + <DD>First note + <DT><A HREF="https://example.com" ADD_DATE="1234567891">Bookmark</A> + <DD>Second note +</DL><p>`; + + // Note: The current parser doesn't extract DD notes, but this test + // documents the expected behavior if/when DD parsing is added + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].content).toMatchObject({ + type: "link", + url: "https://example.com", + }); + }); + + it("handles bookmarks without ADD_DATE attribute", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://example.com">No Date Bookmark</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].addDate).toBeUndefined(); + }); + + it("handles bookmarks without HREF attribute", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A ADD_DATE="1234567890">No URL Bookmark</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(1); + expect(result[0].content).toBeUndefined(); + }); + + it("handles mixed structure with folders and root-level bookmarks", () => { + const html = `<!DOCTYPE NETSCAPE-Bookmark-file-1> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> +<TITLE>Bookmarks</TITLE> +<H1>Bookmarks</H1> +<DL><p> + <DT><A HREF="https://root1.com" ADD_DATE="1234567890">Root Bookmark 1</A> + <DT><H3>Folder</H3> + <DL><p> + <DT><A HREF="https://folder1.com" ADD_DATE="1234567891">Folder Bookmark</A> + </DL><p> + <DT><A HREF="https://root2.com" ADD_DATE="1234567892">Root Bookmark 2</A> +</DL><p>`; + + const result = parseImportFile("html", html); + + expect(result).toHaveLength(3); + expect(result[0]).toMatchObject({ + title: "Root Bookmark 1", + paths: [[]], + }); + expect(result[1]).toMatchObject({ + title: "Folder Bookmark", + paths: [["Folder"]], + }); + expect(result[2]).toMatchObject({ + title: "Root Bookmark 2", + paths: [[]], + }); + }); + + it("throws error for non-Netscape bookmark files", () => { + const html = `<html> +<head><title>Not a bookmark file</title></head> +<body>Just a regular HTML file</body> +</html>`; + + expect(() => parseImportFile("html", html)).toThrow( + "The uploaded html file does not seem to be a bookmark file", + ); + }); +}); diff --git a/packages/shared/import-export/parsers.ts b/packages/shared/import-export/parsers.ts index f4d3f862..24d85c80 100644 --- a/packages/shared/import-export/parsers.ts +++ b/packages/shared/import-export/parsers.ts @@ -1,5 +1,6 @@ // Copied from https://gist.github.com/devster31/4e8c6548fd16ffb75c02e6f24e27f9b9 +import type { AnyNode } from "domhandler"; import * as cheerio from "cheerio"; import { parse } from "csv-parse/sync"; import { z } from "zod"; @@ -10,11 +11,13 @@ import { zExportSchema } from "./exporters"; export type ImportSource = | "html" | "pocket" + | "matter" | "omnivore" | "karakeep" | "linkwarden" | "tab-session-manager" - | "mymind"; + | "mymind" + | "instapaper"; export interface ParsedBookmark { title: string; @@ -34,41 +37,58 @@ function parseNetscapeBookmarkFile(textContent: string): ParsedBookmark[] { } const $ = cheerio.load(textContent); + const bookmarks: ParsedBookmark[] = []; - return $("a") - .map(function (_index, a) { - const $a = $(a); - const addDate = $a.attr("add_date"); - let tags: string[] = []; + // Recursively traverse the bookmark hierarchy top-down + function traverseFolder( + element: cheerio.Cheerio<AnyNode>, + currentPath: string[], + ) { + element.children().each((_index, child) => { + const $child = $(child); - const tagsStr = $a.attr("tags"); - try { - tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : []; - } catch { - /* empty */ - } - const url = $a.attr("href"); + // Check if this is a folder (DT with H3) + const h3 = $child.children("h3").first(); + if (h3.length > 0) { + const folderName = h3.text().trim() || "Unnamed"; + const newPath = [...currentPath, folderName]; + + // Find the DL that follows this folder and recurse into it + const dl = $child.children("dl").first(); + if (dl.length > 0) { + traverseFolder(dl, newPath); + } + } else { + // Check if this is a bookmark (DT with A) + const anchor = $child.children("a").first(); + if (anchor.length > 0) { + const addDate = anchor.attr("add_date"); + const tagsStr = anchor.attr("tags"); + const tags = tagsStr && tagsStr.length > 0 ? tagsStr.split(",") : []; + const url = anchor.attr("href"); - // Build folder path by traversing up the hierarchy - const path: string[] = []; - let current = $a.parent(); - while (current && current.length > 0) { - const h3 = current.find("> h3").first(); - if (h3.length > 0) { - path.unshift(h3.text()); + bookmarks.push({ + title: anchor.text(), + content: url + ? { type: BookmarkTypes.LINK as const, url } + : undefined, + tags, + addDate: + typeof addDate === "undefined" ? undefined : parseInt(addDate), + paths: [currentPath], + }); } - current = current.parent(); } + }); + } - return { - title: $a.text(), - content: url ? { type: BookmarkTypes.LINK as const, url } : undefined, - tags, - addDate: typeof addDate === "undefined" ? undefined : parseInt(addDate), - paths: [path], - }; - }) - .get(); + // Start traversal from the root DL element + const rootDl = $("dl").first(); + if (rootDl.length > 0) { + traverseFolder(rootDl, []); + } + + return bookmarks; } function parsePocketBookmarkFile(textContent: string): ParsedBookmark[] { @@ -95,6 +115,52 @@ function parsePocketBookmarkFile(textContent: string): ParsedBookmark[] { }); } +function parseMatterBookmarkFile(textContent: string): ParsedBookmark[] { + const zMatterRecordSchema = z.object({ + Title: z.string(), + Author: z.string(), + Publisher: z.string(), + URL: z.string(), + Tags: z + .string() + .transform((tags) => (tags.length > 0 ? tags.split(";") : [])), + "Word Count": z.string(), + "In Queue": z.string().transform((inQueue) => inQueue === "False"), + Favorited: z.string(), + Read: z.string(), + Highlight_Count: z.string(), + "Last Interaction Date": z + .string() + .transform((date) => Date.parse(date) / 1000), + "File Id": z.string(), + }); + + const zMatterExportSchema = z.array(zMatterRecordSchema); + + const records = parse(textContent, { + columns: true, + skip_empty_lines: true, + }); + + const parsed = zMatterExportSchema.safeParse(records); + if (!parsed.success) { + throw new Error( + `The uploaded CSV file contains an invalid Matter bookmark file: ${parsed.error.toString()}`, + ); + } + + return parsed.data.map((record) => { + return { + title: record.Title, + content: { type: BookmarkTypes.LINK as const, url: record.URL }, + tags: record.Tags, + addDate: record["Last Interaction Date"], + archived: record["In Queue"], + paths: [], // TODO + }; + }); +} + function parseKarakeepBookmarkFile(textContent: string): ParsedBookmark[] { const parsed = zExportSchema.safeParse(JSON.parse(textContent)); if (!parsed.success) { @@ -292,6 +358,64 @@ function parseMymindBookmarkFile(textContent: string): ParsedBookmark[] { }); } +function parseInstapaperBookmarkFile(textContent: string): ParsedBookmark[] { + const zInstapaperRecordScheme = z.object({ + URL: z.string(), + Title: z.string(), + Selection: z.string(), + Folder: z.string(), + Timestamp: z.string(), + Tags: z.string(), + }); + + const zInstapaperExportScheme = z.array(zInstapaperRecordScheme); + + const record = parse(textContent, { + columns: true, + skip_empty_lines: true, + }); + + const parsed = zInstapaperExportScheme.safeParse(record); + + if (!parsed.success) { + throw new Error( + `CSV file contains an invalid instapaper bookmark file: ${parsed.error.toString()}`, + ); + } + + return parsed.data.map((record) => { + let content: ParsedBookmark["content"]; + if (record.URL && record.URL.trim().length > 0) { + content = { type: BookmarkTypes.LINK as const, url: record.URL.trim() }; + } else if (record.Selection && record.Selection.trim().length > 0) { + content = { + type: BookmarkTypes.TEXT as const, + text: record.Selection.trim(), + }; + } + + const addDate = parseInt(record.Timestamp); + + let tags: string[] = []; + try { + const parsedTags = JSON.parse(record.Tags); + if (Array.isArray(parsedTags)) { + tags = parsedTags.map((tag) => tag.toString().trim()); + } + } catch { + tags = []; + } + + return { + title: record.Title || "", + content, + addDate, + tags, + paths: [], // TODO + }; + }); +} + function deduplicateBookmarks(bookmarks: ParsedBookmark[]): ParsedBookmark[] { const deduplicatedBookmarksMap = new Map<string, ParsedBookmark>(); const textBookmarks: ParsedBookmark[] = []; @@ -345,6 +469,9 @@ export function parseImportFile( case "pocket": result = parsePocketBookmarkFile(textContent); break; + case "matter": + result = parseMatterBookmarkFile(textContent); + break; case "karakeep": result = parseKarakeepBookmarkFile(textContent); break; @@ -360,6 +487,9 @@ export function parseImportFile( case "mymind": result = parseMymindBookmarkFile(textContent); break; + case "instapaper": + result = parseInstapaperBookmarkFile(textContent); + break; } return deduplicateBookmarks(result); } |
