[PM-31203] Change Phishing Url Check to use a Cursor Based Search (#18561)

* Initial changes to look at phishing indexeddb service and removal of obsolete compression code * Convert background update to rxjs format and trigger via subject. Update test cases * Added addUrls function to use instead of saveUrls so appending daily does not clear all urls * Added debug logs to phishing-indexeddb service * Added a fallback url when downloading phishing url list * Remove obsolete comments * Fix testUrl default, false scenario and test cases * Add default return on isPhishingWebAddress * Added log statement * Change hostname to href in hasUrl check * Save fallback response * Fix matching subpaths in links. Update test cases * Fix meta data updates storing last checked instead of last updated * Update QA phishing url to be normalized * Filter web addresses * Return previous meta to keep subscription alive * Change indexeddb lookup from loading all to cursor search * fix(phishing): improve performance and fix URL matching in phishing detection Problem: The cursor-based search takes ~25 seconds to scan the entire phishing database. For non-phishing URLs (99% of cases), this full scan runs to completion every time. Before these fixes, opening a new tab triggered this sequence: 1. chrome://newtab/ fires a phishing check 2. Sequential concatMap blocks while cursor scans all 500k+ URLs (~25 sec) 3. User pastes actual URL and hits enter 4. That URL's check waits in queue behind the chrome:// check 5. Total delay: ~50+ seconds for a simple "open tab, paste link" workflow Even for legitimate phishing checks, the cursor search could take up to 25 seconds per URL when the fast hasUrl lookup misses due to trailing slash mismatches. Changes: phishing-data.service.ts: - Add protocol filter to early-return for non-http(s) URLs, avoiding expensive IndexedDB operations for chrome://, about:, file:// URLs - Add trailing slash normalization for hasUrl lookup - browsers add trailing slashes but DB entries may not have them, causing O(1) lookups to miss and fall back to O(n) cursor search unnecessarily - Add debug logging for hasUrl checks and timing metrics for cursor-based search to aid performance debugging phishing-detection.service.ts: - Replace concatMap with mergeMap for parallel tab processing - each tab check now runs independently instead of sequentially - Add concurrency limit of 5 to prevent overwhelming IndexedDB while still allowing parallel execution Result: - New tabs are instant (no IndexedDB calls for non-web URLs) - One slow phishing check doesn't block other tabs - Common URL patterns hit the fast O(1) path instead of O(n) cursor scan * performance debug logs * disable custom match because too slow * spec fix --------- Co-authored-by: Alex <adewitt@bitwarden.com>
2026-02-08 12:40:26 +00:00 · 2026-01-26 17:05:42 -06:00
parent e2fa296b04
commit 60c28dd182
6 changed files with 259 additions and 51 deletions
--- a/apps/browser/src/dirt/phishing-detection/phishing-resources.ts
+++ b/apps/browser/src/dirt/phishing-detection/phishing-resources.ts
@@ -7,6 +7,8 @@ export type PhishingResource = {
  todayUrl: string;
  /** Matcher used to decide whether a given URL matches an entry from this resource */
  match: (url: URL, entry: string) => boolean;
+  /** Whether to use the custom matcher. If false, only exact hasUrl lookups are used. Default: true */
+  useCustomMatcher?: boolean;
 };

 export const PhishingResourceType = Object.freeze({
@@ -56,6 +58,8 @@ export const PHISHING_RESOURCES: Record<PhishingResourceType, PhishingResource[]
        "https://raw.githubusercontent.com/Phishing-Database/checksums/refs/heads/master/phishing-links-ACTIVE.txt.md5",
      todayUrl:
        "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/refs/heads/master/phishing-links-NEW-today.txt",
+      // Disabled for performance - cursor search takes 6+ minutes on large databases
+      useCustomMatcher: false,
      match: (url: URL, entry: string) => {
        if (!entry) {
          return false;
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts
@@ -40,6 +40,7 @@ describe("PhishingDataService", () => {
    // Set default mock behaviors
    mockIndexedDbService.hasUrl.mockResolvedValue(false);
    mockIndexedDbService.loadAllUrls.mockResolvedValue([]);
+    mockIndexedDbService.findMatchingUrl.mockResolvedValue(false);
    mockIndexedDbService.saveUrls.mockResolvedValue(undefined);
    mockIndexedDbService.addUrls.mockResolvedValue(undefined);
    mockIndexedDbService.saveUrlsFromStream.mockResolvedValue(undefined);
@@ -90,7 +91,7 @@ describe("PhishingDataService", () => {

    it("should NOT detect QA test addresses - different subpath", async () => {
      mockIndexedDbService.hasUrl.mockResolvedValue(false);
-      mockIndexedDbService.loadAllUrls.mockResolvedValue([]);
+      mockIndexedDbService.findMatchingUrl.mockResolvedValue(false);

      const url = new URL("https://phishing.testcategory.com/other");
      const result = await service.isPhishingWebAddress(url);
@@ -120,70 +121,65 @@ describe("PhishingDataService", () => {
      expect(result).toBe(true);
      expect(mockIndexedDbService.hasUrl).toHaveBeenCalledWith("http://phish.com/testing-param");
      // Should not fall back to custom matcher when hasUrl returns true
-      expect(mockIndexedDbService.loadAllUrls).not.toHaveBeenCalled();
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });

-    it("should fall back to custom matcher when hasUrl returns false", async () => {
+    it("should return false when hasUrl returns false (custom matcher disabled)", async () => {
      // Mock hasUrl to return false (no direct href match)
      mockIndexedDbService.hasUrl.mockResolvedValue(false);
-      // Mock loadAllUrls to return phishing URLs for custom matcher
-      mockIndexedDbService.loadAllUrls.mockResolvedValue(["http://phish.com/path"]);

      const url = new URL("http://phish.com/path");
      const result = await service.isPhishingWebAddress(url);

-      expect(result).toBe(true);
+      // Custom matcher is currently disabled (useCustomMatcher: false), so result is false
+      expect(result).toBe(false);
      expect(mockIndexedDbService.hasUrl).toHaveBeenCalledWith("http://phish.com/path");
-      expect(mockIndexedDbService.loadAllUrls).toHaveBeenCalled();
+      // Custom matcher should NOT be called since it's disabled
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });

    it("should not detect a safe web address", async () => {
      // Mock hasUrl to return false
      mockIndexedDbService.hasUrl.mockResolvedValue(false);
-      // Mock loadAllUrls to return phishing URLs that don't match
-      mockIndexedDbService.loadAllUrls.mockResolvedValue(["http://phish.com", "http://badguy.net"]);

      const url = new URL("http://safe.com");
      const result = await service.isPhishingWebAddress(url);

      expect(result).toBe(false);
      expect(mockIndexedDbService.hasUrl).toHaveBeenCalledWith("http://safe.com/");
-      expect(mockIndexedDbService.loadAllUrls).toHaveBeenCalled();
+      // Custom matcher is disabled, so findMatchingUrl should NOT be called
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });

-    it("should not match against root web address with subpaths using custom matcher", async () => {
+    it("should not match against root web address with subpaths (custom matcher disabled)", async () => {
      // Mock hasUrl to return false (no direct href match)
      mockIndexedDbService.hasUrl.mockResolvedValue(false);
-      // Mock loadAllUrls to return entry that matches with subpath
-      mockIndexedDbService.loadAllUrls.mockResolvedValue(["http://phish.com/login"]);

      const url = new URL("http://phish.com/login/page");
      const result = await service.isPhishingWebAddress(url);

      expect(result).toBe(false);
      expect(mockIndexedDbService.hasUrl).toHaveBeenCalledWith("http://phish.com/login/page");
-      expect(mockIndexedDbService.loadAllUrls).toHaveBeenCalled();
+      // Custom matcher is disabled, so findMatchingUrl should NOT be called
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });

-    it("should not match against root web address with different subpaths using custom matcher", async () => {
+    it("should not match against root web address with different subpaths (custom matcher disabled)", async () => {
      // Mock hasUrl to return false (no direct hostname match)
      mockIndexedDbService.hasUrl.mockResolvedValue(false);
-      // Mock loadAllUrls to return entry that matches with subpath
-      mockIndexedDbService.loadAllUrls.mockResolvedValue(["http://phish.com/login/page1"]);

      const url = new URL("http://phish.com/login/page2");
      const result = await service.isPhishingWebAddress(url);

      expect(result).toBe(false);
      expect(mockIndexedDbService.hasUrl).toHaveBeenCalledWith("http://phish.com/login/page2");
-      expect(mockIndexedDbService.loadAllUrls).toHaveBeenCalled();
+      // Custom matcher is disabled, so findMatchingUrl should NOT be called
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });

    it("should handle IndexedDB errors gracefully", async () => {
      // Mock hasUrl to throw error
      mockIndexedDbService.hasUrl.mockRejectedValue(new Error("hasUrl error"));
-      // Mock loadAllUrls to also throw error
-      mockIndexedDbService.loadAllUrls.mockRejectedValue(new Error("IndexedDB error"));

      const url = new URL("http://phish.com/about");
      const result = await service.isPhishingWebAddress(url);
@@ -193,10 +189,8 @@ describe("PhishingDataService", () => {
        "[PhishingDataService] IndexedDB lookup via hasUrl failed",
        expect.any(Error),
      );
-      expect(logService.error).toHaveBeenCalledWith(
-        "[PhishingDataService] Error running custom matcher",
-        expect.any(Error),
-      );
+      // Custom matcher is disabled, so no custom matcher error is expected
+      expect(mockIndexedDbService.findMatchingUrl).not.toHaveBeenCalled();
    });
  });

--- a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts
@@ -153,8 +153,18 @@ export class PhishingDataService {
   * @returns True if the URL is a known phishing web address, false otherwise
   */
  async isPhishingWebAddress(url: URL): Promise<boolean> {
+    this.logService.debug("[PhishingDataService] isPhishingWebAddress called for: " + url.href);
+
+    // Skip non-http(s) protocols - phishing database only contains web URLs
+    // This prevents expensive fallback checks for chrome://, about:, file://, etc.
+    if (url.protocol !== "http:" && url.protocol !== "https:") {
+      this.logService.debug("[PhishingDataService] Skipping non-http(s) protocol: " + url.protocol);
+      return false;
+    }
+
    // Quick check for QA/dev test addresses
    if (this._testWebAddresses.includes(url.href)) {
+      this.logService.info("[PhishingDataService] Found test web address: " + url.href);
      return true;
    }

@@ -162,28 +172,73 @@ export class PhishingDataService {

    try {
      // Quick lookup: check direct presence of href in IndexedDB
-      const hasUrl = await this.indexedDbService.hasUrl(url.href);
+      // Also check without trailing slash since browsers add it but DB entries may not have it
+      const urlHref = url.href;
+      const urlWithoutTrailingSlash = urlHref.endsWith("/") ? urlHref.slice(0, -1) : null;
+
+      this.logService.debug("[PhishingDataService] Checking hasUrl on this string: " + urlHref);
+      let hasUrl = await this.indexedDbService.hasUrl(urlHref);
+
+      // If not found and URL has trailing slash, try without it
+      if (!hasUrl && urlWithoutTrailingSlash) {
+        this.logService.debug(
+          "[PhishingDataService] Checking hasUrl without trailing slash: " +
+            urlWithoutTrailingSlash,
+        );
+        hasUrl = await this.indexedDbService.hasUrl(urlWithoutTrailingSlash);
+      }
+
      if (hasUrl) {
+        this.logService.info(
+          "[PhishingDataService] Found phishing web address through direct lookup: " + urlHref,
+        );
        return true;
      }
    } catch (err) {
      this.logService.error("[PhishingDataService] IndexedDB lookup via hasUrl failed", err);
    }

-    // If a custom matcher is provided, iterate stored entries and apply the matcher.
-    if (resource && resource.match) {
+    // If a custom matcher is provided and enabled, use cursor-based search.
+    // This avoids loading all URLs into memory and allows early exit on first match.
+    // Can be disabled via useCustomMatcher: false for performance reasons.
+    if (resource && resource.match && resource.useCustomMatcher !== false) {
      try {
-        const entries = await this.indexedDbService.loadAllUrls();
-        for (const entry of entries) {
-          if (resource.match(url, entry)) {
-            return true;
-          }
+        this.logService.debug(
+          "[PhishingDataService] Starting cursor-based search for: " + url.href,
+        );
+        const startTime = performance.now();
+
+        const found = await this.indexedDbService.findMatchingUrl((entry) =>
+          resource.match(url, entry),
+        );
+
+        const endTime = performance.now();
+        const duration = (endTime - startTime).toFixed(2);
+        this.logService.debug(
+          `[PhishingDataService] Cursor-based search completed in ${duration}ms for: ${url.href} (found: ${found})`,
+        );
+
+        if (found) {
+          this.logService.info(
+            "[PhishingDataService] Found phishing web address through custom matcher: " + url.href,
+          );
+        } else {
+          this.logService.debug(
+            "[PhishingDataService] No match found, returning false for: " + url.href,
+          );
        }
+        return found;
      } catch (err) {
        this.logService.error("[PhishingDataService] Error running custom matcher", err);
+        this.logService.debug(
+          "[PhishingDataService] Returning false due to error for: " + url.href,
+        );
+        return false;
      }
-      return false;
    }
+    this.logService.debug(
+      "[PhishingDataService] No custom matcher, returning false for: " + url.href,
+    );
    return false;
  }

--- a/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts
@@ -1,10 +1,10 @@
 import {
-  concatMap,
  distinctUntilChanged,
  EMPTY,
  filter,
  map,
  merge,
+  mergeMap,
  Subject,
  switchMap,
  tap,
@@ -43,6 +43,7 @@ export class PhishingDetectionService {
  private static _tabUpdated$ = new Subject<PhishingDetectionNavigationEvent>();
  private static _ignoredHostnames = new Set<string>();
  private static _didInit = false;
+  private static _activeSearchCount = 0;

  static initialize(
    logService: LogService,
@@ -63,7 +64,7 @@ export class PhishingDetectionService {
      tap((message) =>
        logService.debug(`[PhishingDetectionService] user selected continue for ${message.url}`),
      ),
-      concatMap(async (message) => {
+      mergeMap(async (message) => {
        const url = new URL(message.url);
        this._ignoredHostnames.add(url.hostname);
        await BrowserApi.navigateTabToUrl(message.tabId, url);
@@ -88,23 +89,40 @@ export class PhishingDetectionService {
          prev.ignored === curr.ignored,
      ),
      tap((event) => logService.debug(`[PhishingDetectionService] processing event:`, event)),
-      concatMap(async ({ tabId, url, ignored }) => {
-        if (ignored) {
-          // The next time this host is visited, block again
-          this._ignoredHostnames.delete(url.hostname);
-          return;
-        }
-        const isPhishing = await phishingDataService.isPhishingWebAddress(url);
-        if (!isPhishing) {
-          return;
-        }
-
-        const phishingWarningPage = new URL(
-          BrowserApi.getRuntimeURL("popup/index.html#/security/phishing-warning") +
-            `?phishingUrl=${url.toString()}`,
+      // Use mergeMap for parallel processing - each tab check runs independently
+      // Concurrency limit of 5 prevents overwhelming IndexedDB
+      mergeMap(async ({ tabId, url, ignored }) => {
+        this._activeSearchCount++;
+        const searchId = `${tabId}-${Date.now()}`;
+        logService.debug(
+          `[PhishingDetectionService] Search STARTED [${searchId}] for ${url.href} (active: ${this._activeSearchCount}/5)`,
        );
-        await BrowserApi.navigateTabToUrl(tabId, phishingWarningPage);
-      }),
+        const startTime = performance.now();
+
+        try {
+          if (ignored) {
+            // The next time this host is visited, block again
+            this._ignoredHostnames.delete(url.hostname);
+            return;
+          }
+          const isPhishing = await phishingDataService.isPhishingWebAddress(url);
+          if (!isPhishing) {
+            return;
+          }
+
+          const phishingWarningPage = new URL(
+            BrowserApi.getRuntimeURL("popup/index.html#/security/phishing-warning") +
+              `?phishingUrl=${url.toString()}`,
+          );
+          await BrowserApi.navigateTabToUrl(tabId, phishingWarningPage);
+        } finally {
+          this._activeSearchCount--;
+          const duration = (performance.now() - startTime).toFixed(2);
+          logService.debug(
+            `[PhishingDetectionService] Search FINISHED [${searchId}] for ${url.href} in ${duration}ms (active: ${this._activeSearchCount}/5)`,
+          );
+        }
+      }, 5),
    );

    const onCancelCommand$ = messageListener
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-indexeddb.service.spec.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-indexeddb.service.spec.ts
@@ -435,6 +435,89 @@ describe("PhishingIndexedDbService", () => {
    });
  });

+  describe("findMatchingUrl", () => {
+    it("returns true when matcher finds a match", async () => {
+      mockStore.set("https://example.com", { url: "https://example.com" });
+      mockStore.set("https://phishing.net", { url: "https://phishing.net" });
+      mockStore.set("https://test.org", { url: "https://test.org" });
+
+      const matcher = (url: string) => url.includes("phishing");
+      const result = await service.findMatchingUrl(matcher);
+
+      expect(result).toBe(true);
+      expect(mockDb.transaction).toHaveBeenCalledWith("phishing-urls", "readonly");
+      expect(mockObjectStore.openCursor).toHaveBeenCalled();
+    });
+
+    it("returns false when no URLs match", async () => {
+      mockStore.set("https://example.com", { url: "https://example.com" });
+      mockStore.set("https://test.org", { url: "https://test.org" });
+
+      const matcher = (url: string) => url.includes("notfound");
+      const result = await service.findMatchingUrl(matcher);
+
+      expect(result).toBe(false);
+    });
+
+    it("returns false when store is empty", async () => {
+      const matcher = (url: string) => url.includes("anything");
+      const result = await service.findMatchingUrl(matcher);
+
+      expect(result).toBe(false);
+    });
+
+    it("exits early on first match without iterating all records", async () => {
+      mockStore.set("https://match1.com", { url: "https://match1.com" });
+      mockStore.set("https://match2.com", { url: "https://match2.com" });
+      mockStore.set("https://match3.com", { url: "https://match3.com" });
+
+      const matcherCallCount = jest
+        .fn()
+        .mockImplementation((url: string) => url.includes("match2"));
+      await service.findMatchingUrl(matcherCallCount);
+
+      // Matcher should be called for match1.com and match2.com, but NOT match3.com
+      // because it exits early on first match
+      expect(matcherCallCount).toHaveBeenCalledWith("https://match1.com");
+      expect(matcherCallCount).toHaveBeenCalledWith("https://match2.com");
+      expect(matcherCallCount).not.toHaveBeenCalledWith("https://match3.com");
+      expect(matcherCallCount).toHaveBeenCalledTimes(2);
+    });
+
+    it("supports complex matcher logic", async () => {
+      mockStore.set("https://example.com/path", { url: "https://example.com/path" });
+      mockStore.set("https://test.org", { url: "https://test.org" });
+      mockStore.set("https://phishing.net/login", { url: "https://phishing.net/login" });
+
+      const matcher = (url: string) => {
+        return url.includes("phishing") && url.includes("login");
+      };
+      const result = await service.findMatchingUrl(matcher);
+
+      expect(result).toBe(true);
+    });
+
+    it("returns false on error", async () => {
+      const error = new Error("IndexedDB error");
+      mockOpenRequest.error = error;
+      (global.indexedDB.open as jest.Mock).mockImplementation(() => {
+        setTimeout(() => {
+          mockOpenRequest.onerror?.();
+        }, 0);
+        return mockOpenRequest;
+      });
+
+      const matcher = (url: string) => url.includes("test");
+      const result = await service.findMatchingUrl(matcher);
+
+      expect(result).toBe(false);
+      expect(logService.error).toHaveBeenCalledWith(
+        "[PhishingIndexedDbService] Cursor search failed",
+        expect.any(Error),
+      );
+    });
+  });
+
  describe("database initialization", () => {
    it("creates object store with keyPath on upgrade", async () => {
      mockDb.objectStoreNames.contains.mockReturnValue(false);
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-indexeddb.service.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-indexeddb.service.ts
@@ -195,6 +195,60 @@ export class PhishingIndexedDbService {
    });
  }

+  /**
+   * Checks if any URL in the database matches the given matcher function.
+   * Uses a cursor to iterate through records without loading all into memory.
+   * Returns immediately on first match for optimal performance.
+   *
+   * @param matcher - Function that tests each URL and returns true if it matches
+   * @returns `true` if any URL matches, `false` if none match or on error
+   */
+  async findMatchingUrl(matcher: (url: string) => boolean): Promise<boolean> {
+    this.logService.debug("[PhishingIndexedDbService] Searching for matching URL with cursor...");
+
+    let db: IDBDatabase | null = null;
+    try {
+      db = await this.openDatabase();
+      return await this.cursorSearch(db, matcher);
+    } catch (error) {
+      this.logService.error("[PhishingIndexedDbService] Cursor search failed", error);
+      return false;
+    } finally {
+      db?.close();
+    }
+  }
+
+  /**
+   * Performs cursor-based search through all URLs.
+   * Tests each URL with the matcher without accumulating records in memory.
+   */
+  private cursorSearch(db: IDBDatabase, matcher: (url: string) => boolean): Promise<boolean> {
+    return new Promise((resolve, reject) => {
+      const req = db
+        .transaction(this.STORE_NAME, "readonly")
+        .objectStore(this.STORE_NAME)
+        .openCursor();
+      req.onerror = () => reject(req.error);
+      req.onsuccess = (e) => {
+        const cursor = (e.target as IDBRequest<IDBCursorWithValue | null>).result;
+        if (cursor) {
+          const url = (cursor.value as PhishingUrlRecord).url;
+          // Test the URL immediately without accumulating in memory
+          if (matcher(url)) {
+            // Found a match
+            resolve(true);
+            return;
+          }
+          // No match, continue to next record
+          cursor.continue();
+        } else {
+          // Reached end of records without finding a match
+          resolve(false);
+        }
+      };
+    });
+  }
+
  /**
   * Saves phishing URLs directly from a stream.
   * Processes data incrementally to minimize memory usage.