From ad4b01f315e2b53cf73d509a12cb137a86929d32 Mon Sep 17 00:00:00 2001
From: Leslie Tilton <23057410+Banrion@users.noreply.github.com>
Date: Tue, 30 Dec 2025 11:06:30 -0600
Subject: [PATCH] [PM-28548] Phishing Blocker support links (#18070)

* Change domain terminology to web addresses

* Added phishing resource file

* Finish renaming and adding runtime configuration for domains vs links setting

* Update reference

* Add matching functions per resource

* correct URL matching logic for links-based detection

Problem:
The phishing link matcher was failing to detect known phishing URLs due to
two issues:

1. Protocol mismatch: Entries in the phishing list use `http://` but users
   typically visit `https://` versions. The matcher was comparing full URLs
   including protocol, causing legitimate matches to fail.
   - List entry: `http://smartdapptradxx.pages.dev`
   - User visits: `https://smartdapptradxx.pages.dev/`
   - Result: No match (incorrect)

2. Hostname-only matching would have caused false positives: An earlier
   attempt to fix #1 included hostname-only comparison, which defeats the
   purpose of links-based detection. The goal of PM-28548 is precise URL
   matching to avoid blocking entire domains (like pages.dev, github.io)
   when only specific paths are malicious.

Solution:
- Always strip protocol (http:// or https://) from both entry and URL
  before comparison, treating them as equivalent
- Remove hostname-only matching to maintain precision
- Keep prefix matching for subpaths, query strings, and fragments

---------

Co-authored-by: Alex <adewitt@bitwarden.com>
(cherry picked from commit 800a21d8a3b65369528610ac79517e7a5792e48a)
---
 .../phishing-detection/phishing-resources.ts  |  98 ++++++++++++++++
 .../services/phishing-data.service.spec.ts    |  62 +++++-----
 .../services/phishing-data.service.ts         | 107 +++++++++---------
 .../services/phishing-detection.service.ts    |   2 +-
 4 files changed, 186 insertions(+), 83 deletions(-)
 create mode 100644 apps/browser/src/dirt/phishing-detection/phishing-resources.ts

diff --git a/apps/browser/src/dirt/phishing-detection/phishing-resources.ts b/apps/browser/src/dirt/phishing-detection/phishing-resources.ts
new file mode 100644
index 00000000000..262d6cf833b
--- /dev/null
+++ b/apps/browser/src/dirt/phishing-detection/phishing-resources.ts
@@ -0,0 +1,98 @@
+export type PhishingResource = {
+  name?: string;
+  remoteUrl: string;
+  checksumUrl: string;
+  todayUrl: string;
+  /** Matcher used to decide whether a given URL matches an entry from this resource */
+  match: (url: URL, entry: string) => boolean;
+};
+
+export const PhishingResourceType = Object.freeze({
+  Domains: "domains",
+  Links: "links",
+} as const);
+
+export type PhishingResourceType = (typeof PhishingResourceType)[keyof typeof PhishingResourceType];
+
+export const PHISHING_RESOURCES: Record<PhishingResourceType, PhishingResource[]> = {
+  [PhishingResourceType.Domains]: [
+    {
+      name: "Phishing.Database Domains",
+      remoteUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/master/phishing-domains-ACTIVE.txt",
+      checksumUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/checksums/refs/heads/master/phishing-domains-ACTIVE.txt.md5",
+      todayUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/refs/heads/master/phishing-domains-NEW-today.txt",
+      match: (url: URL, entry: string) => {
+        if (!entry) {
+          return false;
+        }
+        const candidate = entry.trim().toLowerCase().replace(/\/$/, "");
+        // If entry contains a scheme, strip it for comparison
+        const e = candidate.replace(/^https?:\/\//, "");
+        // Compare against hostname or host+path
+        if (e === url.hostname.toLowerCase()) {
+          return true;
+        }
+        const urlNoProto = url.href
+          .toLowerCase()
+          .replace(/https?:\/\//, "")
+          .replace(/\/$/, "");
+        return urlNoProto === e || urlNoProto.startsWith(e + "/");
+      },
+    },
+  ],
+  [PhishingResourceType.Links]: [
+    {
+      name: "Phishing.Database Links",
+      remoteUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/master/phishing-links-ACTIVE.txt",
+      checksumUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/checksums/refs/heads/master/phishing-links-ACTIVE.txt.md5",
+      todayUrl:
+        "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/refs/heads/master/phishing-links-NEW-today.txt",
+      match: (url: URL, entry: string) => {
+        if (!entry) {
+          return false;
+        }
+        // Basic HTML entity decode for common cases (the lists sometimes contain &amp;)
+        const decodeHtml = (s: string) => s.replace(/&amp;/g, "&");
+
+        const normalizedEntry = decodeHtml(entry.trim()).toLowerCase().replace(/\/$/, "");
+
+        // Normalize URL for comparison - always strip protocol for consistent matching
+        const normalizedUrl = decodeHtml(url.href).toLowerCase().replace(/\/$/, "");
+        const urlNoProto = normalizedUrl.replace(/^https?:\/\//, "");
+
+        // Strip protocol from entry if present (http:// and https:// should be treated as equivalent)
+        const entryNoProto = normalizedEntry.replace(/^https?:\/\//, "");
+
+        // Compare full path (without protocol) - exact match
+        if (urlNoProto === entryNoProto) {
+          return true;
+        }
+
+        // Check if URL starts with entry (prefix match for subpaths/query/hash)
+        // e.g., entry "site.com/phish" matches "site.com/phish/subpage" or "site.com/phish?id=1"
+        if (
+          urlNoProto.startsWith(entryNoProto + "/") ||
+          urlNoProto.startsWith(entryNoProto + "?") ||
+          urlNoProto.startsWith(entryNoProto + "#")
+        ) {
+          return true;
+        }
+
+        return false;
+      },
+    },
+  ],
+};
+
+export function getPhishingResources(
+  type: PhishingResourceType,
+  index = 0,
+): PhishingResource | undefined {
+  const list = PHISHING_RESOURCES[type] ?? [];
+  return list[index];
+}
diff --git a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts
index 94f3e99f8be..30aa947092d 100644
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.spec.ts
@@ -25,7 +25,7 @@ describe("PhishingDataService", () => {
   };
 
   let fetchChecksumSpy: jest.SpyInstance;
-  let fetchDomainsSpy: jest.SpyInstance;
+  let fetchWebAddressesSpy: jest.SpyInstance;
 
   beforeEach(() => {
     jest.useFakeTimers();
@@ -45,113 +45,113 @@ describe("PhishingDataService", () => {
       platformUtilsService,
     );
 
-    fetchChecksumSpy = jest.spyOn(service as any, "fetchPhishingDomainsChecksum");
-    fetchDomainsSpy = jest.spyOn(service as any, "fetchPhishingDomains");
+    fetchChecksumSpy = jest.spyOn(service as any, "fetchPhishingChecksum");
+    fetchWebAddressesSpy = jest.spyOn(service as any, "fetchPhishingWebAddresses");
   });
 
-  describe("isPhishingDomains", () => {
-    it("should detect a phishing domain", async () => {
+  describe("isPhishingWebAddress", () => {
+    it("should detect a phishing web address", async () => {
       setMockState({
-        domains: ["phish.com", "badguy.net"],
+        webAddresses: ["phish.com", "badguy.net"],
         timestamp: Date.now(),
         checksum: "abc123",
         applicationVersion: "1.0.0",
       });
       const url = new URL("http://phish.com");
-      const result = await service.isPhishingDomain(url);
+      const result = await service.isPhishingWebAddress(url);
       expect(result).toBe(true);
     });
 
-    it("should not detect a safe domain", async () => {
+    it("should not detect a safe web address", async () => {
       setMockState({
-        domains: ["phish.com", "badguy.net"],
+        webAddresses: ["phish.com", "badguy.net"],
         timestamp: Date.now(),
         checksum: "abc123",
         applicationVersion: "1.0.0",
       });
       const url = new URL("http://safe.com");
-      const result = await service.isPhishingDomain(url);
+      const result = await service.isPhishingWebAddress(url);
       expect(result).toBe(false);
     });
 
-    it("should match against root domain", async () => {
+    it("should match against root web address", async () => {
       setMockState({
-        domains: ["phish.com", "badguy.net"],
+        webAddresses: ["phish.com", "badguy.net"],
         timestamp: Date.now(),
         checksum: "abc123",
         applicationVersion: "1.0.0",
       });
       const url = new URL("http://phish.com/about");
-      const result = await service.isPhishingDomain(url);
+      const result = await service.isPhishingWebAddress(url);
       expect(result).toBe(true);
     });
 
     it("should not error on empty state", async () => {
       setMockState(undefined as any);
       const url = new URL("http://phish.com/about");
-      const result = await service.isPhishingDomain(url);
+      const result = await service.isPhishingWebAddress(url);
       expect(result).toBe(false);
     });
   });
 
-  describe("getNextDomains", () => {
-    it("refetches all domains if applicationVersion has changed", async () => {
+  describe("getNextWebAddresses", () => {
+    it("refetches all web addresses if applicationVersion has changed", async () => {
       const prev: PhishingData = {
-        domains: ["a.com"],
+        webAddresses: ["a.com"],
         timestamp: Date.now() - 60000,
         checksum: "old",
         applicationVersion: "1.0.0",
       };
       fetchChecksumSpy.mockResolvedValue("new");
-      fetchDomainsSpy.mockResolvedValue(["d.com", "e.com"]);
+      fetchWebAddressesSpy.mockResolvedValue(["d.com", "e.com"]);
       platformUtilsService.getApplicationVersion.mockResolvedValue("2.0.0");
 
-      const result = await service.getNextDomains(prev);
+      const result = await service.getNextWebAddresses(prev);
 
-      expect(result!.domains).toEqual(["d.com", "e.com"]);
+      expect(result!.webAddresses).toEqual(["d.com", "e.com"]);
       expect(result!.checksum).toBe("new");
       expect(result!.applicationVersion).toBe("2.0.0");
     });
 
     it("only updates timestamp if checksum matches", async () => {
       const prev: PhishingData = {
-        domains: ["a.com"],
+        webAddresses: ["a.com"],
         timestamp: Date.now() - 60000,
         checksum: "abc",
         applicationVersion: "1.0.0",
       };
       fetchChecksumSpy.mockResolvedValue("abc");
-      const result = await service.getNextDomains(prev);
-      expect(result!.domains).toEqual(prev.domains);
+      const result = await service.getNextWebAddresses(prev);
+      expect(result!.webAddresses).toEqual(prev.webAddresses);
       expect(result!.checksum).toBe("abc");
       expect(result!.timestamp).not.toBe(prev.timestamp);
     });
 
     it("patches daily domains if cache is fresh", async () => {
       const prev: PhishingData = {
-        domains: ["a.com"],
+        webAddresses: ["a.com"],
         timestamp: Date.now() - 60000,
         checksum: "old",
         applicationVersion: "1.0.0",
       };
       fetchChecksumSpy.mockResolvedValue("new");
-      fetchDomainsSpy.mockResolvedValue(["b.com", "c.com"]);
-      const result = await service.getNextDomains(prev);
-      expect(result!.domains).toEqual(["a.com", "b.com", "c.com"]);
+      fetchWebAddressesSpy.mockResolvedValue(["b.com", "c.com"]);
+      const result = await service.getNextWebAddresses(prev);
+      expect(result!.webAddresses).toEqual(["a.com", "b.com", "c.com"]);
       expect(result!.checksum).toBe("new");
     });
 
     it("fetches all domains if cache is old", async () => {
       const prev: PhishingData = {
-        domains: ["a.com"],
+        webAddresses: ["a.com"],
         timestamp: Date.now() - 2 * 24 * 60 * 60 * 1000,
         checksum: "old",
         applicationVersion: "1.0.0",
       };
       fetchChecksumSpy.mockResolvedValue("new");
-      fetchDomainsSpy.mockResolvedValue(["d.com", "e.com"]);
-      const result = await service.getNextDomains(prev);
-      expect(result!.domains).toEqual(["d.com", "e.com"]);
+      fetchWebAddressesSpy.mockResolvedValue(["d.com", "e.com"]);
+      const result = await service.getNextWebAddresses(prev);
+      expect(result!.webAddresses).toEqual(["d.com", "e.com"]);
       expect(result!.checksum).toBe("new");
     });
   });
diff --git a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts
index 6e1bf07c647..21fe74f1873 100644
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-data.service.ts
@@ -20,14 +20,16 @@ import { ScheduledTaskNames, TaskSchedulerService } from "@bitwarden/common/plat
 import { LogService } from "@bitwarden/logging";
 import { GlobalStateProvider, KeyDefinition, PHISHING_DETECTION_DISK } from "@bitwarden/state";
 
+import { getPhishingResources, PhishingResourceType } from "../phishing-resources";
+
 export type PhishingData = {
-  domains: string[];
+  webAddresses: string[];
   timestamp: number;
   checksum: string;
 
   /**
    * We store the application version to refetch the entire dataset on a new client release.
-   * This counteracts daily appends updates not removing inactive or false positive domains.
+   * This counteracts daily appends updates not removing inactive or false positive web addresses.
    */
   applicationVersion: string;
 };
@@ -37,34 +39,27 @@ export const PHISHING_DOMAINS_KEY = new KeyDefinition<PhishingData>(
   "phishingDomains",
   {
     deserializer: (value: PhishingData) =>
-      value ?? { domains: [], timestamp: 0, checksum: "", applicationVersion: "" },
+      value ?? { webAddresses: [], timestamp: 0, checksum: "", applicationVersion: "" },
   },
 );
 
-/** Coordinates fetching, caching, and patching of known phishing domains */
+/** Coordinates fetching, caching, and patching of known phishing web addresses */
 export class PhishingDataService {
-  private static readonly RemotePhishingDatabaseUrl =
-    "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/master/phishing-domains-ACTIVE.txt";
-  private static readonly RemotePhishingDatabaseChecksumUrl =
-    "https://raw.githubusercontent.com/Phishing-Database/checksums/refs/heads/master/phishing-domains-ACTIVE.txt.md5";
-  private static readonly RemotePhishingDatabaseTodayUrl =
-    "https://raw.githubusercontent.com/Phishing-Database/Phishing.Database/refs/heads/master/phishing-domains-NEW-today.txt";
-
-  private _testDomains = this.getTestDomains();
+  private _testWebAddresses = this.getTestWebAddresses();
   private _cachedState = this.globalStateProvider.get(PHISHING_DOMAINS_KEY);
-  private _domains$ = this._cachedState.state$.pipe(
+  private _webAddresses$ = this._cachedState.state$.pipe(
     map(
       (state) =>
         new Set(
-          (state?.domains?.filter((line) => line.trim().length > 0) ?? []).concat(
-            this._testDomains,
+          (state?.webAddresses?.filter((line) => line.trim().length > 0) ?? []).concat(
+            this._testWebAddresses,
             "phishing.testcategory.com", // Included for QA to test in prod
           ),
         ),
     ),
   );
 
-  // How often are new domains added to the remote?
+  // How often are new web addresses added to the remote?
   readonly UPDATE_INTERVAL_DURATION = 24 * 60 * 60 * 1000; // 24 hours
 
   private _triggerUpdate$ = new Subject<void>();
@@ -75,7 +70,7 @@ export class PhishingDataService {
       this._cachedState.state$.pipe(
         first(), // Only take the first value to avoid an infinite loop when updating the cache below
         switchMap(async (cachedState) => {
-          const next = await this.getNextDomains(cachedState);
+          const next = await this.getNextWebAddresses(cachedState);
           if (next) {
             await this._cachedState.update(() => next);
             this.logService.info(`[PhishingDataService] cache updated`);
@@ -85,7 +80,7 @@ export class PhishingDataService {
           count: 3,
           delay: (err, count) => {
             this.logService.error(
-              `[PhishingDataService] Unable to update domains. Attempt ${count}.`,
+              `[PhishingDataService] Unable to update web addresses. Attempt ${count}.`,
               err,
             );
             return timer(5 * 60 * 1000); // 5 minutes
@@ -97,7 +92,7 @@ export class PhishingDataService {
             err: unknown /** Eslint actually crashed if you remove this type: https://github.com/cartant/eslint-plugin-rxjs/issues/122 */,
           ) => {
             this.logService.error(
-              "[PhishingDataService] Retries unsuccessful. Unable to update domains.",
+              "[PhishingDataService] Retries unsuccessful. Unable to update web addresses.",
               err,
             );
             return EMPTY;
@@ -114,6 +109,7 @@ export class PhishingDataService {
     private globalStateProvider: GlobalStateProvider,
     private logService: LogService,
     private platformUtilsService: PlatformUtilsService,
+    private resourceType: PhishingResourceType = PhishingResourceType.Links,
   ) {
     this.taskSchedulerService.registerTaskHandler(ScheduledTaskNames.phishingDomainUpdate, () => {
       this._triggerUpdate$.next();
@@ -125,22 +121,31 @@ export class PhishingDataService {
   }
 
   /**
-   * Checks if the given URL is a known phishing domain
+   * Checks if the given URL is a known phishing web address
    *
    * @param url The URL to check
-   * @returns True if the URL is a known phishing domain, false otherwise
+   * @returns True if the URL is a known phishing web address, false otherwise
    */
-  async isPhishingDomain(url: URL): Promise<boolean> {
-    const domains = await firstValueFrom(this._domains$);
-    const result = domains.has(url.hostname);
-    if (result) {
-      return true;
+  async isPhishingWebAddress(url: URL): Promise<boolean> {
+    // Use domain (hostname) matching for domain resources, and link matching for links resources
+    const entries = await firstValueFrom(this._webAddresses$);
+
+    const resource = getPhishingResources(this.resourceType);
+    if (resource && resource.match) {
+      for (const entry of entries) {
+        if (resource.match(url, entry)) {
+          return true;
+        }
+      }
+      return false;
     }
-    return false;
+
+    // Default/domain behavior: exact hostname match as a fallback
+    return entries.has(url.hostname);
   }
 
-  async getNextDomains(prev: PhishingData | null): Promise<PhishingData | null> {
-    prev = prev ?? { domains: [], timestamp: 0, checksum: "", applicationVersion: "" };
+  async getNextWebAddresses(prev: PhishingData | null): Promise<PhishingData | null> {
+    prev = prev ?? { webAddresses: [], timestamp: 0, checksum: "", applicationVersion: "" };
     const timestamp = Date.now();
     const prevAge = timestamp - prev.timestamp;
     this.logService.info(`[PhishingDataService] Cache age: ${prevAge}`);
@@ -148,7 +153,7 @@ export class PhishingDataService {
     const applicationVersion = await this.platformUtilsService.getApplicationVersion();
 
     // If checksum matches, return existing data with new timestamp & version
-    const remoteChecksum = await this.fetchPhishingDomainsChecksum();
+    const remoteChecksum = await this.fetchPhishingChecksum(this.resourceType);
     if (remoteChecksum && prev.checksum === remoteChecksum) {
       this.logService.info(
         `[PhishingDataService] Remote checksum matches local checksum, updating timestamp only.`,
@@ -157,66 +162,66 @@ export class PhishingDataService {
     }
     // Checksum is different, data needs to be updated.
 
-    // Approach 1: Fetch only new domains and append
+    // Approach 1: Fetch only new web addresses and append
     const isOneDayOldMax = prevAge <= this.UPDATE_INTERVAL_DURATION;
     if (isOneDayOldMax && applicationVersion === prev.applicationVersion) {
-      const dailyDomains: string[] = await this.fetchPhishingDomains(
-        PhishingDataService.RemotePhishingDatabaseTodayUrl,
-      );
+      const webAddressesTodayUrl = getPhishingResources(this.resourceType)!.todayUrl;
+      const dailyWebAddresses: string[] =
+        await this.fetchPhishingWebAddresses(webAddressesTodayUrl);
       this.logService.info(
-        `[PhishingDataService] ${dailyDomains.length} new phishing domains added`,
+        `[PhishingDataService] ${dailyWebAddresses.length} new phishing web addresses added`,
       );
       return {
-        domains: prev.domains.concat(dailyDomains),
+        webAddresses: prev.webAddresses.concat(dailyWebAddresses),
         checksum: remoteChecksum,
         timestamp,
         applicationVersion,
       };
     }
 
-    // Approach 2: Fetch all domains
-    const domains = await this.fetchPhishingDomains(PhishingDataService.RemotePhishingDatabaseUrl);
+    // Approach 2: Fetch all web addresses
+    const remoteUrl = getPhishingResources(this.resourceType)!.remoteUrl;
+    const remoteWebAddresses = await this.fetchPhishingWebAddresses(remoteUrl);
     return {
-      domains,
+      webAddresses: remoteWebAddresses,
       timestamp,
       checksum: remoteChecksum,
       applicationVersion,
     };
   }
 
-  private async fetchPhishingDomainsChecksum() {
-    const response = await this.apiService.nativeFetch(
-      new Request(PhishingDataService.RemotePhishingDatabaseChecksumUrl),
-    );
+  private async fetchPhishingChecksum(type: PhishingResourceType = PhishingResourceType.Domains) {
+    const checksumUrl = getPhishingResources(type)!.checksumUrl;
+    const response = await this.apiService.nativeFetch(new Request(checksumUrl));
     if (!response.ok) {
       throw new Error(`[PhishingDataService] Failed to fetch checksum: ${response.status}`);
     }
     return response.text();
   }
 
-  private async fetchPhishingDomains(url: string) {
+  private async fetchPhishingWebAddresses(url: string) {
     const response = await this.apiService.nativeFetch(new Request(url));
 
     if (!response.ok) {
-      throw new Error(`[PhishingDataService] Failed to fetch domains: ${response.status}`);
+      throw new Error(`[PhishingDataService] Failed to fetch web addresses: ${response.status}`);
     }
 
     return response.text().then((text) => text.split("\n"));
   }
 
-  private getTestDomains() {
+  private getTestWebAddresses() {
     const flag = devFlagEnabled("testPhishingUrls");
     if (!flag) {
       return [];
     }
 
-    const domains = devFlagValue("testPhishingUrls") as unknown[];
-    if (domains && domains instanceof Array) {
+    const webAddresses = devFlagValue("testPhishingUrls") as unknown[];
+    if (webAddresses && webAddresses instanceof Array) {
       this.logService.debug(
-        "[PhishingDetectionService] Dev flag enabled for testing phishing detection. Adding test phishing domains:",
-        domains,
+        "[PhishingDetectionService] Dev flag enabled for testing phishing detection. Adding test phishing web addresses:",
+        webAddresses,
       );
-      return domains as string[];
+      return webAddresses as string[];
     }
     return [];
   }
diff --git a/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts b/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts
index e04d08559ab..d90e872eef8 100644
--- a/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts
+++ b/apps/browser/src/dirt/phishing-detection/services/phishing-detection.service.ts
@@ -94,7 +94,7 @@ export class PhishingDetectionService {
           this._ignoredHostnames.delete(url.hostname);
           return;
         }
-        const isPhishing = await phishingDataService.isPhishingDomain(url);
+        const isPhishing = await phishingDataService.isPhishingWebAddress(url);
         if (!isPhishing) {
           return;
         }