Normalize URLs during WARC import (#886)

Mr0grog · web-flow · commit 15bbc6622f20 · 2025-10-06T22:05:52.000-07:00
It turns out we've been missing some records because redirect URLs (in the 3xx response's `Location` header) do not always exactly match the requested target URL as recorded in the WARC record. This solves the problem by normalizing URLs before trying to match them. For example, `https://www.heat.gov/` recently stopped getting recorded because it started redirecting to `https://heat.gov`, which got recorded in the WARC as `https://heat.gov/`. The `/` path is exactly equivalent to an empty path, but because we were looking for a URL without a path, we didn't find a record, even though the correct matching record was there, just with a `/` for its path. While working on this particular case, I found some similar problems arising from variations on this same cause, like having redundant ports (e.g. the `443` is redundant here because it is an `https` scheme: `https://whatever.com:443/`).
diff --git a/web_monitoring/cli/warc_import.py b/web_monitoring/cli/warc_import.py
@@ -10,7 +10,7 @@
 from pathlib import Path
 import sys
 from typing import Any, Generator
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin
 import sentry_sdk
 from tqdm.contrib.logging import tqdm_logging_redirect
 from warcio import ArchiveIterator
@@ -19,21 +19,12 @@
 from .. import db
 from .. import utils
 from ..media import HTML_MEDIA_TYPES, PDF_MEDIA_TYPES, find_media_type
-from ..utils import S3HashStore, detect_encoding
+from ..utils import S3HashStore, detect_encoding, normalize_url
 
 
 logger = logging.getLogger(__name__)
 
 
-def normalize_seed_url(url: str) -> str:
-    """
-    Ensure a URL is an actual, valid URL that could have been sent to a server
-    in an HTTP request and therefore recorded in a WARC record.
-    """
-    parsed = urlparse(url)
-    return parsed._replace(path=(parsed.path or '/'), fragment='').geturl()
-
-
 def read_browsertrix_pages_seeds(seeds_path: str) -> list[str]:
     with open(seeds_path, 'r') as file:
         try:
@@ -44,7 +35,7 @@ def read_browsertrix_pages_seeds(seeds_path: str) -> list[str]:
             raise ValueError('Seeds file is not a Browsertrix "json-pages-1.0" file.')
 
         pages = (json.loads(line) for line in file if line != '')
-        return [normalize_seed_url(page['url'])
+        return [normalize_url(page['url'])
                 for page in pages
                 if page['seed']]
 
@@ -54,7 +45,7 @@ def read_browsertrix_config_seeds(seeds_path: str) -> list[str]:
         data = yaml.safe_load(file)
         seeds = data.get('seeds')
         if isinstance(seeds, list):
-            return [normalize_seed_url(seed if isinstance(seed, str) else seed['url'])
+            return [normalize_url(seed if isinstance(seed, str) else seed['url'])
                     for seed in seeds]
         else:
             raise ValueError(f'Seeds file is missing `seeds` key that is an array of URL strings: "{seeds_path}"')
@@ -86,7 +77,7 @@ def redirect_target(self) -> str:
             status = self.response.http_headers.get_statuscode()
             location = self.response.http_headers.get_header('location')
             if status.startswith('3') and location:
-                return urljoin(self.url, location)
+                return normalize_url(urljoin(self.url, location))
             # Amazon WAF browser challenge works reloading the same URL with a
             # cookie. Treat this like a redirect; we should have captured the
             # second request to the same URL.
@@ -225,7 +216,7 @@ def each_redirect_chain(warcs: list[str], seeds: set[str]) -> Generator[Redirect
                     entry = RecordIndexEntry(
                         id=record.rec_headers.get('WARC-Record-ID'),
                         timestamp=dateutil.parser.parse(record.rec_headers.get('WARC-Date')).astimezone(timezone.utc),
-                        uri=record.rec_headers.get('WARC-Target-URI'),
+                        uri=normalize_url(record.rec_headers.get('WARC-Target-URI')),
                         type=record.rec_type,
                         file=warc,
                         offset=reader.get_record_offset(),
diff --git a/web_monitoring/tests/test_utils.py b/web_monitoring/tests/test_utils.py
@@ -4,7 +4,7 @@
 from support import get_fixture_bytes
 import threading
 from web_monitoring.utils import (extract_html_title, extract_pdf_title,
-                                  RateLimit, FiniteQueue)
+                                  normalize_url, RateLimit, FiniteQueue)
 
 
 def test_extract_html_title():
@@ -94,6 +94,36 @@ def test_extract_pdf_title_no_metadata():
     assert title is None
 
 
+class TestNormalizeUrl:
+    def test_normalizes_scheme(self):
+        assert normalize_url('hTTps://whatever.com/') == 'https://whatever.com/'
+
+    def test_normalizes_domain(self):
+        assert normalize_url('https://whatEVER.com/') == 'https://whatever.com/'
+
+    def test_removes_redundant_https_port(self):
+        assert normalize_url('https://whatever.com:443/') == 'https://whatever.com/'
+
+    def test_removes_redundant_http_port(self):
+        assert normalize_url('http://whatever.com:80/') == 'http://whatever.com/'
+
+    def test_leaves_credentials_along(self):
+        assert normalize_url('https://aBc:DeF@whatEVER.com/') == 'https://aBc:DeF@whatever.com/'
+
+    def test_ensures_a_path(self):
+        assert normalize_url('https://whatever.com') == 'https://whatever.com/'
+
+    def test_removes_fragment(self):
+        assert normalize_url('https://whatever.com/x#y') == 'https://whatever.com/x'
+
+    def test_keeps_existing_path(self):
+        assert normalize_url('https://whatever.com/X/y') == 'https://whatever.com/X/y'
+
+    def test_keeps_www(self):
+        assert normalize_url('https://www.whatever.com/') == 'https://www.whatever.com/'
+        assert normalize_url('https://www3.whatever.com/') == 'https://www3.whatever.com/'
+
+
 class TestRateLimit:
     def test_rate_limit(self):
         limiter = RateLimit(per_second=2)
diff --git a/web_monitoring/utils.py b/web_monitoring/utils.py
@@ -17,6 +17,7 @@
 import threading
 import time
 from typing import Generator, Iterable, TypeVar
+from urllib.parse import ParseResult, urlparse
 
 try:
     from cchardet import detect as detect_charset
@@ -163,6 +164,43 @@ def hash_content(content_bytes):
     return hashlib.sha256(content_bytes).hexdigest()
 
 
+def normalize_netloc(url: ParseResult) -> str:
+    """
+    Get a parsed URL's netloc in a normalized form.
+    """
+    assert url.hostname
+
+    result = ''
+    if url.username:
+        result += url.username
+        if url.password:
+            result += ':' + url.password
+        result += '@'
+    result += url.hostname.lower()
+    if (
+        url.port
+        and not (url.scheme == 'https' and url.port == 443)
+        and not (url.scheme == 'http' and url.port == 80)
+    ):
+        result += f':{url.port}'
+
+    return result
+
+
+def normalize_url(url: str) -> str:
+    """
+    Normalize a URL into an unambiguous, standardized form. The output of this
+    should always be handled by a server or HTTP library exactly the same as
+    the input would have been.
+    """
+    parsed = urlparse(url)
+    return parsed._replace(
+        netloc=normalize_netloc(parsed),
+        path=(parsed.path or '/'),
+        fragment=''
+    ).geturl()
+
+
 class RateLimit:
     """
     RateLimit is a simple locking mechanism that can be used to enforce rate