Skip to content

Commit 15bbc66

Browse files
authored
Normalize URLs during WARC import (#886)
It turns out we've been missing some records because redirect URLs (in the 3xx response's `Location` header) do not always exactly match the requested target URL as recorded in the WARC record. This solves the problem by normalizing URLs before trying to match them. For example, `https://www.heat.gov/` recently stopped getting recorded because it started redirecting to `https://heat.gov`, which got recorded in the WARC as `https://heat.gov/`. The `/` path is exactly equivalent to an empty path, but because we were looking for a URL without a path, we didn't find a record, even though the correct matching record was there, just with a `/` for its path. While working on this particular case, I found some similar problems arising from variations on this same cause, like having redundant ports (e.g. the `443` is redundant here because it is an `https` scheme: `https://whatever.com:443/`).
1 parent 6b41e7a commit 15bbc66

File tree

3 files changed

+75
-16
lines changed

3 files changed

+75
-16
lines changed

web_monitoring/cli/warc_import.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pathlib import Path
1111
import sys
1212
from typing import Any, Generator
13-
from urllib.parse import urljoin, urlparse
13+
from urllib.parse import urljoin
1414
import sentry_sdk
1515
from tqdm.contrib.logging import tqdm_logging_redirect
1616
from warcio import ArchiveIterator
@@ -19,21 +19,12 @@
1919
from .. import db
2020
from .. import utils
2121
from ..media import HTML_MEDIA_TYPES, PDF_MEDIA_TYPES, find_media_type
22-
from ..utils import S3HashStore, detect_encoding
22+
from ..utils import S3HashStore, detect_encoding, normalize_url
2323

2424

2525
logger = logging.getLogger(__name__)
2626

2727

28-
def normalize_seed_url(url: str) -> str:
29-
"""
30-
Ensure a URL is an actual, valid URL that could have been sent to a server
31-
in an HTTP request and therefore recorded in a WARC record.
32-
"""
33-
parsed = urlparse(url)
34-
return parsed._replace(path=(parsed.path or '/'), fragment='').geturl()
35-
36-
3728
def read_browsertrix_pages_seeds(seeds_path: str) -> list[str]:
3829
with open(seeds_path, 'r') as file:
3930
try:
@@ -44,7 +35,7 @@ def read_browsertrix_pages_seeds(seeds_path: str) -> list[str]:
4435
raise ValueError('Seeds file is not a Browsertrix "json-pages-1.0" file.')
4536

4637
pages = (json.loads(line) for line in file if line != '')
47-
return [normalize_seed_url(page['url'])
38+
return [normalize_url(page['url'])
4839
for page in pages
4940
if page['seed']]
5041

@@ -54,7 +45,7 @@ def read_browsertrix_config_seeds(seeds_path: str) -> list[str]:
5445
data = yaml.safe_load(file)
5546
seeds = data.get('seeds')
5647
if isinstance(seeds, list):
57-
return [normalize_seed_url(seed if isinstance(seed, str) else seed['url'])
48+
return [normalize_url(seed if isinstance(seed, str) else seed['url'])
5849
for seed in seeds]
5950
else:
6051
raise ValueError(f'Seeds file is missing `seeds` key that is an array of URL strings: "{seeds_path}"')
@@ -86,7 +77,7 @@ def redirect_target(self) -> str:
8677
status = self.response.http_headers.get_statuscode()
8778
location = self.response.http_headers.get_header('location')
8879
if status.startswith('3') and location:
89-
return urljoin(self.url, location)
80+
return normalize_url(urljoin(self.url, location))
9081
# Amazon WAF browser challenge works reloading the same URL with a
9182
# cookie. Treat this like a redirect; we should have captured the
9283
# second request to the same URL.
@@ -225,7 +216,7 @@ def each_redirect_chain(warcs: list[str], seeds: set[str]) -> Generator[Redirect
225216
entry = RecordIndexEntry(
226217
id=record.rec_headers.get('WARC-Record-ID'),
227218
timestamp=dateutil.parser.parse(record.rec_headers.get('WARC-Date')).astimezone(timezone.utc),
228-
uri=record.rec_headers.get('WARC-Target-URI'),
219+
uri=normalize_url(record.rec_headers.get('WARC-Target-URI')),
229220
type=record.rec_type,
230221
file=warc,
231222
offset=reader.get_record_offset(),

web_monitoring/tests/test_utils.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from support import get_fixture_bytes
55
import threading
66
from web_monitoring.utils import (extract_html_title, extract_pdf_title,
7-
RateLimit, FiniteQueue)
7+
normalize_url, RateLimit, FiniteQueue)
88

99

1010
def test_extract_html_title():
@@ -94,6 +94,36 @@ def test_extract_pdf_title_no_metadata():
9494
assert title is None
9595

9696

97+
class TestNormalizeUrl:
98+
def test_normalizes_scheme(self):
99+
assert normalize_url('hTTps://whatever.com/') == 'https://whatever.com/'
100+
101+
def test_normalizes_domain(self):
102+
assert normalize_url('https://whatEVER.com/') == 'https://whatever.com/'
103+
104+
def test_removes_redundant_https_port(self):
105+
assert normalize_url('https://whatever.com:443/') == 'https://whatever.com/'
106+
107+
def test_removes_redundant_http_port(self):
108+
assert normalize_url('http://whatever.com:80/') == 'http://whatever.com/'
109+
110+
def test_leaves_credentials_along(self):
111+
assert normalize_url('https://aBc:DeF@whatEVER.com/') == 'https://aBc:DeF@whatever.com/'
112+
113+
def test_ensures_a_path(self):
114+
assert normalize_url('https://whatever.com') == 'https://whatever.com/'
115+
116+
def test_removes_fragment(self):
117+
assert normalize_url('https://whatever.com/x#y') == 'https://whatever.com/x'
118+
119+
def test_keeps_existing_path(self):
120+
assert normalize_url('https://whatever.com/X/y') == 'https://whatever.com/X/y'
121+
122+
def test_keeps_www(self):
123+
assert normalize_url('https://www.whatever.com/') == 'https://www.whatever.com/'
124+
assert normalize_url('https://www3.whatever.com/') == 'https://www3.whatever.com/'
125+
126+
97127
class TestRateLimit:
98128
def test_rate_limit(self):
99129
limiter = RateLimit(per_second=2)

web_monitoring/utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import threading
1818
import time
1919
from typing import Generator, Iterable, TypeVar
20+
from urllib.parse import ParseResult, urlparse
2021

2122
try:
2223
from cchardet import detect as detect_charset
@@ -163,6 +164,43 @@ def hash_content(content_bytes):
163164
return hashlib.sha256(content_bytes).hexdigest()
164165

165166

167+
def normalize_netloc(url: ParseResult) -> str:
168+
"""
169+
Get a parsed URL's netloc in a normalized form.
170+
"""
171+
assert url.hostname
172+
173+
result = ''
174+
if url.username:
175+
result += url.username
176+
if url.password:
177+
result += ':' + url.password
178+
result += '@'
179+
result += url.hostname.lower()
180+
if (
181+
url.port
182+
and not (url.scheme == 'https' and url.port == 443)
183+
and not (url.scheme == 'http' and url.port == 80)
184+
):
185+
result += f':{url.port}'
186+
187+
return result
188+
189+
190+
def normalize_url(url: str) -> str:
191+
"""
192+
Normalize a URL into an unambiguous, standardized form. The output of this
193+
should always be handled by a server or HTTP library exactly the same as
194+
the input would have been.
195+
"""
196+
parsed = urlparse(url)
197+
return parsed._replace(
198+
netloc=normalize_netloc(parsed),
199+
path=(parsed.path or '/'),
200+
fragment=''
201+
).geturl()
202+
203+
166204
class RateLimit:
167205
"""
168206
RateLimit is a simple locking mechanism that can be used to enforce rate

0 commit comments

Comments
 (0)