Replace urlparse with urlsplit (#887)

Mr0grog · web-flow · commit f8a607889395 · 2025-10-10T12:28:14.000-07:00
I learned recently that `urlsplit` is both faster and more correct/up-to-date, standards-wise. We should generally be using it instead of `urlparse`.
diff --git a/web_monitoring/cli/cli.py b/web_monitoring/cli/cli.py
@@ -59,7 +59,7 @@
 import threading
 import time
 from tqdm import tqdm
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 from web_monitoring import db
 import wayback
 from wayback.exceptions import (WaybackException, WaybackRetryError,
@@ -931,14 +931,14 @@ def _is_page(version):
     aren't filtering down to a explicit list of URLs.
     """
     return (version.mime_type not in SUBRESOURCE_MIME_TYPES and
-            splitext(urlparse(version.url).path)[1] not in SUBRESOURCE_EXTENSIONS)
+            splitext(urlsplit(version.url).path)[1] not in SUBRESOURCE_EXTENSIONS)
 
 
 def _parse_path(path_string):
     if path_string is None:
         return None
 
-    parsed = urlparse(path_string)
+    parsed = urlsplit(path_string)
     if parsed.scheme == '':
         return Path(path_string)
     elif parsed.scheme == 'file':
@@ -955,9 +955,9 @@ def _is_valid(url):
     a URL is valid if it has a valid addressing scheme and network location.
     """
     try:
-        result = urlparse(url)
+        result = urlsplit(url)
         return all([result.scheme, result.netloc])
-    except:
+    except Exception:
         return False
 
 
diff --git a/web_monitoring/utils.py b/web_monitoring/utils.py
@@ -17,7 +17,7 @@
 import threading
 import time
 from typing import Generator, Iterable, TypeVar
-from urllib.parse import ParseResult, urlparse
+from urllib.parse import SplitResult, urlsplit
 
 try:
     from cchardet import detect as detect_charset
@@ -164,7 +164,7 @@ def hash_content(content_bytes):
     return hashlib.sha256(content_bytes).hexdigest()
 
 
-def normalize_netloc(url: ParseResult) -> str:
+def normalize_netloc(url: SplitResult) -> str:
     """
     Get a parsed URL's netloc in a normalized form.
     """
@@ -193,7 +193,7 @@ def normalize_url(url: str) -> str:
     should always be handled by a server or HTTP library exactly the same as
     the input would have been.
     """
-    parsed = urlparse(url)
+    parsed = urlsplit(url)
     return parsed._replace(
         netloc=normalize_netloc(parsed),
         path=(parsed.path or '/'),