Skip to content

Commit f8a6078

Browse files
authored
Replace urlparse with urlsplit (#887)
I learned recently that `urlsplit` is both faster and more correct/up-to-date, standards-wise. We should generally be using it instead of `urlparse`.
1 parent 074cbb7 commit f8a6078

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

web_monitoring/cli/cli.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
import threading
6060
import time
6161
from tqdm import tqdm
62-
from urllib.parse import urlparse
62+
from urllib.parse import urlsplit
6363
from web_monitoring import db
6464
import wayback
6565
from wayback.exceptions import (WaybackException, WaybackRetryError,
@@ -931,14 +931,14 @@ def _is_page(version):
931931
aren't filtering down to a explicit list of URLs.
932932
"""
933933
return (version.mime_type not in SUBRESOURCE_MIME_TYPES and
934-
splitext(urlparse(version.url).path)[1] not in SUBRESOURCE_EXTENSIONS)
934+
splitext(urlsplit(version.url).path)[1] not in SUBRESOURCE_EXTENSIONS)
935935

936936

937937
def _parse_path(path_string):
938938
if path_string is None:
939939
return None
940940

941-
parsed = urlparse(path_string)
941+
parsed = urlsplit(path_string)
942942
if parsed.scheme == '':
943943
return Path(path_string)
944944
elif parsed.scheme == 'file':
@@ -955,9 +955,9 @@ def _is_valid(url):
955955
a URL is valid if it has a valid addressing scheme and network location.
956956
"""
957957
try:
958-
result = urlparse(url)
958+
result = urlsplit(url)
959959
return all([result.scheme, result.netloc])
960-
except:
960+
except Exception:
961961
return False
962962

963963

web_monitoring/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import threading
1818
import time
1919
from typing import Generator, Iterable, TypeVar
20-
from urllib.parse import ParseResult, urlparse
20+
from urllib.parse import SplitResult, urlsplit
2121

2222
try:
2323
from cchardet import detect as detect_charset
@@ -164,7 +164,7 @@ def hash_content(content_bytes):
164164
return hashlib.sha256(content_bytes).hexdigest()
165165

166166

167-
def normalize_netloc(url: ParseResult) -> str:
167+
def normalize_netloc(url: SplitResult) -> str:
168168
"""
169169
Get a parsed URL's netloc in a normalized form.
170170
"""
@@ -193,7 +193,7 @@ def normalize_url(url: str) -> str:
193193
should always be handled by a server or HTTP library exactly the same as
194194
the input would have been.
195195
"""
196-
parsed = urlparse(url)
196+
parsed = urlsplit(url)
197197
return parsed._replace(
198198
netloc=normalize_netloc(parsed),
199199
path=(parsed.path or '/'),

0 commit comments

Comments
 (0)