File tree Expand file tree Collapse file tree 2 files changed +8
-8
lines changed
Expand file tree Collapse file tree 2 files changed +8
-8
lines changed Original file line number Diff line number Diff line change 5959import threading
6060import time
6161from tqdm import tqdm
62- from urllib .parse import urlparse
62+ from urllib .parse import urlsplit
6363from web_monitoring import db
6464import wayback
6565from wayback .exceptions import (WaybackException , WaybackRetryError ,
@@ -931,14 +931,14 @@ def _is_page(version):
931931 aren't filtering down to a explicit list of URLs.
932932 """
933933 return (version .mime_type not in SUBRESOURCE_MIME_TYPES and
934- splitext (urlparse (version .url ).path )[1 ] not in SUBRESOURCE_EXTENSIONS )
934+ splitext (urlsplit (version .url ).path )[1 ] not in SUBRESOURCE_EXTENSIONS )
935935
936936
937937def _parse_path (path_string ):
938938 if path_string is None :
939939 return None
940940
941- parsed = urlparse (path_string )
941+ parsed = urlsplit (path_string )
942942 if parsed .scheme == '' :
943943 return Path (path_string )
944944 elif parsed .scheme == 'file' :
@@ -955,9 +955,9 @@ def _is_valid(url):
955955 a URL is valid if it has a valid addressing scheme and network location.
956956 """
957957 try :
958- result = urlparse (url )
958+ result = urlsplit (url )
959959 return all ([result .scheme , result .netloc ])
960- except :
960+ except Exception :
961961 return False
962962
963963
Original file line number Diff line number Diff line change 1717import threading
1818import time
1919from typing import Generator , Iterable , TypeVar
20- from urllib .parse import ParseResult , urlparse
20+ from urllib .parse import SplitResult , urlsplit
2121
2222try :
2323 from cchardet import detect as detect_charset
@@ -164,7 +164,7 @@ def hash_content(content_bytes):
164164 return hashlib .sha256 (content_bytes ).hexdigest ()
165165
166166
167- def normalize_netloc (url : ParseResult ) -> str :
167+ def normalize_netloc (url : SplitResult ) -> str :
168168 """
169169 Get a parsed URL's netloc in a normalized form.
170170 """
@@ -193,7 +193,7 @@ def normalize_url(url: str) -> str:
193193 should always be handled by a server or HTTP library exactly the same as
194194 the input would have been.
195195 """
196- parsed = urlparse (url )
196+ parsed = urlsplit (url )
197197 return parsed ._replace (
198198 netloc = normalize_netloc (parsed ),
199199 path = (parsed .path or '/' ),
You can’t perform that action at this time.
0 commit comments