Skip to content

Commit 3c56191

Browse files
authored
Merge pull request #66 from Preetwinder/canonicalize_url
[MRG+1] Add canonicalize_url function #65
2 parents 9952180 + 4954e10 commit 3c56191

File tree

2 files changed

+380
-2
lines changed

2 files changed

+380
-2
lines changed

tests/test_url.py

Lines changed: 207 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
import unittest
55
from w3lib.url import (is_url, safe_url_string, safe_download_url,
66
url_query_parameter, add_or_replace_parameter, url_query_cleaner,
7-
file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc)
7+
file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc,
8+
canonicalize_url, parse_url)
9+
from six.moves.urllib.parse import urlparse
10+
811

912
class UrlTests(unittest.TestCase):
1013

@@ -347,6 +350,209 @@ def test_urljoin_rfc_deprecated(self):
347350
self.assertEqual(jurl, b"http://www.example.com/test")
348351

349352

353+
class CanonicalizeUrlTest(unittest.TestCase):
354+
355+
def test_canonicalize_url(self):
356+
# simplest case
357+
self.assertEqual(canonicalize_url("http://www.example.com/"),
358+
"http://www.example.com/")
359+
360+
def test_return_str(self):
361+
assert isinstance(canonicalize_url(u"http://www.example.com"), str)
362+
assert isinstance(canonicalize_url(b"http://www.example.com"), str)
363+
364+
def test_append_missing_path(self):
365+
self.assertEqual(canonicalize_url("http://www.example.com"),
366+
"http://www.example.com/")
367+
368+
def test_typical_usage(self):
369+
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
370+
"http://www.example.com/do?a=1&b=2&c=3")
371+
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
372+
"http://www.example.com/do?a=3&b=2&c=1")
373+
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
374+
"http://www.example.com/do?a=1")
375+
376+
def test_sorting(self):
377+
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
378+
"http://www.example.com/do?a=50&b=2&b=5&c=3")
379+
380+
def test_keep_blank_values(self):
381+
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
382+
"http://www.example.com/do?a=2")
383+
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
384+
"http://www.example.com/do?a=2&b=")
385+
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
386+
"http://www.example.com/do?a=2")
387+
self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
388+
"http://www.example.com/do?a=2&b=&c=")
389+
390+
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
391+
'http://www.example.com/do?1750%2C4=')
392+
393+
def test_spaces(self):
394+
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
395+
"http://www.example.com/do?a=1&q=a+space")
396+
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
397+
"http://www.example.com/do?a=1&q=a+space")
398+
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
399+
"http://www.example.com/do?a=1&q=a+space")
400+
401+
def test_canonicalize_url_unicode_path(self):
402+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
403+
"http://www.example.com/r%C3%A9sum%C3%A9")
404+
405+
def test_canonicalize_url_unicode_query_string(self):
406+
# default encoding for path and query is UTF-8
407+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
408+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
409+
410+
# passed encoding will affect query string
411+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
412+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
413+
414+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
415+
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
416+
417+
def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
418+
# trying to encode with wrong encoding
419+
# fallback to UTF-8
420+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
421+
"http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
422+
423+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
424+
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
425+
426+
def test_normalize_percent_encoding_in_paths(self):
427+
self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
428+
"http://www.example.com/r%C3%A9sum%C3%A9")
429+
430+
# non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
431+
# 'latin1'-encoded sequence in path
432+
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
433+
"http://www.example.com/a%A3do")
434+
435+
# 'latin1'-encoded path, UTF-8 encoded query string
436+
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
437+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
438+
439+
# 'latin1'-encoded path and query string
440+
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
441+
"http://www.example.com/a%A3do?q=r%E9sum%E9")
442+
443+
def test_normalize_percent_encoding_in_query_arguments(self):
444+
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
445+
"http://www.example.com/do?k=b%A3")
446+
447+
self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
448+
"http://www.example.com/do?k=r%C3%A9sum%C3%A9")
449+
450+
def test_non_ascii_percent_encoding_in_paths(self):
451+
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
452+
"http://www.example.com/a%20do?a=1"),
453+
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
454+
"http://www.example.com/a%20%20do?a=1"),
455+
self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
456+
"http://www.example.com/a%20do%C2%A3.html?a=1")
457+
self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
458+
"http://www.example.com/a%20do%C2%A3.html?a=1")
459+
460+
def test_non_ascii_percent_encoding_in_query_arguments(self):
461+
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
462+
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
463+
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
464+
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
465+
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
466+
"http://www.example.com/do?a=1&price%28%C2%A3%29=500")
467+
468+
def test_urls_with_auth_and_ports(self):
469+
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"),
470+
u"http://user:pass@www.example.com:81/do?now=1")
471+
472+
def test_remove_fragments(self):
473+
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"),
474+
u"http://user:pass@www.example.com/do?a=1")
475+
self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True),
476+
u"http://user:pass@www.example.com/do?a=1#frag")
477+
478+
def test_dont_convert_safe_characters(self):
479+
# dont convert safe characters to percent encoding representation
480+
self.assertEqual(canonicalize_url(
481+
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
482+
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
483+
484+
def test_safe_characters_unicode(self):
485+
# urllib.quote uses a mapping cache of encoded characters. when parsing
486+
# an already percent-encoded url, it will fail if that url was not
487+
# percent-encoded as utf-8, that's why canonicalize_url must always
488+
# convert the urls to string. the following test asserts that
489+
# functionality.
490+
self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
491+
'http://www.example.com/caf%E9-con-leche.htm')
492+
493+
def test_domains_are_case_insensitive(self):
494+
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
495+
"http://www.example.com/")
496+
497+
def test_canonicalize_idns(self):
498+
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
499+
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
500+
# Japanese (+ reordering query parameters)
501+
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
502+
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
503+
504+
def test_quoted_slash_and_question_sign(self):
505+
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
506+
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
507+
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
508+
"http://foo.com/AC%2FDC/")
509+
510+
def test_canonicalize_urlparsed(self):
511+
# canonicalize_url() can be passed an already urlparse'd URL
512+
self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
513+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
514+
self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
515+
'http://www.example.com/caf%E9-con-leche.htm')
516+
self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
517+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
518+
519+
def test_canonicalize_parse_url(self):
520+
# parse_url() wraps urlparse and is used in link extractors
521+
self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
522+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
523+
self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
524+
'http://www.example.com/caf%E9-con-leche.htm')
525+
self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
526+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
527+
528+
def test_canonicalize_url_idempotence(self):
529+
for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
530+
(u'http://www.example.com/résumé?q=résumé', 'latin1'),
531+
(u'http://www.example.com/résumé?country=Россия', 'cp1251'),
532+
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
533+
canonicalized = canonicalize_url(url, encoding=enc)
534+
535+
# if we canonicalize again, we ge the same result
536+
self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)
537+
538+
# without encoding, already canonicalized URL is canonicalized identically
539+
self.assertEqual(canonicalize_url(canonicalized), canonicalized)
540+
541+
def test_canonicalize_url_idna_exceptions(self):
542+
# missing DNS label
543+
self.assertEqual(
544+
canonicalize_url(u"http://.example.com/résumé?q=résumé"),
545+
"http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
546+
547+
# DNS label too long
548+
self.assertEqual(
549+
canonicalize_url(
550+
u"http://www.{label}.com/résumé?q=résumé".format(
551+
label=u"example"*11)),
552+
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
553+
label=u"example"*11))
554+
555+
350556
if __name__ == "__main__":
351557
unittest.main()
352558

0 commit comments

Comments
 (0)