|
4 | 4 | import unittest |
5 | 5 | from w3lib.url import (is_url, safe_url_string, safe_download_url, |
6 | 6 | url_query_parameter, add_or_replace_parameter, url_query_cleaner, |
7 | | - file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc) |
| 7 | + file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc, |
| 8 | + canonicalize_url, parse_url) |
| 9 | +from six.moves.urllib.parse import urlparse |
| 10 | + |
8 | 11 |
|
9 | 12 | class UrlTests(unittest.TestCase): |
10 | 13 |
|
@@ -347,6 +350,209 @@ def test_urljoin_rfc_deprecated(self): |
347 | 350 | self.assertEqual(jurl, b"http://www.example.com/test") |
348 | 351 |
|
349 | 352 |
|
| 353 | +class CanonicalizeUrlTest(unittest.TestCase): |
| 354 | + |
| 355 | + def test_canonicalize_url(self): |
| 356 | + # simplest case |
| 357 | + self.assertEqual(canonicalize_url("http://www.example.com/"), |
| 358 | + "http://www.example.com/") |
| 359 | + |
| 360 | + def test_return_str(self): |
| 361 | + assert isinstance(canonicalize_url(u"http://www.example.com"), str) |
| 362 | + assert isinstance(canonicalize_url(b"http://www.example.com"), str) |
| 363 | + |
| 364 | + def test_append_missing_path(self): |
| 365 | + self.assertEqual(canonicalize_url("http://www.example.com"), |
| 366 | + "http://www.example.com/") |
| 367 | + |
| 368 | + def test_typical_usage(self): |
| 369 | + self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), |
| 370 | + "http://www.example.com/do?a=1&b=2&c=3") |
| 371 | + self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"), |
| 372 | + "http://www.example.com/do?a=3&b=2&c=1") |
| 373 | + self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), |
| 374 | + "http://www.example.com/do?a=1") |
| 375 | + |
| 376 | + def test_sorting(self): |
| 377 | + self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), |
| 378 | + "http://www.example.com/do?a=50&b=2&b=5&c=3") |
| 379 | + |
| 380 | + def test_keep_blank_values(self): |
| 381 | + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False), |
| 382 | + "http://www.example.com/do?a=2") |
| 383 | + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), |
| 384 | + "http://www.example.com/do?a=2&b=") |
| 385 | + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False), |
| 386 | + "http://www.example.com/do?a=2") |
| 387 | + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"), |
| 388 | + "http://www.example.com/do?a=2&b=&c=") |
| 389 | + |
| 390 | + self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), |
| 391 | + 'http://www.example.com/do?1750%2C4=') |
| 392 | + |
| 393 | + def test_spaces(self): |
| 394 | + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"), |
| 395 | + "http://www.example.com/do?a=1&q=a+space") |
| 396 | + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"), |
| 397 | + "http://www.example.com/do?a=1&q=a+space") |
| 398 | + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"), |
| 399 | + "http://www.example.com/do?a=1&q=a+space") |
| 400 | + |
| 401 | + def test_canonicalize_url_unicode_path(self): |
| 402 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"), |
| 403 | + "http://www.example.com/r%C3%A9sum%C3%A9") |
| 404 | + |
| 405 | + def test_canonicalize_url_unicode_query_string(self): |
| 406 | + # default encoding for path and query is UTF-8 |
| 407 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"), |
| 408 | + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") |
| 409 | + |
| 410 | + # passed encoding will affect query string |
| 411 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'), |
| 412 | + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9") |
| 413 | + |
| 414 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'), |
| 415 | + "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF") |
| 416 | + |
| 417 | + def test_canonicalize_url_unicode_query_string_wrong_encoding(self): |
| 418 | + # trying to encode with wrong encoding |
| 419 | + # fallback to UTF-8 |
| 420 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'), |
| 421 | + "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC") |
| 422 | + |
| 423 | + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'), |
| 424 | + "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F") |
| 425 | + |
| 426 | + def test_normalize_percent_encoding_in_paths(self): |
| 427 | + self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"), |
| 428 | + "http://www.example.com/r%C3%A9sum%C3%A9") |
| 429 | + |
| 430 | + # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased |
| 431 | + # 'latin1'-encoded sequence in path |
| 432 | + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"), |
| 433 | + "http://www.example.com/a%A3do") |
| 434 | + |
| 435 | + # 'latin1'-encoded path, UTF-8 encoded query string |
| 436 | + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"), |
| 437 | + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") |
| 438 | + |
| 439 | + # 'latin1'-encoded path and query string |
| 440 | + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"), |
| 441 | + "http://www.example.com/a%A3do?q=r%E9sum%E9") |
| 442 | + |
| 443 | + def test_normalize_percent_encoding_in_query_arguments(self): |
| 444 | + self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"), |
| 445 | + "http://www.example.com/do?k=b%A3") |
| 446 | + |
| 447 | + self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"), |
| 448 | + "http://www.example.com/do?k=r%C3%A9sum%C3%A9") |
| 449 | + |
| 450 | + def test_non_ascii_percent_encoding_in_paths(self): |
| 451 | + self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), |
| 452 | + "http://www.example.com/a%20do?a=1"), |
| 453 | + self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), |
| 454 | + "http://www.example.com/a%20%20do?a=1"), |
| 455 | + self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"), |
| 456 | + "http://www.example.com/a%20do%C2%A3.html?a=1") |
| 457 | + self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), |
| 458 | + "http://www.example.com/a%20do%C2%A3.html?a=1") |
| 459 | + |
| 460 | + def test_non_ascii_percent_encoding_in_query_arguments(self): |
| 461 | + self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"), |
| 462 | + u"http://www.example.com/do?a=5&price=%C2%A3500&z=3") |
| 463 | + self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"), |
| 464 | + "http://www.example.com/do?a=5&price=%C2%A3500&z=3") |
| 465 | + self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"), |
| 466 | + "http://www.example.com/do?a=1&price%28%C2%A3%29=500") |
| 467 | + |
| 468 | + def test_urls_with_auth_and_ports(self): |
| 469 | + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"), |
| 470 | + u"http://user:pass@www.example.com:81/do?now=1") |
| 471 | + |
| 472 | + def test_remove_fragments(self): |
| 473 | + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"), |
| 474 | + u"http://user:pass@www.example.com/do?a=1") |
| 475 | + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True), |
| 476 | + u"http://user:pass@www.example.com/do?a=1#frag") |
| 477 | + |
| 478 | + def test_dont_convert_safe_characters(self): |
| 479 | + # dont convert safe characters to percent encoding representation |
| 480 | + self.assertEqual(canonicalize_url( |
| 481 | + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"), |
| 482 | + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html") |
| 483 | + |
| 484 | + def test_safe_characters_unicode(self): |
| 485 | + # urllib.quote uses a mapping cache of encoded characters. when parsing |
| 486 | + # an already percent-encoded url, it will fail if that url was not |
| 487 | + # percent-encoded as utf-8, that's why canonicalize_url must always |
| 488 | + # convert the urls to string. the following test asserts that |
| 489 | + # functionality. |
| 490 | + self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), |
| 491 | + 'http://www.example.com/caf%E9-con-leche.htm') |
| 492 | + |
| 493 | + def test_domains_are_case_insensitive(self): |
| 494 | + self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"), |
| 495 | + "http://www.example.com/") |
| 496 | + |
| 497 | + def test_canonicalize_idns(self): |
| 498 | + self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), |
| 499 | + 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') |
| 500 | + # Japanese (+ reordering query parameters) |
| 501 | + self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), |
| 502 | + 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5') |
| 503 | + |
| 504 | + def test_quoted_slash_and_question_sign(self): |
| 505 | + self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), |
| 506 | + "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") |
| 507 | + self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), |
| 508 | + "http://foo.com/AC%2FDC/") |
| 509 | + |
| 510 | + def test_canonicalize_urlparsed(self): |
| 511 | + # canonicalize_url() can be passed an already urlparse'd URL |
| 512 | + self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")), |
| 513 | + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") |
| 514 | + self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')), |
| 515 | + 'http://www.example.com/caf%E9-con-leche.htm') |
| 516 | + self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), |
| 517 | + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") |
| 518 | + |
| 519 | + def test_canonicalize_parse_url(self): |
| 520 | + # parse_url() wraps urlparse and is used in link extractors |
| 521 | + self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")), |
| 522 | + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") |
| 523 | + self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')), |
| 524 | + 'http://www.example.com/caf%E9-con-leche.htm') |
| 525 | + self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), |
| 526 | + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") |
| 527 | + |
| 528 | + def test_canonicalize_url_idempotence(self): |
| 529 | + for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'), |
| 530 | + (u'http://www.example.com/résumé?q=résumé', 'latin1'), |
| 531 | + (u'http://www.example.com/résumé?country=Россия', 'cp1251'), |
| 532 | + (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]: |
| 533 | + canonicalized = canonicalize_url(url, encoding=enc) |
| 534 | + |
| 535 | + # if we canonicalize again, we ge the same result |
| 536 | + self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized) |
| 537 | + |
| 538 | + # without encoding, already canonicalized URL is canonicalized identically |
| 539 | + self.assertEqual(canonicalize_url(canonicalized), canonicalized) |
| 540 | + |
| 541 | + def test_canonicalize_url_idna_exceptions(self): |
| 542 | + # missing DNS label |
| 543 | + self.assertEqual( |
| 544 | + canonicalize_url(u"http://.example.com/résumé?q=résumé"), |
| 545 | + "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") |
| 546 | + |
| 547 | + # DNS label too long |
| 548 | + self.assertEqual( |
| 549 | + canonicalize_url( |
| 550 | + u"http://www.{label}.com/résumé?q=résumé".format( |
| 551 | + label=u"example"*11)), |
| 552 | + "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( |
| 553 | + label=u"example"*11)) |
| 554 | + |
| 555 | + |
350 | 556 | if __name__ == "__main__": |
351 | 557 | unittest.main() |
352 | 558 |
|
0 commit comments