From b6c41436845a7a546936654cdf54c9795ce76c2a Mon Sep 17 00:00:00 2001 From: John Reynolds Date: Sat, 6 Jul 2019 14:29:36 -0700 Subject: [PATCH 1/2] Added a parsing step by comma for servers still sending back CSV for multiple cookies #1 --- ScrapySharp/Network/CookiesParser.cs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ScrapySharp/Network/CookiesParser.cs b/ScrapySharp/Network/CookiesParser.cs index b9da92d..790577b 100644 --- a/ScrapySharp/Network/CookiesParser.cs +++ b/ScrapySharp/Network/CookiesParser.cs @@ -19,19 +19,24 @@ public List> ParseValuePairs(string cookiesExpressi { var list = new List>(); - var match = splitCookiesRegex.Match(cookiesExpression); + var cookiesArr = cookiesExpression.Split(",".ToCharArray()); - while (match.Success) + foreach (var cookieString in cookiesArr) { - if (match.Groups["name"].Success && match.Groups["val"].Success) + var match = splitCookiesRegex.Match(cookieString); + + while (match.Success) { - try + if (match.Groups["name"].Success && match.Groups["val"].Success) { - list.Add(new KeyValuePair(match.Groups["name"].Value, match.Groups["val"].Value)); + try + { + list.Add(new KeyValuePair(match.Groups["name"].Value, match.Groups["val"].Value)); + } + catch (CookieException) { } } - catch (CookieException) { } + match = match.NextMatch(); } - match = match.NextMatch(); } return list; From ebe55e98114443b9e963f695db024ddad6acae7b Mon Sep 17 00:00:00 2001 From: John Reynolds Date: Mon, 8 Jun 2020 12:22:24 -0700 Subject: [PATCH 2/2] Added unit tests to cookie parser logic, added regex for comma separated parsing, PR #23 --- ScrapySharp.Tests/Network/Cookie.txt | 1 + ScrapySharp.Tests/ScrapySharp.Tests.csproj | 14 +++++ ScrapySharp.Tests/When_parse_cookies.cs | 56 +++++++++++++++++++ ...using_CssSelector_with_fsharp_tokenizer.cs | 3 +- ScrapySharp/Network/CookiesParser.cs | 30 +++++----- 5 files changed, 87 insertions(+), 17 deletions(-) create mode 100644 ScrapySharp.Tests/Network/Cookie.txt create mode 100644 ScrapySharp.Tests/When_parse_cookies.cs diff --git a/ScrapySharp.Tests/Network/Cookie.txt b/ScrapySharp.Tests/Network/Cookie.txt new file mode 100644 index 0000000..c4d0e30 --- /dev/null +++ b/ScrapySharp.Tests/Network/Cookie.txt @@ -0,0 +1 @@ +key1=204=eBL9WYALz-YPXedwGc-utNT3YXlx7moCeN-dvDDl-7xYtHIZtPEr0OZ2bKWtEJWInrKuEnlR_-JFXZ4mMYIVQrPcTxGCZCCHoeUpdz1kU3cMa38TGrn-uaB6gH7D7A_XKa5bJvjbkatI3mvnPNgjJfd4QrHeDu8hms-c9b6to04; key2=this is a test value; expires=Tue, 08-Dec-2020 01:44:28 GMT; path=/; domain=.localhost.fakedomain; HttpOnly \ No newline at end of file diff --git a/ScrapySharp.Tests/ScrapySharp.Tests.csproj b/ScrapySharp.Tests/ScrapySharp.Tests.csproj index c663c50..ce9f38d 100644 --- a/ScrapySharp.Tests/ScrapySharp.Tests.csproj +++ b/ScrapySharp.Tests/ScrapySharp.Tests.csproj @@ -2,6 +2,12 @@ netcoreapp2.0 + + + + + + @@ -41,5 +47,13 @@ Always + + Always + + + + + Always + \ No newline at end of file diff --git a/ScrapySharp.Tests/When_parse_cookies.cs b/ScrapySharp.Tests/When_parse_cookies.cs new file mode 100644 index 0000000..93769e8 --- /dev/null +++ b/ScrapySharp.Tests/When_parse_cookies.cs @@ -0,0 +1,56 @@ +// ReSharper disable InconsistentNaming + +using System.IO; +using NUnit.Framework; +using ScrapySharp.Html.Dom; +using ScrapySharp.Html.Parsing; +using System.Linq; +using ScrapySharp.Network; +using System.Net; + +namespace ScrapySharp.Tests +{ + [TestFixture] + public class When_parse_cookies + { + [Test] + public void When_parse_standard_cookie() + { + var cookie = GetCookie(); + + CookiesParser parser = new CookiesParser(".localhost.fakedomain"); + var cookieList = parser.ParseCookies(cookie); + + Assert.AreEqual(2, cookieList.Count); + } + + [Test] + public void When_parse_csv_cookie() + { + var csvCookie = GetCookie().Replace(";",","); + + CookiesParser parser = new CookiesParser(".localhost.fakedomain"); + var cookieList = parser.ParseCookies(csvCookie); + + Assert.AreEqual(2, cookieList.Count); + } + + [Test] + public void When_parse_csv_invalid_cookie() + { + string invalidCookie = GetCookie().Replace(";",";,"); + CookiesParser parser = new CookiesParser(".localhost.fakedomain"); + + Assert.Throws(()=> { parser.ParseCookies(invalidCookie); }); + } + + private static string GetCookie() + { + var cookie = File.ReadAllText("Network/Cookie.txt"); + + return cookie; + } + } +} + +// ReSharper restore InconsistentNaming \ No newline at end of file diff --git a/ScrapySharp.Tests/When_parses_using_CssSelector_with_fsharp_tokenizer.cs b/ScrapySharp.Tests/When_parses_using_CssSelector_with_fsharp_tokenizer.cs index d3da672..fe980f1 100644 --- a/ScrapySharp.Tests/When_parses_using_CssSelector_with_fsharp_tokenizer.cs +++ b/ScrapySharp.Tests/When_parses_using_CssSelector_with_fsharp_tokenizer.cs @@ -1,11 +1,10 @@ // ReSharper disable InconsistentNaming -using System.Linq; using HtmlAgilityPack; using NUnit.Framework; using ScrapySharp.Core; using ScrapySharp.Extensions; -using ScrapySharp.Core; +using System.Linq; namespace ScrapySharp.Tests { diff --git a/ScrapySharp/Network/CookiesParser.cs b/ScrapySharp/Network/CookiesParser.cs index 790577b..2bf997c 100644 --- a/ScrapySharp/Network/CookiesParser.cs +++ b/ScrapySharp/Network/CookiesParser.cs @@ -8,8 +8,9 @@ namespace ScrapySharp.Network public class CookiesParser { private readonly string defaultDomain; - private static readonly Regex splitCookiesRegex = new Regex(@"\s*(?[^=]+)=(?[^;]+)?[,;]+", RegexOptions.Compiled); - + private static readonly Regex splitCookiesRegex = new Regex(@"\s*(?[^=]+)=(?[^;]+)?[;]+", RegexOptions.Compiled); + private static readonly Regex splitCookiesCsvRegex = new Regex(@"\s*(?[^=]+)=(?.*?),(?=[^,]+?(?:=|$))+", RegexOptions.Compiled); + public CookiesParser(string defaultDomain) { this.defaultDomain = defaultDomain; @@ -19,24 +20,23 @@ public List> ParseValuePairs(string cookiesExpressi { var list = new List>(); - var cookiesArr = cookiesExpression.Split(",".ToCharArray()); + Match match; + if (cookiesExpression.Contains(";")) + match = splitCookiesRegex.Match(cookiesExpression); + else + match = splitCookiesCsvRegex.Match(cookiesExpression); - foreach (var cookieString in cookiesArr) + while (match.Success) { - var match = splitCookiesRegex.Match(cookieString); - - while (match.Success) + if (match.Groups["name"].Success && match.Groups["val"].Success) { - if (match.Groups["name"].Success && match.Groups["val"].Success) + try { - try - { - list.Add(new KeyValuePair(match.Groups["name"].Value, match.Groups["val"].Value)); - } - catch (CookieException) { } + list.Add(new KeyValuePair(match.Groups["name"].Value, match.Groups["val"].Value)); } - match = match.NextMatch(); + catch (CookieException) { } } + match = match.NextMatch(); } return list; @@ -50,7 +50,7 @@ public List ParseCookies(string cookiesExpression) for (int i = 0; i < keyValuePairs.Count; i++) { var pair = keyValuePairs[i]; - if (pair.Key.Equals("path", StringComparison.InvariantCultureIgnoreCase) + if (pair.Key.Equals("path", StringComparison.InvariantCultureIgnoreCase) || pair.Key.Equals("domain", StringComparison.InvariantCultureIgnoreCase) || pair.Key.Equals("expires", StringComparison.InvariantCultureIgnoreCase)) continue;