From 5795835b8e83556ae7ad1bc298905d1c96c44797 Mon Sep 17 00:00:00 2001 From: iavivai <18yukitaka@gmail.com> Date: Fri, 1 Nov 2019 23:59:58 +0900 Subject: [PATCH 1/3] Fix of sample code. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dddc3f8..cd15a19 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ class GithubSpider < Kimurai::Base } def parse(response, url:, data: {}) - response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| + response.xpath("//ul[@class='repo-list']//a[@class='v-align-middle']").each do |a| request_to :parse_repo_page, url: absolute_url(a[:href], base: url) end @@ -51,7 +51,7 @@ class GithubSpider < Kimurai::Base item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text item[:repo_url] = url item[:description] = response.xpath("//span[@itemprop='about']").text.squish - item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } + item[:tags] = response.xpath("//div[starts-with(@class, 'list-topics-container')]/a").map { |a| a.text.squish } item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish From 4aa720fca9f5b287f0be6186a702a18089088a94 Mon Sep 17 00:00:00 2001 From: Dusan Orlovic Date: Thu, 14 Nov 2019 12:33:54 +0100 Subject: [PATCH 2/3] Use config argument on parse! to set config --- README.md | 4 +++- lib/kimurai/base.rb | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cd15a19..082124a 100644 --- a/README.md +++ b/README.md @@ -1359,7 +1359,7 @@ end # => So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead: -#### `.parse!(:method_name, url:)` method +#### `.parse!(:method_name, url:, config: {})` method `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back: @@ -1376,6 +1376,8 @@ end ExampleSpider.parse!(:parse, url: "https://example.com/") # => "Example Domain" +# this is example when you need to override config +ExampleSpider.parse!(:parse, url: "https://example.com/", config: { before_request: { clear_and_set_cookies: true } } ) ``` Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time: diff --git a/lib/kimurai/base.rb b/lib/kimurai/base.rb index f8a4b4f..1d2ae87 100644 --- a/lib/kimurai/base.rb +++ b/lib/kimurai/base.rb @@ -154,7 +154,13 @@ def self.crawl!(exception_on_fail: true) end def self.parse!(handler, *args, **request) - spider = self.new + if request.has_key? :config + config = request[:config] + request.delete :config + else + config = {} + end + spider = self.new config: config if args.present? spider.public_send(handler, *args) From 58c971bff02fe559717f8fd506ced82f5050ea60 Mon Sep 17 00:00:00 2001 From: John Phamvan Date: Wed, 13 May 2020 11:25:22 -0700 Subject: [PATCH 3/3] Switch to Addressable.URI.escape away from obsolete URI.escape; updated some development gems Fixed Ruby 2.7 warning for keyword arguments --- CHANGELOG.md | 5 +++++ kimurai.gemspec | 5 +++-- lib/kimurai/base.rb | 4 +++- lib/kimurai/base_helper.rb | 6 ++++-- lib/kimurai/version.rb | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eed38f..8f5fbdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,9 @@ # CHANGELOG +## 1.4.1 +New +* Updated for Ruby 2.7+ support +* Switched to Addressable.URI.escape from obsolete URI.escape + ## 1.4.0 ### New * Add `encoding` config option (see [All available config options](https://github.com/vifreefly/kimuraframework#all-available-config-options)) diff --git a/kimurai.gemspec b/kimurai.gemspec index 07c4d55..748add4 100644 --- a/kimurai.gemspec +++ b/kimurai.gemspec @@ -37,12 +37,13 @@ Gem::Specification.new do |spec| spec.add_dependency "headless" spec.add_dependency "pmap" + spec.add_dependency "addressable" spec.add_dependency "whenever" spec.add_dependency "rbcat", "~> 0.2" spec.add_dependency "pry" - spec.add_development_dependency "bundler", "~> 1.16" - spec.add_development_dependency "rake", "~> 10.0" + spec.add_development_dependency "bundler", "~> 2.1" + spec.add_development_dependency "rake", "~> 13.0" spec.add_development_dependency "minitest", "~> 5.0" end diff --git a/lib/kimurai/base.rb b/lib/kimurai/base.rb index 1d2ae87..6e09270 100644 --- a/lib/kimurai/base.rb +++ b/lib/kimurai/base.rb @@ -207,7 +207,9 @@ def request_to(handler, delay = nil, url:, data: {}, response_type: :html) visited = delay ? browser.visit(url, delay: delay) : browser.visit(url) return unless visited - public_send(handler, browser.current_response(response_type), { url: url, data: data }) + options = { url: url, data: data } + + public_send(handler, browser.current_response(response_type), **options) end def console(response = nil, url: nil, data: {}) diff --git a/lib/kimurai/base_helper.rb b/lib/kimurai/base_helper.rb index cff59d2..633208d 100644 --- a/lib/kimurai/base_helper.rb +++ b/lib/kimurai/base_helper.rb @@ -1,16 +1,18 @@ +require 'addressable/uri' + module Kimurai module BaseHelper private def absolute_url(url, base:) return unless url - URI.join(base, URI.escape(url)).to_s + URI.join(base, Addressable::URI.escape(url)).to_s end def escape_url(url) uri = URI.parse(url) rescue URI::InvalidURIError => e - URI.parse(URI.escape url).to_s rescue url + URI.parse(Addressable::URI.escape(url)).to_s rescue url else url end diff --git a/lib/kimurai/version.rb b/lib/kimurai/version.rb index ed8ce2a..52e0c33 100644 --- a/lib/kimurai/version.rb +++ b/lib/kimurai/version.rb @@ -1,3 +1,3 @@ module Kimurai - VERSION = "1.4.0" + VERSION = "1.4.1" end