diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eed38f..8f5fbdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,9 @@ # CHANGELOG +## 1.4.1 +New +* Updated for Ruby 2.7+ support +* Switched to Addressable.URI.escape from obsolete URI.escape + ## 1.4.0 ### New * Add `encoding` config option (see [All available config options](https://github.com/vifreefly/kimuraframework#all-available-config-options)) diff --git a/README.md b/README.md index dddc3f8..082124a 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ class GithubSpider < Kimurai::Base } def parse(response, url:, data: {}) - response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| + response.xpath("//ul[@class='repo-list']//a[@class='v-align-middle']").each do |a| request_to :parse_repo_page, url: absolute_url(a[:href], base: url) end @@ -51,7 +51,7 @@ class GithubSpider < Kimurai::Base item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text item[:repo_url] = url item[:description] = response.xpath("//span[@itemprop='about']").text.squish - item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } + item[:tags] = response.xpath("//div[starts-with(@class, 'list-topics-container')]/a").map { |a| a.text.squish } item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish @@ -1359,7 +1359,7 @@ end # => So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead: -#### `.parse!(:method_name, url:)` method +#### `.parse!(:method_name, url:, config: {})` method `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back: @@ -1376,6 +1376,8 @@ end ExampleSpider.parse!(:parse, url: "https://example.com/") # => "Example Domain" +# this is example when you need to override config +ExampleSpider.parse!(:parse, url: "https://example.com/", config: { before_request: { clear_and_set_cookies: true } } ) ``` Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time: diff --git a/kimurai.gemspec b/kimurai.gemspec index 07c4d55..748add4 100644 --- a/kimurai.gemspec +++ b/kimurai.gemspec @@ -37,12 +37,13 @@ Gem::Specification.new do |spec| spec.add_dependency "headless" spec.add_dependency "pmap" + spec.add_dependency "addressable" spec.add_dependency "whenever" spec.add_dependency "rbcat", "~> 0.2" spec.add_dependency "pry" - spec.add_development_dependency "bundler", "~> 1.16" - spec.add_development_dependency "rake", "~> 10.0" + spec.add_development_dependency "bundler", "~> 2.1" + spec.add_development_dependency "rake", "~> 13.0" spec.add_development_dependency "minitest", "~> 5.0" end diff --git a/lib/kimurai/base.rb b/lib/kimurai/base.rb index f8a4b4f..6e09270 100644 --- a/lib/kimurai/base.rb +++ b/lib/kimurai/base.rb @@ -154,7 +154,13 @@ def self.crawl!(exception_on_fail: true) end def self.parse!(handler, *args, **request) - spider = self.new + if request.has_key? :config + config = request[:config] + request.delete :config + else + config = {} + end + spider = self.new config: config if args.present? spider.public_send(handler, *args) @@ -201,7 +207,9 @@ def request_to(handler, delay = nil, url:, data: {}, response_type: :html) visited = delay ? browser.visit(url, delay: delay) : browser.visit(url) return unless visited - public_send(handler, browser.current_response(response_type), { url: url, data: data }) + options = { url: url, data: data } + + public_send(handler, browser.current_response(response_type), **options) end def console(response = nil, url: nil, data: {}) diff --git a/lib/kimurai/base_helper.rb b/lib/kimurai/base_helper.rb index cff59d2..633208d 100644 --- a/lib/kimurai/base_helper.rb +++ b/lib/kimurai/base_helper.rb @@ -1,16 +1,18 @@ +require 'addressable/uri' + module Kimurai module BaseHelper private def absolute_url(url, base:) return unless url - URI.join(base, URI.escape(url)).to_s + URI.join(base, Addressable::URI.escape(url)).to_s end def escape_url(url) uri = URI.parse(url) rescue URI::InvalidURIError => e - URI.parse(URI.escape url).to_s rescue url + URI.parse(Addressable::URI.escape(url)).to_s rescue url else url end diff --git a/lib/kimurai/version.rb b/lib/kimurai/version.rb index ed8ce2a..52e0c33 100644 --- a/lib/kimurai/version.rb +++ b/lib/kimurai/version.rb @@ -1,3 +1,3 @@ module Kimurai - VERSION = "1.4.0" + VERSION = "1.4.1" end