From 28748b9e3a986f319307c5946764dee53be7a994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Denilson=20S=C3=A1=20Maia?= Date: Sat, 18 Oct 2025 01:42:21 +0200 Subject: [PATCH 1/4] New scraper: graphviz (also known as DOT language) Closes: https://trello.com/c/Jaa1vC24 This is my second scraper. It's almost in a good shape, but I left a TODO comment because I couldn't figure out how to make UrlScraper reliably scrape the website. I keep getting random errors. Otherwise, it's ready for review. --- assets/javascripts/vendor/prism.js | 79 +++++++++++++++++++++++- lib/docs/filters/graphviz/clean_html.rb | 44 +++++++++++++ lib/docs/filters/graphviz/entries.rb | 28 +++++++++ lib/docs/scrapers/graphviz.rb | 55 +++++++++++++++++ public/icons/docs/graphviz/16.png | Bin 0 -> 395 bytes public/icons/docs/graphviz/16@2x.png | Bin 0 -> 744 bytes public/icons/docs/graphviz/SOURCE | 1 + 7 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 lib/docs/filters/graphviz/clean_html.rb create mode 100644 lib/docs/filters/graphviz/entries.rb create mode 100644 lib/docs/scrapers/graphviz.rb create mode 100644 public/icons/docs/graphviz/16.png create mode 100644 public/icons/docs/graphviz/16@2x.png create mode 100644 public/icons/docs/graphviz/SOURCE diff --git a/assets/javascripts/vendor/prism.js b/assets/javascripts/vendor/prism.js index 1f4a1b70bf..96519c4a8d 100644 --- a/assets/javascripts/vendor/prism.js +++ b/assets/javascripts/vendor/prism.js @@ -1,5 +1,5 @@ /* PrismJS 1.30.0 -https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markdown+markup-templating+matlab+nginx+nim+nix+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+tcl+typescript+yaml+zig */ +https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript+bash+c+cpp+cmake+coffeescript+crystal+d+dart+diff+django+dot+elixir+erlang+go+groovy+java+json+julia+kotlin+latex+lua+markdown+markup-templating+matlab+nginx+nim+nix+ocaml+perl+php+python+qml+r+jsx+ruby+rust+scss+scala+shell-session+sql+tcl+typescript+yaml+zig */ /// var _self = (typeof window !== 'undefined') @@ -2929,6 +2929,83 @@ Prism.languages.insertBefore('d', 'function', { }(Prism)); +// https://www.graphviz.org/doc/info/lang.html + +(function (Prism) { + + var ID = '(?:' + [ + // an identifier + /[a-zA-Z_\x80-\uFFFF][\w\x80-\uFFFF]*/.source, + // a number + /-?(?:\.\d+|\d+(?:\.\d*)?)/.source, + // a double-quoted string + /"[^"\\]*(?:\\[\s\S][^"\\]*)*"/.source, + // HTML-like string + /<(?:[^<>]|(?!)*>/.source + ].join('|') + ')'; + + var IDInside = { + 'markup': { + pattern: /(^<)[\s\S]+(?=>$)/, + lookbehind: true, + alias: ['language-markup', 'language-html', 'language-xml'], + inside: Prism.languages.markup + } + }; + + /** + * @param {string} source + * @param {string} flags + * @returns {RegExp} + */ + function withID(source, flags) { + return RegExp(source.replace(//g, function () { return ID; }), flags); + } + + Prism.languages.dot = { + 'comment': { + pattern: /\/\/.*|\/\*[\s\S]*?\*\/|^#.*/m, + greedy: true + }, + 'graph-name': { + pattern: withID(/(\b(?:digraph|graph|subgraph)[ \t\r\n]+)/.source, 'i'), + lookbehind: true, + greedy: true, + alias: 'class-name', + inside: IDInside + }, + 'attr-value': { + pattern: withID(/(=[ \t\r\n]*)/.source), + lookbehind: true, + greedy: true, + inside: IDInside + }, + 'attr-name': { + pattern: withID(/([\[;, \t\r\n])(?=[ \t\r\n]*=)/.source), + lookbehind: true, + greedy: true, + inside: IDInside + }, + 'keyword': /\b(?:digraph|edge|graph|node|strict|subgraph)\b/i, + 'compass-point': { + pattern: /(:[ \t\r\n]*)(?:[ewc_]|[ns][ew]?)(?![\w\x80-\uFFFF])/, + lookbehind: true, + alias: 'builtin' + }, + 'node': { + pattern: withID(/(^|[^-.\w\x80-\uFFFF\\])/.source), + lookbehind: true, + greedy: true, + inside: IDInside + }, + 'operator': /[=:]|-[->]/, + 'punctuation': /[\[\]{};,]/ + }; + + Prism.languages.gv = Prism.languages.dot; + +}(Prism)); + Prism.languages.elixir = { 'doc': { pattern: /@(?:doc|moduledoc)\s+(?:("""|''')[\s\S]*?\1|("|')(?:\\(?:\r\n|[\s\S])|(?!\2)[^\\\r\n])*\2)/, diff --git a/lib/docs/filters/graphviz/clean_html.rb b/lib/docs/filters/graphviz/clean_html.rb new file mode 100644 index 0000000000..123913aae1 --- /dev/null +++ b/lib/docs/filters/graphviz/clean_html.rb @@ -0,0 +1,44 @@ +module Docs + class Graphviz + class CleanHtmlFilter < Filter + def call + css('[tabindex]').remove_attribute('tabindex') + + content = at_css('.td-content') + @doc = content if content + + css('pre:has(code)').each do |node| + pre = Nokogiri::XML::Node.new('pre', @doc) + code = node.at_css('code') + + if code['data-lang'] + # Syntax highlighting is embedded into this HTML markup. + pre['data-language'] = code['data-lang'] + else + # Plain example source-code without highlighting. + # Let's guess the language. + sourcecode = code.content.strip + if sourcecode =~ /^\$/ + # Starts with '$'? Probably a shell session. + pre['data-language'] = 'shell-session' + elsif sourcecode =~ /^cmd / + # Command line example. No highlighting needed. + pre['data-language'] = '' + elsif sourcecode =~ /^void / + # C language. + pre['data-language'] = 'c' + else + # Nothing else? Let's guess DOT. + pre['data-language'] = 'dot' + end + end + pre.content = code.content + + node.replace(pre) + end + + doc + end + end + end +end diff --git a/lib/docs/filters/graphviz/entries.rb b/lib/docs/filters/graphviz/entries.rb new file mode 100644 index 0000000000..3f2fdfc95c --- /dev/null +++ b/lib/docs/filters/graphviz/entries.rb @@ -0,0 +1,28 @@ +module Docs + class Graphviz + class EntriesFilter < Docs::EntriesFilter + + def get_name + name = at_css('h1').content.strip + end + + def get_type + breadcrumbs = css('nav ol.breadcrumb li.breadcrumb-item') + category = breadcrumbs[1]&.content&.strip + + # These categories have several sub-pages. + return category if [ + 'Attribute Types', + 'Attributes', + 'Command Line', + 'Layout Engines', + 'Output Formats', + ].include?(category) + + # Several categories have only one page each. Let's group them together. + return 'Documentation' + end + + end + end +end diff --git a/lib/docs/scrapers/graphviz.rb b/lib/docs/scrapers/graphviz.rb new file mode 100644 index 0000000000..6b78d349cc --- /dev/null +++ b/lib/docs/scrapers/graphviz.rb @@ -0,0 +1,55 @@ +module Docs + class Graphviz < UrlScraper + self.name = 'Graphviz' + self.slug = 'graphviz' + self.type = 'simple' + + self.links = { + home: 'https://www.graphviz.org/', + code: 'https://gitlab.com/graphviz/graphviz' + } + + options[:container] = 'main' + + # These images are too large: + # 980KB https://www.graphviz.org/doc/info/plugins.png + # 650KB https://www.graphviz.org/Gallery/twopi/twopi2.svg + # All other files are under 100KB + options[:max_image_size] = 100_000 + + # TODO: the UrlScraper is very unreliable on this website. + # I often get several errors: + # - SSL connect error + # - Failure when receiving data from the peer + # - was slow to process (30s) + # Setting a :rate_limit doesn't help. + # We have to figure out a more reliable solution. + #options[:rate_limit] = 100 + + options[:attribution] = <<-HTML + © 2025 The Graphviz Authors
+ Licensed under the Eclipse Public License 1.0. + HTML + + html_filters.push 'graphviz/entries', 'graphviz/clean_html' + + self.release = '14.01' + self.base_url = 'https://www.graphviz.org/' + self.root_path = 'documentation/' + options[:only_patterns] = [ + /^documentation\//, + /^doc\//, + /^docs\//, + ] + options[:replace_paths] = { + # Redirections: + 'docs/outputs/cmap/' => 'docs/outputs/imap/', + 'doc/info/output.html' => 'docs/outputs/', + } + + def get_latest_version(opts) + tags = get_gitlab_tags('gitlab.com', 'graphviz', 'graphviz', opts) + tags[0]['name'] + end + end +end diff --git a/public/icons/docs/graphviz/16.png b/public/icons/docs/graphviz/16.png new file mode 100644 index 0000000000000000000000000000000000000000..bc8936906f04e3d28ef357749b9e9111cf3c2b93 GIT binary patch literal 395 zcmV;60d)R}P) z1I$-l901_wv;Ae)lx^G1*|yCwPg!HO8`HdO+qUcfp3b&AX58($$3OVWUwDsC3y>$O z6aD=;0e+6=MfaoDq&_*o_dFTR(>+{l#4T+9*vD%SP%{ThbCWF_er5)O#Cv=Hh5^~h zP>cY-@>2n(XCVCWF#*c+a0_R~5WbJFrU`eq6M9A8zbC*Ayl6-2DPckM3oj7bk@k!y zOE~l^o8I=g41B}DeC+JRtwO+ZY{<&4&P=U?4ET=kunz&Q17ya3$c3NiivT(CCvqbL zGU6u!UO}k9>A|$F!^!My9mnyigrB+BjO8Pk_BV6m{M2CPP)!3g8wkg48(pwJOhY}fLI7BPB#a*0otdB_*kR8nc&IL|X6yDUBgL8rH6@@$0Iye^?6}r!X-`<7q z9)wF*g@+kT5dvtN2di zw4GXn255nLU8qREYEz*yuK{{5gKA{nZ(4s~EHIj0Q54Spmj%YsTZ+QT1M3qw4|>aY zw$Z;V(5Q-|vCS=MZOU%wpj!^dfxvF?_TOM6J-6@aFaKo#%0H0vzR5E(CknT!N%s8VQRvBYXJT%?vnaf)vDKxQ z6ooT{mey~I^jn^xGZlrGve($zI|^r!g`X)@)$yuRjSklD3RSd>$Yu{I(nEQMmd)JA z+Jm!KUG}aja^}+H#U*vJcG74fb!r=(qUEy*X&N1#rV*(wb5F_IPLW2_PC07{)iA=x a$oLb8^wik7FKHbB0000 Date: Sun, 19 Oct 2025 09:46:27 +0200 Subject: [PATCH 2/4] graphviz: clean html --- lib/docs/filters/graphviz/clean_html.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/docs/filters/graphviz/clean_html.rb b/lib/docs/filters/graphviz/clean_html.rb index 123913aae1..86c4824024 100644 --- a/lib/docs/filters/graphviz/clean_html.rb +++ b/lib/docs/filters/graphviz/clean_html.rb @@ -7,6 +7,9 @@ def call content = at_css('.td-content') @doc = content if content + css('a:contains("Search the Graphviz codebase")').remove + css('.td-page-meta__lastmod').remove + css('pre:has(code)').each do |node| pre = Nokogiri::XML::Node.new('pre', @doc) code = node.at_css('code') From 8e070714947e20e13e18821dec765d52ab6f39fd Mon Sep 17 00:00:00 2001 From: Simon Legner Date: Sun, 19 Oct 2025 09:46:38 +0200 Subject: [PATCH 3/4] graphviz: retry on server error --- lib/docs/core/requester.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/docs/core/requester.rb b/lib/docs/core/requester.rb index 28b7db2d57..8456f49c6b 100644 --- a/lib/docs/core/requester.rb +++ b/lib/docs/core/requester.rb @@ -54,6 +54,12 @@ def build_and_queue_request(url, options = {}, &block) end def handle_response(response) + if response.code.to_i == 0 || (response.code.to_i >= 500 && response.code.to_i < 600) + instrument 'handle_response.retry', url: response.url do + build_and_queue_request(response.url) + end + return + end instrument 'handle_response.requester', url: response.url do on_response.each do |callback| result = callback.call(response) From cbaedce080743beb80a8e750bc1297041c11f80f Mon Sep 17 00:00:00 2001 From: Simon Legner Date: Sun, 19 Oct 2025 09:59:30 +0200 Subject: [PATCH 4/4] graphviz: retry on server error --- lib/docs/core/requester.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/docs/core/requester.rb b/lib/docs/core/requester.rb index 8456f49c6b..7413a506db 100644 --- a/lib/docs/core/requester.rb +++ b/lib/docs/core/requester.rb @@ -54,7 +54,7 @@ def build_and_queue_request(url, options = {}, &block) end def handle_response(response) - if response.code.to_i == 0 || (response.code.to_i >= 500 && response.code.to_i < 600) + if ENV['RETRY'] == '1' && [0, 500, 501, 502, 503, 504].include?(response.code.to_i) instrument 'handle_response.retry', url: response.url do build_and_queue_request(response.url) end