From 3f88c4e12d5a4763589b422e03571580c6b11aab Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 24 Oct 2025 21:52:00 -0700 Subject: [PATCH 1/2] refactor: replace insecure custom downloader with Down::NetHttp - drop ftp support - Harden repository copy path (OntologySubmission.copy_file_repository) - Normalize src (accept Tempfile or String path) - Sanitize destination filename (strip control/unsafe chars, trim/collapse spaces, remove leading dots, 255-char cap) --- Gemfile | 1 - Gemfile.lock | 2 +- .../models/ontology_submission.rb | 29 +++-- lib/ontologies_linked_data/utils/file.rb | 108 +++++------------- ontologies_linked_data.gemspec | 1 + test/models/test_ontology_submission.rb | 8 +- 6 files changed, 56 insertions(+), 93 deletions(-) diff --git a/Gemfile b/Gemfile index 95414ea2..5099d784 100644 --- a/Gemfile +++ b/Gemfile @@ -18,7 +18,6 @@ gem 'request_store' gem 'rest-client' gem 'rsolr' gem 'thin', '~> 1.0' # compatibility version pin. thin should be replaced with webmoc -gem "down", "~> 5.0" # Testing group :test do diff --git a/Gemfile.lock b/Gemfile.lock index 815b5cbf..00114348 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -30,6 +30,7 @@ PATH ontologies_linked_data (0.0.1) activesupport bcrypt + down (~> 5.0) goo json libxml-ruby @@ -240,7 +241,6 @@ DEPENDENCIES addressable (~> 2.8) bcrypt (~> 3.0) cube-ruby - down (~> 5.0) email_spec ffi goo! diff --git a/lib/ontologies_linked_data/models/ontology_submission.rb b/lib/ontologies_linked_data/models/ontology_submission.rb index 83810a2a..e457460f 100644 --- a/lib/ontologies_linked_data/models/ontology_submission.rb +++ b/lib/ontologies_linked_data/models/ontology_submission.rb @@ -282,37 +282,42 @@ def self.submission_id_generator(ss) end def self.copy_file_repository(acronym, submission_id, src, filename = nil) - path_to_repo = File.join( - LinkedData.settings.repository_folder, - acronym.to_s, - submission_id.to_s - ) + repo_root = LinkedData.settings.repository_folder + dst_dir = File.join(repo_root.to_s, acronym.to_s, submission_id.to_s) + + src_path = src.respond_to?(:path) ? src.path.to_s : src.to_s + raise ArgumentError, "Source file does not exist: #{src_path}" unless ::File.exist?(src_path) name = filename || File.basename(src) - dst = File.join(path_to_repo, name) + name = LinkedData::Utils::FileHelpers.sanitize_filename(name) + + dst_final = File.join(dst_dir, name) + dst_tmp = "#{dst_final}.tmp-#{Process.pid}-#{rand(1_000_000)}" begin - FileUtils.mkdir_p(path_to_repo) - FileUtils.chmod(REPOSITORY_DIR_MODE, path_to_repo) + FileUtils.mkdir_p(dst_dir) + FileUtils.chmod(REPOSITORY_DIR_MODE, dst_dir) - FileUtils.copy(src, dst) + FileUtils.copy(src, dst_tmp) # Uploaded files are initially written to a Tempfile in tmpdir with # permissions 0600 (owner read/write only) for security. To ensure # repository files are also accessible by the service group as intended, # we explicitly chmod the destination file to REPOSITORY_FILE_MODE. - FileUtils.chmod(REPOSITORY_FILE_MODE, dst) + FileUtils.chmod(REPOSITORY_FILE_MODE, dst_tmp) + FileUtils.mv(dst_tmp, dst_final) rescue StandardError => e + FileUtils.rm_f(dst_tmp) raise e.class, "Failed to copy #{src} to #{dst}: #{e.message}", e.backtrace end # Sanity check: ensure the file actually exists after copy and chmod # This guards against rare cases like silent file storage failures or # race conditions - unless File.exist?(dst) + unless File.exist?(dst_final) raise IOError, "Copy operation completed without error, but file '#{dst}' does not exist" end - dst + dst_final end def valid? diff --git a/lib/ontologies_linked_data/utils/file.rb b/lib/ontologies_linked_data/utils/file.rb index 5b6277f3..2632a42d 100644 --- a/lib/ontologies_linked_data/utils/file.rb +++ b/lib/ontologies_linked_data/utils/file.rb @@ -4,6 +4,7 @@ require 'zlib' require 'tmpdir' require 'fileutils' +require 'down/net_http' module LinkedData module Utils @@ -143,75 +144,28 @@ def self.exists_and_file(path) File.exist?(path) && !File.directory?(path) end - def self.download_file(uri, limit = 10) - raise ArgumentError, 'HTTP redirect too deep' if limit == 0 - - uri = URI(uri) unless uri.kind_of?(URI) - - if uri.kind_of?(URI::FTP) - file, filename = download_file_ftp(uri) - else - file = Tempfile.new('ont-rest-file') - file_size = 0 - filename = nil - http_session = Net::HTTP.new(uri.host, uri.port) - http_session.verify_mode = OpenSSL::SSL::VERIFY_NONE - http_session.use_ssl = (uri.scheme == 'https') - http_session.start do |http| - http.read_timeout = 1800 - http.request_get(uri.request_uri, {'Accept-Encoding' => 'gzip'}) do |res| - if res.kind_of?(Net::HTTPRedirection) - new_loc = res['location'] - if new_loc.match(/^(http:\/\/|https:\/\/)/) - uri = new_loc - else - uri.path = new_loc - end - return download_file(uri, limit - 1) - end - - raise Net::HTTPBadResponse.new("#{uri.request_uri}: #{res.code}") if res.code.to_i >= 400 - - file_size = res.read_header['content-length'].to_i - begin - content_disposition = res.read_header['content-disposition'] - filenames = content_disposition.match(/filename=\"(.*)\"/) || content_disposition.match(/filename=(.*)/) - filename = filenames[1] if filename.nil? - rescue - filename = LinkedData::Utils::Triples.last_iri_fragment(uri.request_uri) if filename.nil? - end - - file.write(res.body) - - if res.header['Content-Encoding'].eql?('gzip') - uncompressed_file = Tempfile.new('uncompressed-ont-rest-file') - file.rewind - sio = StringIO.new(file.read) - gz = Zlib::GzipReader.new(sio) - uncompressed_file.write(gz.read()) - file.close - file = uncompressed_file - gz.close() - end - end - end - file.close + def self.download_file(uri, limit: 10, open_timeout: 15, read_timeout: 1800, headers: {}, max_size: 512 * 1024 * 1024) + uri = URI(uri) unless uri.is_a?(URI) + unless %w[http https].include?(uri.scheme) + raise ArgumentError, "Unsupported URI scheme #{uri.scheme.inspect} (only http/https are supported)" end - return file, filename - end - - def self.download_file_ftp(url) - url = URI.parse(url) unless url.kind_of?(URI) - ftp = Net::FTP.new(url.host, url.user, url.password) - ftp.passive = true - ftp.login - filename = LinkedData::Utils::Triples.last_iri_fragment(url.path) - temp_dir = Dir.tmpdir - temp_file_path = File.join(temp_dir, filename) - ftp.getbinaryfile(url.path, temp_file_path) - file = File.new(temp_file_path) - return file, filename + tmpfile = Down::NetHttp.download( + uri.to_s, + max_redirects: limit, + open_timeout: open_timeout, + read_timeout: read_timeout, + max_size: max_size, + headers: { "User-Agent" => "OntoPortal" }.merge(headers) + ) + + filename = tmpfile.original_filename || + File.basename(uri.path.to_s) || + LinkedData::Utils::Triples.last_iri_fragment(uri.request_uri) + filename = sanitize_filename(filename) + tmpfile.rewind + + [tmpfile, filename] end # --- Utility guards / filters --- @@ -229,6 +183,15 @@ def self.macos_metadata?(entry_name) entry_name.start_with?('__MACOSX/') || base == '.DS_Store' || base.start_with?('._') end + def self.sanitize_filename(name) + base = File.basename(name.to_s) + base = base.gsub(/[\x00-\x1F\/\\:\*\?\"<>\|]/, "") # control + unsafe chars + base = base.sub(/\A\.+/, "") # no leading dots + base = base.strip.gsub(/\s+/, " ") # trim + collapse spaces + base = base[0, 255] + base.empty? ? "unnamed" : base + end + # Resolve the output filename for a .gz: # - Prefer header's orig_name when present # - Otherwise use the source filename without its .gz @@ -238,16 +201,7 @@ def self.resolve_gzip_name(file_path, gzip_reader) name = gzip_reader.orig_name name = File.basename(file_path, File.extname(file_path)) if name.nil? || name.empty? name = File.basename(name.to_s) - - # Strip NULs and other control chars to avoid filesystem weirdness - name = name.gsub(/[\x00-\x1F]/, '') - # Ensure non-empty - if name.empty? - base = File.basename(file_path, File.extname(file_path)) - name = "#{base}_unnamed" - end - - name[0, 255] + sanitize_filename(name) end end diff --git a/ontologies_linked_data.gemspec b/ontologies_linked_data.gemspec index bbc3dc99..c93e6fa6 100644 --- a/ontologies_linked_data.gemspec +++ b/ontologies_linked_data.gemspec @@ -32,6 +32,7 @@ Gem::Specification.new do |gem| gem.add_dependency("rack-test") gem.add_dependency("rsolr") gem.add_dependency("rubyzip", "~> 3.0") + gem.add_dependency("down", "~> 5.0") gem.add_development_dependency("email_spec") diff --git a/test/models/test_ontology_submission.rb b/test/models/test_ontology_submission.rb index 1a6a6037..007b5645 100644 --- a/test/models/test_ontology_submission.rb +++ b/test/models/test_ontology_submission.rb @@ -1,6 +1,7 @@ require_relative "./test_ontology_common" require "logger" require "rack" +require "tempfile" class TestOntologySubmission < LinkedData::TestOntologyCommon @@ -679,11 +680,14 @@ def test_download_ontology_file sub.pullLocation = RDF::IRI.new(server_url) file, filename = sub.download_ontology_file sleep 2 - assert filename.nil?, "Test filename is not nil: #{filename}" - assert file.is_a?(Tempfile), "Test file is not a Tempfile" + p "filename is" + p filename + assert_equal "unnamed", filename, "Expected fallback filename to be 'unnamed'" + assert_kind_of Tempfile, file file.open assert file.read.eql?("test file"), "Test file content error: #{file.read}" ensure + file.close! if file && file.respond_to?(:close!) LinkedData::TestCase.backend_4s_delete Thread.kill(server_thread) # this will shutdown Rack::Server also sleep 3 From 10c4e51213df38dd9c54aad5b8fa5dd1e9a2615e Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Mon, 17 Nov 2025 10:57:58 -0800 Subject: [PATCH 2/2] Remove FTP support, refactor remote_file_exists? to FileHelpers, replace Thin in tests - Remove all FTP download logic (check_ftp_file, URI::FTP branch, net-ftp dependency) - Move remote_file_exists? to LinkedData::Utils::FileHelpers and reimplement using Down::NetHttp - Update download_file to use max_redirects keyword for consistency - Remove Thin dependency and add Webrick for test environment --- Gemfile | 2 +- Gemfile.lock | 81 ++++++++----------- .../models/ontology_submission.rb | 52 +----------- lib/ontologies_linked_data/utils/file.rb | 25 +++++- ontologies_linked_data.gemspec | 3 +- test/models/test_ontology_submission.rb | 2 - 6 files changed, 61 insertions(+), 104 deletions(-) diff --git a/Gemfile b/Gemfile index 5099d784..41fd45d3 100644 --- a/Gemfile +++ b/Gemfile @@ -17,7 +17,6 @@ gem 'rake', '~> 10.0' gem 'request_store' gem 'rest-client' gem 'rsolr' -gem 'thin', '~> 1.0' # compatibility version pin. thin should be replaced with webmoc # Testing group :test do @@ -30,6 +29,7 @@ group :test do gem 'rack-test', '~> 0.6' gem 'simplecov' gem 'simplecov-cobertura' # for codecov.io + gem 'webrick' end group :development do diff --git a/Gemfile.lock b/Gemfile.lock index 00114348..ba5d3b8c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: ca5f9d858eef89923903236fe6f76c78271e538d + revision: 1d6b8ace1b06f7cb80fd990baf1067b88caaa5a0 branch: develop specs: goo (0.0.2) @@ -35,12 +35,10 @@ PATH json libxml-ruby multi_json - net-ftp oj omni_logger pony rack - rack-test rsolr rubyzip (~> 3.0) @@ -58,16 +56,15 @@ GEM ansi (1.5.0) ast (2.4.3) bcrypt (3.1.20) - bigdecimal (3.2.2) + bigdecimal (3.3.1) builder (3.3.0) childprocess (5.1.0) logger (~> 1.5) coderay (1.1.3) concurrent-ruby (1.3.5) - connection_pool (2.5.3) + connection_pool (2.5.4) cube-ruby (0.0.3) - daemons (1.4.1) - date (3.4.1) + date (3.5.0) docile (1.4.1) domain_name (0.6.20240107) down (5.4.2) @@ -76,24 +73,23 @@ GEM htmlentities (~> 4.3.3) launchy (>= 2.1, < 4.0) mail (~> 2.7) - eventmachine (1.2.7) - faraday (2.13.4) + faraday (2.14.0) faraday-net_http (>= 2.0, < 3.5) json logger - faraday-net_http (3.4.1) - net-http (>= 0.5.0) + faraday-net_http (3.4.2) + net-http (~> 0.5) ffi (1.17.2-aarch64-linux-gnu) ffi (1.17.2-arm64-darwin) ffi (1.17.2-x86_64-linux-gnu) hashie (5.0.0) htmlentities (4.3.4) http-accept (1.7.0) - http-cookie (1.0.8) + http-cookie (1.1.0) domain_name (~> 0.5) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.13.2) + json (2.16.0) json_pure (2.8.1) language_server-protocol (3.17.0.5) launchy (3.1.1) @@ -105,7 +101,8 @@ GEM logger (1.7.0) macaddr (1.7.2) systemu (~> 2.6.5) - mail (2.8.1) + mail (2.9.0) + logger mini_mime (>= 0.1.1) net-imap net-pop @@ -114,7 +111,7 @@ GEM mime-types (3.7.0) logger mime-types-data (~> 3.2025, >= 3.2025.0507) - mime-types-data (3.2025.0826) + mime-types-data (3.2025.0924) mini_mime (1.1.5) minitest (4.7.5) minitest-reporters (0.14.24) @@ -122,16 +119,13 @@ GEM builder minitest (>= 2.12, < 5.0) powerbar - mocha (2.7.1) + mocha (2.8.0) ruby2_keywords (>= 0.0.5) - mock_redis (0.51.0) + mock_redis (0.53.0) redis (~> 5) multi_json (1.17.0) - net-ftp (0.3.8) - net-protocol - time - net-http (0.6.0) - uri + net-http (0.8.0) + uri (>= 0.11.1) net-http-persistent (2.9.4) net-imap (0.4.22) date @@ -143,27 +137,27 @@ GEM net-smtp (0.5.1) net-protocol netrc (0.11.0) - oj (3.16.11) + oj (3.16.12) bigdecimal (>= 3.0) ostruct (>= 0.2) omni_logger (0.1.4) logger ostruct (0.6.3) parallel (1.27.0) - parser (3.3.9.0) + parser (3.3.10.0) ast (~> 2.4.1) racc pony (1.13.1) mail (>= 2.0) powerbar (2.0.1) hashie (>= 1.1.0) - prism (1.4.0) + prism (1.6.0) pry (0.15.2) coderay (~> 1.1) method_source (~> 1.0) public_suffix (5.1.1) racc (1.8.1) - rack (2.2.17) + rack (2.2.21) rack-test (0.8.3) rack (>= 1.0, < 3) rainbow (3.1.1) @@ -172,9 +166,9 @@ GEM addressable (>= 2.2) redis (5.4.1) redis-client (>= 0.22.0) - redis-client (0.25.2) + redis-client (0.26.1) connection_pool - regexp_parser (2.11.2) + regexp_parser (2.11.3) request_store (1.7.0) rack (>= 1.4) rest-client (2.1.0) @@ -182,11 +176,11 @@ GEM http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) - rexml (3.4.1) + rexml (3.4.4) rsolr (2.6.0) builder (>= 2.1.2) faraday (>= 0.9, < 3, != 2.0.0) - rubocop (1.79.2) + rubocop (1.81.7) json (~> 2.3) language_server-protocol (~> 3.17.0.2) lint_roller (~> 1.1.0) @@ -194,40 +188,35 @@ GEM parser (>= 3.3.0.2) rainbow (>= 2.2.2, < 4.0) regexp_parser (>= 2.9.3, < 3.0) - rubocop-ast (>= 1.46.0, < 2.0) + rubocop-ast (>= 1.47.1, < 2.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 2.4.0, < 4.0) - rubocop-ast (1.46.0) + rubocop-ast (1.48.0) parser (>= 3.3.7.2) prism (~> 1.4) ruby-progressbar (1.13.0) ruby2_keywords (0.0.5) - rubyzip (3.0.2) + rubyzip (3.2.2) simplecov (0.22.0) docile (~> 1.1) simplecov-html (~> 0.11) simplecov_json_formatter (~> 0.1) - simplecov-cobertura (3.0.0) + simplecov-cobertura (3.1.0) rexml simplecov (~> 0.19) simplecov-html (0.13.2) simplecov_json_formatter (0.1.4) systemu (2.6.5) - thin (1.8.2) - daemons (~> 1.0, >= 1.0.9) - eventmachine (~> 1.0, >= 1.0.4) - rack (>= 1, < 3) thread_safe (0.3.6) - time (0.4.1) - date - timeout (0.4.3) + timeout (0.4.4) tzinfo (0.3.62) - unicode-display_width (3.1.5) - unicode-emoji (~> 4.0, >= 4.0.4) - unicode-emoji (4.0.4) - uri (1.0.3) + unicode-display_width (3.2.0) + unicode-emoji (~> 4.1) + unicode-emoji (4.1.0) + uri (1.1.1) uuid (2.3.9) macaddr (~> 1.0) + webrick (1.9.1) PLATFORMS aarch64-linux @@ -267,7 +256,7 @@ DEPENDENCIES simplecov simplecov-cobertura sparql-client! - thin (~> 1.0) + webrick BUNDLED WITH 2.6.3 diff --git a/lib/ontologies_linked_data/models/ontology_submission.rb b/lib/ontologies_linked_data/models/ontology_submission.rb index e457460f..009088df 100644 --- a/lib/ontologies_linked_data/models/ontology_submission.rb +++ b/lib/ontologies_linked_data/models/ontology_submission.rb @@ -1,4 +1,3 @@ -require 'net/ftp' require 'net/http' require 'uri' require 'open-uri' @@ -813,18 +812,6 @@ def ontology_uri RDF::URI.new(self.uri) end - - - - - - - - - - - - def roots_sorted(extra_include=nil) classes = roots(extra_include) LinkedData::Models::Class.sort_classes(classes) @@ -838,17 +825,7 @@ def download_and_store_ontology_file end def remote_file_exists?(url) - begin - url = URI.parse(url) - if url.kind_of?(URI::FTP) - check = check_ftp_file(url) - else - check = check_http_file(url) - end - rescue Exception - check = false - end - check + LinkedData::Utils::FileHelpers.remote_file_exists?(url) end def download_ontology_file @@ -912,33 +889,6 @@ def owlapi_parser_input File.expand_path(path) end - def check_http_file(url) - session = Net::HTTP.new(url.host, url.port) - session.use_ssl = true if url.port == 443 - session.start do |http| - response_valid = http.head(url.request_uri).code.to_i < 400 - return response_valid - end - end - - def check_ftp_file(uri) - ftp = Net::FTP.new(uri.host, uri.user, uri.password) - ftp.login - begin - file_exists = ftp.size(uri.path) > 0 - rescue Exception => e - # Check using another method - path = uri.path.split("/") - filename = path.pop - path = path.join("/") - ftp.chdir(path) - files = ftp.dir - # Dumb check, just see if the filename is somewhere in the list - files.each { |file| return true if file.include?(filename) } - end - file_exists - end - def self.loom_transform_literal(lit) res = [] lit.each_char do |c| diff --git a/lib/ontologies_linked_data/utils/file.rb b/lib/ontologies_linked_data/utils/file.rb index 2632a42d..608aa5ca 100644 --- a/lib/ontologies_linked_data/utils/file.rb +++ b/lib/ontologies_linked_data/utils/file.rb @@ -144,7 +144,7 @@ def self.exists_and_file(path) File.exist?(path) && !File.directory?(path) end - def self.download_file(uri, limit: 10, open_timeout: 15, read_timeout: 1800, headers: {}, max_size: 512 * 1024 * 1024) + def self.download_file(uri, max_redirects: 10, open_timeout: 15, read_timeout: 1800, headers: {}, max_size: 512 * 1024 * 1024) uri = URI(uri) unless uri.is_a?(URI) unless %w[http https].include?(uri.scheme) raise ArgumentError, "Unsupported URI scheme #{uri.scheme.inspect} (only http/https are supported)" @@ -152,7 +152,7 @@ def self.download_file(uri, limit: 10, open_timeout: 15, read_timeout: 1800, hea tmpfile = Down::NetHttp.download( uri.to_s, - max_redirects: limit, + max_redirects: max_redirects, open_timeout: open_timeout, read_timeout: read_timeout, max_size: max_size, @@ -168,6 +168,27 @@ def self.download_file(uri, limit: 10, open_timeout: 15, read_timeout: 1800, hea [tmpfile, filename] end + def self.remote_file_exists?(url, max_redirects: 10, open_timeout: 15, read_timeout: 10, headers: {}) + uri = url.is_a?(URI) ? url : URI.parse(url.to_s) + + # only http/https are supported + return false unless %w[http https].include?(uri.scheme) + + Down::NetHttp.open( + uri.to_s, + max_redirects: max_redirects, + open_timeout: open_timeout, + read_timeout: read_timeout, + headers: { "User-Agent" => "OntoPortal" }.merge(headers) + ) do |io| + io.close + end + + true + rescue Down::Error, URI::InvalidURIError + false + end + # --- Utility guards / filters --- def self.safe_join(base, *paths) target = File.expand_path(File.join(base, *paths)) diff --git a/ontologies_linked_data.gemspec b/ontologies_linked_data.gemspec index c93e6fa6..fdc0f2dd 100644 --- a/ontologies_linked_data.gemspec +++ b/ontologies_linked_data.gemspec @@ -24,17 +24,16 @@ Gem::Specification.new do |gem| gem.add_dependency("json") gem.add_dependency("libxml-ruby") gem.add_dependency("multi_json") - gem.add_dependency("net-ftp") gem.add_dependency("oj") gem.add_dependency("omni_logger") gem.add_dependency("pony") gem.add_dependency("rack") - gem.add_dependency("rack-test") gem.add_dependency("rsolr") gem.add_dependency("rubyzip", "~> 3.0") gem.add_dependency("down", "~> 5.0") gem.add_development_dependency("email_spec") + gem.add_development_dependency("rack-test") # gem.executables = %w() end diff --git a/test/models/test_ontology_submission.rb b/test/models/test_ontology_submission.rb index 007b5645..3de725d7 100644 --- a/test/models/test_ontology_submission.rb +++ b/test/models/test_ontology_submission.rb @@ -680,8 +680,6 @@ def test_download_ontology_file sub.pullLocation = RDF::IRI.new(server_url) file, filename = sub.download_ontology_file sleep 2 - p "filename is" - p filename assert_equal "unnamed", filename, "Expected fallback filename to be 'unnamed'" assert_kind_of Tempfile, file file.open