diff --git a/.github/workflows/indexing_top_gems.yml b/.github/workflows/indexing_top_gems.yml index 1bf80e34..b1233cf2 100644 --- a/.github/workflows/indexing_top_gems.yml +++ b/.github/workflows/indexing_top_gems.yml @@ -31,6 +31,8 @@ jobs: key: ${{ runner.os }}-rust - name: Run Ruby tests + env: + RUST_BACKTRACE: full run: bundle exec rake index_top_gems - name: Save Rust compile cache diff --git a/ext/rubydex/graph.c b/ext/rubydex/graph.c index 905ee1d1..452fd3a3 100644 --- a/ext/rubydex/graph.c +++ b/ext/rubydex/graph.c @@ -9,6 +9,7 @@ #include "utils.h" static VALUE cGraph; +static VALUE mRubydex; // Free function for the custom Graph allocator. We always have to call into Rust to free data allocated by it static void graph_free(void *ptr) { @@ -449,6 +450,32 @@ static VALUE rdxr_graph_require_paths(VALUE self, VALUE load_path) { return array; } +// Graph#check_integrity: () -> Array[Rubydex::IntegrityFailure] +// Returns an array of IntegrityFailure objects, empty if no issues found +static VALUE rdxr_graph_check_integrity(VALUE self) { + void *graph; + TypedData_Get_Struct(self, void *, &graph_type, graph); + + size_t error_count = 0; + const char *const *errors = rdx_check_integrity(graph, &error_count); + + if (errors == NULL) { + return rb_ary_new(); + } + + VALUE cIntegrityError = rb_const_get(mRubydex, rb_intern("IntegrityFailure")); + VALUE array = rb_ary_new_capa((long)error_count); + + for (size_t i = 0; i < error_count; i++) { + VALUE argv[] = {rb_utf8_str_new_cstr(errors[i])}; + VALUE error = rb_class_new_instance(1, argv, cIntegrityError); + rb_ary_push(array, error); + } + + free_c_string_array(errors, error_count); + return array; +} + // Graph#diagnostics -> Array[Rubydex::Diagnostic] static VALUE rdxr_graph_diagnostics(VALUE self) { void *graph; @@ -482,7 +509,8 @@ static VALUE rdxr_graph_diagnostics(VALUE self) { return diagnostics; } -void rdxi_initialize_graph(VALUE mRubydex) { +void rdxi_initialize_graph(VALUE moduleRubydex) { + mRubydex = moduleRubydex; cGraph = rb_define_class_under(mRubydex, "Graph", rb_cObject); rb_define_alloc_func(cGraph, rdxr_graph_alloc); rb_define_method(cGraph, "index_all", rdxr_graph_index_all, 1); @@ -495,6 +523,7 @@ void rdxi_initialize_graph(VALUE mRubydex) { rb_define_method(cGraph, "constant_references", rdxr_graph_constant_references, 0); rb_define_method(cGraph, "method_references", rdxr_graph_method_references, 0); rb_define_method(cGraph, "diagnostics", rdxr_graph_diagnostics, 0); + rb_define_method(cGraph, "check_integrity", rdxr_graph_check_integrity, 0); rb_define_method(cGraph, "[]", rdxr_graph_aref, 1); rb_define_method(cGraph, "search", rdxr_graph_search, 1); rb_define_method(cGraph, "encoding=", rdxr_graph_set_encoding, 1); diff --git a/lib/rubydex.rb b/lib/rubydex.rb index 51d4b729..17d8e9f0 100644 --- a/lib/rubydex.rb +++ b/lib/rubydex.rb @@ -2,11 +2,6 @@ require "bundler" require "uri" - -module Rubydex - class Error < StandardError; end -end - require "rubydex/version" begin @@ -18,6 +13,7 @@ class Error < StandardError; end require "rubydex/rubydex" end +require "rubydex/failures" require "rubydex/location" require "rubydex/comment" require "rubydex/diagnostic" diff --git a/lib/rubydex/failures.rb b/lib/rubydex/failures.rb new file mode 100644 index 00000000..7149b554 --- /dev/null +++ b/lib/rubydex/failures.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Rubydex + class Failure + #: String + attr_reader :message + + #: (String) -> void + def initialize(message) + @message = message + end + end + + class IntegrityFailure < Failure; end +end diff --git a/lib/rubydex/location.rb b/lib/rubydex/location.rb index 4d8c39ce..1f1ea0d8 100644 --- a/lib/rubydex/location.rb +++ b/lib/rubydex/location.rb @@ -4,6 +4,8 @@ module Rubydex # A zero based internal location. Intended to be used for tool-to-tool communication, such as a language server # communicating with an editor. class Location + class NotFileUriError < StandardError; end + include Comparable #: String @@ -22,9 +24,9 @@ def initialize(uri:, start_line:, end_line:, start_column:, end_column:) end #: () -> String - def path + def to_file_path uri = URI(@uri) - raise Rubydex::Error, "URI is not a file:// URI: #{@uri}" unless uri.scheme == "file" + raise NotFileUriError, "URI is not a file:// URI: #{@uri}" unless uri.scheme == "file" path = uri.path # TODO: This has to go away once we have a proper URI abstraction @@ -59,7 +61,7 @@ def to_display #: -> String def to_s - "#{path}:#{@start_line + 1}:#{@start_column + 1}-#{@end_line + 1}:#{@end_column + 1}" + "#{to_file_path}:#{@start_line + 1}:#{@start_column + 1}-#{@end_line + 1}:#{@end_column + 1}" end end @@ -82,7 +84,7 @@ def comparable_values #: -> String def to_s - "#{path}:#{@start_line}:#{@start_column}-#{@end_line}:#{@end_column}" + "#{to_file_path}:#{@start_line}:#{@start_column}-#{@end_line}:#{@end_column}" end end end diff --git a/rakelib/index_top_gems.rake b/rakelib/index_top_gems.rake index 163aa7aa..b593acd8 100644 --- a/rakelib/index_top_gems.rake +++ b/rakelib/index_top_gems.rake @@ -1,7 +1,7 @@ # frozen_string_literal: true desc "Index the top 100 gems from rubygems.org" -task index_top_gems: :compile_release do +task index_top_gems: :compile do $LOAD_PATH.unshift(File.expand_path("../lib", __dir__)) require "net/http" require "rubygems/package" @@ -51,7 +51,10 @@ task index_top_gems: :compile_release do # Index the gem's files and yield errors back to the main Ractor graph = Rubydex::Graph.new - errors = graph.index_all(Dir.glob("#{gem_dir}/**/*.rb")) + indexing_errors = graph.index_all(Dir.glob("#{gem_dir}/**/*.rb")) + graph.resolve + + errors = indexing_errors + graph.check_integrity.map(&:message) next if errors.empty? @mutex.synchronize { @errors << "#{gem} => #{errors.join(", ")}" } diff --git a/rbi/rubydex.rbi b/rbi/rubydex.rbi index fe7ebae8..aa1df4f2 100644 --- a/rbi/rubydex.rbi +++ b/rbi/rubydex.rbi @@ -204,6 +204,9 @@ module Rubydex sig { returns(T::Array[String]) } def workspace_paths; end + sig { returns(T::Array[Failure]) } + def check_integrity; end + private # Gathers the paths we have to index for all workspace dependencies @@ -235,7 +238,7 @@ module Rubydex def end_line; end sig { returns(String) } - def path; end + def to_file_path; end sig { returns(Integer) } def start_column; end diff --git a/rust/rubydex-sys/src/graph_api.rs b/rust/rubydex-sys/src/graph_api.rs index d390ae5f..3a7766c2 100644 --- a/rust/rubydex-sys/src/graph_api.rs +++ b/rust/rubydex-sys/src/graph_api.rs @@ -10,7 +10,7 @@ use rubydex::model::encoding::Encoding; use rubydex::model::graph::Graph; use rubydex::model::ids::DeclarationId; use rubydex::resolution::Resolver; -use rubydex::{indexing, listing, query}; +use rubydex::{indexing, integrity, listing, query}; use std::ffi::CString; use std::path::PathBuf; use std::{mem, ptr}; @@ -196,6 +196,42 @@ pub extern "C" fn rdx_graph_resolve(pointer: GraphPointer) { }); } +/// Checks the integrity of the graph and returns an array of error message strings. Returns NULL if there are no +/// errors. Caller must free with `free_c_string_array`. +/// +/// # Safety +/// +/// - `pointer` must be a valid `GraphPointer` previously returned by this crate. +/// - `out_error_count` must be a valid, writable pointer. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rdx_check_integrity( + pointer: GraphPointer, + out_error_count: *mut usize, +) -> *const *const c_char { + with_graph(pointer, |graph| { + let errors = integrity::check_integrity(graph); + + if errors.is_empty() { + unsafe { *out_error_count = 0 }; + return ptr::null(); + } + + let c_strings: Vec<*const c_char> = errors + .into_iter() + .filter_map(|error| { + CString::new(error.to_string()) + .ok() + .map(|c_string| c_string.into_raw().cast_const()) + }) + .collect(); + + unsafe { *out_error_count = c_strings.len() }; + + let boxed = c_strings.into_boxed_slice(); + Box::into_raw(boxed).cast::<*const c_char>() + }) +} + /// # Safety /// /// Expects both the graph pointer and encoding string pointer to be valid diff --git a/rust/rubydex/src/integrity.rs b/rust/rubydex/src/integrity.rs new file mode 100644 index 00000000..2037dd13 --- /dev/null +++ b/rust/rubydex/src/integrity.rs @@ -0,0 +1,278 @@ +use std::fmt; + +use crate::model::{ + declaration::{Declaration, Namespace}, + graph::{BASIC_OBJECT_ID, Graph, OBJECT_ID}, + ids::DeclarationId, +}; + +#[derive(Debug, PartialEq, Eq)] +pub enum IntegrityErrorKind { + /// A declaration's owner is not a namespace (module, class, or singleton class) + OwnerIsNotNamespace, + /// A declaration's owner does not exist in the graph + OwnerDoesNotExist, + /// A singleton class chain never resolves to a non-singleton namespace + SingletonClassChainDoesNotTerminate, + /// A non-root declaration unexpectedly owns itself + UnexpectedSelfOwnership, +} + +/// An integrity error found during graph validation +#[derive(Debug, PartialEq, Eq)] +pub struct IntegrityError { + kind: IntegrityErrorKind, + declaration_name: String, + uris: Vec, +} + +impl fmt::Display for IntegrityError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let message = match self.kind { + IntegrityErrorKind::OwnerIsNotNamespace => { + format!("Declaration `{}` is owned by a non-namespace", self.declaration_name) + } + IntegrityErrorKind::OwnerDoesNotExist => { + format!( + "Declaration `{}` has an owner that does not exist in the graph", + self.declaration_name + ) + } + IntegrityErrorKind::SingletonClassChainDoesNotTerminate => { + format!( + "Singleton class `{}` does not eventually attach to a non-singleton namespace", + self.declaration_name + ) + } + IntegrityErrorKind::UnexpectedSelfOwnership => { + format!("Declaration `{}` unexpectedly owns itself", self.declaration_name) + } + }; + + write!(f, "{message}. Defined in: {}", self.uris.join(", ")) + } +} + +impl std::error::Error for IntegrityError {} + +/// Checks the integrity of the graph data +#[must_use] +pub fn check_integrity(graph: &Graph) -> Vec { + let mut errors = Vec::new(); + let self_owners = [*OBJECT_ID, *BASIC_OBJECT_ID]; + + for (id, declaration) in graph.declarations() { + let owner_id = declaration.owner_id(); + + // Check for constants that own themselves. Only `Object` and `BasicObject` own themselves and no other constant + if *id == *owner_id { + if self_owners.contains(id) { + continue; + } + errors.push(IntegrityError { + kind: IntegrityErrorKind::UnexpectedSelfOwnership, + declaration_name: declaration.name().to_string(), + uris: collect_uris(graph, declaration), + }); + continue; + } + + // Check that the owner exists + let Some(owner) = graph.declarations().get(owner_id) else { + errors.push(IntegrityError { + kind: IntegrityErrorKind::OwnerDoesNotExist, + declaration_name: declaration.name().to_string(), + uris: collect_uris(graph, declaration), + }); + continue; + }; + + // Check that the owner is a namespace + if owner.as_namespace().is_none() { + errors.push(IntegrityError { + kind: IntegrityErrorKind::OwnerIsNotNamespace, + declaration_name: declaration.name().to_string(), + uris: collect_uris(graph, declaration), + }); + continue; + } + + // Check singleton class chain termination + if let Declaration::Namespace(Namespace::SingletonClass(_)) = declaration + && !singleton_chain_terminates(graph, *owner_id) + { + errors.push(IntegrityError { + kind: IntegrityErrorKind::SingletonClassChainDoesNotTerminate, + declaration_name: declaration.name().to_string(), + uris: collect_uris(graph, declaration), + }); + } + } + + errors +} + +/// Collects the URIs where a declaration is defined, sorted and deduplicated +fn collect_uris(graph: &Graph, declaration: &Declaration) -> Vec { + declaration + .definitions() + .iter() + .map(|def_id| { + let definition = graph.definitions().get(def_id).unwrap(); + let document = graph.documents().get(definition.uri_id()).unwrap(); + document.uri().to_string() + }) + .collect() +} + +/// Walks the singleton class chain to verify that it eventually finds a module or class as its attached object +fn singleton_chain_terminates(graph: &Graph, start_owner_id: DeclarationId) -> bool { + const MAX_SINGLETON_DEPTH: usize = 128; + let mut current_id = start_owner_id; + + for _ in 0..MAX_SINGLETON_DEPTH { + let Some(current) = graph.declarations().get(¤t_id) else { + return false; + }; + + match current { + Declaration::Namespace(Namespace::SingletonClass(_)) => { + current_id = *current.owner_id(); + } + Declaration::Namespace(_) => return true, + _ => return false, + } + } + + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::model::declaration::{ClassDeclaration, MethodDeclaration, SingletonClassDeclaration}; + + #[test] + fn test_unexpected_self_ownership() { + let mut graph = Graph::new(); + + // Object and BasicObject are exempt from self-ownership + graph.declarations_mut().insert( + *OBJECT_ID, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "Object".to_string(), + *OBJECT_ID, + )))), + ); + graph.declarations_mut().insert( + *BASIC_OBJECT_ID, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "BasicObject".to_string(), + *BASIC_OBJECT_ID, + )))), + ); + + // Foo owns itself — should be an error + let foo_id = DeclarationId::from("Foo"); + graph.declarations_mut().insert( + foo_id, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "Foo".to_string(), + foo_id, + )))), + ); + + let errors = check_integrity(&graph); + assert_eq!(errors.len(), 1); + assert_eq!(errors[0].kind, IntegrityErrorKind::UnexpectedSelfOwnership); + assert_eq!(errors[0].declaration_name, "Foo"); + } + + #[test] + fn test_owner_does_not_exist() { + let mut graph = Graph::new(); + let foo_id = DeclarationId::from("Foo"); + let bogus_owner = DeclarationId::from("NonExistent"); + + graph.declarations_mut().insert( + foo_id, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "Foo".to_string(), + bogus_owner, + )))), + ); + + let errors = check_integrity(&graph); + assert_eq!(errors.len(), 1); + assert_eq!(errors[0].kind, IntegrityErrorKind::OwnerDoesNotExist); + assert_eq!(errors[0].declaration_name, "Foo"); + } + + #[test] + fn test_owner_is_not_namespace() { + let mut graph = Graph::new(); + + // Object (self-owned, exempt) + graph.declarations_mut().insert( + *OBJECT_ID, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "Object".to_string(), + *OBJECT_ID, + )))), + ); + + // A method owned by Object (valid) + let method_id = DeclarationId::from("Object#foo"); + graph.declarations_mut().insert( + method_id, + Declaration::Method(Box::new(MethodDeclaration::new("Object#foo".to_string(), *OBJECT_ID))), + ); + + // A class owned by the method (invalid — owner is not a namespace) + let bar_id = DeclarationId::from("Bar"); + graph.declarations_mut().insert( + bar_id, + Declaration::Namespace(Namespace::Class(Box::new(ClassDeclaration::new( + "Bar".to_string(), + method_id, + )))), + ); + + let errors = check_integrity(&graph); + assert_eq!(errors.len(), 1); + assert_eq!(errors[0].kind, IntegrityErrorKind::OwnerIsNotNamespace); + assert_eq!(errors[0].declaration_name, "Bar"); + } + + #[test] + fn test_singleton_class_chain_does_not_terminate() { + let mut graph = Graph::new(); + + // Two singleton classes that own each other, forming a cycle + let s1_id = DeclarationId::from(""); + let s2_id = DeclarationId::from(""); + + graph.declarations_mut().insert( + s1_id, + Declaration::Namespace(Namespace::SingletonClass(Box::new(SingletonClassDeclaration::new( + "".to_string(), + s2_id, + )))), + ); + graph.declarations_mut().insert( + s2_id, + Declaration::Namespace(Namespace::SingletonClass(Box::new(SingletonClassDeclaration::new( + "".to_string(), + s1_id, + )))), + ); + + let errors = check_integrity(&graph); + assert_eq!(errors.len(), 2); + assert!( + errors + .iter() + .all(|e| e.kind == IntegrityErrorKind::SingletonClassChainDoesNotTerminate) + ); + } +} diff --git a/rust/rubydex/src/lib.rs b/rust/rubydex/src/lib.rs index ead014aa..3a468ef5 100644 --- a/rust/rubydex/src/lib.rs +++ b/rust/rubydex/src/lib.rs @@ -2,6 +2,7 @@ pub mod compile_assertions; pub mod diagnostic; pub mod errors; pub mod indexing; +pub mod integrity; pub mod job_queue; pub mod listing; pub mod model; diff --git a/rust/rubydex/src/main.rs b/rust/rubydex/src/main.rs index ffe4098c..d6258e76 100644 --- a/rust/rubydex/src/main.rs +++ b/rust/rubydex/src/main.rs @@ -2,7 +2,7 @@ use clap::{Parser, ValueEnum}; use std::mem; use rubydex::{ - indexing, listing, + indexing, integrity, listing, model::graph::Graph, resolution::Resolver, stats::{ @@ -28,6 +28,9 @@ struct Args { #[arg(long = "stats", help = "Show detailed performance statistics")] stats: bool, + #[arg(long = "check-integrity", help = "Check the integrity of the graph after resolution")] + check_integrity: bool, + #[arg( long = "report-orphans", value_name = "PATH", @@ -98,6 +101,23 @@ fn main() { return exit(args.stats); } + // Integrity check + if args.check_integrity { + let errors = time_it!(integrity_check, { integrity::check_integrity(&graph) }); + + if errors.is_empty() { + println!("Integrity check passed: no issues found"); + } else { + eprintln!("Integrity check found {} issue(s):", errors.len()); + + for error in &errors { + eprintln!(" - {error}"); + } + + std::process::exit(1); + } + } + // Querying if args.stats { diff --git a/rust/rubydex/src/model/graph.rs b/rust/rubydex/src/model/graph.rs index e591f1cc..c19713e2 100644 --- a/rust/rubydex/src/model/graph.rs +++ b/rust/rubydex/src/model/graph.rs @@ -14,6 +14,7 @@ use crate::model::references::{ConstantReference, MethodRef}; use crate::model::string_ref::StringRef; use crate::stats; +pub static BASIC_OBJECT_ID: LazyLock = LazyLock::new(|| DeclarationId::from("BasicObject")); pub static OBJECT_ID: LazyLock = LazyLock::new(|| DeclarationId::from("Object")); pub static MODULE_ID: LazyLock = LazyLock::new(|| DeclarationId::from("Module")); pub static CLASS_ID: LazyLock = LazyLock::new(|| DeclarationId::from("Class")); diff --git a/rust/rubydex/src/stats/timer.rs b/rust/rubydex/src/stats/timer.rs index 6ceb8b59..01097f74 100644 --- a/rust/rubydex/src/stats/timer.rs +++ b/rust/rubydex/src/stats/timer.rs @@ -122,5 +122,6 @@ make_timer! { listing, "Listing"; indexing, "Indexing"; resolution, "Resolution"; + integrity_check, "Integrity check"; querying, "Querying"; } diff --git a/test/definition_test.rb b/test/definition_test.rb index ca13dde7..dc729c03 100644 --- a/test/definition_test.rb +++ b/test/definition_test.rb @@ -87,7 +87,7 @@ def foo; end location = def_a.location.to_display refute_nil(location) assert_equal(context.uri_to("file1.rb"), location.uri) - assert_equal(context.absolute_path_to("file1.rb"), location.path) + assert_equal(context.absolute_path_to("file1.rb"), location.to_file_path) assert_equal(1, location.start_line) assert_equal(1, location.start_column) assert_equal(3, location.end_line) @@ -98,7 +98,7 @@ def foo; end location = def_foo.location.to_display refute_nil(location) assert_equal(context.uri_to("file1.rb"), location.uri) - assert_equal(context.absolute_path_to("file1.rb"), location.path) + assert_equal(context.absolute_path_to("file1.rb"), location.to_file_path) assert_equal(2, location.start_line) assert_equal(3, location.start_column) assert_equal(2, location.end_line) diff --git a/test/graph_test.rb b/test/graph_test.rb index 9570e7c2..ccd54b3c 100644 --- a/test/graph_test.rb +++ b/test/graph_test.rb @@ -566,7 +566,7 @@ def assert_diagnostics(expected, actual) assert_equal( expected, actual.sort_by { |d| [d.location, d.message] } - .map { |d| { rule: d.rule, path: File.basename(d.location.path), message: d.message } }, + .map { |d| { rule: d.rule, path: File.basename(d.location.to_file_path), message: d.message } }, ) end end diff --git a/test/location_test.rb b/test/location_test.rb index 8564b2e6..f762f626 100644 --- a/test/location_test.rb +++ b/test/location_test.rb @@ -64,7 +64,7 @@ def test_location_to_display def test_location_to_s loc = Rubydex::Location.new(uri: "file:///foo.rb", start_line: 0, end_line: 5, start_column: 3, end_column: 10) - assert_equal("#{loc.path}:1:4-6:11", loc.to_s) + assert_equal("#{loc.to_file_path}:1:4-6:11", loc.to_s) end def test_display_location_equality @@ -84,7 +84,7 @@ def test_display_location_to_display_returns_self def test_display_location_to_s display = Rubydex::DisplayLocation.new(uri: "file:///foo.rb", start_line: 1, end_line: 6, start_column: 4, end_column: 11) - assert_equal("#{display.path}:1:4-6:11", display.to_s) + assert_equal("#{display.to_file_path}:1:4-6:11", display.to_s) end def test_location_and_its_display_location_are_equal