From b4c694b45c2f98c5e7f782f38da040d1e14d6786 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 13 Feb 2026 10:49:28 -0800 Subject: [PATCH 1/9] updated reporting --- README.md | 25 +-- report.sh | 4 +- run.sh | 18 +- .../adk/scope/extractors/extractor_go.py | 17 +- src/google/adk/scope/matcher/matcher.py | 75 +++----- src/google/adk/scope/reporter/reporter.py | 173 ++++++------------ test/adk/scope/reporter/test_reporter.py | 126 ++++--------- 7 files changed, 139 insertions(+), 299 deletions(-) diff --git a/README.md b/README.md index b586719..9a5b85d 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Once you have extracted features from two languages (e.g., Python and TypeScript --base output/py.txtpb \ --target output/ts.txtpb \ --output output/ \ - --report-type directional + --report-type md ``` | Argument | Description | @@ -85,7 +85,7 @@ Once you have extracted features from two languages (e.g., Python and TypeScript | `--base ` | **Required.** Path to the "source of truth" feature registry (e.g., Python). | | `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | | `--output ` | **Required.** Path for the output directory. The report filename is auto-generated. | -| `--report-type ` | `symmetric` (default) for Jaccard Index, `directional` for F1/Precision/Recall, or `raw` for CSV. | +| `--report-type ` | `md` (default) for Markdown Parity Report, or `raw` for CSV. | | `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | #### How Matching Works @@ -99,26 +99,19 @@ The matcher uses the **Hungarian Algorithm** to find the optimal assignment betw #### Understanding the Reports -`adk-scope` can generate three types of reports to help you understand the feature overlap between two languages. +`adk-scope` generates two types of reports to help you understand the feature overlap between two languages. -##### Symmetric Report (`--report-type symmetric`) +##### Markdown Parity Report (`--report-type md`) -This report is best for measuring the general similarity between two feature sets, where neither is considered the "source of truth". It uses the **Jaccard Index** to calculate a global similarity score. +This report generates a human-readable Markdown file detailing the feature parity between two SDKs. -- **What it measures**: The Jaccard Index measures the similarity between two sets by dividing the size of their intersection by the size of their union. The score ranges from 0% (no similarity) to 100% (identical sets). -- **What it means**: A high Jaccard Index indicates that both languages have a very similar set of features, with few features unique to either one. It penalizes both missing and extra features equally. - -##### Directional Report (`--report-type directional`) - -This report is ideal when you have a "base" or "source of truth" language and you want to measure how well a "target" language conforms to it. It uses **Precision**, **Recall**, and **F1-Score**. - -- **Precision**: Answers the question: *"Of all the features implemented in the target language, how many of them are correct matches to features in the base language?"* A low score indicates the target has many extra features not present in the base. -- **Recall**: Answers the question: *"Of all the features that should be in the target language (i.e., all features in the base), how many were actually found?"* A low score indicates the target is missing many features from the base. -- **F1-Score**: The harmonic mean of Precision and Recall, providing a single score that balances both. A high F1-Score indicates the target is a close match to the base, having most of the required features and not too many extra ones. +- **Gap Analysis List**: A summary table that breaks down features into "Common Shared", "Exclusive to [Base Language]", and "Exclusive to [Target Language]". +- **Jaccard Score**: It calculates an overall similarity score using the Jaccard Index (Intersection over Union), providing a global metric of feature parity. +- **Module Breakdown**: It provides score details and status links on a per-module basis, highlighting exact matches, potential near-matches, and missing features. ##### Raw Report (`--report-type raw`) -This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools.$ +This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools. ## Development diff --git a/report.sh b/report.sh index 2850338..f7c99f3 100755 --- a/report.sh +++ b/report.sh @@ -4,7 +4,7 @@ set -e # Default values -REPORT_TYPE="symmetric" +REPORT_TYPE="md" ALPHA="0.8" VERBOSE="" @@ -66,7 +66,7 @@ if [ "$REPORT_TYPE" == "raw" ]; then else EXTENSION="md" fi -OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.${EXTENSION}" +OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}.${EXTENSION}" FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" # Determine the directory where this script is located diff --git a/run.sh b/run.sh index a3f5884..60adb6e 100755 --- a/run.sh +++ b/run.sh @@ -13,10 +13,7 @@ echo "Extracting Go features..." # Py -> TS echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type symmetric - -echo "Generating directional reports.. ." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type directional +./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type md echo "Generating raw reports..." ./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type raw @@ -24,16 +21,15 @@ echo "Generating raw reports..." # Py -> Java echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type symmetric - -echo "Generating directional reports (py->java)..." -./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type directional +./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type md +echo "Generating raw reports..." +./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type raw # Py -> Go echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type symmetric +./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type md -echo "Generating directional reports (py->go)..." -./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type directional \ No newline at end of file +echo "Generating raw reports..." +./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type raw diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index 14848e0..207dd03 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -146,7 +146,21 @@ def extract_features( def get_version(repo_root: pathlib.Path) -> str: - """Get the module path from a go.mod file.""" + """Get the version of the ADK from internal/version/version.go.""" + version_path = repo_root / "internal" / "version" / "version.go" + if version_path.exists(): + try: + content = version_path.read_text() + for line in content.splitlines(): + if "const Version string =" in line: + # e.g., const Version string = "0.3.0" + parts = line.split('"') + if len(parts) >= 3: + return parts[1] + except Exception as e: + logger.warning("Failed to read version.go file: %s", e) + + # Fallback to reading go.mod module path if version isn't found go_mod_path = repo_root / "go.mod" if go_mod_path.exists(): try: @@ -156,4 +170,5 @@ def get_version(repo_root: pathlib.Path) -> str: return line.split()[1] except Exception as e: logger.warning("Failed to read go.mod file: %s", e) + return "" diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 40ba4c1..2f67a43 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -132,7 +132,8 @@ def process_module( base_list: List[features_pb2.Feature], target_list: List[features_pb2.Feature], alpha: float, - report_type: str, + base_lang_name: str, + target_lang_name: str, base_lang_code: str, target_lang_code: str, ) -> Dict: @@ -149,13 +150,8 @@ def process_module( unmatched_base = base_list unmatched_target = target_list - if report_type == "symmetric": - union_size = mod_base_count + mod_target_count - mod_solid_count - mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 - else: # directional - precision = stats.calculate_precision(mod_solid_count, mod_target_count) - recall = stats.calculate_recall(mod_solid_count, mod_base_count) - mod_score = stats.calculate_f1(precision, recall) + union_size = mod_base_count + mod_target_count - mod_solid_count + mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 status_icon = ( "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" @@ -164,22 +160,16 @@ def process_module( module_filename = f"{module_safe_name}.md" details_link = f"[View Details]({{modules_dir}}/{module_filename})" - if report_type == "symmetric": - adk_parts = [] - if mod_base_count > 0: - adk_parts.append(base_lang_code) - if mod_target_count > 0: - adk_parts.append(target_lang_code) - adk_value = ", ".join(adk_parts) - row_content = ( - f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" - f" {status_icon} | {details_link} |" - ) - else: - row_content = ( - f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon}" - f" | {details_link} |" - ) + adk_parts = [] + if mod_base_count > 0: + adk_parts.append(base_lang_code) + if mod_target_count > 0: + adk_parts.append(target_lang_code) + adk_value = ", ".join(adk_parts) + row_content = ( + f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" + f" {status_icon} | {details_link} |" + ) # Module Content mod_lines = [ @@ -188,21 +178,7 @@ def process_module( "", f"**Score:** {mod_score:.2%} ({status_icon})", ] - if report_type == "directional": - mod_lines.extend( - [ - "\n| Metric | Score |", - "|---|---|", - f"| **Precision** | {precision:.2%} |", - f"| **Recall** | {recall:.2%} |", - ] - ) - - mod_total_features = ( - (mod_base_count + mod_target_count - mod_solid_count) - if report_type == "symmetric" - else mod_base_count - ) + mod_total_features = mod_base_count + mod_target_count - mod_solid_count mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) solid_matches.sort( @@ -214,12 +190,11 @@ def process_module( if solid_matches: mod_lines.append( - f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'}" - " Features" + "### ✅ Solid Features" ) mod_lines.extend( [ - "| Type | Base Feature | Target Feature | Similarity Score |", + f"| Type | {base_lang_name} Feature | {target_lang_name} Feature | Similarity Score |", "|---|---|---|---|", ] ) @@ -237,7 +212,7 @@ def process_module( mod_lines.extend( [ "### ⚠️ Potential Matches", - "| Type | Base Feature | Closest Target Candidate" + f"| Type | {base_lang_name} Feature | Closest {target_lang_name} Candidate" " | Similarity |", "|---|---|---|---|", ] @@ -252,7 +227,7 @@ def process_module( ) mod_lines.append("") - if report_type == "symmetric" and (unmatched_base or unmatched_target): + if unmatched_base or unmatched_target: mod_lines.extend( [ "### ❌ Unmatched Features", @@ -261,18 +236,10 @@ def process_module( ] ) mod_lines.extend( - [f"| `{_format_feature(f)}` | Target |" for f in unmatched_base] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` | Base |" for f in unmatched_target] - ) - mod_lines.append("") - elif report_type == "directional" and unmatched_base: - mod_lines.extend( - ["### ❌ Missing in Target", "| Missing Feature |", "|---|"] + [f"| `{_format_feature(f)}` | {target_lang_name} |" for f in unmatched_base] ) mod_lines.extend( - [f"| `{_format_feature(f)}` |" for f in unmatched_base] + [f"| `{_format_feature(f)}` | {base_lang_name} |" for f in unmatched_target] ) mod_lines.append("") diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 4044a8b..1b686e8 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -37,7 +37,7 @@ def _group_features_by_module( def _get_language_code(language_name: str) -> str: """Returns a short code for the language.""" name = language_name.upper() - if name == {"PYTHON", "PY"}: + if name in {"PYTHON", "PY"}: return "py" elif name in {"TYPESCRIPT", "TS"}: return "ts" @@ -49,6 +49,21 @@ def _get_language_code(language_name: str) -> str: return name.lower() +def _get_language_name(language_name: str) -> str: + """Returns a properly capitalized display name for the language.""" + name = language_name.upper() + if name in {"PYTHON", "PY"}: + return "Python" + elif name in {"TYPESCRIPT", "TS"}: + return "TypeScript" + elif name == "JAVA": + return "Java" + elif name in {"GOLANG", "GO"}: + return "Go" + else: + return language_name.title() + + def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: """Reads a FeatureRegistry from a text proto file.""" registry = features_pb2.FeatureRegistry() @@ -61,7 +76,7 @@ def match_registries( base_registry: features_pb2.FeatureRegistry, target_registry: features_pb2.FeatureRegistry, alpha: float, - report_type: str = "symmetric", + report_type: str = "md", ) -> MatchResult: """Matches features and generates a master report + module sub-reports.""" reporter = ReportGenerator( @@ -92,10 +107,8 @@ def generate_report(self, report_type) -> MatchResult: """Generates report.""" if report_type == "raw": return self.generate_raw_report() - elif report_type == "directional": - return self.generate_directional_report() - elif report_type == "symmetric": - return self.generate_symmetric_report() + elif report_type == "md": + return self.generate_md_report() else: raise ValueError(f"Unknown report type: {report_type}") @@ -190,113 +203,18 @@ def esc_csv(s): module_files={}, ) - def generate_directional_report(self) -> MatchResult: - """Generates a directional report.""" - all_modules = sorted(self.features_base.keys()) - master_lines = [] - title_suffix = "Directional" - master_lines.extend( - [ - f"# Feature Matching Report: {title_suffix}", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - ( - f"**Base:** {self.base_registry.language}" - f" ({self.base_registry.version})" - ), - ( - f"**Target:** {self.target_registry.language}" - f" ({self.target_registry.version})" - ), - ] - ) - - global_score_idx = len(master_lines) - master_lines.append("GLOBAL_SCORE_PLACEHOLDER") - master_lines.append("") - - header = "| Module | Features (Base) | Score | Status | Details |" - divider = "|---|---|---|---|---|" - master_lines.extend(["## Module Summary", header, divider]) - - module_files = {} - module_rows = [] - total_solid_matches = 0 - - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - - for module in all_modules: - mod_base_list = self.features_base.get(module, []) - mod_target_list = self.features_target.get(module, []) - - results = matcher.process_module( - module, - mod_base_list, - mod_target_list, - self.alpha, - "directional", - base_code, - target_code, - ) - total_solid_matches += results["solid_matches_count"] - module_rows.append((results["score"], results["row_content"])) - if results.get("module_filename"): - module_files[results["module_filename"]] = results[ - "module_content" - ] - - module_rows.sort(key=lambda x: x[0], reverse=True) - master_lines.extend([row for _, row in module_rows]) - - total_base_features = len(self.base_registry.features) - total_target_features = len(self.target_registry.features) - - precision = stats.calculate_precision( - total_solid_matches, total_target_features - ) - recall = stats.calculate_recall( - total_solid_matches, total_base_features - ) - parity_score = stats.calculate_f1(precision, recall) - - global_stats = ( - "\n| Metric | Score |\n" - "|---|---|\n" - f"| **Precision** | {precision:.2%} |\n" - f"| **Recall** | {recall:.2%} |\n" - f"| **F1 Score** | {parity_score:.2%} |\n\n" - "> **Precision**: Of all features in the target, how many are " - "correct matches to the base? (High score = low number of extra " - "features in target)\n\n" - "> **Recall**: Of all features in the base, how many were found in " - "the target? (High score = low number of missing features in " - "target)\n\n" - "> **F1 Score**: A weighted average of Precision and Recall, " - "providing a single measure of how well the target feature set " - "matches the base." - ) - - master_lines[global_score_idx] = global_stats - - return MatchResult( - master_content="\n".join(master_lines).strip(), - module_files=module_files, - ) - - def generate_symmetric_report(self) -> MatchResult: - """Generates a symmetric report.""" + def generate_md_report(self) -> MatchResult: + """Generates a Markdown parity report.""" all_modules = sorted( set(self.features_base.keys()) | set(self.features_target.keys()) ) master_lines = [] - title_suffix = "Symmetric" master_lines.extend( [ - f"# Feature Matching Report: {title_suffix}", + "# Feature Matching Parity Report", f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "", - "| Registry | Language | Version |", + "| Role | Language | Version |", "| :--- | :--- | :--- |", ( f"| **Base** | {self.base_registry.language} |" @@ -314,7 +232,10 @@ def generate_symmetric_report(self) -> MatchResult: master_lines.append("GLOBAL_SCORE_PLACEHOLDER") master_lines.append("") - header = "| ADK | Module | Features (Base) | Score | Status | Details |" + b_lang = _get_language_name(self.base_registry.language) + t_lang = _get_language_name(self.target_registry.language) + + header = f"| ADK | Module | Features ({b_lang}) | Score | Status | Details |" divider = "|---|---|---|---|---|---|" master_lines.extend(["## Module Summary", header, divider]) @@ -335,7 +256,8 @@ def generate_symmetric_report(self) -> MatchResult: mod_base_list, mod_target_list, self.alpha, - "symmetric", + b_lang, + t_lang, base_code, target_code, ) @@ -352,17 +274,28 @@ def generate_symmetric_report(self) -> MatchResult: total_base_features = len(self.base_registry.features) total_target_features = len(self.target_registry.features) - union_size = ( - total_base_features + total_target_features - total_solid_matches - ) - parity_score = ( - total_solid_matches / union_size if union_size > 0 else 1.0 - ) + # Calculate metrics for the summary table + base_exclusive = total_base_features - total_solid_matches + target_exclusive = total_target_features - total_solid_matches + + union_size = total_base_features + total_target_features - total_solid_matches + parity_score = total_solid_matches / union_size if union_size > 0 else 1.0 + + b_lang = _get_language_name(self.base_registry.language) + t_lang = _get_language_name(self.target_registry.language) + global_stats = ( - f"**Jaccard Index:** {parity_score:.2%}\n\n" - "> The Jaccard Index measures the similarity between the " - "two feature sets. A score of 100% indicates that both languages " - "have identical features." + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ Common Shared** | **{total_solid_matches}** | " + f"Implemented in both SDKs |\n" + f"| **📦 Exclusive to `{b_lang}`** | **{base_exclusive}** | " + f"Requires implementation in `{t_lang}` |\n" + f"| **📦 Exclusive to `{t_lang}`** | **{target_exclusive}** | " + f"Requires implementation in `{b_lang}` |\n" + f"| **📊 Jaccard Score** | **{parity_score:.2%}** | " + f"Overall Parity ({total_solid_matches} / {union_size}) |" ) master_lines[global_score_idx] = global_stats @@ -400,9 +333,9 @@ def main(): ) parser.add_argument( "--report-type", - choices=["symmetric", "directional", "raw"], - default="symmetric", - help="Type of gap report to generate (symmetric, directional, or raw).", + choices=["md", "raw"], + default="md", + help="Type of gap report to generate (md or raw).", ) adk_args.add_verbose_argument(parser) args = parser.parse_args() diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index f3f0223..0198471 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -106,24 +106,29 @@ def test_match_registries(self): ) target_registry.features.extend([f2, f_near_target]) - # Test Symmetric Report - result_sym = reporter.match_registries( - base_registry, target_registry, 0.9, report_type="symmetric" + # Test Markdown Report + result_md = reporter.match_registries( + base_registry, target_registry, 0.9, report_type="md" ) - report_sym = result_sym.master_content + report_md = result_md.master_content # 1. Verify Master Report Structure - self.assertIn("# Feature Matching Report: Symmetric", report_sym) - self.assertIn("**Jaccard Index:** 25.00%", report_sym) - self.assertIn("## Module Summary", report_sym) + self.assertIn("# Feature Matching Parity Report", report_md) + self.assertIn("## Summary", report_md) + self.assertIn("| **✅ Common Shared** | **1** |", report_md) + self.assertIn("| **📦 Exclusive to `Python`** | **2** |", report_md) + self.assertIn("| **📦 Exclusive to `TypeScript`** | **1** |", report_md) + self.assertIn("| **📊 Jaccard Score** | **25.00%** |", report_md) + self.assertIn("## Module Summary", report_md) # Check for module entry in master summary - self.assertIn("| `n_same` |", report_sym) - self.assertIn("[View Details]({modules_dir}/n_same.md)", report_sym) + self.assertIn("| ADK | Module | Features (Python) | Score | Status | Details |", report_md) + self.assertIn("| `n_same` |", report_md) + self.assertIn("[View Details]({modules_dir}/n_same.md)", report_md) # 2. Verify Module Content - self.assertIn("n_same.md", result_sym.module_files) - module_content = result_sym.module_files["n_same.md"] + self.assertIn("n_same.md", result_md.module_files) + module_content = result_md.module_files["n_same.md"] self.assertIn("# Module: `n_same`", module_content) self.assertIn("**Features:** 3", module_content) @@ -131,7 +136,7 @@ def test_match_registries(self): # Solid Matches self.assertIn("### ✅ Solid Features", module_content) self.assertIn( - "| Type | Base Feature | Target Feature | Similarity Score |", + "| Type | Python Feature | TypeScript Feature | Similarity Score |", module_content, ) self.assertIn( @@ -142,7 +147,7 @@ def test_match_registries(self): # Potential Matches (formerly Near Misses) self.assertIn("### ⚠️ Potential Matches", module_content) self.assertIn( - "| Type | Base Feature | Closest Target Candidate | Similarity |", + "| Type | Python Feature | Closest TypeScript Candidate | Similarity |", module_content, ) self.assertIn( @@ -152,51 +157,13 @@ def test_match_registries(self): ) # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_sym.module_files) - stuff_content = result_sym.module_files["stuff.md"] + self.assertIn("stuff.md", result_md.module_files) + stuff_content = result_md.module_files["stuff.md"] self.assertIn("### ❌ Unmatched Features", stuff_content) - self.assertIn("| `totally_diff` | Target |", stuff_content) + self.assertIn("| `totally_diff` | TypeScript |", stuff_content) self.assertIn("**Features:** 1", stuff_content) - # Test Directional Report - result_dir = reporter.match_registries( - base_registry, target_registry, 0.9, report_type="directional" - ) - report_dir = result_dir.master_content - - self.assertIn("| **F1 Score** | 40.00% |", report_dir) - self.assertIn("n_same.md", result_dir.module_files) - - mod_dir_content = result_dir.module_files["n_same.md"] - - # Solid Matches - self.assertIn("### ✅ Matched Features", mod_dir_content) - self.assertIn( - "| Type | Base Feature | Target Feature | Similarity Score |", - mod_dir_content, - ) - self.assertIn( - "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - mod_dir_content, - ) - - # Potential Matches - self.assertIn("### ⚠️ Potential Matches", mod_dir_content) - self.assertIn( - "| Type | Base Feature | Closest Target Candidate | Similarity |", - mod_dir_content, - ) - self.assertIn( - "| method | `base_member.base_name` | " - "`target_member.target_name` |", - mod_dir_content, - ) - # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_dir.module_files) - stuff_dir_content = result_dir.module_files["stuff.md"] - self.assertIn("### ❌ Missing in Target", stuff_dir_content) - self.assertIn("| `totally_diff` |", stuff_dir_content) def test_match_registries_raw(self): f1 = features_pb2.Feature( @@ -216,7 +183,7 @@ def test_match_registries_raw(self): csv_content = result.master_content expected_header = ( - "python_namespace,python_member_of,python_name,ts_namespace," + "py_namespace,py_member_of,py_name,ts_namespace," "ts_member_of,ts_name,type,score" ) self.assertIn(expected_header, csv_content) @@ -273,7 +240,8 @@ def test_process_module(self): base_list=[f_base], target_list=[f_target], alpha=0.9, - report_type="symmetric", + base_lang_name="Python", + target_lang_name="TypeScript", base_lang_code="py", target_lang_code="ts", ) @@ -312,13 +280,13 @@ def test_generate_raw_report(self): ).generate_raw_report() self.assertIn( - "python_namespace,python_member_of,python_name", + "py_namespace,py_member_of,py_name", result.master_content, ) self.assertIn("n1,c1,f1_base", result.master_content) - def test_generate_symmetric_report(self): - """Tests the symmetric report generation.""" + def test_generate_md_report(self): + """Tests the md report generation.""" base_registry = features_pb2.FeatureRegistry( language="Python", version="1.0.0" ) @@ -341,49 +309,17 @@ def test_generate_symmetric_report(self): result = reporter.ReportGenerator( base_registry, target_registry, 0.9 - ).generate_symmetric_report() + ).generate_md_report() self.assertIn( - "# Feature Matching Report: Symmetric", result.master_content + "# Feature Matching Parity Report", result.master_content ) - self.assertIn("**Jaccard Index:**", result.master_content) + self.assertIn("## Summary", result.master_content) self.assertIn("## Module Summary", result.master_content) self.assertIn("| `n1` |", result.master_content) self.assertIn("n1.md", result.module_files) - def test_generate_directional_report(self): - """Tests the directional report generation.""" - base_registry = features_pb2.FeatureRegistry( - language="Python", version="1.0.0" - ) - f1 = base_registry.features.add() - f1.namespace = "n1" - target_registry = features_pb2.FeatureRegistry( - language="TypeScript", version="2.0.0" - ) - - with patch( - "google.adk.scope.reporter.reporter.matcher.process_module" - ) as mock_process: - mock_process.return_value = { - "solid_matches_count": 1, - "score": 1.0, - "row_content": "| `n1` | 1 | 100.00% | ✅ | n1.md |", - "module_filename": "n1.md", - "module_content": "# Module: `n1`", - } - - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_directional_report() - self.assertIn( - "# Feature Matching Report: Directional", result.master_content - ) - self.assertIn("| **F1 Score** |", result.master_content) - self.assertIn("## Module Summary", result.master_content) - self.assertIn("| `n1` |", result.master_content) - self.assertIn("n1.md", result.module_files) def test_raw_integration(self): """Tests the raw report generation end-to-end.""" @@ -460,7 +396,7 @@ def test_raw_integration(self): ).generate_raw_report() self.assertIn( - "python_namespace,python_member_of,python_name,ts_namespace,ts_member_of,ts_name,type,score", + "py_namespace,py_member_of,py_name,ts_namespace,ts_member_of,ts_name,type,score", result.master_content, ) From 537aae10f801f6170e83cd5c236ffe9b1f88853f Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 13 Feb 2026 14:37:51 -0800 Subject: [PATCH 2/9] Improved go extractor --- report.sh | 47 +++- run.sh | 5 + .../adk/scope/extractors/converter_go.py | 35 ++- .../adk/scope/extractors/converter_py.py | 10 +- .../adk/scope/extractors/extractor_go.py | 67 +++-- src/google/adk/scope/reporter/reporter.py | 243 ++++++++++++++++-- .../adk/scope/extractors/test_extractor_py.py | 62 +++++ test/adk/scope/reporter/test_reporter.py | 70 ++++- 8 files changed, 482 insertions(+), 57 deletions(-) diff --git a/report.sh b/report.sh index f7c99f3..4b1a4a4 100755 --- a/report.sh +++ b/report.sh @@ -7,8 +7,10 @@ set -e REPORT_TYPE="md" ALPHA="0.8" VERBOSE="" +COMMON="" # Parse arguments +REGISTRIES=() while [[ "$#" -gt 0 ]]; do case "$1" in --base) @@ -19,6 +21,13 @@ while [[ "$#" -gt 0 ]]; do TARGET_FILE="$2" shift 2 ;; + --registries) + shift + while [[ "$#" -gt 0 && ! "$1" =~ ^-- ]]; do + REGISTRIES+=("$1") + shift + done + ;; --output) OUTPUT_DIR="$2" shift 2 @@ -35,6 +44,10 @@ while [[ "$#" -gt 0 ]]; do VERBOSE="--verbose" shift ;; + --common) + COMMON="--common" + shift + ;; *) echo "Unknown option: $1" exit 1 @@ -42,10 +55,6 @@ while [[ "$#" -gt 0 ]]; do esac done -# Extract languages -BASE_LANG_RAW=$(head -n 1 "${BASE_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') -TARGET_LANG_RAW=$(head -n 1 "${TARGET_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') - # Function to map language to short code get_lang_code() { case "$1" in @@ -57,8 +66,21 @@ get_lang_code() { esac } -BASE_LANG=$(get_lang_code "$BASE_LANG_RAW") -TARGET_LANG=$(get_lang_code "$TARGET_LANG_RAW") +if [[ ${#REGISTRIES[@]} -eq 0 && -n "$BASE_FILE" && -n "$TARGET_FILE" ]]; then + REGISTRIES+=("$BASE_FILE" "$TARGET_FILE") +fi + +if [[ ${#REGISTRIES[@]} -lt 2 ]]; then + echo "Error: Must provide at least two registries via --registries or --base/--target" + exit 1 +fi + +# Extract languages and construct filename +LANG_CODES=() +for REG_FILE in "${REGISTRIES[@]}"; do + LANG_RAW=$(head -n 1 "${REG_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') + LANG_CODES+=($(get_lang_code "$LANG_RAW")) +done # Construct filename if [ "$REPORT_TYPE" == "raw" ]; then @@ -66,7 +88,14 @@ if [ "$REPORT_TYPE" == "raw" ]; then else EXTENSION="md" fi -OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}.${EXTENSION}" + +if [ "$REPORT_TYPE" == "matrix" ]; then + # e.g., py_ts_go.md + OUTPUT_FILENAME="$(IFS=_; echo "${LANG_CODES[*]}").${EXTENSION}" +else + OUTPUT_FILENAME="${LANG_CODES[0]}_${LANG_CODES[1]}.${EXTENSION}" +fi + FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" # Determine the directory where this script is located @@ -77,9 +106,9 @@ export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" # Run the python matcher python3 "${SCRIPT_DIR}/src/google/adk/scope/reporter/reporter.py" \ - --base "${BASE_FILE}" \ - --target "${TARGET_FILE}" \ + --registries "${REGISTRIES[@]}" \ --output "${FULL_OUTPUT_PATH}" \ --report-type "${REPORT_TYPE}" \ --alpha "${ALPHA}" \ + ${COMMON} \ ${VERBOSE} diff --git a/run.sh b/run.sh index 60adb6e..448103a 100755 --- a/run.sh +++ b/run.sh @@ -33,3 +33,8 @@ echo "Generating symmetric reports..." echo "Generating raw reports..." ./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type raw + +# Matrix reports + +echo "Generating matrix reports..." +./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix \ No newline at end of file diff --git a/src/google/adk/scope/extractors/converter_go.py b/src/google/adk/scope/extractors/converter_go.py index f7ed2d2..927bfcb 100644 --- a/src/google/adk/scope/extractors/converter_go.py +++ b/src/google/adk/scope/extractors/converter_go.py @@ -29,7 +29,8 @@ def process( normalized_namespace: str, ) -> Optional[feature_pb2.Feature]: """Convert a Tree-sitter node into a Feature.""" - if node.type not in ("function_declaration", "method_declaration"): + valid_nodes = ("function_declaration", "method_declaration", "method_elem") + if node.type not in valid_nodes: return None original_name = self._extract_name(node) @@ -56,10 +57,18 @@ def process( "New" ): feature_type = feature_pb2.Feature.Type.CONSTRUCTOR + elif node.type == "method_elem": + feature_type = feature_pb2.Feature.Type.INSTANCE_METHOD + member_of = self._extract_interface_name(node) + normalized_member_of = ( + normalize_name(member_of) if member_of else "" + ) parameters = self._extract_params(node) original_returns, normalized_returns = self._extract_return_types(node) + + docstring = self._extract_docstring(node) feature = feature_pb2.Feature( original_name=original_name, @@ -75,8 +84,32 @@ def process( normalized_return_types=normalized_returns, ) + if docstring: + feature.description = docstring + return feature + def _extract_docstring(self, node: Node) -> str: + """Extract comments immediately preceding the declaration.""" + comments = [] + prev = node.prev_sibling + while prev and prev.type == "comment": + clean_comment = prev.text.decode("utf-8").lstrip("//").strip() + comments.insert(0, clean_comment) + prev = prev.prev_sibling + return "\n".join(comments) + + def _extract_interface_name(self, node: Node) -> str: + """Walk up the AST from a method_spec to find the interface type name.""" + parent = node.parent + while parent: + if parent.type == "type_spec": + name_node = parent.child_by_field_name("name") + if name_node: + return name_node.text.decode("utf-8") + parent = parent.parent + return "" + def _extract_receiver_type(self, node: Node) -> str: """Extract the receiver type from a method_declaration.""" receiver_node = node.child_by_field_name("receiver") diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index cc3c540..127c70d 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -54,6 +54,11 @@ def process( # 2. Context member_of, normalized_member_of = self._extract_member_of(node) + + # If the member belongs to a private class, skip it + if member_of and member_of.startswith("_"): + logger.debug("Skipping method %s of private class %s", original_name, member_of) + return None feature_type = self._determine_type( node, original_name, bool(member_of) @@ -142,6 +147,10 @@ def _extract_name(self, node: Node) -> str: def _process_dataclass( self, node: Node, file_path: Path, repo_root: Path ) -> Optional[feature_pb2.Feature]: + original_name = self._extract_name(node) + if not original_name or original_name.startswith("_"): + return None + body_node = node.child_by_field_name("body") if not body_node: return None @@ -201,7 +210,6 @@ def _process_dataclass( if has_init or not params: return None - original_name = self._extract_name(node) normalized_name = normalize_name(original_name) namespace, normalized_namespace = self._extract_namespace( file_path, repo_root diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index 207dd03..fb3f5b7 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -73,43 +73,58 @@ def extract_features( query_text = """ (function_declaration) @func (method_declaration) @method + (type_declaration + (type_spec + name: (type_identifier) @interface_name + type: (interface_type + (method_elem) @interface_method + ) + ) + ) """ query = Query(GO_LANGUAGE, query_text) cursor = QueryCursor(query) captures = cursor.captures(root_node) all_nodes = [] - for node_list in captures.values(): - all_nodes.extend(node_list) + # We only want to process the actual function/method nodes, not the interface names + # which are captured just for context by the processor (via tree traversal). + for capture_name, node_list in captures.items(): + if capture_name in ("func", "method", "interface_method"): + all_nodes.extend(node_list) # Log results for debugging logger.debug("Found %d potential nodes in %s", len(all_nodes), file_path) for node in all_nodes: - # Filter out simple functions (e.g., getters, setters) by checking - # the body. Note: In Go AST, the function 'body' is a 'block' which - # contains a 'statement_list'. We need to check the size of the - # 'statement_list' to know the actual number of statements. - body_node = node.child_by_field_name("body") - if body_node: - stmt_list = next( - ( - child - for child in body_node.children - if child.type == "statement_list" - ), - None, - ) - # If there is no statement list, or it has 1 or fewer statements, - # consider it simple. - if stmt_list is None or stmt_list.named_child_count <= 1: - function_name_node = node.child_by_field_name("name") - if function_name_node: - logger.debug( - "Skipping simple function: %s", - function_name_node.text.decode("utf8"), - ) - continue + # Prevent filtering out abstract interface methods which have no body + if node.type == "method_elem": + pass + else: + # Filter out simple functions (e.g., getters, setters) by checking + # the body. Note: In Go AST, the function 'body' is a 'block' which + # contains a 'statement_list'. We need to check the size of the + # 'statement_list' to know the actual number of statements. + body_node = node.child_by_field_name("body") + if body_node: + stmt_list = next( + ( + child + for child in body_node.children + if child.type == "statement_list" + ), + None, + ) + # If there is no statement list, or it has 1 or fewer statements, + # consider it simple. + if stmt_list is None or stmt_list.named_child_count <= 1: + function_name_node = node.child_by_field_name("name") + if function_name_node: + logger.debug( + "Skipping simple function: %s", + function_name_node.text.decode("utf8"), + ) + continue # Prepare namespace and normalized namespace try: diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 1b686e8..00eb7a5 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -73,21 +73,214 @@ def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: def match_registries( - base_registry: features_pb2.FeatureRegistry, - target_registry: features_pb2.FeatureRegistry, + registries: List[features_pb2.FeatureRegistry], alpha: float, report_type: str = "md", ) -> MatchResult: - """Matches features and generates a master report + module sub-reports.""" - reporter = ReportGenerator( - base_registry, - target_registry, - alpha, - ) + """Matches features and generates reports.""" + if report_type == "matrix": + reporter = MatrixReportGenerator(registries, alpha) + else: + if len(registries) != 2: + raise ValueError(f"Report type '{report_type}' requires exactly 2 registries.") + reporter = ReportGenerator( + registries[0], + registries[1], + alpha, + ) return reporter.generate_report(report_type) +class MatrixReportGenerator: + def __init__( + self, + registries: List[features_pb2.FeatureRegistry], + alpha: float, + ): + self.registries = registries + self.alpha = alpha + + self.langs = [_get_language_name(r.language) for r in self.registries] + + def _compute_jaccard_matrix(self) -> List[str]: + n = len(self.registries) + matrix = [[0.0] * n for _ in range(n)] + + for i in range(n): + for j in range(n): + if i == j: + matrix[i][j] = 1.0 + continue + if i > j: + matrix[i][j] = matrix[j][i] + continue + + # compute intersection + r_base = self.registries[i] + r_target = self.registries[j] + + features_base = _group_features_by_module(r_base) + features_target = _group_features_by_module(r_target) + matcher.fuzzy_match_namespaces(features_base, features_target) + + all_modules = set(features_base.keys()) | set(features_target.keys()) + total_solid = 0 + for mod in all_modules: + b_list = list(features_base.get(mod, [])) + t_list = list(features_target.get(mod, [])) + solid_matches = matcher.match_features(b_list, t_list, self.alpha) + total_solid += len(solid_matches) + + total_base = len(r_base.features) + total_target = len(r_target.features) + union_size = total_base + total_target - total_solid + + score = total_solid / union_size if union_size > 0 else 1.0 + matrix[i][j] = score + + lines = [ + "## Global Parity Matrix", + "", + "| Language | " + " | ".join(self.langs) + " |", + "| :--- |" + " :--- |" * n + ] + + for i in range(n): + row = [f"**{self.langs[i]}**"] + for j in range(n): + if i == j: + row.append("-") + else: + row.append(f"{matrix[i][j]:.2%}") + lines.append("| " + " | ".join(row) + " |") + + lines.append("") + return lines + + def _build_global_feature_matrix(self) -> List[str]: + # CrossLanguageFeature: dict mapping lang_idx -> Feature + global_features: List[Dict[int, features_pb2.Feature]] = [] + + # 1. Initialize with Anchor (index 0) + anchor_registry = self.registries[0] + for f in anchor_registry.features: + global_features.append({0: f}) + + # 2. Iteratively align remaining registries + for i in range(1, len(self.registries)): + target_registry = self.registries[i] + + # Group current global features by module and target features by module + global_by_mod = defaultdict(list) + for row in global_features: + # Use the feature representation from the earliest language that has it + rep_idx = min(row.keys()) + rep_f = row[rep_idx] + mod = rep_f.normalized_namespace or rep_f.namespace or "Unknown Module" + global_by_mod[mod].append((row, rep_f)) + + target_by_mod = _group_features_by_module(target_registry) + + # We must remap namespaces just for matching purposes in this step + # We'll build temporary Dict[str, List[Feature]] for namespaces + g_ns_dict = {mod: [f for _, f in lst] for mod, lst in global_by_mod.items()} + matcher.fuzzy_match_namespaces(g_ns_dict, target_by_mod) + + all_modules = set(g_ns_dict.keys()) | set(target_by_mod.keys()) + + for mod in all_modules: + base_tuples = global_by_mod.get(mod, []) # list of (row_dict, Feature) + b_list = [f for _, f in base_tuples] + t_list = target_by_mod.get(mod, []) + + # Match + solid_matches = matcher.match_features(b_list, t_list, self.alpha) + + # Record matches + for b_f, t_f, _ in solid_matches: + # Find the original row dict that owns b_f + for row_dict, feat in base_tuples: + if feat is b_f: + row_dict[i] = t_f + break + + # Record unmatched targets as new rows + # t_list was mutated by match_features (items removed) + for t_f in t_list: + global_features.append({i: t_f}) + + # 3. Render table grouped by Module + # Regroup final global features by module for rendering + final_by_mod = defaultdict(list) + for row in global_features: + rep_idx = min(row.keys()) + rep_f = row[rep_idx] + mod = rep_f.normalized_namespace or rep_f.namespace or "Unknown Module" + final_by_mod[mod].append(row) + + lines = ["## Global Feature Support", ""] + + for mod in sorted(final_by_mod.keys()): + lines.append(f"### Module: `{mod}`") + header = "| Feature | Type | " + " | ".join(self.langs) + " |" + divider = "| :--- | :--- |" + " :---: |" * len(self.langs) + lines.extend([header, divider]) + + # sort features in module + def get_sort_key(row): + rep_idx = min(row.keys()) + rep_f = row[rep_idx] + return (matcher._get_type_priority(rep_f), rep_f.normalized_name or "") + + mod_rows = final_by_mod[mod] + mod_rows.sort(key=get_sort_key) + + for row in mod_rows: + rep_idx = min(row.keys()) + rep_f = row[rep_idx] + f_name = matcher._format_feature(rep_f) + f_type = matcher.get_type_display_name(rep_f) + + row_cells = [f"`{f_name}`", f_type] + for i in range(len(self.registries)): + if i in row: + row_cells.append("✅") + else: + row_cells.append("❌") + + lines.append("| " + " | ".join(row_cells) + " |") + + lines.append("") + + return lines + + def generate_report(self, report_type: str = "matrix") -> MatchResult: + master_lines = [ + "# Multi-SDK Feature Matrix Report", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "## Registries", + "| Role | Language | Version |", + "| :--- | :--- | :--- |" + ] + + for idx, r in enumerate(self.registries): + role_marker = "Anchor" if idx == 0 else f"Comparison {idx}" + master_lines.append( + f"| **{role_marker}** | {self.langs[idx]} | {r.version} |" + ) + + master_lines.append("") + master_lines.extend(self._compute_jaccard_matrix()) + master_lines.extend(self._build_global_feature_matrix()) + + return MatchResult( + master_content="\n".join(master_lines).strip(), + module_files={}, + ) + + class ReportGenerator: def __init__( self, @@ -312,14 +505,20 @@ def main(): ) parser.add_argument( "--base", - required=True, + required=False, help="Path to the base FeatureRegistry .txtpb file.", ) parser.add_argument( "--target", - required=True, + required=False, help="Path to the target FeatureRegistry .txtpb file.", ) + parser.add_argument( + "--registries", + nargs="+", + required=False, + help="Paths to multiple FeatureRegistry .txtpb files.", + ) parser.add_argument( "--output", required=True, @@ -333,24 +532,34 @@ def main(): ) parser.add_argument( "--report-type", - choices=["md", "raw"], + choices=["md", "raw", "matrix"], default="md", - help="Type of gap report to generate (md or raw).", + help="Type of gap report to generate (md, raw, matrix).", ) adk_args.add_verbose_argument(parser) args = parser.parse_args() adk_args.configure_logging(args) try: - base_registry = _read_feature_registry(args.base) - target_registry = _read_feature_registry(args.target) + registry_paths = [] + if args.registries: + registry_paths.extend(args.registries) + elif args.base and args.target: + registry_paths.extend([args.base, args.target]) + else: + logging.error("Must provide either --registries or both --base and --target") + sys.exit(1) + + if len(registry_paths) < 2: + logging.error("Must provide at least 2 registries to compare.") + sys.exit(1) + + registries = [_read_feature_registry(p) for p in registry_paths] except Exception as e: logging.error(f"Error reading feature registries: {e}") sys.exit(1) - result = match_registries( - base_registry, target_registry, args.alpha, args.report_type - ) + result = match_registries(registries, args.alpha, args.report_type) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/test/adk/scope/extractors/test_extractor_py.py b/test/adk/scope/extractors/test_extractor_py.py index 165d7f4..f9ed5b4 100644 --- a/test/adk/scope/extractors/test_extractor_py.py +++ b/test/adk/scope/extractors/test_extractor_py.py @@ -95,6 +95,68 @@ def test_extract_features_read_error(self): features = extract_features(mock_path, Path("/repo"), ".") self.assertEqual(features, []) + @patch("google.adk.scope.extractors.extractor_py.QueryCursor") + @patch("google.adk.scope.extractors.extractor_py.Query") + @patch("google.adk.scope.extractors.extractor_py.PARSER") + def test_private_classes_filtered( + self, mock_parser, mock_query_cls, mock_cursor_cls + ): + mock_path = MagicMock(spec=Path) + mock_path.read_bytes.return_value = b"class _PrivateClass: pass" + + mock_tree = MagicMock() + mock_parser.parse.return_value = mock_tree + mock_tree.root_node = MagicMock() + + mock_cursor_instance = mock_cursor_cls.return_value + + mock_node = MagicMock() + mock_node.type = "class_definition" + + # Simulate query returning the private class + mock_cursor_instance.captures.return_value = {"class": [mock_node]} + + with patch("google.adk.scope.extractors.extractor_py.NodeProcessor") as MockProcessor: + processor_instance = MockProcessor.return_value + # The processor returns None for private classes + processor_instance.process.return_value = None + + features = extract_features(mock_path, Path("/repo"), ".") + + self.assertEqual(features, []) + processor_instance.process.assert_called_once() + + @patch("google.adk.scope.extractors.extractor_py.QueryCursor") + @patch("google.adk.scope.extractors.extractor_py.Query") + @patch("google.adk.scope.extractors.extractor_py.PARSER") + def test_private_class_methods_filtered( + self, mock_parser, mock_query_cls, mock_cursor_cls + ): + mock_path = MagicMock(spec=Path) + mock_path.read_bytes.return_value = b"class _PrivateClass:\n def method(self): pass" + + mock_tree = MagicMock() + mock_parser.parse.return_value = mock_tree + mock_tree.root_node = MagicMock() + + mock_cursor_instance = mock_cursor_cls.return_value + + mock_node = MagicMock() + mock_node.type = "function_definition" + + # Simulate query returning the method + mock_cursor_instance.captures.return_value = {"func": [mock_node]} + + with patch("google.adk.scope.extractors.extractor_py.NodeProcessor") as MockProcessor: + processor_instance = MockProcessor.return_value + # The processor returns None for methods in private classes + processor_instance.process.return_value = None + + features = extract_features(mock_path, Path("/repo"), ".") + + self.assertEqual(features, []) + processor_instance.process.assert_called_once() + if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index 0198471..89e95e5 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -108,7 +108,7 @@ def test_match_registries(self): # Test Markdown Report result_md = reporter.match_registries( - base_registry, target_registry, 0.9, report_type="md" + [base_registry, target_registry], 0.9, report_type="md" ) report_md = result_md.master_content @@ -163,9 +163,73 @@ def test_match_registries(self): self.assertIn("| `totally_diff` | TypeScript |", stuff_content) self.assertIn("**Features:** 1", stuff_content) + def test_matrix_report(self): + f_py = features_pb2.Feature( + original_name="f", + normalized_name="f", + member_of="c", + normalized_member_of="c", + normalized_namespace="n", + type=features_pb2.Feature.Type.FUNCTION, + ) + f_ts = features_pb2.Feature( + original_name="f", + normalized_name="f", + member_of="c", + normalized_member_of="c", + normalized_namespace="n", + type=features_pb2.Feature.Type.FUNCTION, + ) + # Go only matches partially (different name) or provides a new feature + f_go1 = features_pb2.Feature( + original_name="new_f", + normalized_name="new_f", + member_of="c", + normalized_member_of="c", + normalized_namespace="n", + type=features_pb2.Feature.Type.FUNCTION, + ) + r_py = features_pb2.FeatureRegistry(language="Python", version="1") + r_py.features.append(f_py) + + r_ts = features_pb2.FeatureRegistry(language="TypeScript", version="2") + r_ts.features.append(f_ts) + + r_go = features_pb2.FeatureRegistry(language="Go", version="3") + r_go.features.append(f_go1) + + result_matrix = reporter.match_registries( + [r_py, r_ts, r_go], 0.9, report_type="matrix" + ) + + report_md = result_matrix.master_content + + # 1. Check title & headers + self.assertIn("# Multi-SDK Feature Matrix Report", report_md) + self.assertIn("| **Anchor** | Python | 1 |", report_md) + self.assertIn("| **Comparison 1** | TypeScript | 2 |", report_md) + self.assertIn("| **Comparison 2** | Go | 3 |", report_md) + + # 2. Check Jaccard Matrix + self.assertIn("## Global Parity Matrix", report_md) + self.assertIn("| Language | Python | TypeScript | Go |", report_md) + # Py vs TS should be 100% since they both only have 'f' + self.assertIn("| **Python** | - | 100.00% | 0.00% |", report_md) + # Py/TS vs Go should be 0% since Go has 'new_f' entirely disjoint + self.assertIn("| **Go** | 0.00% | 0.00% | - |", report_md) + + # 3. Check Global Feature Matrix + self.assertIn("## Global Feature Support", report_md) + self.assertIn("### Module: `n`", report_md) + self.assertIn("| Feature | Type | Python | TypeScript | Go |", report_md) + + # 'f' should be yes for Py/Ts, no for Go + self.assertIn("| `c.f` | function | ✅ | ✅ | ❌ |", report_md) + + # 'new_f' should be no for Py/Ts, yes for Go + self.assertIn("| `c.new_f` | function | ❌ | ❌ | ✅ |", report_md) - def test_match_registries_raw(self): f1 = features_pb2.Feature( original_name="f_same", normalized_name="f_same", @@ -179,7 +243,7 @@ def test_match_registries_raw(self): target = features_pb2.FeatureRegistry(language="TS", version="2") target.features.append(f1) - result = reporter.match_registries(base, target, 0.9, report_type="raw") + result = reporter.match_registries([base, target], 0.9, report_type="raw") csv_content = result.master_content expected_header = ( From 26ee53f7df56d56f74d2d0446b80deeed5dc11d6 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Tue, 17 Feb 2026 11:50:50 -0800 Subject: [PATCH 3/9] Improved go extractor --- run.sh | 2 +- .../adk/scope/extractors/converter_go.py | 8 +++-- .../adk/scope/extractors/extractor_go.py | 31 +++++++++++++------ src/google/adk/scope/reporter/reporter.py | 22 +++++++++++-- .../adk/scope/extractors/test_extractor_go.py | 4 +++ 5 files changed, 52 insertions(+), 15 deletions(-) diff --git a/run.sh b/run.sh index 448103a..93e566e 100755 --- a/run.sh +++ b/run.sh @@ -37,4 +37,4 @@ echo "Generating raw reports..." # Matrix reports echo "Generating matrix reports..." -./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix \ No newline at end of file +./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix --common \ No newline at end of file diff --git a/src/google/adk/scope/extractors/converter_go.py b/src/google/adk/scope/extractors/converter_go.py index 927bfcb..2174aa3 100644 --- a/src/google/adk/scope/extractors/converter_go.py +++ b/src/google/adk/scope/extractors/converter_go.py @@ -64,7 +64,7 @@ def process( normalize_name(member_of) if member_of else "" ) - parameters = self._extract_params(node) + parameters, is_async = self._extract_params(node) original_returns, normalized_returns = self._extract_return_types(node) @@ -83,6 +83,8 @@ def process( original_return_types=original_returns, normalized_return_types=normalized_returns, ) + if is_async: + setattr(feature, "async", True) if docstring: feature.description = docstring @@ -171,6 +173,7 @@ def _extract_params(self, node: Node) -> list[feature_pb2.Param]: if not params_node: return [] + is_async = False for child in params_node.children: if child.type == "parameter_declaration": name_node = child.child_by_field_name("name") @@ -183,6 +186,7 @@ def _extract_params(self, node: Node) -> list[feature_pb2.Param]: # Skip Go context.Context parameters to align with other # languages if param_type == "context.Context": + is_async = True continue norm_types = self.normalizer.normalize(param_type, "go") @@ -195,7 +199,7 @@ def _extract_params(self, node: Node) -> list[feature_pb2.Param]: normalized_types=norm_enums, ) params.append(p) - return params + return params, is_async def _extract_name(self, node: Node) -> str: """Extract the name from a function_declaration node.""" diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index fb3f5b7..2a6c26d 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -28,9 +28,13 @@ def find_files( iterator = root.rglob("*.go") if recursive else root.glob("*.go") for path in iterator: - # Check if any part of the path starts with '.' (excluding '.' and '..') + if path.name.endswith("_test.go"): + continue + + # Exclude hidden directories, files, and common testing directories if any( - part.startswith(".") and part not in (".", "..") + (part.startswith(".") and part not in (".", "..")) + or part in ("tests", "testutil", "testing", "testdata") for part in path.parts ): continue @@ -118,13 +122,22 @@ def extract_features( # If there is no statement list, or it has 1 or fewer statements, # consider it simple. if stmt_list is None or stmt_list.named_child_count <= 1: - function_name_node = node.child_by_field_name("name") - if function_name_node: - logger.debug( - "Skipping simple function: %s", - function_name_node.text.decode("utf8"), - ) - continue + # Also check physical line span to prevent skipping large + # single-statement functions (e.g. methods returning a large + # anonymous function). + start_row = body_node.start_point[0] + end_row = body_node.end_point[0] + line_span = end_row - start_row + 1 + + if line_span <= 4: + function_name_node = node.child_by_field_name("name") + if function_name_node: + logger.debug( + "Skipping simple function: %s (span: %d lines)", + function_name_node.text.decode("utf8"), + line_span + ) + continue # Prepare namespace and normalized namespace try: diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 00eb7a5..4b54f11 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -76,10 +76,11 @@ def match_registries( registries: List[features_pb2.FeatureRegistry], alpha: float, report_type: str = "md", + common: bool = False, ) -> MatchResult: """Matches features and generates reports.""" if report_type == "matrix": - reporter = MatrixReportGenerator(registries, alpha) + reporter = MatrixReportGenerator(registries, alpha, common) else: if len(registries) != 2: raise ValueError(f"Report type '{report_type}' requires exactly 2 registries.") @@ -97,9 +98,11 @@ def __init__( self, registries: List[features_pb2.FeatureRegistry], alpha: float, + common: bool = False, ): self.registries = registries self.alpha = alpha + self.common = common self.langs = [_get_language_name(r.language) for r in self.registries] @@ -222,6 +225,15 @@ def _build_global_feature_matrix(self) -> List[str]: lines = ["## Global Feature Support", ""] for mod in sorted(final_by_mod.keys()): + mod_rows = final_by_mod[mod] + + if self.common: + python_idx = self.langs.index("Python") if "Python" in self.langs else -1 + mod_rows = [row for row in mod_rows if python_idx in row or len(row) >= 2] + + if not mod_rows: + continue + lines.append(f"### Module: `{mod}`") header = "| Feature | Type | " + " | ".join(self.langs) + " |" divider = "| :--- | :--- |" + " :---: |" * len(self.langs) @@ -233,7 +245,6 @@ def get_sort_key(row): rep_f = row[rep_idx] return (matcher._get_type_priority(rep_f), rep_f.normalized_name or "") - mod_rows = final_by_mod[mod] mod_rows.sort(key=get_sort_key) for row in mod_rows: @@ -536,6 +547,11 @@ def main(): default="md", help="Type of gap report to generate (md, raw, matrix).", ) + parser.add_argument( + "--common", + action="store_true", + help="Only list features present in Python or at least 2 languages (matrix report only).", + ) adk_args.add_verbose_argument(parser) args = parser.parse_args() adk_args.configure_logging(args) @@ -559,7 +575,7 @@ def main(): logging.error(f"Error reading feature registries: {e}") sys.exit(1) - result = match_registries(registries, args.alpha, args.report_type) + result = match_registries(registries, args.alpha, args.report_type, args.common) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/test/adk/scope/extractors/test_extractor_go.py b/test/adk/scope/extractors/test_extractor_go.py index c1b2f65..9acf3b2 100644 --- a/test/adk/scope/extractors/test_extractor_go.py +++ b/test/adk/scope/extractors/test_extractor_go.py @@ -36,6 +36,8 @@ def test_extract_features( mock_func_node = MagicMock() mock_func_body = MagicMock() + mock_func_body.start_point = (1, 0) + mock_func_body.end_point = (10, 0) # span = 10 lines mock_func_stmt_list = MagicMock() mock_func_stmt_list.type = "statement_list" mock_func_stmt_list.named_child_count = 2 @@ -44,6 +46,8 @@ def test_extract_features( mock_method_node = MagicMock() mock_method_body = MagicMock() + mock_method_body.start_point = (12, 0) + mock_method_body.end_point = (20, 0) # span = 9 lines mock_method_stmt_list = MagicMock() mock_method_stmt_list.type = "statement_list" mock_method_stmt_list.named_child_count = 2 From 31ae2a63f4deed7b9a19decd6bac820ca714857e Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 10:30:01 -0800 Subject: [PATCH 4/9] Updates --- playground.ipynb | 878 +++++++++++++----- pyproject.toml | 1 + score.sh | 13 + .../adk/scope/extractors/converter_go.py | 122 ++- .../adk/scope/extractors/extractor_go.py | 24 + src/google/adk/scope/reporter/reporter.py | 158 ++-- src/google/adk/scope/utils/score_features.py | 51 + src/google/adk/scope/utils/similarity.py | 62 +- test/adk/scope/reporter/test_reporter.py | 133 ++- 9 files changed, 1131 insertions(+), 311 deletions(-) create mode 100755 score.sh create mode 100644 src/google/adk/scope/utils/score_features.py diff --git a/playground.ipynb b/playground.ipynb index 02fb739..9fe9688 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "8f748152", + "execution_count": 2, + "id": "02bd4661-609e-40c1-95e7-3f4fc946c69b", "metadata": {}, "outputs": [], "source": [ @@ -21,6 +21,8 @@ "from google.adk.scope import features_pb2\n", "from google.adk.scope.utils import similarity\n", "from google.protobuf import text_format\n", + "import pandas as pd\n", + "import numpy as np\n", "import logging\n", "\n", "logging.basicConfig(level=logging.DEBUG)" @@ -28,325 +30,765 @@ }, { "cell_type": "code", - "execution_count": 43, - "id": "bb16a48a-5cda-4ec6-a10e-c8a2f2828700", + "execution_count": 3, + "id": "96c6f1de-6442-4d9b-b21a-06dcacebff69", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((1426, 10), (1426, 10), (1426, 10))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry:\n", - " \"\"\"Reads a FeatureRegistry from a text proto file.\"\"\"\n", - " registry = features_pb2.FeatureRegistry()\n", - " with open(file_path, \"rb\") as f:\n", - " text_format.Parse(f.read(), registry)\n", - " return registry" + "pg = pd.read_csv('./output/py_go.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "pj = pd.read_csv('./output/py_java.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "pt = pd.read_csv('./output/py_ts.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "\n", + "pg.shape, pj.shape, pt.shape" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "f3361359-a887-4537-b22b-fe8fec24e13a", + "execution_count": 4, + "id": "093a3f6c-131a-42be-aa61-0c1b8209c009", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.409128\n", + "std 0.152272\n", + "min 0.000000\n", + "25% 0.349225\n", + "50% 0.407950\n", + "75% 0.476100\n", + "max 0.992100\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "py_features = read_feature_registry(\"output/py.txtpb\")\n", - "ts_features = read_feature_registry(\"output/ts.txtpb\")" + "pg['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "2208e1dd-6647-4796-b81c-58ab7406e4c9", + "execution_count": 5, + "id": "b95b8dff-dc50-4505-91a8-562fa6f2d5b9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.528424\n", + "std 0.284260\n", + "min 0.000000\n", + "25% 0.391550\n", + "50% 0.486150\n", + "75% 0.797150\n", + "max 1.000000\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "py_run_async = py_features.features[0]\n", - "py_run = py_features.features[9]\n", - "ts_run_async = ts_features.features[3]" + "pj['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "3358d489-92bb-411a-9ec8-0f596be08c9f", + "execution_count": 6, + "id": "9b1f6e3d-3912-4b97-ba50-0ff3894b17c5", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:google.adk.scope.utils.similarity:Initializing SimilarityScorer with alpha=0.8 and weights={'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n" - ] + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.532307\n", + "std 0.224664\n", + "min 0.170800\n", + "25% 0.373025\n", + "50% 0.452750\n", + "75% 0.589400\n", + "max 1.000000\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "scorer = similarity.SimilarityScorer()" + "pt['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "a1b5b03b-8a0c-4614-ab4f-89140fa872d0", + "execution_count": 14, + "id": "4a520159-6ef0-4def-b1fb-d96c3412d8ca", "metadata": {}, "outputs": [ { - "name": "stderr", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJA5JREFUeJzt3Q9wFdX5//En/0gIksRAQ0gJf8QiICAWBCNoEYEIDIowUy0W0aFQFZiRtIgoYAJiaIaf2joBxhbBzoBaHNECERKgSJEgkpZRQFNBES0kVC0ESQn5s7855zs3ckMAE3aTZ5P3a2a9uffu3ex9crj349lzdkMcx3EEAABAkdDG3gEAAICaCCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1AkXH6qqqpJjx45J69atJSQkpLF3BwAA/ADm3LCnT5+WpKQkCQ0NbXoBxYST5OTkxt4NAABQD19++aV06NCh6QUU03MSeIMxMTGubru8vFxyc3NlxIgREhER4eq2Qa0bA22aWjdFtGt/1rqkpMR2MAS+x5tcQAkc1jHhxIuAEh0dbbdLQPEWtW4Y1LnhUGtq3RSVe/C9+EOGZzBIFgAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6oQ39g4AEOn8xEbPyvDpwhGUGIDv0IMCAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAfweUZcuWSZ8+fSQmJsYuKSkp8s4771Q/f/bsWZk2bZq0adNGrrrqKhk/frwUFxcHbePo0aMyevRoiY6OloSEBJk1a5ZUVFS4944AAEDzCigdOnSQxYsXS0FBgezdu1eGDh0qd999txw4cMA+P3PmTFm/fr2sXbtW3n33XTl27JiMGzeu+vWVlZU2nJw7d0527dolr7zyiqxatUrmz5/v/jsDAAC+FV6XlceMGRN0f9GiRbZXZffu3Ta8rFixQtasWWODi7Fy5Urp0aOHff7mm2+W3NxcOXjwoGzZskXatWsnffv2lYULF8rs2bMlPT1dWrRo4e67AwAATT+gnM/0hpiekjNnzthDPaZXpby8XIYNG1a9Tvfu3aVjx46Sn59vA4q57d27tw0nAampqfLII4/YXpgbb7yx1t9VVlZml4CSkhJ7a36fWdwU2J7b2wW1vpTIMMezJkKbbjjUmlo3ReUufi/WZRt1DigfffSRDSRmvIkZZ7Ju3Trp2bOn7Nu3z/aAxMXFBa1vwkhRUZH92dyeH04Czweeu5jMzEzJyMi44HHTI2PGsnghLy/Pk+2CWtcma4B3LSPQlmnTDYdaU+umKM+F78XS0lLvAsp1111nw8ipU6fkjTfekEmTJtnxJl6aM2eOpKWlBfWgJCcny4gRI+xgXTeZdGf+CMOHD5eIiAhXtw1qfTG90jd71jz++dRQ2nQD4fOj4VBrf9Y6cATEk4BiekmuvfZa+3O/fv3kgw8+kN///vdy77332sGvJ0+eDOpFMbN4EhMT7c/mds+ePUHbC8zyCaxTm8jISLvUZArlVYjwctug1jWVVYZ41iwC7Zg23XCoNbVuiiJc+F6sy+uv+DwoVVVVdnyICSvmF2/durX6ucLCQjut2BwSMsytOUR04sSJ6nVMKjO9IOYwEQAAQJ17UMyhlpEjR9qBr6dPn7YzdrZv3y6bN2+W2NhYmTx5sj0UEx8fb0PHjBkzbCgxA2QNc0jGBJGJEydKVlaWHXcyd+5ce+6U2npIAABA81SngGJ6Ph544AE5fvy4DSTmpG0mnJjjUsbzzz8voaGh9gRtplfFzNBZunRp9evDwsJkw4YNdtaOCS6tWrWyY1gWLFjg/jsDAADNI6CY85xcSlRUlGRnZ9vlYjp16iQ5OTl1+bUAAKCZ4Vo8AABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAnfDG3gEA/tX5iY2ebPfI4tGebBeAf9CDAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAAD8HVAyMzPlpptuktatW0tCQoKMHTtWCgsLg9YZMmSIhISEBC0PP/xw0DpHjx6V0aNHS3R0tN3OrFmzpKKiwp13BAAAfC+8Liu/++67Mm3aNBtSTKB48sknZcSIEXLw4EFp1apV9XpTpkyRBQsWVN83QSSgsrLShpPExETZtWuXHD9+XB544AGJiIiQZ5991q33BQAAmktA2bRpU9D9VatW2R6QgoICue2224ICiQkgtcnNzbWBZsuWLdKuXTvp27evLFy4UGbPni3p6enSokWL+r4XAADQRFzRGJRTp07Z2/j4+KDHV69eLW3btpVevXrJnDlzpLS0tPq5/Px86d27tw0nAampqVJSUiIHDhy4kt0BAADNsQflfFVVVfLYY4/JoEGDbBAJmDBhgnTq1EmSkpLkww8/tD0jZpzKm2++aZ8vKioKCidG4L55rjZlZWV2CTBhxigvL7eLmwLbc3u7oNaXEhnmeNZEvGzTXu23X//98flBrZuichc/Q+qyjRDHcer1CfPII4/IO++8Izt37pQOHTpcdL1t27bJHXfcIYcOHZKuXbvK1KlT5YsvvpDNmzdXr2N6WMwYlpycHBk5cuQF2zCHfjIyMi54fM2aNUHjWwAAgF7m+950ZJgjMDExMe73oEyfPl02bNggO3bsuGQ4MQYOHGhvAwHFjE3Zs2dP0DrFxcX29mLjVsxhorS0tKAelOTkZDtA93JvsD7pLi8vT4YPH24H7sI71Pp7vdK/D+xu++dTQz1r017t9/70VPEj2jS1borKXfxeDBwB+SHqFFBMZ8uMGTNk3bp1sn37dunSpctlX7Nv3z572759e3ubkpIiixYtkhMnTtgBtoZ54yZo9OzZs9ZtREZG2qUmUyivQoSX2wa1rqmsMsSzZhFox160aa/22+//9vj8oNZNUYQLnyF1eX2dAoqZYmwOq7z99tv2XCiBMSOxsbHSsmVLOXz4sH1+1KhR0qZNGzsGZebMmXaGT58+fey6ptfDBJGJEydKVlaW3cbcuXPttmsLIQAAoPmp0yyeZcuW2eNG5mRspkcksLz++uv2eTNF2EwfNiGke/fu8pvf/EbGjx8v69evr95GWFiYPTxkbk1vyi9/+Ut7HpTzz5sCAACatzof4rkUMy7EnMztcswsHzMgFgAAoDZciwcAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOqEN/YOAPBWr/TNkjXg/27LKkMoNwBfoAcFAACoQ0ABAADqcIgHqIPOT2ykXgDQAOhBAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA/g4omZmZctNNN0nr1q0lISFBxo4dK4WFhUHrnD17VqZNmyZt2rSRq666SsaPHy/FxcVB6xw9elRGjx4t0dHRdjuzZs2SiooKd94RAABoXgHl3XffteFj9+7dkpeXJ+Xl5TJixAg5c+ZM9TozZ86U9evXy9q1a+36x44dk3HjxlU/X1lZacPJuXPnZNeuXfLKK6/IqlWrZP78+e6+MwAA4FvhdVl506ZNQfdNsDA9IAUFBXLbbbfJqVOnZMWKFbJmzRoZOnSoXWflypXSo0cPG2puvvlmyc3NlYMHD8qWLVukXbt20rdvX1m4cKHMnj1b0tPTpUWLFu6+QwAA0LzGoJhAYsTHx9tbE1RMr8qwYcOq1+nevbt07NhR8vPz7X1z27t3bxtOAlJTU6WkpEQOHDhwJbsDAACaYw/K+aqqquSxxx6TQYMGSa9evexjRUVFtgckLi4uaF0TRsxzgXXODyeB5wPP1aasrMwuASbMGCYMmcVNge25vV00jVpHhjniN5GhTtCtH/ipTfi9TfsVtfZnreuyjXoHFDMWZf/+/bJz5876bqJOg3MzMjIueNwcLjIDbb1gxtigYfip1lkDxLcW9q8Sv8jJyRE/81Ob9jtq7a9al5aWehtQpk+fLhs2bJAdO3ZIhw4dqh9PTEy0g19PnjwZ1ItiZvGY5wLr7NmzJ2h7gVk+gXVqmjNnjqSlpQX1oCQnJ9sBujExMeImk+7MH2H48OESERHh6rbh/1r3St8sfmN6Tkw4mbc3VMqqQsQP9qenih/5sU37FbX2Z60DR0BcDyiO48iMGTNk3bp1sn37dunSpUvQ8/369bM7v3XrVju92DDTkM204pSUFHvf3C5atEhOnDhhB9ga5o2boNGzZ89af29kZKRdajK/y6sPAS+3Df/WuqzSH1/wtTHhxC/775f20BTatN9Ra3/Vui6vD6/rYR0zQ+ftt9+250IJjBmJjY2Vli1b2tvJkyfb3g4zcNaEDhNoTCgxM3gM0+thgsjEiRMlKyvLbmPu3Ll227WFEAAA0PzUKaAsW7bM3g4ZMiTocTOV+MEHH7Q/P//88xIaGmp7UMzAVjNDZ+nSpdXrhoWF2cNDjzzyiA0urVq1kkmTJsmCBQvceUcAAMD36nyI53KioqIkOzvbLhfTqVMn3w+CAwAA3uFaPAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAD/B5QdO3bImDFjJCkpSUJCQuStt94Kev7BBx+0j5+/3HnnnUHrfPvtt3L//fdLTEyMxMXFyeTJk+W777678ncDAACaZ0A5c+aM3HDDDZKdnX3RdUwgOX78ePXy6quvBj1vwsmBAwckLy9PNmzYYEPP1KlT6/cOAABAkxNe1xeMHDnSLpcSGRkpiYmJtT738ccfy6ZNm+SDDz6Q/v3728defPFFGTVqlCxZssT2zAAAgOatzgHlh9i+fbskJCTI1VdfLUOHDpVnnnlG2rRpY5/Lz8+3h3UC4cQYNmyYhIaGyvvvvy/33HPPBdsrKyuzS0BJSYm9LS8vt4ubAttze7toGrWODHPEbyJDnaBbP/BTm/B7m/Yrau3PWtdlG64HFHN4Z9y4cdKlSxc5fPiwPPnkk7bHxQSTsLAwKSoqsuElaCfCwyU+Pt4+V5vMzEzJyMi44PHc3FyJjo4WL5jDT2gYfqp11gDxrYX9q8QvcnJyxM/81Kb9jlr7q9alpaWNF1Duu+++6p979+4tffr0ka5du9pelTvuuKNe25wzZ46kpaUF9aAkJyfLiBEj7EBbN5l0Z/4Iw4cPl4iICFe3Df/Xulf6ZvEb03Niwsm8vaFSVhUifrA/PVX8yI9t2q+otT9rHTgC0miHeM53zTXXSNu2beXQoUM2oJixKSdOnAhap6Kiws7sudi4FTOmxSw1mUJ59SHg5bbh31qXVfrjC742Jpz4Zf/90h6aQpv2O2rtr1rX5fWenwflq6++km+++Ubat29v76ekpMjJkyeloKCgep1t27ZJVVWVDBw40OvdAQAAPlDnHhRzvhLTGxLw+eefy759++wYErOYsSLjx4+3vSFmDMrjjz8u1157raSm/l+XbY8ePew4lSlTpsjy5ctt19H06dPtoSFm8AAAgHr1oOzdu1duvPFGuxhmbIj5ef78+XYQ7Icffih33XWXdOvWzZ6ArV+/fvL3v/896BDN6tWrpXv37vaQj5lePHjwYHnppZf4iwAAgPr1oAwZMkQc5+LTFTdvvvwgQtPTsmbNmrr+agAA0ExwLR4AAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAID/LxYIAF7r/MRGz7Z9ZPFoz7YNwD30oAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAADA/wFlx44dMmbMGElKSpKQkBB56623gp53HEfmz58v7du3l5YtW8qwYcPk008/DVrn22+/lfvvv19iYmIkLi5OJk+eLN99992VvxsAANA8A8qZM2fkhhtukOzs7Fqfz8rKkj/84Q+yfPlyef/996VVq1aSmpoqZ8+erV7HhJMDBw5IXl6ebNiwwYaeqVOnXtk7AQAATUZ4XV8wcuRIu9TG9J688MILMnfuXLn77rvtY3/+85+lXbt2tqflvvvuk48//lg2bdokH3zwgfTv39+u8+KLL8qoUaNkyZIltmcGAAA0b3UOKJfy+eefS1FRkT2sExAbGysDBw6U/Px8G1DMrTmsEwgnhlk/NDTU9rjcc889F2y3rKzMLgElJSX2try83C5uCmzP7e2iadQ6MswRv4kMdYJumzsv25sf27RfUWt/1rou23A1oJhwYpgek/OZ+4HnzG1CQkLwToSHS3x8fPU6NWVmZkpGRsYFj+fm5kp0dLR4wRx+QsPwU62zBohvLexf1di7oEJOTo7nv8NPbdrvqLW/al1aWto4AcUrc+bMkbS0tKAelOTkZBkxYoQdaOsmk+7MH2H48OESERHh6rbh/1r3St8sfmN6Tkw4mbc3VMqqQqS525+e6tm2/dim/Ypa+7PWgSMgDR5QEhMT7W1xcbGdxRNg7vft27d6nRMnTgS9rqKiws7sCby+psjISLvUZArl1YeAl9uGf2tdVunfL3gTTvy8/25piLbmpzbtd9TaX7Wuy+tdPQ9Kly5dbMjYunVrUFoyY0tSUlLsfXN78uRJKSgoqF5n27ZtUlVVZceqAAAA1LkHxZyv5NChQ0EDY/ft22fHkHTs2FEee+wxeeaZZ+QnP/mJDSzz5s2zM3PGjh1r1+/Ro4fceeedMmXKFDsV2XQdTZ8+3Q6gZQYPAACoV0DZu3ev3H777dX3A2NDJk2aJKtWrZLHH3/cnivFnNfE9JQMHjzYTiuOioqqfs3q1attKLnjjjvs7J3x48fbc6cAAADUK6AMGTLEnu/kYszZZRcsWGCXizG9LWvWrOEvAAAAasW1eAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADq+OJqxo111Vq3L6x2ZPFoV7cHAEBTRQ8KAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAAmn5ASU9Pl5CQkKCle/fu1c+fPXtWpk2bJm3atJGrrrpKxo8fL8XFxW7vBgAA8LFwLzZ6/fXXy5YtW77/JeHf/5qZM2fKxo0bZe3atRIbGyvTp0+XcePGyXvvvefFrgBAkM5PbPSkIkcWj6bSgPaAYgJJYmLiBY+fOnVKVqxYIWvWrJGhQ4fax1auXCk9evSQ3bt3y8033+zF7gAAAJ/xJKB8+umnkpSUJFFRUZKSkiKZmZnSsWNHKSgokPLychk2bFj1uubwj3kuPz//ogGlrKzMLgElJSX21mzLLG4KbC8y1HF1u+dvG8H18FNdIsPcbxdeC7RlL9o0vnf+55Gf2rRfUWt/1rou2whxHMfVT6133nlHvvvuO7nuuuvk+PHjkpGRIf/+979l//79sn79ennooYeCwoYxYMAAuf322+V3v/vdRce1mO3UZHpioqOj3dx9AADgkdLSUpkwYYI9ohITE9OwAaWmkydPSqdOneS5556Tli1b1iug1NaDkpycLF9//fVl32B90l1eXp7M2xsqZVUhrm57f3qqq9vzu0Cthw8fLhEREeIHvdI3i9+YnpOF/as8adMI/vftxzbtV9Tan7U2399t27b9QQHFk0M854uLi5Nu3brJoUOH7Js7d+6cDS3m8QAzi6e2MSsBkZGRdqnJFMqrDwHzQV5W6e6HOR9YF6+LX2rjdptoSF60aXzv/Dbspzbtd9TaX7Wuy+s9Pw+KOdxz+PBhad++vfTr18/u3NatW6ufLywslKNHj9qxKgAAAJ70oPz2t7+VMWPG2MM6x44dk6efflrCwsLkF7/4hZ1WPHnyZElLS5P4+HjbvTNjxgwbTpjBAwAAPAsoX331lQ0j33zzjfzoRz+SwYMH2ynE5mfj+eefl9DQUHuCNjOuJDU1VZYuXer2bgAAAB9zPaC89tprl3zeTD3Ozs62CwAAQG24Fg8AAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQJb+wdANzW+YmNFBUAfI4eFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDpcLBAAAA8vNHpk8WjqWw/0oAAAAHUIKAAAQB0CCgAAUIcxKAAA340TiQxzJGuASK/0zVJWGdLYuwUP0IMCAADUIaAAAAB1OMQDAPDNlF00H/SgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCH86AAgEvn/fDq9OtHFo8WL3Cukobh1zof8ajd+aIHJTs7Wzp37ixRUVEycOBA2bNnT2PuDgAAaO49KK+//rqkpaXJ8uXLbTh54YUXJDU1VQoLCyUhIaGxdgsA1PHr/4EDvuxBee6552TKlCny0EMPSc+ePW1QiY6OlpdffrmxdgkAADTnHpRz585JQUGBzJkzp/qx0NBQGTZsmOTn51+wfllZmV0CTp06ZW+//fZbKS8vd3XfzPZKS0slvDxUKqvcvYT3N9984+r2/C5Qa1OXiIgI17YbXnHGtW01BeFVjpSWVnnSpkGtGwvt2nuB7yw3P6tPnz5tbx3H0RlQvv76a6msrJR27doFPW7uf/LJJxesn5mZKRkZGRc83qVLF/GTtv+vsfcAzdWExt6BZoRaU+umoq2H31kmqMTGxvp/Fo/paTHjVQKqqqps70mbNm0kJMTd/yMsKSmR5ORk+fLLLyUmJsbVbYNaNwbaNLVuimjX/qy16Tkx4SQpKemy6zZKQGnbtq2EhYVJcXFx0OPmfmJi4gXrR0ZG2uV8cXFxnu6j+SMQUBoGtabOTQ1tmlo3RTEufS9eruekUQfJtmjRQvr16ydbt24N6hUx91NSUhpjlwAAgCKNdojHHLKZNGmS9O/fXwYMGGCnGZ85c8bO6gEAAM1bowWUe++9V/7zn//I/PnzpaioSPr27SubNm26YOBsQzOHkp5++ukLDimBWvsVbZpaN0W066Zf6xDnh8z1AQAAaEBcLBAAAKhDQAEAAOoQUAAAgDoEFAAAoE6zDCjZ2dnSuXNniYqKsldS3rNnzyXXX7t2rXTv3t2u37t3b8nJyWmwfW0udf7jH/8ot956q1x99dV2MddlutzfBfWr9flee+01ezbmsWPHUk6Pan3y5EmZNm2atG/f3s6C6NatG58hHtXanK7iuuuuk5YtW9ozn86cOVPOnj1L276MHTt2yJgxY+zZXc3nwVtvvXW5l8j27dvlpz/9qW3T1157raxatUpc5zQzr732mtOiRQvn5Zdfdg4cOOBMmTLFiYuLc4qLi2td/7333nPCwsKcrKws5+DBg87cuXOdiIgI56OPPmrwfW/KdZ4wYYKTnZ3t/POf/3Q+/vhj58EHH3RiY2Odr776qsH3vanXOuDzzz93fvzjHzu33nqrc/fddzfY/janWpeVlTn9+/d3Ro0a5ezcudPWfPv27c6+ffsafN+beq1Xr17tREZG2ltT582bNzvt27d3Zs6c2eD77jc5OTnOU0895bz55ptmVq+zbt26S67/2WefOdHR0U5aWpr9XnzxxRft9+SmTZtc3a9mF1AGDBjgTJs2rfp+ZWWlk5SU5GRmZta6/s9//nNn9OjRQY8NHDjQ+fWvf+35vjanOtdUUVHhtG7d2nnllVc83MvmW2tT31tuucX505/+5EyaNImA4lGtly1b5lxzzTXOuXPnfvgfFPWqtVl36NChQY+ZL9BBgwZR0Tr4IQHl8ccfd66//vqgx+69914nNTXVcVOzOsRz7tw5KSgosIcPAkJDQ+39/Pz8Wl9jHj9/fSM1NfWi66N+da7JXNrbXOI7Pj6ekrrcpo0FCxZIQkKCTJ48mfp6WOu//vWv9vId5hCPOQllr1695Nlnn7VXc4e7tb7lllvsawKHgT777DN7KG3UqFGU2mUN9b3oi6sZu+Xrr7+2Hww1z1Zr7n/yySe1vsac5ba29c3jcK/ONc2ePdseD635jwBXXuudO3fKihUrZN++fZTT41qbL8lt27bJ/fffb78sDx06JI8++qgN3+bMnHCv1hMmTLCvGzx4sL1ibkVFhTz88MPy5JNPUmaXXex70Vz1+H//+58dA+SGZtWDAn9YvHixHby5bt06OzgO7jGXOZ84caIdlGyuKg5vmYugmp6ql156yV4g1Vzi46mnnpLly5dTepeZQZumd2rp0qXyj3/8Q958803ZuHGjLFy4kFr7VLPqQTEfyGFhYVJcXBz0uLmfmJhY62vM43VZH/Wrc8CSJUtsQNmyZYv06dOHcrrcpg8fPixHjhyxI/bP/xI1wsPDpbCwULp27UrdXai1YWbuRERE2NcF9OjRw/4fqDmMYa7sDndqPW/ePBu+f/WrX9n7ZsaluQDt1KlTbSg0h4jgjot9L8bExLjWe2I0q7+Y+TAw/xezdevWoA9nc98cJ66Nefz89Y28vLyLro/61dnIysqy/7djLhpprnIN99u0mS7/0Ucf2cM7geWuu+6S22+/3f5spmbCnVobgwYNsod1AiHQ+Ne//mWDC+HEvXYdGLdWM4QEgiGXnHNXg30vOs1w6pqZirZq1So7PWrq1Kl26lpRUZF9fuLEic4TTzwRNM04PDzcWbJkiZ3++vTTTzPN2IM6L1682E4pfOONN5zjx49XL6dPn3a/ETTzWtfELB7van306FE7G2369OlOYWGhs2HDBichIcF55plnruAv3jzUtdbms9nU+tVXX7XTYHNzc52uXbvamZi4NPM5a07xYBYTC5577jn78xdffGGfN3U29a45zXjWrFn2e9GcIoJpxi4xc7Y7duxovxDNVLbdu3dXP/ezn/3MfmCf7y9/+YvTrVs3u76ZWrVx40a3dqVJq0udO3XqZP9h1FzMhw7crXVNBBTv2rWxa9cue2oC82VrphwvWrTITvOGu7UuLy930tPTbSiJiopykpOTnUcffdT573//S6kv429/+1utn7+B+ppbU++ar+nbt6/925h2vXLlSsdtIeY/7vbJAAAAXJlmNQYFAAD4AwEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAaPP/AaCPs5RuguPIAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pg['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cdce096a-6081-48d6-861a-a0b39cda5544", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJ4FJREFUeJzt3Qt0FOX5x/Fnc2EhSoCAMaQGAlguyq2CRIQqtxCBA6JpvUQpUApegHNMThVRwAS0UEqRUxrl2HKxRyKWHkAFBAIISAko2BwFNTUIIuVWtBBJyhKS+Z/37X8jmwuQMJO8k/1+zhnXnZm8zD55d/aXmXdmPZZlWQIAAGCQkLreAAAAgPIIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA44SJC5WWlsqxY8ekcePG4vF46npzAADAVVD3hv3+++8lNjZWQkJC6l9AUeEkLi6urjcDAADUwDfffCM33XRT/Qso6siJ/wVGRkba2nZxcbFs2rRJBg8eLOHh4ba2Depc2+jP1Lk+oT+7v9YFBQX6AIP/c7zeBRT/aR0VTpwIKBEREbpdAopzqHPtoM7UuT6hP9efWl/N8AwGyQIAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAA7g4os2fPlttvv13fAS46OlpGjhwpeXl5AeucP39eJk6cKM2bN5frr79ekpOT5eTJkwHrHDlyRIYNG6ZvAqPaefrpp+XixYv2vCIAABBcAWX79u06fOzevVuys7P1nebUbXALCwvL1klNTZV3331XVq5cqddX35tz//33ly0vKSnR4eTChQuya9cuef3112XZsmUyY8YMe18ZAABwrWrd6n7Dhg0Bz1WwUEdA9u3bJ3fddZecPXtWFi9eLFlZWTJgwAC9ztKlS6VTp0461Nxxxx363v6fffaZbN68WW688Ubp3r27zJo1S6ZMmSLp6enSoEEDe18hAAAIrjEoKpAoUVFR+lEFFXVUZdCgQWXrdOzYUVq1aiU5OTn6uXrs0qWLDid+SUlJ+guEDhw4cC2bAwAA6okaf1lgaWmpPPXUU9KnTx/p3LmznnfixAl9BKRp06YB66owopb517k0nPiX+5dVxufz6clPhRlFhSE12cnfnt3tgjrXBfozda5P6M/ur3V12qtxQFFjUfbv3y87d+4Up6nBuRkZGRXmq9NFaqCtE9QYGziPOtcO6kyd6xP6s3trXVRU5GxAmTRpkqxdu1Z27NghN910U9n8mJgYPfj1zJkzAUdR1FU8apl/nQ8//DCgPf9VPv51yps6daqkpaUFHEGJi4vTA3TVV0Hbne7ULyQxMdGRr5gGda5M5/SNjnQNb4gls3qW0p8dxn6jdlBn99fafwbE9oBiWZZMnjxZVq9eLdu2bZM2bdoELO/Ro4d+IVu2bNGXFyvqMmR1WXHv3r31c/X40ksvyalTp/QAW0UVQQWNW265pdJ/1+v16qk89W85FSKcbBvUuTxficfRbkF/rh3UmTrXN+E2fxZWp62w6p7WUVfovP322/peKP4xI02aNJFGjRrpx3HjxumjHWrgrAodKtCoUKKu4FHUUQ8VREaNGiVz587VbUybNk23XVkIAQAAwadaAeXVV1/Vj/369QuYry4lHjNmjP7/l19+WUJCQvQRFDWwVV2h88orr5StGxoaqk8PPfHEEzq4XHfddTJ69GiZOXOmPa8IAAC4XrVP8VxJw4YNJTMzU09Vad26taxfv746/zQAAAgifBcPAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAHd/mzEAd+qcvlF8JR7b2z08Z5jtbQKAwhEUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAA3B9QduzYIcOHD5fY2FjxeDyyZs2agOVqXmXT7373u7J14uPjKyyfM2eOPa8IAAAEX0ApLCyUbt26SWZmZqXLjx8/HjAtWbJEB5Dk5OSA9WbOnBmw3uTJk2v+KgAAQL0SVt0fGDJkiJ6qEhMTE/D87bfflv79+0vbtm0D5jdu3LjCugAAADUKKNVx8uRJWbdunbz++usVlqlTOrNmzZJWrVpJSkqKpKamSlhY5Zvj8/n05FdQUKAfi4uL9WQnf3t2twvqfDneUMuRLuINsQIe7cb7JLAO1MNZ1Nn9ta5Oex7Lsmq851KnblavXi0jR46sdPncuXN1EDl27Jg0bNiwbP78+fPltttuk6ioKNm1a5dMnTpVxo4dq+dXJj09XTIyMirMz8rKkoiIiJpuPgAAqEVFRUX6oMTZs2clMjKy7gJKx44dJTExURYuXHjZdtQ4lccee0zOnTsnXq/3qo6gxMXFyenTp6/4AmuS7rKzs/V2h4eH29o2qHNVOqdvdKR7qCMns3qWyvS9IeIr9dje/v70JNvbdCP2G9S5vil26LNQfX63aNHiqgKKY6d4PvjgA8nLy5O33nrriusmJCTIxYsX5fDhw9KhQ4cKy1VoqSy4qKI5FSKcbBvUuTxficfZ9ks9jvwbvEcq1oOaOI86u7fW1WnLsYCyePFi6dGjh77i50pyc3MlJCREoqOjxaS/aO3eoR+eM8zW9gAAqK+qHVDUaZj8/Pyy54cOHdIBQ40nUQNe/YdwVq5cKb///e8r/HxOTo7s2bNHX9mjruRRz9UA2UcffVSaNWt2ra8HAAAEY0DZu3evDhd+aWlp+nH06NGybNky/f8rVqwQNbTl4YcfrvDz6lSNWq4GvqpxJW3atNEBxd8OAABAtQNKv379dPi4nAkTJuipMurqnd27d1N5AABQJb6LBwAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHMdudQ+g/ot/dp0j7fK1EAA4ggIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAANwfUHbs2CHDhw+X2NhY8Xg8smbNmoDlY8aM0fMvne65556Adb777jt55JFHJDIyUpo2bSrjxo2Tc+fOXfurAQAAwRlQCgsLpVu3bpKZmVnlOiqQHD9+vGx68803A5arcHLgwAHJzs6WtWvX6tAzYcKEmr0CAABQ74RV9weGDBmip8vxer0SExNT6bLPP/9cNmzYIB999JH07NlTz1u4cKEMHTpU5s2bp4/MAACA4FbtgHI1tm3bJtHR0dKsWTMZMGCAvPjii9K8eXO9LCcnR5/W8YcTZdCgQRISEiJ79uyR++67r0J7Pp9PT34FBQX6sbi4WE928rfnDbFsbffStvFDLajJ/3hDLWfa/f9+7ER/dpLb+gX9mTrXN8UO7aOr057tAUWd3rn//vulTZs2cvDgQXnuuef0ERcVTEJDQ+XEiRM6vARsRFiYREVF6WWVmT17tmRkZFSYv2nTJomIiBAnzOpZanub69evt71Nt1On+SAyt5ezVXCiPzvJre8V+jN1rm+ybd5HFxUV1V1Aeeihh8r+v0uXLtK1a1dp166dPqoycODAGrU5depUSUtLCziCEhcXJ4MHD9YDbe1Od+oXMn1viPhKPba2vT89ydb23Mxf58TERAkPD5dg1zl9oyPtqiMnKpw40Z+d5Lb3Cv2ZOtc3xQ7to/1nQOrsFM+l2rZtKy1atJD8/HwdUNTYlFOnTgWsc/HiRX1lT1XjVtSYFjWVp4rm1Ieb2pn7SuzdofNBXHlNqIvY3tdqoz87ya19gv5MneubcJv30dVpy/GAcvToUfn222+lZcuW+nnv3r3lzJkzsm/fPunRo4eet3XrViktLZWEhASnNweAC8Q/u86xtg/PGeZY2wDsU+2Aou5Xoo6G+B06dEhyc3P1GBI1qbEiycnJ+miIGoPyzDPPyM033yxJSf87ZNupUyc9TmX8+PGyaNEifRhp0qRJ+tQQV/AAAIAa3Qdl79698pOf/ERPihobov5/xowZehDsJ598IiNGjJD27dvrG7CpoyQffPBBwCma5cuXS8eOHfUpH3V5cd++feW1117jNwIAAGp2BKVfv35iWVVfsrhx45UH+6kjLVlZWdX9pwEAQJDgu3gAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOOE1fUGAACAmol/dp0jpfOGWjK3l7jrCMqOHTtk+PDhEhsbKx6PR9asWVO2rLi4WKZMmSJdunSR6667Tq/zi1/8Qo4dOxbQRnx8vP7ZS6c5c+bY84oAAIDrVTugFBYWSrdu3SQzM7PCsqKiIvn4449l+vTp+nHVqlWSl5cnI0aMqLDuzJkz5fjx42XT5MmTa/4qAABAcJ/iGTJkiJ4q06RJE8nOzg6Y98c//lF69eolR44ckVatWpXNb9y4scTExNRkmwEAQD3n+BiUs2fP6lM4TZs2DZivTunMmjVLh5aUlBRJTU2VsLDKN8fn8+nJr6CgoOyUkprs5G/PG2LZ2u6lbeOHWlCTH873OsHfj53oz27lRJ+jP9cO6lz7+w6nPmOvhseyrBq/OhU8Vq9eLSNHjqx0+fnz56VPnz7SsWNHWb58edn8+fPny2233SZRUVGya9cumTp1qowdO1bPr0x6erpkZGRUmJ+VlSURERE13XwAAFCL1FAQdVBCHbyIjIysm4CiUlJycrIcPXpUtm3bdtkNWbJkiTz22GNy7tw58Xq9V3UEJS4uTk6fPn3FF1hdarvVaarpe0PEV+qxte396Um2tudm/jonJiZKeHi4BLvO6Rsd+ytoVs9SR/qzWznxPqQ/1w7qXPv7Drv30erzu0WLFlcVUMKc6kQPPPCAfP3117J169YrbkRCQoJcvHhRDh8+LB06dKiwXIWWyoKLKppTH25qZ+4rsXeHzgdx5TWhLmJ7X6uN/uxWTvY3+nPtoM4/cPp9bXetq9NWmFPh5Msvv5T3339fmjdvfsWfyc3NlZCQEImOjrZ7cwAAgAtVO6Co0zD5+fllzw8dOqQDhhpP0rJlS/nZz36mLzFeu3atlJSUyIkTJ/R6anmDBg0kJydH9uzZI/3799dX8qjnaoDso48+Ks2aNbP31QEAgOAIKHv37tXhwi8tLU0/jh49Wg9mfeedd/Tz7t27B/ycOprSr18/fapmxYoVel01rqRNmzY6oPjbAQAAqHZAUSHjcuNqrzTmVl29s3v3bioPAACqxJcFAgAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOGF1vQEAUJvin11ne5veUEvm9rK9WSCocQQFAAAYh4ACAACMQ0ABAADGIaAAAADjMEgWAACXDcwOBtU+grJjxw4ZPny4xMbGisfjkTVr1gQstyxLZsyYIS1btpRGjRrJoEGD5MsvvwxY57vvvpNHHnlEIiMjpWnTpjJu3Dg5d+7ctb8aAAAQnAGlsLBQunXrJpmZmZUunzt3rvzhD3+QRYsWyZ49e+S6666TpKQkOX/+fNk6KpwcOHBAsrOzZe3atTr0TJgw4dpeCQAACN5TPEOGDNFTZdTRkwULFsi0adPk3nvv1fP+8pe/yI033qiPtDz00EPy+eefy4YNG+Sjjz6Snj176nUWLlwoQ4cOlXnz5ukjMwAAILjZOgbl0KFDcuLECX1ax69JkyaSkJAgOTk5OqCoR3Vaxx9OFLV+SEiIPuJy3333VWjX5/Ppya+goEA/FhcX68lO/va8IZat7V7aNn6oBTX54UZfTvD3Yyf6MyrWmf7sLLfuN5x6f7uxT1enPVsDigonijpicin13L9MPUZHRwduRFiYREVFla1T3uzZsyUjI6PC/E2bNklERIQ4YVbPUtvbXL9+ve1tup06zQdx/C6kTvRnVER/rh1uq7Ob7zKcbXOti4qK6tdVPFOnTpW0tLSAIyhxcXEyePBgPdDW7nSnfiHT94aIr9Rja9v705Nsbc/N/HVOTEyU8PBwCXad0zc69leQCidO9GdUrDP92Vlu3W849f52Y5/2nwGp9YASExOjH0+ePKmv4vFTz7t37162zqlTpwJ+7uLFi/rKHv/Pl+f1evVUniqaU51U7cx9Jfbu0N30hqotTv4O3cTuvlYb/RkV0Z9rh9vq7Ob3XrjNta5OW7beqK1NmzY6ZGzZsiUgLamxJb1799bP1eOZM2dk3759Zets3bpVSktL9VgVAACAah9BUfcryc/PDxgYm5ubq8eQtGrVSp566il58cUX5cc//rEOLNOnT9dX5owcOVKv36lTJ7nnnntk/Pjx+lJkdchu0qRJegAtV/AAAIAaBZS9e/dK//79y577x4aMHj1ali1bJs8884y+V4q6r4k6UtK3b199WXHDhg3Lfmb58uU6lAwcOFBfvZOcnKzvnQIAAFCjgNKvXz99v5OqqLvLzpw5U09VUUdbsrKy+A0AAIBK8WWBAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAKD+B5T4+HjxeDwVpokTJ+rl/fr1q7Ds8ccft3szAACAi4XZ3eBHH30kJSUlZc/3798viYmJ8vOf/7xs3vjx42XmzJllzyMiIuzeDAAA4GK2B5Qbbrgh4PmcOXOkXbt2cvfddwcEkpiYGLv/aQAAUE84OgblwoUL8sYbb8gvf/lLfSrHb/ny5dKiRQvp3LmzTJ06VYqKipzcDAAAEOxHUC61Zs0aOXPmjIwZM6ZsXkpKirRu3VpiY2Plk08+kSlTpkheXp6sWrWqynZ8Pp+e/AoKCvRjcXGxnuzkb88bYtna7qVt44daUJP/8YZazrT7//3Yif6MinWmPzvLrfsNp97fbuzT1WnPY1mWY5VLSkqSBg0ayLvvvlvlOlu3bpWBAwdKfn6+PhVUmfT0dMnIyKgwPysri/ErAAC4hDpjog5UnD17ViIjI+smoHz99dfStm1bfWTk3nvvrXK9wsJCuf7662XDhg060FztEZS4uDg5ffr0FV9gTdJddna2TN8bIr7SH05L2WF/euWvLxj566wGUIeHh0uw65y+0bG/gmb1LHWkP6NinenP7t5vOPU+dCOvQ31afX6rIR5XE1AcO8WzdOlSiY6OlmHDhl12vdzcXP3YsmXLKtfxer16Kk8VzakPN7Uz95XYu0Png7jymlAXsb2v1UZ/RkX0Z3fXmfeI87WuTluOBJTS0lIdUEaPHi1hYT/8EwcPHtSnZYYOHSrNmzfXY1BSU1Plrrvukq5duzqxKQAAwIUcCSibN2+WI0eO6Kt3LqXGo6hlCxYs0Kd21Gma5ORkmTZtmhObAQAAXMqRgDJ48GCpbGiLCiTbt2934p8EAAD1CN/FAwAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAAD1P6Ckp6eLx+MJmDp27Fi2/Pz58zJx4kRp3ry5XH/99ZKcnCwnT560ezMAAICLOXIE5dZbb5Xjx4+XTTt37ixblpqaKu+++66sXLlStm/fLseOHZP777/fic0AAAAuFeZIo2FhEhMTU2H+2bNnZfHixZKVlSUDBgzQ85YuXSqdOnWS3bt3yx133OHE5gAAAJdxJKB8+eWXEhsbKw0bNpTevXvL7NmzpVWrVrJv3z4pLi6WQYMGla2rTv+oZTk5OVUGFJ/Ppye/goIC/ajaUpOd/O15Qyxb2720bfxQC2ryP95Qy5l2/78fO9GfUbHO9Gd37zeceh+6kdehPl2d9jyWZdn6G3nvvffk3Llz0qFDB316JyMjQ/71r3/J/v379amdsWPHBoQNpVevXtK/f3/57W9/W+W4FtVOeepITEREhJ2bDwAAHFJUVCQpKSn6jEpkZGTtBpTyzpw5I61bt5b58+dLo0aNahRQKjuCEhcXJ6dPn77iC6xJusvOzpbpe0PEV+qxte396Um2tudm/jonJiZKeHi4BLvO6Rsd+ytoVs9SR/ozKtaZ/uzu/YZT70M38jrUp9Xnd4sWLa4qoDhyiudSTZs2lfbt20t+fr5+oRcuXNChRc33U1fxVDZmxc/r9eqpPFU0pz7c1M7cV2LvDp0P4sprQl3E9r5WG/0ZFdGf3V1n3iPO17o6bTkeUNTpnoMHD8qoUaOkR48eeuO2bNmiLy9W8vLy5MiRI3qsCgAAV3OkgzBR/9keUH7961/L8OHD9WkddQnxCy+8IKGhofLwww9LkyZNZNy4cZKWliZRUVH68M7kyZN1OOEKHgAA4FhAOXr0qA4j3377rdxwww3St29ffQmx+n/l5ZdflpCQEH0ERY0rSUpKkldeecXuzQAAAC5me0BZsWLFZZerS48zMzP1BAAAUBm+iwcAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOGF1vQEAgPon/tl1trfpDbVkbi/bm4WhOIICAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDh8mzFQx9/QCgCoiCMoAADAOAQUAABgHAIKAAAwDgEFAADU/4Aye/Zsuf3226Vx48YSHR0tI0eOlLy8vIB1+vXrJx6PJ2B6/PHH7d4UAADgUrYHlO3bt8vEiRNl9+7dkp2dLcXFxTJ48GApLCwMWG/8+PFy/Pjxsmnu3Ll2bwoAAHAp2y8z3rBhQ8DzZcuW6SMp+/btk7vuuqtsfkREhMTExNj9zwMAgHrA8fugnD17Vj9GRUUFzF++fLm88cYbOqQMHz5cpk+frkNLZXw+n578CgoK9KM6OqMmO/nb84ZYtrZ7adv4oRZuq4k31P5+4SR/P3aiP6Nind3Wn932XqE/u79PV6c9j2VZju25SktLZcSIEXLmzBnZuXNn2fzXXntNWrduLbGxsfLJJ5/IlClTpFevXrJq1apK20lPT5eMjIwK87OysqoMNQAAwCxFRUWSkpKiD15ERkbWXUB54okn5L333tPh5Kabbqpyva1bt8rAgQMlPz9f2rVrd1VHUOLi4uT06dNXfIE1SXdq7Mz0vSHiK/XY2vb+9CRb23Mzf50TExMlPDxc3KJz+kZx219Bs3qWOtKfUbHObuvPbnuv0J/d36fV53eLFi2uKqA4dopn0qRJsnbtWtmxY8dlw4mSkJCgH6sKKF6vV0/lqaI5tTNQO3Nfib07dHZcldfETXWxu0/UFif6M9zfn53kZH+jP7u3T1enLdsDijogM3nyZFm9erVs27ZN2rRpc8Wfyc3N1Y8tW7a0e3MAAIAL2R5Q1CXGamzI22+/re+FcuLECT2/SZMm0qhRIzl48KBePnToUGnevLkeg5Kamqqv8OnatavdmwMAAFzI9oDy6quvlt2M7VJLly6VMWPGSIMGDWTz5s2yYMECfW8UNZYkOTlZpk2bZvemAAAAl3LkFM/lqECibuYGAABQFb6LBwAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYx7FvMwYAmC3+2XV1vQlAlTiCAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIwTVtcbAAC4vPhn11EiBB2OoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAME6dBpTMzEyJj4+Xhg0bSkJCgnz44Yd1uTkAACDYA8pbb70laWlp8sILL8jHH38s3bp1k6SkJDl16lRdbRIAAAj2gDJ//nwZP368jB07Vm655RZZtGiRREREyJIlS+pqkwAAQDDfqO3ChQuyb98+mTp1atm8kJAQGTRokOTk5FRY3+fz6cnv7Nmz+vG7776T4uJiW7dNtVdUVCRhxSFSUuqxte1vv/3W1vbczF9nVZPw8HBxi7CLheImYaWWFBWVOtKfUbHO3Z9fJT4H6swdNQPrTH+uvVrbvY/+/vvv9aNlWVfeBqkDp0+flpKSErnxxhsD5qvnX3zxRYX1Z8+eLRkZGRXmt2nTRtykxe/regsQjFLqegOCBHWmzvVNioNtq6DSpEkT9wdzdaRFjVfxKy0t1UdPmjdvLh6PvX+tFBQUSFxcnHzzzTcSGRlpa9ugzrWN/kyd6xP6s/trrY6cqHASGxt7xXXrJKC0aNFCQkND5eTJkwHz1fOYmJgK63u9Xj1dqmnTpo5uo/qFEFCcR51rB3WmzvUJ/dndtb7SkZM6HSTboEED6dGjh2zZsiXgqIh63rt377rYJAAAYJA6O8WjTtmMHj1aevbsKb169ZIFCxZIYWGhvqoHAAAEtzoLKA8++KD8+9//lhkzZsiJEyeke/fusmHDhgoDZ2ubOpWk7s1S/pQSqLMb0Z+pc31Cfw6uWnusq7nWBwAAoBbxXTwAAMA4BBQAAGAcAgoAADAOAQUAABgnKANKZmamxMfHS8OGDSUhIUE+/PDDy66/cuVK6dixo16/S5cusn79+lrb1mCp85/+9Cf56U9/Ks2aNdOT+l6mK/1eUP06X2rFihX6TswjR46klDb3Z+XMmTMyceJEadmypb4Son379uw7HKizukVFhw4dpFGjRvrOp6mpqXL+/Hn69GXs2LFDhg8fru/mqvYBa9askSvZtm2b3Hbbbbov33zzzbJs2TJxnBVkVqxYYTVo0MBasmSJdeDAAWv8+PFW06ZNrZMnT1a6/t///ncrNDTUmjt3rvXZZ59Z06ZNs8LDw61PP/201re9Ptc5JSXFyszMtP7xj39Yn3/+uTVmzBirSZMm1tGjR2t92+tznf0OHTpk/ehHP7J++tOfWvfee2+tbW+w1Nnn81k9e/a0hg4dau3cuVPXe9u2bVZubm6tb3t9rvPy5cstr9erH1WNN27caLVs2dJKTU2t9W13k/Xr11vPP/+8tWrVKnUVr7V69erLrv/VV19ZERERVlpamv4cXLhwof5c3LBhg6PbGXQBpVevXtbEiRPLnpeUlFixsbHW7NmzK13/gQcesIYNGxYwLyEhwXrssccc39ZgqnN5Fy9etBo3bmy9/vrrDm5lcNZZ1fbOO++0/vznP1ujR48moDhQ51dffdVq27atdeHCher9QoNcdeus1h0wYEDAPPUh2qdPH8e3tb6QqwgozzzzjHXrrbcGzHvwwQetpKQkR7ctqE7xXLhwQfbt26dPH/iFhITo5zk5OZX+jJp/6fpKUlJSleujZnUur6ioSIqLiyUqKoqS2tiflZkzZ0p0dLSMGzeO2jpU53feeUd/bYc6xaNuPtm5c2f5zW9+o7/FHfbV+c4779Q/4z8N9NVXX+nTaEOHDqXMNqqrz0FXfJuxXU6fPq13EOXvVquef/HFF5X+jLrLbWXrq/mwr87lTZkyRZ8fLf+mwLXVeefOnbJ48WLJzc2llA7WWX1Qbt26VR555BH9gZmfny9PPvmkDt3q7pywp84pKSn65/r27au/JffixYvy+OOPy3PPPUeJbVTV56D6xuP//ve/evyPE4LqCArcYc6cOXoA5+rVq/VAOdhDfcX5qFGj9IBk9Y3icI768lN1lOq1117TX4yqvtrj+eefl0WLFlF2G6mBm+rI1CuvvCIff/yxrFq1StatWyezZs2izvVAUB1BUTvl0NBQOXnyZMB89TwmJqbSn1Hzq7M+alZnv3nz5umAsnnzZunatSvltLE/Hzx4UA4fPqxH71/6QaqEhYVJXl6etGvXjppfY50VdeVOeHi4/jm/Tp066b9E1akM9Y3uuPY6T58+XYfuX/3qV/q5uspSfenshAkTdCBUp4hw7ar6HIyMjHTs6IkSVL89tVNQf81s2bIlYAetnqvzxZVR8y9dX8nOzq5yfdSszsrcuXP1Xz7qSyPVt1zD3v6sLpX/9NNP9ekd/zRixAjp37+//n91iSauvc5Knz599GkdfwBU/vnPf+rgQjixpz/7x6qVDyH+UMjXzNmnzj4HrSC8jE1dlrZs2TJ9udSECRP0ZWwnTpzQy0eNGmU9++yzAZcZh4WFWfPmzdOXv77wwgtcZuxAnefMmaMvL/zb3/5mHT9+vGz6/vvv7e8EQVzn8riKx5k6HzlyRF+FNmnSJCsvL89au3atFR0dbb344ovX+Buv36pbZ7U/VnV+88039aWwmzZtstq1a6evvkTV1H5V3dJBTSoGzJ8/X///119/rZerGqtal7/M+Omnn9afg+qWEFxm7BB1DXerVq30B6K6rG337t1ly+6++269077UX//6V6t9+/Z6fXWp1bp165zatKCtc+vWrfUbpfykdkCwr87lEVCc6c/Krl279C0J1AeuuuT4pZde0pd4w746FxcXW+np6TqUNGzY0IqLi7OefPJJ6z//+Q9lvoz333+/0v2tv7bqUdW6/M90795d/15Uf166dKnlNI/6j7PHaAAAAKonqMagAAAAdyCgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAEBM83/rYGLijZs3IwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pj['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c101439b-f429-4529-9831-d182922d95d6", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAHSFJREFUeJzt3QuMVOX5P/B3d8FFVECwCFQU1KooKhYKovirIrKKwRtJtRqrhkrqLRFS7zdQK5QYNTWosVVpEy+NjdoWKIJYtVS8YU2912vVyqVqEZSyLOz8c86/u3GRyy7uMM+wn09yHGbmzNl353HmfPc973tORaFQKCQAgEAqS90AAIB1CSgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCE0y6Vofr6+vTxxx+nHXbYIVVUVJS6OQBAM2Tnhl2xYkXq1atXqqys3PoCShZOevfuXepmAACb4cMPP0y77LLL1hdQsp6Thl+wU6dOKZK6uro0Z86cNHLkyNS+fftSN4dNUK/yo2blR83KT12R9mXLly/POxga9uNbXUBpOKyThZOIAaVjx455uwSU+NSr/KhZ+VGz8lNX5H1Zc4ZnGCQLAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4bQrdQNou/pcOrMo231/yrFF2S4AW44eFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACA8g4okydPTt/73vfSDjvskLp3755OOOGE9OabbzZZZ9WqVem8885L3bp1S9tvv30aM2ZMWrJkSZN1Pvjgg3Tsscemjh075tu56KKL0po1a1rnNwIAyl67lqz85JNP5uEjCylZoLj88svTyJEj02uvvZa22267fJ3x48enmTNnpgcffDB17tw5nX/++emkk05Kf/3rX/Pn165dm4eTHj16pKeffjotWrQo/ehHP0rt27dPN9xwQ3F+S9qUPpfObPa61VWFNHVwSv0nPppq11Zscv33pxz7DVsHQKsHlNmzZze5P3369LwHZOHChen//u//0ueff57uuuuudN9996Xhw4fn69xzzz2pX79+6ZlnnkkHH3xwmjNnTh5oHnvssbTzzjunAQMGpOuuuy5dcsklaeLEiWmbbbZpSZMAgLYeUNaVBZJM165d89ssqNTV1aURI0Y0rrPPPvukXXfdNS1YsCAPKNnt/vvvn4eTBjU1Nemcc85Jr776ajrooIO+9nNqa2vzpcHy5cvz2+xnZUskDe2J1q6Ist6LUquuLDS53RR1LT2fsfKjZuWnrkj7spZsb7MDSn19fbrwwgvToYcemvr3758/tnjx4rwHpEuXLk3WzcJI9lzDOl8NJw3PNzy3obEvkyZN+trjWW9MNo4lorlz55a6CeFlh1aiuG5QfbPWmzVrVtHbQvP4jJUfNSs/c1t5X7Zy5criB5RsLMorr7yS5s+fn4rtsssuSxMmTGjSg9K7d+98/EunTp1SJFk6zAp61FFH5eNq2LBs3EepZT0nWTi56oXKVFu/6TEor0ys2SLtYsN8xsqPmpWfuiLtyxqOgBQtoGQDX2fMmJGeeuqptMsuuzQ+ng18Xb16dVq2bFmTXpRsFk/2XMM6zz33XJPtNczyaVhnXdXV1fmyruxNixoCIrctiuYMSt1SsnDSnPaoaRw+Y+VHzcpP+1bel7VkWy2aZlwoFPJw8vDDD6fHH3889e3bt8nzAwcOzH/4vHnzGh/LpiFn04qHDh2a389uX3755bR06dLGdbKUlvWE7Lvvvi1pDgCwlWrX0sM62Qyd3//+9/m5UBrGjGTTibfddtv8duzYsfnhmGzgbBY6LrjggjyUZANkM9lhmSyInH766Wnq1Kn5Nq688sp82+vrJQEA2p4WBZTbb789vz388MObPJ5NJT7zzDPzf998882psrIyP0FbNvMmm6Fz2223Na5bVVWVHx7KZu1kwSU7f8oZZ5yRrr322tb5jQCAthVQskM8m9KhQ4c0bdq0fNmQ3XbbzWwIAGCDXIsHAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAo/4Dy1FNPpdGjR6devXqlioqK9MgjjzR5/swzz8wf/+py9NFHN1nns88+S6eddlrq1KlT6tKlSxo7dmz64osvvvlvAwC0zYDy5ZdfpgMPPDBNmzZtg+tkgWTRokWNy/3339/k+SycvPrqq2nu3LlpxowZeegZN27c5v0GAMBWp11LX3DMMcfky8ZUV1enHj16rPe5119/Pc2ePTs9//zzadCgQfljt956axo1alS68cYb854ZAKBta3FAaY4nnngide/ePe24445p+PDh6frrr0/dunXLn1uwYEF+WKchnGRGjBiRKisr07PPPptOPPHEr22vtrY2XxosX748v62rq8uXSBraE61dEVVXFUrdhFRdWWhyuynqWno+Y+VHzcpPXZH2ZS3ZXqsHlOzwzkknnZT69u2b3nnnnXT55ZfnPS5ZMKmqqkqLFy/Ow0uTRrRrl7p27Zo/tz6TJ09OkyZN+trjc+bMSR07dkwRZYev2Lipg+O8Q9cNqm/WerNmzSp6W2gen7Hyo2blZ24r78tWrlxZuoByyimnNP57//33TwcccEDaY4898l6VI488crO2edlll6UJEyY06UHp3bt3GjlyZD7QNpIsHWYFPeqoo1L79u1L3ZzQ+k98tNRNyHtOsnBy1QuVqba+YpPrvzKxZou0iw3zGSs/alZ+6oq0L2s4AlKyQzxftfvuu6eddtopvf3223lAycamLF26tMk6a9asyWf2bGjcSjamJVvWlb1pUUNA5LZFUbt204FgS8nCSXPao6Zx+IyVHzUrP+1beV/Wkm0V/TwoH330Ufr0009Tz5498/tDhw5Ny5YtSwsXLmxc5/HHH0/19fVpyJAhxW4OAFAGWtyDkp2vJOsNafDee++ll156KR9Dki3ZWJExY8bkvSHZGJSLL7447bnnnqmm5v93jffr1y8fp3L22WenO+64I+9GOv/88/NDQ2bwAACb1YPywgsvpIMOOihfMtnYkOzfV199dT4I9u9//3s67rjj0l577ZWfgG3gwIHpL3/5S5NDNPfee2/aZ5998kM+2fTiYcOGpTvvvFNFAIDN60E5/PDDU6Gw4SmZjz666YGPWU/Lfffd19IfDQC0EUUfJAtbkz6Xzkzl5v0px5a6CQAt5mKBAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQTrtSNwAA2Dx9Lp1ZlLeuuqqQpg5OJaUHBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUAKD8A8pTTz2VRo8enXr16pUqKirSI4880uT5QqGQrr766tSzZ8+07bbbphEjRqS33nqryTqfffZZOu2001KnTp1Sly5d0tixY9MXX3zxzX8bAKBtBpQvv/wyHXjggWnatGnrfX7q1KnpF7/4RbrjjjvSs88+m7bbbrtUU1OTVq1a1bhOFk5effXVNHfu3DRjxow89IwbN+6b/SYAwFajXUtfcMwxx+TL+mS9J7fccku68sor0/HHH58/9pvf/CbtvPPOeU/LKaeckl5//fU0e/bs9Pzzz6dBgwbl69x6661p1KhR6cYbb8x7ZgCAtq3FAWVj3nvvvbR48eL8sE6Dzp07pyFDhqQFCxbkASW7zQ7rNISTTLZ+ZWVl3uNy4okntmaToM3rc+nMor0H7085ts2/v0AZBJQsnGSyHpOvyu43PJfddu/evWkj2rVLXbt2bVxnXbW1tfnSYPny5fltXV1dvkTS0J5o7YqouqpQ6iak6spCk1taphT/n/uMlR81K7/v0er/fSe29me8Jdtr1YBSLJMnT06TJk362uNz5sxJHTt2TBFl42vYuKmD47xD1w2qL3UTytKsWbNK9rN9xsqPmpXf9+jcVt6XrVy5sjQBpUePHvntkiVL8lk8DbL7AwYMaFxn6dKlTV63Zs2afGZPw+vXddlll6UJEyY06UHp3bt3GjlyZD4TKJIsHWYFPeqoo1L79u1L3ZzQ+k98tNRNyP9KyMLJVS9Uptr6ilI3p+y8MrFmi/9Mn7Hyo2bl9z1a/b/vxtbelzUcAdniAaVv3755yJg3b15jIMkak40tOeecc/L7Q4cOTcuWLUsLFy5MAwcOzB97/PHHU319fT5WZX2qq6vzZV3ZmxY1BERuWxS1a+MEgiycRGpPuSjl/+M+Y+VHzVpfbZG/t1q7Zi3ZVosDSna+krfffrvJwNiXXnopH0Oy6667pgsvvDBdf/316Tvf+U4eWK666qp8Zs4JJ5yQr9+vX7909NFHp7PPPjufipwl6/PPPz8fQGsGDwCwWQHlhRdeSEcccUTj/YZDL2eccUaaPn16uvjii/NzpWTnNcl6SoYNG5ZPK+7QoUPja+699948lBx55JH57J0xY8bk504BANisgHL44Yfn5zvZkOzsstdee22+bEjW23LfffepAACwXq7FAwCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhtCt1A4Dy1efSmUXZ7vtTji3KdoHyoQcFAAhHQAEAwnGIh5J04QPAxuhBAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHDalboBALA163PpzFI3oSzpQQEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACMeJ2gDACdXC0YMCAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKADA1h9QJk6cmCoqKpos++yzT+Pzq1atSuedd17q1q1b2n777dOYMWPSkiVLWrsZAEAZK0oPyn777ZcWLVrUuMyfP7/xufHjx6c//vGP6cEHH0xPPvlk+vjjj9NJJ51UjGYAAGWqKGeSbdeuXerRo8fXHv/888/TXXfdle677740fPjw/LF77rkn9evXLz3zzDPp4IMPLkZzAIAyU5SA8tZbb6VevXqlDh06pKFDh6bJkyenXXfdNS1cuDDV1dWlESNGNK6bHf7JnluwYMEGA0ptbW2+NFi+fHl+m20rWyJpaE+0dm2u6qpC2ppVVxaa3BLDxj4/W9tnrC0ol5pt7d93LdHwndjaNWvJ9ioKhUKrVuRPf/pT+uKLL9Lee++dH96ZNGlS+te//pVeeeWV/NDOWWed1SRsZAYPHpyOOOKI9POf/3yD41qy7awr64np2LFjazYfACiSlStXplNPPTU/otKpU6ctG1DWtWzZsrTbbrulm266KW277babFVDW14PSu3fv9Mknn2zyF9zSsnQ4d+7cdNRRR6X27dunctd/4qNpa/8r4bpB9emqFypTbX1FqZvD/7wysabNfMbagnKp2db+fbc5342tXbNs/73TTjs1K6AU/WrGXbp0SXvttVd6++2381909erVeWjJHm+QzeJZ35iVBtXV1fmyruxNi/o/e+S2tUTt2rax087CSVv5XctBcz47W8tnrC2JXjPfAcWvWUu2VfTzoGSHe955553Us2fPNHDgwLxx8+bNa3z+zTffTB988EE+VgUAoCg9KD/96U/T6NGj88M62RTia665JlVVVaUf/vCHqXPnzmns2LFpwoQJqWvXrnn3zgUXXJCHEzN4AICiBZSPPvooDyOffvpp+ta3vpWGDRuWTyHO/p25+eabU2VlZX6CtmxcSU1NTbrttttauxkAQBlr9YDywAMPbPT5bOrxtGnT8gUAYH1ciwcACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBw2pW6AW1Jn0tnFm3b7085tmjbBoAtTQ8KABCOgAIAhOMQz1aimIePAGBL04MCAIQjoAAA4TjEA0DZcDi77dCDAgCEowcFKKu/kqurCmnq4JT6T3w01a6taPG2nTMIyoMeFAAgHAEFAAjHIR4AinqY7pselqNt0oMCAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjosFArRRX72gH0SjBwUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwnAellc8NUF1VSFMHp9R/4qOpdm3FN6kNALRZelAAgHAEFAAgHId4AIKfNv79KccWbdsQlR4UACAcPSgAwbmoH22RHhQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACMc0Y6BNMWUXyoMeFAAgnJIGlGnTpqU+ffqkDh06pCFDhqTnnnuulM0BANp6QPntb3+bJkyYkK655pr04osvpgMPPDDV1NSkpUuXlqpJAEBbDyg33XRTOvvss9NZZ52V9t1333THHXekjh07prvvvrtUTQIA2vIg2dWrV6eFCxemyy67rPGxysrKNGLEiLRgwYKvrV9bW5svDT7//PP89rPPPkt1dXWt3r52a77c/NfWF9LKlfWpXV1lWltf0artovWpV/lRs/KjZuVbs08//TS1b9++1ba7YsWK/LZQKGy6DakEPvnkk7R27dq08847N3k8u//GG298bf3JkyenSZMmfe3xvn37pohOLXUDaBH1Kj9qVn7UrPycWsRtZ0Glc+fO5T/NOOtpycarNKivr897T7p165YqKmL1Uixfvjz17t07ffjhh6lTp06lbg6boF7lR83Kj5qVn+VF2pdlPSdZOOnVq9cm1y1JQNlpp51SVVVVWrJkSZPHs/s9evT42vrV1dX58lVdunRJkWUFFVDKh3qVHzUrP2pWfjoVYV+2qZ6Tkg6S3WabbdLAgQPTvHnzmvSKZPeHDh1aiiYBAIGU7BBPdsjmjDPOSIMGDUqDBw9Ot9xyS/ryyy/zWT0AQNtWsoBy8sknp3//+9/p6quvTosXL04DBgxIs2fP/trA2XKTHYrKzu2y7iEpYlKv8qNm5UfNyk91gH1ZRaE5c30AALYg1+IBAMIRUACAcAQUACAcAQUACEdAaaFp06alPn36pA4dOqQhQ4ak5557boPr/vKXv0yHHXZY2nHHHfMlu9bQxtan9DX7qgceeCA/U/EJJ5ygNMFrtmzZsnTeeeelnj175rMO9tprrzRr1qwt1l5aXrPs1BJ777132nbbbfMzlo4fPz6tWrXKW7kFPPXUU2n06NH52Vyz77hHHnlkk6954okn0ne/+93887Xnnnum6dOnF7+h2SwemueBBx4obLPNNoW777678OqrrxbOPvvsQpcuXQpLlixZ7/qnnnpqYdq0aYW//e1vhddff71w5plnFjp37lz46KOPvOVBa9bgvffeK3z7298uHHbYYYXjjz9evQLXrLa2tjBo0KDCqFGjCvPnz89r98QTTxReeukldQtas3vvvbdQXV2d32b1evTRRws9e/YsjB8/Xs22gFmzZhWuuOKKwkMPPZTN4i08/PDDG13/3XffLXTs2LEwYcKEwmuvvVa49dZbC1VVVYXZs2cXtZ0CSgsMHjy4cN555zXeX7t2baFXr16FyZMnN+v1a9asKeywww6FX//61y2vFFusZlmdDjnkkMKvfvWrwhlnnCGgBK/Z7bffXth9990Lq1ev3oKt5JvULFt3+PDhTR7Ldn6HHnqoN3YLS80IKBdffHFhv/32a/LYySefXKipqSlq2xziaabVq1enhQsX5odpGlRWVub3FyxY0KxtrFy5MtXV1aWuXbtuXncXW6Rm1157berevXsaO3asd7wMavaHP/whv0RGdognO9Fj//790w033JBfMZ2YNTvkkEPy1zQcBnr33XfzQ3KjRo1SsoAWLFjQpL6ZmpqaZu/7NldZXM04gk8++ST/wlv3TLfZ/TfeeKNZ27jkkkvyY37rFpo4NZs/f36666670ksvvaQsZVKzbOf2+OOPp9NOOy3fyb399tvp3HPPzf8YyM6ESbyanXrqqfnrhg0bll/dds2aNeknP/lJuvzyy5UroMWLF6+3vtkVj//73//m44iKQQ/KFjJlypR80OXDDz+cDyIjnuwS4Keffno+uDm74jblIbvQaNbjdeedd+YXIc0uo3HFFVekO+64o9RNYyMDLrNerttuuy29+OKL6aGHHkozZ85M1113nfeMRnpQminbYVVVVaUlS5Y0eTy736NHj42+9sYbb8wDymOPPZYOOOCA5v5ItnDN3nnnnfT+++/no9u/uvPLtGvXLr355ptpjz32UJdgn7Ns5k779u3z1zXo169f/ldfdvghu3o6sWp21VVX5X8M/PjHP87v77///vnFYseNG5eHy+wQEXFkdVxffTt16lS03pOM/wuaKfuSy/46mzdvXpOdV3Y/O/69IVOnTs3/KsguhJhduZm4Ndtnn33Syy+/nB/eaViOO+64dMQRR+T/zqZCEqtmmUMPPTQ/rNMQJjP/+Mc/8uAinMSsWTYeb90Q0hAwXR4unqFDhzapb2bu3Lkb3fe1iqIOwd0Kp9JlU+OmT5+eT7UaN25cPpVu8eLF+fOnn3564dJLL21cf8qUKfnUu9/97neFRYsWNS4rVqwo4W/RtrS0Zusyiyd+zT744IN8dtz5559fePPNNwszZswodO/evXD99deXoPVtU0trds011+Q1u//++/MprHPmzCnssccehR/84Acl/C3ajhUrVuSnv8iWLAbcdNNN+b//+c9/5s9ntcpqtu4044suuig/ZUZ2+gzTjAPK5n/vuuuuefDIptY988wzjc99//vfz3doDXbbbbe8+Osu2YeTmDVbl4BSHjV7+umnC0OGDMl3ktmU45/97Gf5dHFi1qyurq4wceLEPJR06NCh0Lt378K5555b+M9//qNkW8Cf//zn9e6bGmqU3WY1W/c1AwYMyOubfcbuueeeorezIvtPcftoAABaxhgUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAFM3/A299nSLZhZEWAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pt['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d5fda75a-93fc-4d0d-827b-ef57c9cc2993", + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "DEBUG:google.adk.scope.utils.similarity:Comparing 'run_async' and 'run_async'\n", - "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.7457, Early exit threshold: 0.6000\n", - "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 6 vs 5 parameters\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'user_id': 0.6033 (name:0.41, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'session_id': 0.6880 (name:0.58, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'new_message': 0.2226 (name:0.45, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'state_delta': 0.2950 (name:0.39, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'run_config': 0.3474 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.2446 (name:0.49, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.2523 (name:0.50, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 0.9000 (name:1.00, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.7879 (name:0.58, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.7621 (name:0.52, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'user_id': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'session_id': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'new_message': 0.2879 (name:0.58, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'state_delta': 0.6000 (name:1.00, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'run_config': 0.2985 (name:0.40, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.5000, final parameter score: 0.8182\n", - "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.3000 (type match: 0.0, async match: 1.0)\n", - "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8181818181818182), 'return_type': 0.3}\n", - "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.8984\n" + "218\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namego_namespacego_member_ofgo_nametypescorematchconfidence
1399flows.llm_flowsNaNmerge_parallel_function_response_eventsinternal.utilsNaNFunctionResponsesfunction0.4991Falsehigh
263pluginsBasePluginon_model_error_callbackinternal.plugininternalPluginManagerRunOnModelErrorCallbackmethod0.4989Falsehigh
1213tools.openapi_tool.authNaNdict_to_auth_schemeinternal.typeutilNaNConvertToWithJSONSchemafunction0.4989Falsehigh
1288cli.pluginsReplayPlugin__init__pluginPluginNewconstructor0.4989Falsehigh
219pluginsHybridContentParserparsecmd.launcher.consoleConsoleLauncherParsemethod0.4983Falsehigh
\n", + "
" + ], "text/plain": [ - "np.float64(0.8984415584415584)" + " py_namespace py_member_of \\\n", + "1399 flows.llm_flows NaN \n", + "263 plugins BasePlugin \n", + "1213 tools.openapi_tool.auth NaN \n", + "1288 cli.plugins ReplayPlugin \n", + "219 plugins HybridContentParser \n", + "\n", + " py_name go_namespace \\\n", + "1399 merge_parallel_function_response_events internal.utils \n", + "263 on_model_error_callback internal.plugininternal \n", + "1213 dict_to_auth_scheme internal.typeutil \n", + "1288 __init__ plugin \n", + "219 parse cmd.launcher.console \n", + "\n", + " go_member_of go_name type score match \\\n", + "1399 NaN FunctionResponses function 0.4991 False \n", + "263 PluginManager RunOnModelErrorCallback method 0.4989 False \n", + "1213 NaN ConvertToWithJSONSchema function 0.4989 False \n", + "1288 Plugin New constructor 0.4989 False \n", + "219 ConsoleLauncher Parse method 0.4983 False \n", + "\n", + " confidence \n", + "1399 high \n", + "263 high \n", + "1213 high \n", + "1288 high \n", + "219 high " ] }, - "execution_count": 47, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scorer.get_similarity_score(py_run_async, ts_run_async)" + "# PYTHON <-> GO:\n", + "# 0.6 <= GOOD\n", + "# 0.5 < AVERAGE <= 0.6\n", + "# BAD <= 0.5 \n", + "LOWER, UPPER = 0.4599, 0.4999\n", + "print(len(pg[(LOWER < pg['score']) & (pg['score'] < UPPER)]))\n", + "pg[(LOWER < pg['score']) & (pg['score'] < UPPER)].head(5)" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "341ad037-4bdb-481a-8a45-b3e1a48f54bc", + "execution_count": 11, + "id": "dace4311-5318-4196-a982-3dafa6e39747", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "DEBUG:google.adk.scope.utils.similarity:Comparing 'run' and 'run_async'\n", - "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.6990, Early exit threshold: 0.6000\n", - "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 4 vs 5 parameters\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.6879 (name:0.58, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.0000, final parameter score: 0.8889\n", - "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.7000 (type match: 1.0, async match: 0.0)\n", - "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8888888888888888), 'return_type': 0.7}\n", - "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.9024\n" + "252\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namejava_namespacejava_member_ofjava_nametypescorematchconfidence
1runnersRunnerrewind_asyncrunnerRunnerrunAsyncmethod0.5978Truelow
1071tools.computer_useComputerUseToolsetclosetools.mcpMcpAsyncToolsetclosemethod0.5975Truelow
1153tools.application_integration_tool.clientsConnectionsClientget_entity_schema_and_operationstools.applicationintegrationtoolsetConnectionsClientconvertJsonSchemaToOpenApiSchemamethod0.5970Truelow
594cliAdkWebServerget_session_tracewebAdkWebServersessionServicemethod0.5962Truelow
1416flows.llm_flowsBaseLlmFlowrun_asyncflows.llmflowsBaseLlmFlowrunmethod0.5959Truelow
\n", + "
" + ], "text/plain": [ - "np.float64(0.9023809523809523)" + " py_namespace py_member_of \\\n", + "1 runners Runner \n", + "1071 tools.computer_use ComputerUseToolset \n", + "1153 tools.application_integration_tool.clients ConnectionsClient \n", + "594 cli AdkWebServer \n", + "1416 flows.llm_flows BaseLlmFlow \n", + "\n", + " py_name java_namespace \\\n", + "1 rewind_async runner \n", + "1071 close tools.mcp \n", + "1153 get_entity_schema_and_operations tools.applicationintegrationtoolset \n", + "594 get_session_trace web \n", + "1416 run_async flows.llmflows \n", + "\n", + " java_member_of java_name type score \\\n", + "1 Runner runAsync method 0.5978 \n", + "1071 McpAsyncToolset close method 0.5975 \n", + "1153 ConnectionsClient convertJsonSchemaToOpenApiSchema method 0.5970 \n", + "594 AdkWebServer sessionService method 0.5962 \n", + "1416 BaseLlmFlow run method 0.5959 \n", + "\n", + " match confidence \n", + "1 True low \n", + "1071 True low \n", + "1153 True low \n", + "594 True low \n", + "1416 True low " ] }, - "execution_count": 48, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scorer.get_similarity_score(py_run, ts_run_async)" + "# PYTHON <-> JAVA:\n", + "# 0.6 <= GOOD\n", + "# BAD <= 0.6\n", + "LOWER, UPPER = 0.4999, 0.5999\n", + "print(len(pj[(LOWER < pj['score']) & (pj['score'] < UPPER)]))\n", + "pj[(LOWER < pj['score']) & (pj['score'] < UPPER)].head(5)" ] }, { "cell_type": "code", - "execution_count": 49, - "id": "783ee277-0893-46e3-ae9c-72a8df82cb8d", + "execution_count": 12, + "id": "b71d2c93-6f89-4e1e-ad47-0cb0285d7182", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "102\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namets_namespacets_member_ofts_nametypescorematchconfidence
942tools.google_api_toolDocsToolset__init__tools.mcpMCPToolsetconstructorconstructor0.5564Truelow
762evaluationNaNget_all_tool_callstelemetryNaNtraceToolCallfunction0.5551Truelow
1280cli.pluginsRecordingsPlugin__init__pluginsLoggingPluginconstructorconstructor0.5527Truelow
1006tools.pubsubPubSubToolset__init__toolsBaseToolsetconstructorconstructor0.5524Truelow
943tools.google_api_toolGoogleApiTool__init__toolsGoogleSearchToolconstructorconstructor0.5521Truelow
\n", + "
" + ], "text/plain": [ - "original_name: \"run_async\"\n", - "normalized_name: \"run_async\"\n", - "description: \"Main entry method to run the agent in this runner.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The async generator will only finish iterating after event\\n compaction is complete. However, this does not block new `run_async`\\n calls for subsequent user queries, which can be started concurrently.\"\n", - "member_of: \"Runner\"\n", - "normalized_member_of: \"runner\"\n", - "type: INSTANCE_METHOD\n", - "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", - "namespace: \"runners\"\n", - "normalized_namespace: \"runners\"\n", - "parameters {\n", - " original_name: \"user_id\"\n", - " normalized_name: \"user_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The user ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"session_id\"\n", - " normalized_name: \"session_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The session ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"invocation_id\"\n", - " normalized_name: \"invocation_id\"\n", - " original_types: \"Optional[str]\"\n", - " normalized_types: STRING\n", - " description: \"The invocation ID of the session, set this to resume an interrupted invocation.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"new_message\"\n", - " normalized_name: \"new_message\"\n", - " original_types: \"Optional[types.Content]\"\n", - " normalized_types: OBJECT\n", - " description: \"A new message to append to the session.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"state_delta\"\n", - " normalized_name: \"state_delta\"\n", - " original_types: \"Optional[dict[str, Any]]\"\n", - " normalized_types: MAP\n", - " description: \"Optional state changes to apply to the session.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"run_config\"\n", - " normalized_name: \"run_config\"\n", - " original_types: \"Optional[RunConfig]\"\n", - " normalized_types: OBJECT\n", - " description: \"The run config for the agent.\"\n", - " is_optional: true\n", - "}\n", - "original_return_types: \"AsyncGenerator[Event, None]\"\n", - "normalized_return_types: \"OBJECT\"\n", - "normalized_return_types: \"null\"\n", - "async: true" + " py_namespace py_member_of py_name \\\n", + "942 tools.google_api_tool DocsToolset __init__ \n", + "762 evaluation NaN get_all_tool_calls \n", + "1280 cli.plugins RecordingsPlugin __init__ \n", + "1006 tools.pubsub PubSubToolset __init__ \n", + "943 tools.google_api_tool GoogleApiTool __init__ \n", + "\n", + " ts_namespace ts_member_of ts_name type score \\\n", + "942 tools.mcp MCPToolset constructor constructor 0.5564 \n", + "762 telemetry NaN traceToolCall function 0.5551 \n", + "1280 plugins LoggingPlugin constructor constructor 0.5527 \n", + "1006 tools BaseToolset constructor constructor 0.5524 \n", + "943 tools GoogleSearchTool constructor constructor 0.5521 \n", + "\n", + " match confidence \n", + "942 True low \n", + "762 True low \n", + "1280 True low \n", + "1006 True low \n", + "943 True low " ] }, - "execution_count": 49, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "py_run_async" + "# PYTHON <-> TS:\n", + "# 0.7 <= GOOD\n", + "# 0.55 < AVERAGE <= 0.7\n", + "# BAD <= 0.55\n", + "LOWER, UPPER = 0.55, 0.7\n", + "print(len(pt[(LOWER < pt['score']) & (pt['score'] < UPPER)]))\n", + "pt[(LOWER < pt['score']) & (pt['score'] < UPPER)].tail(5)" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "861b3583-49d8-4b29-9f39-144aabe82948", + "execution_count": 13, + "id": "f1e5b76d-22d2-432f-b3c6-5090b847cfe1", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
matchconfidencetype
Falsehighconstructor377
function206
method604
Truehighconstructor10
function8
method46
lowconstructor19
function45
method111
\n", + "
" + ], "text/plain": [ - "original_name: \"run\"\n", - "normalized_name: \"run\"\n", - "description: \"Runs the agent.\\n\\n NOTE:\\n This sync interface is only for local testing and convenience purpose.\\n Consider using `run_async` for production usage.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The generator will only finish iterating after event\\n compaction is complete.\"\n", - "member_of: \"Runner\"\n", - "normalized_member_of: \"runner\"\n", - "type: INSTANCE_METHOD\n", - "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", - "namespace: \"runners\"\n", - "normalized_namespace: \"runners\"\n", - "parameters {\n", - " original_name: \"user_id\"\n", - " normalized_name: \"user_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The user ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"session_id\"\n", - " normalized_name: \"session_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The session ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"new_message\"\n", - " normalized_name: \"new_message\"\n", - " original_types: \"types.Content\"\n", - " normalized_types: OBJECT\n", - " description: \"A new message to append to the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"run_config\"\n", - " normalized_name: \"run_config\"\n", - " original_types: \"Optional[RunConfig]\"\n", - " normalized_types: OBJECT\n", - " description: \"The run config for the agent.\"\n", - " is_optional: true\n", - "}\n", - "original_return_types: \"Generator[Event, None, None]\"\n", - "normalized_return_types: \"OBJECT\"" + " score\n", + "match confidence type \n", + "False high constructor 377\n", + " function 206\n", + " method 604\n", + "True high constructor 10\n", + " function 8\n", + " method 46\n", + " low constructor 19\n", + " function 45\n", + " method 111" ] }, - "execution_count": 50, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "py_run" + "pg[['match', 'confidence', 'type', 'score']].groupby(['match', 'confidence', 'type']).count()" ] }, { "cell_type": "code", "execution_count": null, - "id": "941ec275-7405-404b-a31c-d91d4d3671bd", + "id": "5dc6f20d", "metadata": {}, "outputs": [], "source": [] diff --git a/pyproject.toml b/pyproject.toml index a18b27d..238f452 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "scipy", "numpy", "jellyfish", + "RapidFuzz", ] diff --git a/score.sh b/score.sh new file mode 100755 index 0000000..7a5f822 --- /dev/null +++ b/score.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +# Resolve the project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" + +if [ "$#" -lt 2 ]; then + echo "Usage: $0 [options]" + exit 1 +fi + +python3 "${SCRIPT_DIR}/src/google/adk/scope/utils/score_features.py" "$@" diff --git a/src/google/adk/scope/extractors/converter_go.py b/src/google/adk/scope/extractors/converter_go.py index 2174aa3..3388291 100644 --- a/src/google/adk/scope/extractors/converter_go.py +++ b/src/google/adk/scope/extractors/converter_go.py @@ -19,6 +19,10 @@ class NodeProcessor: def __init__(self): self.normalizer = TypeNormalizer() + def __init__(self): + self.normalizer = TypeNormalizer() + # Mapping from struct name to list of (field_name, field_type, is_optional) + self._struct_definitions: dict[str, list[tuple[str, str, bool]]] = {} def process( self, @@ -57,6 +61,18 @@ def process( "New" ): feature_type = feature_pb2.Feature.Type.CONSTRUCTOR + # For constructors, try to infer member_of from the return type + # e.g. func NewAgent() *Agent -> member_of = Agent + original_returns, _ = self._extract_return_types(node) + if original_returns: + # Typically the first return value is the struct + ret_type = original_returns[0] + # access the struct name, e.g. *Agent -> Agent, mypkg.Agent -> Agent + # Similar logic to parameter flattening type extraction + clean_ret = ret_type.lstrip("*").split(".")[-1] + if clean_ret: + member_of = clean_ret + normalized_member_of = normalize_name(member_of) elif node.type == "method_elem": feature_type = feature_pb2.Feature.Type.INSTANCE_METHOD member_of = self._extract_interface_name(node) @@ -89,8 +105,64 @@ def process( if docstring: feature.description = docstring + if docstring: + feature.description = docstring + return feature + def register_struct(self, node: Node) -> None: + """Register a struct definition to allow parameter flattening.""" + # Find struct name from parent type_spec + parent = node.parent + # The query capture is on (type_spec name: ... type: (struct_type) @struct_body) + # So node is the struct_type node. Parent should be type_spec. + if not parent or parent.type != "type_spec": + return + + name_node = parent.child_by_field_name("name") + if not name_node: + return + + struct_name = name_node.text.decode("utf-8") + + # Parse fields + fields = [] + + # Iterating children to find field_declaration_list because child_by_field_name + # might be failing or the field name is different in this version of tree-sitter-go + field_list = None + for child in node.children: + if child.type == "field_declaration_list": + field_list = child + break + + if field_list: + for child in field_list.children: + if child.type == "field_declaration": + # Handle multiple names for same type e.g. A, B int + type_node = child.child_by_field_name("type") + if not type_node: + continue + + type_str = type_node.text.decode("utf-8") + + # Determine if optional + is_optional = False + if type_node.type == "pointer_type": + is_optional = True + + # field_declaration children names + # Loop through children to find all field_identifier nodes + field_names = [] + for subchild in child.children: + if subchild.type == "field_identifier": + field_names.append(subchild.text.decode("utf-8")) + + for fname in field_names: + fields.append((fname, type_str, is_optional)) + + self._struct_definitions[struct_name] = fields + def _extract_docstring(self, node: Node) -> str: """Extract comments immediately preceding the declaration.""" comments = [] @@ -166,12 +238,12 @@ def _extract_return_types( return original_returns, normalized_returns - def _extract_params(self, node: Node) -> list[feature_pb2.Param]: + def _extract_params(self, node: Node) -> tuple[list[feature_pb2.Param], bool]: """Extract parameters from a function_declaration node.""" params = [] params_node = node.child_by_field_name("parameters") if not params_node: - return [] + return [], False is_async = False for child in params_node.children: @@ -188,17 +260,41 @@ def _extract_params(self, node: Node) -> list[feature_pb2.Param]: if param_type == "context.Context": is_async = True continue - - norm_types = self.normalizer.normalize(param_type, "go") - norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] - - p = feature_pb2.Param( - original_name=param_name, - normalized_name=normalize_name(param_name), - original_types=[param_type], - normalized_types=norm_enums, - ) - params.append(p) + + # Check if this parameter type should be flattened + # We strip pointer and module prefix to find the struct name + # e.g. *Config -> Config, mypkg.Config -> Config + # Simple heuristic: take the last part after dot, strip * + clean_type_name = param_type.lstrip("*").split(".")[-1] + + if clean_type_name in self._struct_definitions: + # FLATTEN: Add all fields of the struct as parameters + for field_name, field_type, is_optional in self._struct_definitions[clean_type_name]: + # Recursively normalize the field type + norm_types = self.normalizer.normalize(field_type, "go") + norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] + + p = feature_pb2.Param( + original_name=field_name, + normalized_name=normalize_name(field_name), + original_types=[field_type], + normalized_types=norm_enums, + ) + if is_optional: + p.is_optional = True + params.append(p) + else: + # Normal processing + norm_types = self.normalizer.normalize(param_type, "go") + norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] + + p = feature_pb2.Param( + original_name=param_name, + normalized_name=normalize_name(param_name), + original_types=[param_type], + normalized_types=norm_enums, + ) + params.append(p) return params, is_async def _extract_name(self, node: Node) -> str: diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index 2a6c26d..cfd6a0a 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -70,6 +70,11 @@ def extract_features( return [] processor = NodeProcessor() + + # Pre-process structs to build the definition map + # We need to re-query or process struct nodes specifically. + # To keep it simple, let's just use the query we have. + pass features = [] # REVISED QUERY: Matches the declaration nodes. @@ -85,21 +90,40 @@ def extract_features( ) ) ) + (type_declaration + (type_spec + name: (type_identifier) @struct_name + type: (struct_type) @struct_body + ) + ) """ query = Query(GO_LANGUAGE, query_text) cursor = QueryCursor(query) captures = cursor.captures(root_node) all_nodes = [] + struct_nodes = [] # We only want to process the actual function/method nodes, not the interface names # which are captured just for context by the processor (via tree traversal). for capture_name, node_list in captures.items(): if capture_name in ("func", "method", "interface_method"): all_nodes.extend(node_list) + elif capture_name == "struct_body": + # We need to associate the struct body with its name. + # The query captures @struct_name and @struct_body separately but in order. + # However, 'captures' is a dict of lists, so order might be tricky if we rely on index alignment across lists. + # Better strategy: Capture the parent type_spec and process it? + # Or iterate the captures list (which we can't easily do with the dict output). + # Let's rely on NodeProcessor to find the name from the struct_body node's parent. + struct_nodes.extend(node_list) # Log results for debugging logger.debug("Found %d potential nodes in %s", len(all_nodes), file_path) + # Build struct definitions map first + for node in struct_nodes: + processor.register_struct(node) + for node in all_nodes: # Prevent filtering out abstract interface methods which have no body if node.type == "method_elem": diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 4b54f11..748c209 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -13,9 +13,27 @@ from google.adk.scope.matcher import matcher from google.adk.scope.utils import args as adk_args from google.adk.scope.utils import stats +from google.adk.scope.utils.similarity import SimilarityScorer _NEAR_MISS_THRESHOLD = 0.15 +# Global thresholds for match confidence +# Keys are frozenset of language codes (e.g., frozenset(['py', 'go'])) +SIMILARITY_THRESHOLDS = { + frozenset(["py", "go"]): { + "high": 0.6, + "avg": 0.5, + }, + frozenset(["py", "java"]): { + "high": 0.6, + "avg": 0.58, + }, + frozenset(["py", "ts"]): { + "high": 0.7, + "avg": 0.55, + }, +} + @dataclasses.dataclass class MatchResult: @@ -317,17 +335,30 @@ def generate_report(self, report_type) -> MatchResult: raise ValueError(f"Unknown report type: {report_type}") def generate_raw_report(self) -> MatchResult: - """Generates a raw CSV report.""" + """Generates a raw CSV report using global best-match logic. + + For every feature in the base registry, finds the best matching feature + in the target registry with the same TYPE, regardless of module/namespace. + """ base_code = _get_language_code(self.base_registry.language) target_code = _get_language_code(self.target_registry.language) - all_modules = sorted( - set(self.features_base.keys()) | set(self.features_target.keys()) - ) + csv_header = ( - f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," - f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," - "type,score" + f"py_namespace,py_member_of,py_name," + f"java_namespace,java_member_of,java_name," + "type,score,match,confidence" ) + # Use user-requested headers if languages match expectation, otherwise dynamic + if base_code == "py" and target_code == "java": + pass # Header is already correct for the user's specific request example + else: + # Fallback to dynamic headers if not exactly py/java as requested + csv_header = ( + f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," + f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," + "type,score,match,confidence" + ) + csv_lines = [csv_header] def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: @@ -347,60 +378,79 @@ def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: def esc_csv(s): if s is None: return "" + s = str(s) if "," in s or '"' in s or "\n" in s: return '"{}"'.format(s.replace('"', '""')) return s - for module in all_modules: - base_list = self.features_base.get(module, []) - target_list = self.features_target.get(module, []) + # 1. Index target features by Type for faster lookup + target_by_type = defaultdict(list) + for f in self.target_registry.features: + target_by_type[f.type].append(f) - solid_matches = matcher.match_features( - base_list, target_list, self.alpha - ) - beta = max(0.0, self.alpha - _NEAR_MISS_THRESHOLD) - potential_matches = matcher.match_features( - base_list, target_list, beta + scorer = SimilarityScorer(alpha=self.alpha) + + # 2. Iterate over all base features + for f_base in self.base_registry.features: + candidates = target_by_type.get(f_base.type, []) + + best_match = None + best_score = -1.0 + + if candidates: + # Find best match among candidates of same type + for f_target in candidates: + score = scorer.get_similarity_score(f_base, f_target) + if score > best_score: + best_score = score + best_match = f_target + + # 3. Write row if we have a match (even if score is 0, user might want to see it? + # Actually user said "pair with maximum similarity score should be included") + # We will include it if it matches the best score logic. + # If no candidates exist, we print empties for target. + + b_ns, b_mem, b_name = get_feature_cols(f_base) + f_type = matcher.get_type_display_name(f_base) + + if best_match: + t_ns, t_mem, t_name = get_feature_cols(best_match) + final_score = best_score + else: + t_ns, t_mem, t_name = "", "", "" + final_score = 0.0 + + # Determine match and confidence + thresholds = SIMILARITY_THRESHOLDS.get( + frozenset([base_code, target_code]) ) - unmatched_base = list(base_list) - unmatched_target = list(target_list) - - for f_base, f_target, score in solid_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," - f"{esc_csv(f_type)},{score:.4f}" - ) - - for f_base, f_target, score in potential_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," - f"{esc_csv(f_type)},{score:.4f}" - ) - - for f_base in unmatched_base: - b_ns, b_mem, b_name = get_feature_cols(f_base) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f",,,{esc_csv(f_type)},0.0000" - ) - - for f_target in unmatched_target: - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_target) - csv_lines.append( - f",,,{esc_csv(t_ns)},{esc_csv(t_mem)}," - f"{esc_csv(t_name)},{esc_csv(f_type)},0.0000" - ) + match_str = "false" + confidence_str = "low" + + if thresholds: + if final_score > thresholds["high"]: + match_str = "true" + confidence_str = "high" + elif final_score >= thresholds["avg"]: + match_str = "true" + confidence_str = "low" + else: + match_str = "false" + confidence_str = "high" + else: + # Default behavior if no thresholds defined for this pair + # Fallback to general alpha or just say low confidence? + # User only provided specific pairs. + match_str = "true" if final_score >= self.alpha else "false" + confidence_str = "low" + + csv_lines.append( + f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," + f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," + f"{esc_csv(f_type)},{final_score:.4f}," + f"{match_str},{confidence_str}" + ) return MatchResult( master_content="\n".join(csv_lines), diff --git a/src/google/adk/scope/utils/score_features.py b/src/google/adk/scope/utils/score_features.py new file mode 100644 index 0000000..098ef93 --- /dev/null +++ b/src/google/adk/scope/utils/score_features.py @@ -0,0 +1,51 @@ + +import argparse +import logging +import sys +from pathlib import Path + +from google.protobuf import text_format + +from google.adk.scope import features_pb2 +from google.adk.scope.utils.similarity import SimilarityScorer + +def main(): + parser = argparse.ArgumentParser(description="Calculate similarity score between two features.") + parser.add_argument("feature1", type=Path, help="Path to first feature file (text proto).") + parser.add_argument("feature2", type=Path, help="Path to second feature file (text proto).") + parser.add_argument( + "--algorithm", + choices=["levenshtein", "token_set_ratio"], + default="levenshtein", + help="String similarity algorithm to use.", + ) + args = parser.parse_args() + + # Configure logging to DEBUG + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + try: + f1_content = args.feature1.read_text() + f2_content = args.feature2.read_text() + + f1 = features_pb2.Feature() + text_format.Parse(f1_content, f1) + + f2 = features_pb2.Feature() + text_format.Parse(f2_content, f2) + + print(f'Using the {args.algorithm} algorithm...') + + scorer = SimilarityScorer(alpha=0.1, similarity_algorithm=args.algorithm) + score = scorer.get_similarity_score(f1, f2) + + print("-" * 40) + print(f"Similarity Score: {score:.4f}") + print("-" * 40) + + except Exception as e: + logging.error(f"Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 97fb226..fd5c2de 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -2,7 +2,8 @@ from typing import Optional import numpy as np -from jellyfish import jaro_winkler_similarity +from jellyfish import levenshtein_distance +from rapidfuzz import fuzz from scipy.optimize import linear_sum_assignment from google.adk.scope import features_pb2 as features_pb @@ -23,18 +24,41 @@ class SimilarityScorer: """Calculates a similarity score between two features.""" def __init__( - self, weights: Optional[dict[str, float]] = None, alpha: float = 0.8 + self, + weights: Optional[dict[str, float]] = None, + alpha: float = 0.8, + similarity_algorithm: str = "token_set_ratio", ): + self.alpha = alpha self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS + self.similarity_algorithm = similarity_algorithm logger.debug( - f"Initializing SimilarityScorer with alpha={alpha} and " - f"weights={self.weights}" + f"Initializing SimilarityScorer with alpha={alpha}, " + f"algorithm={similarity_algorithm}, weights={self.weights}" ) assert "name" in self.weights assert "member_of" in self.weights assert "namespace" in self.weights assert "parameters" in self.weights assert "return_type" in self.weights + assert self.similarity_algorithm in {"levenshtein", "token_set_ratio"} + + def get_similarity(self, s1: str, s2: str) -> float: + """Calculates similarity between two strings using the selected algorithm.""" + if not s1 and not s2: + return 1.0 + if not s1 or not s2: + return 0.0 + + if self.similarity_algorithm == "token_set_ratio": + # rapidfuzz.fuzz.token_set_ratio returns 0-100 + return fuzz.token_set_ratio(s1, s2) / 100.0 + else: + # Default to Levenshtein + dist = levenshtein_distance(s1, s2) + max_len = max(len(s1), len(s2)) + return 1.0 - (dist / max_len) + return 1.0 - (dist / max_len) def _fuzzy_type_match(self, types1: list, types2: list) -> float: """Calculates a fuzzy similarity score between two lists of types.""" @@ -61,10 +85,13 @@ def _to_str_set(type_list): return 0.0 if set1 == set2: + logger.debug(f"Exact type match: {set1}") return 1.0 # Check the best match between any pair of types best_score = 0.0 + + logger.debug(f"Fuzzy type match between {set1} and {set2}") for t1 in set1: for t2 in set2: if t1 == t2: @@ -90,7 +117,7 @@ def _calculate_param_similarity( self, param1: features_pb.Param, param2: features_pb.Param ) -> float: """Calculates the similarity score between two individual parameters.""" - s_p_name = jaro_winkler_similarity( + s_p_name = self.get_similarity( param1.normalized_name, param2.normalized_name ) s_p_type = self._fuzzy_type_match( @@ -145,6 +172,10 @@ def _calculate_parameters_score( f"Matrix matched total score: {total_match_score:.4f}, " f"final parameter score: {score:.4f}" ) + # Log parameter matches + for r, c in zip(row_ind, col_ind): + if similarity_matrix[r, c] > 0: + logger.debug(f" Matched param '{params1[r].normalized_name}' with '{params2[c].normalized_name}': {similarity_matrix[r, c]:.4f}") return score def _calculate_return_type_score( @@ -208,16 +239,22 @@ def get_similarity_score( # 2. Similarity Calculations scores = { - "name": jaro_winkler_similarity( + "name": self.get_similarity( feature1.normalized_name, feature2.normalized_name ), - "member_of": jaro_winkler_similarity( + "member_of": self.get_similarity( feature1.normalized_member_of, feature2.normalized_member_of ), - "namespace": jaro_winkler_similarity( + "namespace": self.get_similarity( feature1.normalized_namespace, feature2.normalized_namespace ), } + logger.debug( + f"Comparison Details:\n" + f" Name: '{feature1.normalized_name}' vs '{feature2.normalized_name}' -> {scores['name']:.4f}\n" + f" MemberOf: '{feature1.normalized_member_of}' vs '{feature2.normalized_member_of}' -> {scores['member_of']:.4f}\n" + f" Namespace: '{feature1.normalized_namespace}' vs '{feature2.normalized_namespace}' -> {scores['namespace']:.4f}" + ) logger.debug(f"Preliminary scores: {scores}") # 3. Early Exit Check (using dynamic weights) @@ -227,7 +264,7 @@ def get_similarity_score( + scores["namespace"] * current_weights["namespace"] ) - early_exit_threshold = 0.8 * ( + early_exit_threshold = self.alpha * ( current_weights["name"] + current_weights["member_of"] + current_weights["namespace"] @@ -255,5 +292,12 @@ def get_similarity_score( scores[key] * current_weights[key] for key in current_weights ) logger.debug(f"Final scores including params & return: {scores}") + + # Log contributions + logger.debug("Score Contributions:") + for key in current_weights: + contribution = scores[key] * current_weights[key] + logger.debug(f" {key}: {scores[key]:.4f} * {current_weights[key]:.4f} = {contribution:.4f}") + logger.debug(f"Final weighted similarity score: {final_score:.4f}") return final_score diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index 89e95e5..87bb895 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -108,7 +108,7 @@ def test_match_registries(self): # Test Markdown Report result_md = reporter.match_registries( - [base_registry, target_registry], 0.9, report_type="md" + [base_registry, target_registry], 0.8, report_type="md" ) report_md = result_md.master_content @@ -325,6 +325,14 @@ def test_generate_raw_report(self): member_of="c1", type=features_pb2.Feature.Type.FUNCTION, ) + # f_target is a perfect match + f_target = features_pb2.Feature( + original_name="f1_base", + normalized_name="f1_base", + namespace="n1", + member_of="c1", + type=features_pb2.Feature.Type.FUNCTION, + ) base_registry = features_pb2.FeatureRegistry( language="Python", version="1.0.0" @@ -333,21 +341,60 @@ def test_generate_raw_report(self): target_registry = features_pb2.FeatureRegistry( language="TypeScript", version="2.0.0" ) + target_registry.features.extend([f_target]) - with patch( - "google.adk.scope.reporter.reporter.matcher.match_features" - ) as mock_match: - mock_match.return_value = [] # No matches for simplicity + # We no longer patch match_features, we rely on SimilarityScorer + # yielding a high score for identical features. + result = reporter.ReportGenerator( + base_registry, target_registry, 0.9 + ).generate_raw_report() - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_raw_report() + self.assertIn( + "py_namespace,py_member_of,py_name", + result.master_content, + ) + self.assertIn("n1,c1,f1_base", result.master_content) - self.assertIn( - "py_namespace,py_member_of,py_name", - result.master_content, - ) - self.assertIn("n1,c1,f1_base", result.master_content) + def test_global_best_match(self): + """Tests that a feature matches best candidate globally, ignoring namespace.""" + # Base feature in namespace 'n1' + f_base = features_pb2.Feature( + original_name="my_feature", + normalized_name="my_feature", + namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + # Target feature 1: Same namespace, but different name (low score) + f_target_bad = features_pb2.Feature( + original_name="other_feature", + normalized_name="other_feature", + namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + # Target feature 2: Different namespace, but same name (high score) + f_target_good = features_pb2.Feature( + original_name="my_feature", + normalized_name="my_feature", + namespace="n2", + type=features_pb2.Feature.Type.FUNCTION, + ) + + base_registry = features_pb2.FeatureRegistry(language="Python", version="1") + base_registry.features.append(f_base) + + target_registry = features_pb2.FeatureRegistry(language="Java", version="2") + target_registry.features.extend([f_target_bad, f_target_good]) + + # Logic should pick f_target_good because it has higher similarity + # even though it is in a different namespace. + result = reporter.ReportGenerator( + base_registry, target_registry, 0.5 + ).generate_raw_report() + + # Check that we found the match in n2 + self.assertIn("n1,,my_feature,n2,,my_feature,function,1.0000", result.master_content) def test_generate_md_report(self): """Tests the md report generation.""" @@ -384,7 +431,6 @@ def test_generate_md_report(self): self.assertIn("n1.md", result.module_files) - def test_raw_integration(self): """Tests the raw report generation end-to-end.""" python_features_str = """ @@ -463,10 +509,63 @@ def test_raw_integration(self): "py_namespace,py_member_of,py_name,ts_namespace,ts_member_of,ts_name,type,score", result.master_content, ) + + # Verify the solid match is present with high score + # Note: Original names are used (load_artifact vs loadArtifact) and original members (InMemoryArtifactService) + self.assertRegex(result.master_content, r"runners,InMemoryArtifactService,load_artifact,artifacts,InMemoryArtifactService,loadArtifact,.*,0.86[0-9]*") + + def test_raw_report_match_confidence(self): + """Tests match and confidence columns with various scores.""" + # 1. High match (score 0.9 > 0.6 for py/go) + f_high = features_pb2.Feature( + original_name="high", normalized_name="high", type=features_pb2.Feature.Type.FUNCTION + ) + # 2. Avg match (score 0.55 between 0.5 and 0.6 for py/go) + f_avg = features_pb2.Feature( + original_name="high", normalized_name="high_ish", type=features_pb2.Feature.Type.FUNCTION + ) + # 3. Low match (score 0.1 < 0.5 for py/go) + f_low = features_pb2.Feature( + original_name="high", normalized_name="completely_different", type=features_pb2.Feature.Type.FUNCTION + ) - print(result.master_content) - self.assertEqual(len(result.master_content.splitlines()), 2) - # A known match + base = features_pb2.FeatureRegistry(language="Python", version="1") + base.features.append(f_high) + + target = features_pb2.FeatureRegistry(language="Go", version="1") + # We need to craft targets that produce specific scores or mock the scorer. + # It's easier to mock SimilarityScorer to return fixed scores. + target.features.extend([f_high, f_avg, f_low]) + + with patch("google.adk.scope.reporter.reporter.SimilarityScorer") as MockScorer: + instance = MockScorer.return_value + # match_registries -> ReportGenerator -> generate_raw_report -> SimilarityScorer + # We need to control get_similarity_score. + # The logic iterates base features, then finds best match target. + + # Case 1: High match + # We want best_score to be > 0.6 + instance.get_similarity_score.return_value = 0.9 + + gen = reporter.ReportGenerator(base, target, 0.1) + # We need to reset the scorer inside generator if we patched the class, + # but ReportGenerator instantiates it inside generate_raw_report. + # So the patch above should work for the instance created inside. + + result = gen.generate_raw_report() + + # Check for match=true, confidence=high + self.assertIn("true,high", result.master_content) + + # Case 2: Avg match (0.55) -> match=true, confidence=low + instance.get_similarity_score.return_value = 0.55 + result = gen.generate_raw_report() + self.assertIn("true,low", result.master_content) + + # Case 3: Low/No match (0.4) -> match=false, confidence=high + instance.get_similarity_score.return_value = 0.4 + result = gen.generate_raw_report() + self.assertIn("false,high", result.master_content) if __name__ == "__main__": From d40302aad71eec690b8090d485226cb77176c203 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 10:47:11 -0800 Subject: [PATCH 5/9] Removed unused similarity scorer --- src/google/adk/scope/utils/score_features.py | 11 ++-------- src/google/adk/scope/utils/similarity.py | 23 ++++++-------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/google/adk/scope/utils/score_features.py b/src/google/adk/scope/utils/score_features.py index 098ef93..7718c00 100644 --- a/src/google/adk/scope/utils/score_features.py +++ b/src/google/adk/scope/utils/score_features.py @@ -13,12 +13,7 @@ def main(): parser = argparse.ArgumentParser(description="Calculate similarity score between two features.") parser.add_argument("feature1", type=Path, help="Path to first feature file (text proto).") parser.add_argument("feature2", type=Path, help="Path to second feature file (text proto).") - parser.add_argument( - "--algorithm", - choices=["levenshtein", "token_set_ratio"], - default="levenshtein", - help="String similarity algorithm to use.", - ) + args = parser.parse_args() # Configure logging to DEBUG @@ -34,9 +29,7 @@ def main(): f2 = features_pb2.Feature() text_format.Parse(f2_content, f2) - print(f'Using the {args.algorithm} algorithm...') - - scorer = SimilarityScorer(alpha=0.1, similarity_algorithm=args.algorithm) + scorer = SimilarityScorer() score = scorer.get_similarity_score(f1, f2) print("-" * 40) diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index fd5c2de..b484f52 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -3,7 +3,7 @@ import numpy as np from jellyfish import levenshtein_distance -from rapidfuzz import fuzz + from scipy.optimize import linear_sum_assignment from google.adk.scope import features_pb2 as features_pb @@ -26,22 +26,17 @@ class SimilarityScorer: def __init__( self, weights: Optional[dict[str, float]] = None, - alpha: float = 0.8, - similarity_algorithm: str = "token_set_ratio", ): - self.alpha = alpha self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS - self.similarity_algorithm = similarity_algorithm logger.debug( - f"Initializing SimilarityScorer with alpha={alpha}, " - f"algorithm={similarity_algorithm}, weights={self.weights}" + f"Initializing SimilarityScorer with " + f"weights={self.weights}" ) assert "name" in self.weights assert "member_of" in self.weights assert "namespace" in self.weights assert "parameters" in self.weights assert "return_type" in self.weights - assert self.similarity_algorithm in {"levenshtein", "token_set_ratio"} def get_similarity(self, s1: str, s2: str) -> float: """Calculates similarity between two strings using the selected algorithm.""" @@ -50,14 +45,10 @@ def get_similarity(self, s1: str, s2: str) -> float: if not s1 or not s2: return 0.0 - if self.similarity_algorithm == "token_set_ratio": - # rapidfuzz.fuzz.token_set_ratio returns 0-100 - return fuzz.token_set_ratio(s1, s2) / 100.0 - else: - # Default to Levenshtein - dist = levenshtein_distance(s1, s2) - max_len = max(len(s1), len(s2)) - return 1.0 - (dist / max_len) + # Default to Levenshtein + dist = levenshtein_distance(s1, s2) + max_len = max(len(s1), len(s2)) + return 1.0 - (dist / max_len) return 1.0 - (dist / max_len) def _fuzzy_type_match(self, types1: list, types2: list) -> float: From 50b3ced96901fe1dfe99424b2ba372fc8bdbd6e8 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 12:40:03 -0800 Subject: [PATCH 6/9] updated markdown report --- report.sh | 12 +- run.sh | 6 +- src/google/adk/scope/matcher/matcher.py | 2 +- src/google/adk/scope/reporter/reporter.py | 444 ++++++++++------------ src/google/adk/scope/utils/similarity.py | 2 +- 5 files changed, 221 insertions(+), 245 deletions(-) diff --git a/report.sh b/report.sh index 4b1a4a4..edfdcb9 100755 --- a/report.sh +++ b/report.sh @@ -83,17 +83,19 @@ for REG_FILE in "${REGISTRIES[@]}"; do done # Construct filename -if [ "$REPORT_TYPE" == "raw" ]; then - EXTENSION="csv" -else - EXTENSION="md" -fi +# Default to markdown extension. The python script will generate CSV alongside it. +EXTENSION="md" if [ "$REPORT_TYPE" == "matrix" ]; then # e.g., py_ts_go.md OUTPUT_FILENAME="$(IFS=_; echo "${LANG_CODES[*]}").${EXTENSION}" else + # Standard 2-way report OUTPUT_FILENAME="${LANG_CODES[0]}_${LANG_CODES[1]}.${EXTENSION}" + # Ensure report type is 'md' for standard logic so unified generator runs + if [ "$REPORT_TYPE" == "raw" ]; then + REPORT_TYPE="md" + fi fi FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" diff --git a/run.sh b/run.sh index 93e566e..dedf5e5 100755 --- a/run.sh +++ b/run.sh @@ -12,7 +12,7 @@ echo "Extracting Go features..." # Py -> TS -echo "Generating symmetric reports..." +echo "Generating reports..." ./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type md echo "Generating raw reports..." @@ -20,7 +20,7 @@ echo "Generating raw reports..." # Py -> Java -echo "Generating symmetric reports..." +echo "Generating reports..." ./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type md echo "Generating raw reports..." @@ -28,7 +28,7 @@ echo "Generating raw reports..." # Py -> Go -echo "Generating symmetric reports..." +echo "Generating reports..." ./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type md echo "Generating raw reports..." diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 2f67a43..a2dfce4 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -53,7 +53,7 @@ def match_features( if not base_features or not target_features: return [] - scorer = SimilarityScorer(alpha=alpha) + scorer = SimilarityScorer() matches = [] # Build Cost Matrix (Rows=Base, Cols=Target) diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 748c209..ed45755 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -1,3 +1,4 @@ + import argparse import dataclasses import logging @@ -5,35 +6,20 @@ from collections import defaultdict from datetime import datetime from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional from google.protobuf import text_format +import pandas as pd from google.adk.scope import features_pb2 from google.adk.scope.matcher import matcher +from google.adk.scope.reporter import raw from google.adk.scope.utils import args as adk_args from google.adk.scope.utils import stats from google.adk.scope.utils.similarity import SimilarityScorer _NEAR_MISS_THRESHOLD = 0.15 -# Global thresholds for match confidence -# Keys are frozenset of language codes (e.g., frozenset(['py', 'go'])) -SIMILARITY_THRESHOLDS = { - frozenset(["py", "go"]): { - "high": 0.6, - "avg": 0.5, - }, - frozenset(["py", "java"]): { - "high": 0.6, - "avg": 0.58, - }, - frozenset(["py", "ts"]): { - "high": 0.7, - "avg": 0.55, - }, -} - @dataclasses.dataclass class MatchResult: @@ -93,22 +79,37 @@ def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: def match_registries( registries: List[features_pb2.FeatureRegistry], alpha: float, - report_type: str = "md", + report_type: str = "md", # Kept for backward compatibility/matrix logic common: bool = False, + output_path: Optional[Path] = None, ) -> MatchResult: """Matches features and generates reports.""" if report_type == "matrix": reporter = MatrixReportGenerator(registries, alpha, common) + return reporter.generate_report("matrix") else: if len(registries) != 2: - raise ValueError(f"Report type '{report_type}' requires exactly 2 registries.") - reporter = ReportGenerator( - registries[0], - registries[1], - alpha, - ) - - return reporter.generate_report(report_type) + raise ValueError( + f"Report type '{report_type}' requires exactly 2 registries." + ) + + # New unified flow for standard reports + generator = raw.RawReportGenerator(registries[0], registries[1]) + + # Generate DataFrame (and CSV if path provided) + csv_path = None + if output_path: + # If output is "report.md", csv will be "report.csv" + # If output is "report.csv", md will be "report.md" + stem = output_path.stem + parent = output_path.parent + csv_path = str(parent / f"{stem}.csv") + + df = generator.generate(output_path=csv_path) + + # Generate Markdown Report from DataFrame + reporter = ReportGenerator(registries[0], registries[1], df) + return reporter.generate_md_report() class MatrixReportGenerator: @@ -225,7 +226,7 @@ def _build_global_feature_matrix(self) -> List[str]: if feat is b_f: row_dict[i] = t_f break - + # Record unmatched targets as new rows # t_list was mutated by match_features (items removed) for t_f in t_list: @@ -291,6 +292,7 @@ def generate_report(self, report_type: str = "matrix") -> MatchResult: "", "## Registries", "| Role | Language | Version |", + "| :--- | :--- | :--- |", "| :--- | :--- | :--- |" ] @@ -315,153 +317,19 @@ def __init__( self, base_registry: features_pb2.FeatureRegistry, target_registry: features_pb2.FeatureRegistry, - alpha: float, + df: pd.DataFrame, ): self.base_registry = base_registry self.target_registry = target_registry - - self.features_base = _group_features_by_module(base_registry) - self.features_target = _group_features_by_module(target_registry) - matcher.fuzzy_match_namespaces(self.features_base, self.features_target) - self.alpha = alpha - - def generate_report(self, report_type) -> MatchResult: - """Generates report.""" - if report_type == "raw": - return self.generate_raw_report() - elif report_type == "md": - return self.generate_md_report() - else: - raise ValueError(f"Unknown report type: {report_type}") - - def generate_raw_report(self) -> MatchResult: - """Generates a raw CSV report using global best-match logic. - - For every feature in the base registry, finds the best matching feature - in the target registry with the same TYPE, regardless of module/namespace. - """ - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - - csv_header = ( - f"py_namespace,py_member_of,py_name," - f"java_namespace,java_member_of,java_name," - "type,score,match,confidence" - ) - # Use user-requested headers if languages match expectation, otherwise dynamic - if base_code == "py" and target_code == "java": - pass # Header is already correct for the user's specific request example - else: - # Fallback to dynamic headers if not exactly py/java as requested - csv_header = ( - f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," - f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," - "type,score,match,confidence" - ) - - csv_lines = [csv_header] - - def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: - ns = f.namespace or "" - if not ns and f.normalized_namespace: - ns = f.normalized_namespace - - mem = f.member_of or "" - if not mem and f.normalized_member_of: - mem = f.normalized_member_of - if mem.lower() == "null": - mem = "" - - name = f.original_name or f.normalized_name or "" - return ns, mem, name - - def esc_csv(s): - if s is None: - return "" - s = str(s) - if "," in s or '"' in s or "\n" in s: - return '"{}"'.format(s.replace('"', '""')) - return s - - # 1. Index target features by Type for faster lookup - target_by_type = defaultdict(list) - for f in self.target_registry.features: - target_by_type[f.type].append(f) - - scorer = SimilarityScorer(alpha=self.alpha) - - # 2. Iterate over all base features - for f_base in self.base_registry.features: - candidates = target_by_type.get(f_base.type, []) - - best_match = None - best_score = -1.0 - - if candidates: - # Find best match among candidates of same type - for f_target in candidates: - score = scorer.get_similarity_score(f_base, f_target) - if score > best_score: - best_score = score - best_match = f_target - - # 3. Write row if we have a match (even if score is 0, user might want to see it? - # Actually user said "pair with maximum similarity score should be included") - # We will include it if it matches the best score logic. - # If no candidates exist, we print empties for target. - - b_ns, b_mem, b_name = get_feature_cols(f_base) - f_type = matcher.get_type_display_name(f_base) - - if best_match: - t_ns, t_mem, t_name = get_feature_cols(best_match) - final_score = best_score - else: - t_ns, t_mem, t_name = "", "", "" - final_score = 0.0 - - # Determine match and confidence - thresholds = SIMILARITY_THRESHOLDS.get( - frozenset([base_code, target_code]) - ) - - match_str = "false" - confidence_str = "low" - - if thresholds: - if final_score > thresholds["high"]: - match_str = "true" - confidence_str = "high" - elif final_score >= thresholds["avg"]: - match_str = "true" - confidence_str = "low" - else: - match_str = "false" - confidence_str = "high" - else: - # Default behavior if no thresholds defined for this pair - # Fallback to general alpha or just say low confidence? - # User only provided specific pairs. - match_str = "true" if final_score >= self.alpha else "false" - confidence_str = "low" - - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," - f"{esc_csv(f_type)},{final_score:.4f}," - f"{match_str},{confidence_str}" - ) - - return MatchResult( - master_content="\n".join(csv_lines), - module_files={}, - ) + self.df = df + + self.base_code = _get_language_code(base_registry.language) + self.target_code = _get_language_code(target_registry.language) + self.base_name = _get_language_name(base_registry.language) + self.target_name = _get_language_name(target_registry.language) def generate_md_report(self) -> MatchResult: - """Generates a Markdown parity report.""" - all_modules = sorted( - set(self.features_base.keys()) | set(self.features_target.keys()) - ) + """Generates a Markdown parity report from the DataFrame.""" master_lines = [] master_lines.extend( [ @@ -486,72 +354,85 @@ def generate_md_report(self) -> MatchResult: master_lines.append("GLOBAL_SCORE_PLACEHOLDER") master_lines.append("") - b_lang = _get_language_name(self.base_registry.language) - t_lang = _get_language_name(self.target_registry.language) - - header = f"| ADK | Module | Features ({b_lang}) | Score | Status | Details |" - divider = "|---|---|---|---|---|---|" + header = f"| Module | Features ({self.base_name}) | Score | Status | Details |" + divider = "|---|---|---|---|---|" master_lines.extend(["## Module Summary", header, divider]) module_files = {} module_rows = [] - total_solid_matches = 0 - - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - - for module in all_modules: - mod_base_list = self.features_base.get(module, []) - mod_target_list = self.features_target.get(module, []) - - results = matcher.process_module( - module, - mod_base_list, - mod_target_list, - self.alpha, - b_lang, - t_lang, - base_code, - target_code, + + # Determine cols based on language codes + col_ns = f"{self.base_code}_namespace" + + # Group by base namespace + # If namespace is empty, group under "Unknown Module" + self.df["_module_group"] = self.df[col_ns].replace("", "Unknown Module") + + grouped = self.df.groupby("_module_group") + + total_high = 0 + total_low = 0 + total_mismatch = 0 + total_base_features = len(self.df) + + for module, group in grouped: + # Calculate module stats + high = len(group[group["confidence"] == "high"]) + low = len(group[group["confidence"] == "low"]) + mismatches = len(group[group["match"] == "false"]) + + # Actually, `high` and `low` confidence applies to matches usually + # But let's verify what `match` column says. + matches_high = len(group[(group["match"] == "true") & (group["confidence"] == "high")]) + matches_low = len(group[(group["match"] == "true") & (group["confidence"] == "low")]) + # Everything else is a mismatch or low confidence match? + # Let's trust `match` column for parity score + solid_matches_count = len(group[group["match"] == "true"]) + + total_high += matches_high + total_low += matches_low + total_mismatch += mismatches + + module_total = len(group) + score = solid_matches_count / module_total if module_total > 0 else 0.0 + + # Generate Module File Content + module_filename = f"{module}.md" + module_content = self._generate_module_content(module, group, module_total, matches_high, matches_low, mismatches) + module_files[module_filename] = module_content + + # Add summary row + status_icon = "✅" if score == 1.0 else "⚠️" if score > 0.5 else "❌" + row_str = ( + f"| `{module}` | {module_total} | " + f"{score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" ) - total_solid_matches += results["solid_matches_count"] - module_rows.append((results["score"], results["row_content"])) - if results.get("module_filename"): - module_files[results["module_filename"]] = results[ - "module_content" - ] + module_rows.append((score, row_str)) module_rows.sort(key=lambda x: x[0], reverse=True) master_lines.extend([row for _, row in module_rows]) - total_base_features = len(self.base_registry.features) - total_target_features = len(self.target_registry.features) - - # Calculate metrics for the summary table - base_exclusive = total_base_features - total_solid_matches - target_exclusive = total_target_features - total_solid_matches - - union_size = total_base_features + total_target_features - total_solid_matches - parity_score = total_solid_matches / union_size if union_size > 0 else 1.0 - - b_lang = _get_language_name(self.base_registry.language) - t_lang = _get_language_name(self.target_registry.language) - + # Summary Stats + total_matches = total_high + total_low + parity_score = total_matches / total_base_features if total_base_features > 0 else 1.0 + + base_exclusive = total_base_features - total_matches + global_stats = ( "## Summary\n\n" "| Feature Category | Count | Details |\n" "| :--- | :--- | :--- |\n" - f"| **✅ Common Shared** | **{total_solid_matches}** | " - f"Implemented in both SDKs |\n" - f"| **📦 Exclusive to `{b_lang}`** | **{base_exclusive}** | " - f"Requires implementation in `{t_lang}` |\n" - f"| **📦 Exclusive to `{t_lang}`** | **{target_exclusive}** | " - f"Requires implementation in `{b_lang}` |\n" - f"| **📊 Jaccard Score** | **{parity_score:.2%}** | " - f"Overall Parity ({total_solid_matches} / {union_size}) |" + f"| **✅ High Confidence Matches** | **{total_high}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{total_low}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{base_exclusive}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{parity_score:.2%}** | " + f"Matches / Total Base Features ({total_matches} / {total_base_features}) |" ) - + master_lines[global_score_idx] = global_stats return MatchResult( @@ -559,6 +440,87 @@ def generate_md_report(self) -> MatchResult: module_files=module_files, ) + def _generate_module_content( + self, + module: str, + group: pd.DataFrame, + total_features: int, + high_conf: int, + low_conf: int, + mismatches: int + ) -> str: + + # Calculate scores for summary + total_matches = high_conf + low_conf + coverage = total_matches / total_features if total_features > 0 else 0.0 + + summary_table = ( + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ High Confidence Matches** | **{high_conf}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{low_conf}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{mismatches}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{coverage:.2%}** | " + f"Matches / Total Base Features ({total_matches} / {total_features}) |\n" + ) + + lines = [ + f"# Module: `{module}`", + "", + f"[← Back to Master Report]({{master_report}})", + "", + summary_table, + "## Feature Details", + "", + f"| Module ({self.base_name}) | Container ({self.base_name}) | Name ({self.base_name}) | Module ({self.target_name}) | Container ({self.target_name}) | Name ({self.target_name}) | Score | Match | Confidence |", + "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | :---: |", + ] + + # Sort by score desc, then name + group_sorted = group.sort_values(by=["score", f"{self.base_code}_name"], ascending=[False, True]) + + for _, row in group_sorted.iterrows(): + # Base logic + b_ns = row[f'{self.base_code}_namespace'] + b_mem = row[f'{self.base_code}_member_of'] + b_name = row[f'{self.base_code}_name'] + + # Target logic + t_ns = row[f'{self.target_code}_namespace'] + t_mem = row[f'{self.target_code}_member_of'] + t_name = row[f'{self.target_code}_name'] + + if t_name == "" and t_mem == "" and t_ns == "": + t_name = "*(None)*" + + score = row['score'] + match_val = row['match'] + conf_val = row['confidence'] + + if match_val == "true": + if conf_val == "high": + match_icon = "✅" + else: + match_icon = "⚠️" + else: + match_icon = "❌" + + conf_display = conf_val.title() + if conf_display == "High": + conf_display = "**High**" + + lines.append( + f"| `{b_ns}` | `{b_mem}` | `{b_name}` | " + f"`{t_ns}` | `{t_mem}` | `{t_name}` | " + f"{score:.4f} | {match_icon} | {conf_display} |" + ) + + return "\n".join(lines) + def main(): parser = argparse.ArgumentParser( @@ -583,7 +545,7 @@ def main(): parser.add_argument( "--output", required=True, - help="Path to save the Markdown report.", + help="Path to save the Markdown report. Corresponding CSV will be saved with same stem.", ) parser.add_argument( "--alpha", @@ -595,7 +557,7 @@ def main(): "--report-type", choices=["md", "raw", "matrix"], default="md", - help="Type of gap report to generate (md, raw, matrix).", + help="Type of gap report. 'md' or 'raw' now produce both. 'matrix' is separate.", ) parser.add_argument( "--common", @@ -625,24 +587,30 @@ def main(): logging.error(f"Error reading feature registries: {e}") sys.exit(1) - result = match_registries(registries, args.alpha, args.report_type, args.common) - output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) - if args.report_type == "raw": - # Raw report is a single file, no modules directory needed + result = match_registries( + registries, + args.alpha, + args.report_type, + args.common, + output_path=output_path + ) + + if args.report_type == "matrix": + # Matrix only writes one file try: - output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(result.master_content) - logging.info( - f"Successfully wrote raw match report to {output_path}" - ) + logging.info(f"Successfully wrote matrix report to {output_path}") except Exception as e: - logging.error(f"Error writing raw report to {output_path}: {e}") + logging.error(f"Error writing matrix report: {e}") sys.exit(1) return + # For standard report, we already generated CSV inside match_registries. + # Now write the Markdown and Modules. + # Create module directory if result.module_files: modules_dir_name = f"{output_path.stem}_modules" @@ -653,8 +621,8 @@ def main(): for filename, content in result.module_files.items(): # Replace placeholder for master report link # The link is relative from module dir to master report - # So name is enough. - final_content = content.replace("{master_report}", output_path.name) + # We are in {stem}_modules/, so we need to go up one level. + final_content = content.replace("{master_report}", f"../{output_path.name}") (modules_dir / filename).write_text(final_content) # Replace placeholder in Master Report @@ -667,9 +635,15 @@ def main(): master_report = result.master_content.replace("{modules_dir}", ".") try: - output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(master_report) logging.info(f"Successfully wrote match report to {output_path}") + # Note: CSV writing is logged inside RawReportGenerator or we should log it here + # Actually RawReportGenerator doesn't log, so we might want to Add a log here if we knew it matched + stem = output_path.stem + csv_path = output_path.parent / f"{stem}.csv" + if csv_path.exists(): + logging.info(f"Successfully wrote raw match report to {csv_path}") + except Exception as e: logging.error(f"Error writing report to {output_path}: {e}") sys.exit(1) diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index b484f52..3fdb8c6 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -255,7 +255,7 @@ def get_similarity_score( + scores["namespace"] * current_weights["namespace"] ) - early_exit_threshold = self.alpha * ( + early_exit_threshold = 0.8 * ( current_weights["name"] + current_weights["member_of"] + current_weights["namespace"] From 6bfca8b23af0bc4e3cab12297c4d4fa541051c87 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 14:22:41 -0800 Subject: [PATCH 7/9] updated reporter. --- README.md | 8 +- report.sh | 23 +- run.sh | 19 +- src/google/adk/scope/matcher/__init__.py | 0 src/google/adk/scope/matcher/matcher.py | 252 --------- src/google/adk/scope/reporter/markdown.py | 249 +++++++++ src/google/adk/scope/reporter/raw.py | 187 +++++++ src/google/adk/scope/reporter/reporter.py | 609 ++-------------------- test/adk/scope/matcher/test_matcher.py | 84 --- test/adk/scope/reporter/test_reporter.py | 387 ++++---------- 10 files changed, 609 insertions(+), 1209 deletions(-) delete mode 100644 src/google/adk/scope/matcher/__init__.py delete mode 100644 src/google/adk/scope/matcher/matcher.py create mode 100644 src/google/adk/scope/reporter/markdown.py create mode 100644 src/google/adk/scope/reporter/raw.py delete mode 100644 test/adk/scope/matcher/test_matcher.py diff --git a/README.md b/README.md index 9a5b85d..462f927 100644 --- a/README.md +++ b/README.md @@ -86,16 +86,10 @@ Once you have extracted features from two languages (e.g., Python and TypeScript | `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | | `--output ` | **Required.** Path for the output directory. The report filename is auto-generated. | | `--report-type ` | `md` (default) for Markdown Parity Report, or `raw` for CSV. | -| `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | #### How Matching Works -The matcher uses the **Hungarian Algorithm** to find the optimal assignment between features in the Base and Target registries. -- **Cost Function**: Based on a similarity score derived from: - - Feature Name (normalized) - - Namespace / Module - - Feature Type (Function, Method, Class, etc.) -- **Thresholding**: Pairs with a similarity score below `--alpha` are discarded. +TODO: This needs updating #### Understanding the Reports diff --git a/report.sh b/report.sh index edfdcb9..9d2192a 100755 --- a/report.sh +++ b/report.sh @@ -5,7 +5,6 @@ set -e # Default values REPORT_TYPE="md" -ALPHA="0.8" VERBOSE="" COMMON="" @@ -36,10 +35,6 @@ while [[ "$#" -gt 0 ]]; do REPORT_TYPE="$2" shift 2 ;; - --alpha) - ALPHA="$2" - shift 2 - ;; -v|--verbose) VERBOSE="--verbose" shift @@ -86,16 +81,11 @@ done # Default to markdown extension. The python script will generate CSV alongside it. EXTENSION="md" -if [ "$REPORT_TYPE" == "matrix" ]; then - # e.g., py_ts_go.md - OUTPUT_FILENAME="$(IFS=_; echo "${LANG_CODES[*]}").${EXTENSION}" -else - # Standard 2-way report - OUTPUT_FILENAME="${LANG_CODES[0]}_${LANG_CODES[1]}.${EXTENSION}" - # Ensure report type is 'md' for standard logic so unified generator runs - if [ "$REPORT_TYPE" == "raw" ]; then - REPORT_TYPE="md" - fi +# Standard 2-way report +OUTPUT_FILENAME="${LANG_CODES[0]}_${LANG_CODES[1]}.${EXTENSION}" +# Ensure report type is 'md' for standard logic so unified generator runs +if [ "$REPORT_TYPE" == "raw" ]; then + REPORT_TYPE="md" fi FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" @@ -106,11 +96,10 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Add 'src' to PYTHONPATH so the python script can find modules export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" -# Run the python matcher +# Run the python reporter python3 "${SCRIPT_DIR}/src/google/adk/scope/reporter/reporter.py" \ --registries "${REGISTRIES[@]}" \ --output "${FULL_OUTPUT_PATH}" \ --report-type "${REPORT_TYPE}" \ - --alpha "${ALPHA}" \ ${COMMON} \ ${VERBOSE} diff --git a/run.sh b/run.sh index dedf5e5..4ca47f4 100755 --- a/run.sh +++ b/run.sh @@ -12,29 +12,20 @@ echo "Extracting Go features..." # Py -> TS -echo "Generating reports..." +echo "Generating raw and markdown reports..." ./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type md -echo "Generating raw reports..." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type raw - # Py -> Java -echo "Generating reports..." +echo "Generating raw and markdown reports..." ./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type md -echo "Generating raw reports..." -./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type raw - # Py -> Go -echo "Generating reports..." +echo "Generating raw and markdown reports..." ./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type md -echo "Generating raw reports..." -./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type raw - # Matrix reports -echo "Generating matrix reports..." -./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix --common \ No newline at end of file +#echo "Generating matrix reports..." +#./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix --common \ No newline at end of file diff --git a/src/google/adk/scope/matcher/__init__.py b/src/google/adk/scope/matcher/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py deleted file mode 100644 index a2dfce4..0000000 --- a/src/google/adk/scope/matcher/matcher.py +++ /dev/null @@ -1,252 +0,0 @@ -from collections import defaultdict -from typing import Dict, List, Tuple - -import numpy as np -from jellyfish import jaro_winkler_similarity -from scipy.optimize import linear_sum_assignment - -from google.adk.scope import features_pb2 -from google.adk.scope.utils import stats -from google.adk.scope.utils.similarity import SimilarityScorer - -_NEAR_MISS_THRESHOLD = 0.15 - - -def _format_feature(f: features_pb2.Feature) -> str: - name = f.original_name or f.normalized_name - member = f.member_of - if member and member.lower() != "null": - return f"{member}.{name}" - return name - - -def get_type_display_name(f: features_pb2.Feature) -> str: - FeatureType = features_pb2.Feature.Type - if f.type == FeatureType.CONSTRUCTOR: - return "constructor" - elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): - return "function" - elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: - return "method" - else: - return "unknown" - - -def _get_type_priority(f: features_pb2.Feature) -> int: - """Returns priority: constructor < function < method < unknown.""" - type_name = get_type_display_name(f) - priorities = { - "constructor": 0, - "function": 1, - "method": 2, - "unknown": 3, - } - return priorities.get(type_name, 99) - - -def match_features( - base_features: List[features_pb2.Feature], - target_features: List[features_pb2.Feature], - alpha: float, -) -> List[Tuple[features_pb2.Feature, features_pb2.Feature, float]]: - """Matches features between two lists using Hungarian algorithm.""" - if not base_features or not target_features: - return [] - - scorer = SimilarityScorer() - matches = [] - - # Build Cost Matrix (Rows=Base, Cols=Target) - n_base = len(base_features) - n_target = len(target_features) - similarity_matrix = np.zeros((n_base, n_target)) - - for i, f1 in enumerate(base_features): - for j, f2 in enumerate(target_features): - similarity_matrix[i, j] = scorer.get_similarity_score(f1, f2) - - # Run Hungarian Algorithm (Global Optimization) - row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) - - matched_base_indices = set() - matched_target_indices = set() - - # Filter Optimal Assignments by Alpha Threshold - for r, c in zip(row_ind, col_ind): - score = similarity_matrix[r, c] - if score > alpha: - matches.append((base_features[r], target_features[c], score)) - matched_base_indices.add(r) - matched_target_indices.add(c) - - # Update the input lists in-place (Remove matched items) - base_features[:] = [ - f for i, f in enumerate(base_features) if i not in matched_base_indices - ] - target_features[:] = [ - f - for i, f in enumerate(target_features) - if i not in matched_target_indices - ] - - return matches - - -def fuzzy_match_namespaces( - features_base: Dict[str, List[features_pb2.Feature]], - features_target: Dict[str, List[features_pb2.Feature]], -) -> None: - """Remaps target namespaces to base namespaces using fuzzy matching.""" - - base_namespaces = sorted(list(features_base.keys())) - remapped_features = defaultdict(list, {k: [] for k in features_base}) - - for t_ns, features in features_target.items(): - if t_ns in base_namespaces: - remapped_features[t_ns].extend(features) - continue - - if not base_namespaces: - # No base to match against, so keep original target namespace - remapped_features[t_ns].extend(features) - continue - - best_match, best_score = max( - ( - (b_ns, jaro_winkler_similarity(t_ns, b_ns)) - for b_ns in base_namespaces - ), - key=lambda item: item[1], - default=(None, 0.0), - ) - - if best_score > 0.8 and best_match: - remapped_features[best_match].extend(features) - - features_target.clear() - features_target.update(remapped_features) - - -def process_module( - module: str, - base_list: List[features_pb2.Feature], - target_list: List[features_pb2.Feature], - alpha: float, - base_lang_name: str, - target_lang_name: str, - base_lang_code: str, - target_lang_code: str, -) -> Dict: - """Analyzes a single module and generates its report content.""" - mod_base_count = len(base_list) - mod_target_count = len(target_list) - - solid_matches = match_features(base_list, target_list, alpha) - mod_solid_count = len(solid_matches) - - beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) - potential_matches = match_features(base_list, target_list, beta) - - unmatched_base = base_list - unmatched_target = target_list - - union_size = mod_base_count + mod_target_count - mod_solid_count - mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 - - status_icon = ( - "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" - ) - module_safe_name = module.replace(".", "_") - module_filename = f"{module_safe_name}.md" - - details_link = f"[View Details]({{modules_dir}}/{module_filename})" - adk_parts = [] - if mod_base_count > 0: - adk_parts.append(base_lang_code) - if mod_target_count > 0: - adk_parts.append(target_lang_code) - adk_value = ", ".join(adk_parts) - row_content = ( - f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" - f" {status_icon} | {details_link} |" - ) - - # Module Content - mod_lines = [ - f"# Module: `{module}`", - "[⬅️ Back to Master Report](../{master_report})", - "", - f"**Score:** {mod_score:.2%} ({status_icon})", - ] - mod_total_features = mod_base_count + mod_target_count - mod_solid_count - mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) - - solid_matches.sort( - key=lambda x: (_get_type_priority(x[0]), x[0].normalized_name) - ) - potential_matches.sort( - key=lambda x: (_get_type_priority(x[0]), x[0].normalized_name) - ) - - if solid_matches: - mod_lines.append( - "### ✅ Solid Features" - ) - mod_lines.extend( - [ - f"| Type | {base_lang_name} Feature | {target_lang_name} Feature | Similarity Score |", - "|---|---|---|---|", - ] - ) - mod_lines.extend( - [ - f"| {get_type_display_name(f_base)} |" - f" `{_format_feature(f_base)}`" - f" | `{_format_feature(f_target)}` | {score:.2f} |" - for f_base, f_target, score in solid_matches - ] - ) - mod_lines.append("") - - if potential_matches: - mod_lines.extend( - [ - "### ⚠️ Potential Matches", - f"| Type | {base_lang_name} Feature | Closest {target_lang_name} Candidate" - " | Similarity |", - "|---|---|---|---|", - ] - ) - mod_lines.extend( - [ - f"| {get_type_display_name(f_base)} |" - f" `{_format_feature(f_base)}`" - f" | `{_format_feature(f_target)}` | {score:.2f} |" - for f_base, f_target, score in potential_matches - ] - ) - mod_lines.append("") - - if unmatched_base or unmatched_target: - mod_lines.extend( - [ - "### ❌ Unmatched Features", - "\n| Missing Feature | Missing In |", - "|---|---|", - ] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` | {target_lang_name} |" for f in unmatched_base] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` | {base_lang_name} |" for f in unmatched_target] - ) - mod_lines.append("") - - return { - "solid_matches_count": mod_solid_count, - "score": mod_score, - "row_content": row_content, - "module_filename": module_filename, - "module_content": "\n".join(mod_lines).strip(), - } diff --git a/src/google/adk/scope/reporter/markdown.py b/src/google/adk/scope/reporter/markdown.py new file mode 100644 index 0000000..bb8347f --- /dev/null +++ b/src/google/adk/scope/reporter/markdown.py @@ -0,0 +1,249 @@ + +import dataclasses +from datetime import datetime +from typing import Dict, List, Optional +import pandas as pd +from google.adk.scope import features_pb2 + +@dataclasses.dataclass +class MarkdownReport: + main_report_content: str + module_reports: Dict[str, str] # filename -> content + +def _get_language_code(language_name: str) -> str: + """Returns a short code for the language.""" + name = language_name.upper() + if name in {"PYTHON", "PY"}: + return "py" + elif name in {"TYPESCRIPT", "TS"}: + return "ts" + elif name == "JAVA": + return "java" + elif name in {"GOLANG", "GO"}: + return "go" + else: + return name.lower() + +def _get_language_name(language_name: str) -> str: + """Returns a properly capitalized display name for the language.""" + name = language_name.upper() + if name in {"PYTHON", "PY"}: + return "Python" + elif name in {"TYPESCRIPT", "TS"}: + return "TypeScript" + elif name == "JAVA": + return "Java" + elif name in {"GOLANG", "GO"}: + return "Go" + else: + return language_name.title() + + +class MarkdownReportGenerator: + def __init__( + self, + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + df: pd.DataFrame, + ): + self.base_registry = base_registry + self.target_registry = target_registry + self.df = df + + self.base_code = _get_language_code(base_registry.language) + self.target_code = _get_language_code(target_registry.language) + self.base_name = _get_language_name(base_registry.language) + self.target_name = _get_language_name(target_registry.language) + + def generate(self) -> MarkdownReport: + """Generates a Markdown parity report from the DataFrame.""" + master_lines = [] + master_lines.extend( + [ + "# Feature Matching Parity Report", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "| Role | Language | Version |", + "| :--- | :--- | :--- |", + ( + f"| **Base** | {self.base_registry.language} |" + f" {self.base_registry.version} |" + ), + ( + f"| **Target** | {self.target_registry.language} |" + f" {self.target_registry.version} |" + ), + "", + ] + ) + + global_score_idx = len(master_lines) + master_lines.append("GLOBAL_SCORE_PLACEHOLDER") + master_lines.append("") + + header = f"| Module | Features ({self.base_name}) | Score | Status | Details |" + divider = "|---|---|---|---|---|" + + master_lines.extend(["## Module Summary", header, divider]) + + module_reports = {} + module_rows = [] + + # Determine cols based on language codes + col_ns = f"{self.base_code}_namespace" + + # Group by base namespace + # If namespace is empty, group under "Unknown Module" + self.df["_module_group"] = self.df[col_ns].replace("", "Unknown Module") + + grouped = self.df.groupby("_module_group") + + total_high = 0 + total_low = 0 + total_mismatch = 0 + total_base_features = len(self.df) + + for module, group in grouped: + # Calculate module stats + high = len(group[group["confidence"] == "high"]) + low = len(group[group["confidence"] == "low"]) + mismatches = len(group[group["match"] == "false"]) + + # Actually, `high` and `low` confidence applies to matches usually + # But let's verify what `match` column says. + matches_high = len(group[(group["match"] == "true") & (group["confidence"] == "high")]) + matches_low = len(group[(group["match"] == "true") & (group["confidence"] == "low")]) + # Everything else is a mismatch or low confidence match? + # Let's trust `match` column for parity score + solid_matches_count = len(group[group["match"] == "true"]) + + total_high += matches_high + total_low += matches_low + total_mismatch += mismatches + + module_total = len(group) + score = solid_matches_count / module_total if module_total > 0 else 0.0 + + # Generate Module File Content + module_filename = f"{module}.md" + module_content = self._generate_module_content(module, group, module_total, matches_high, matches_low, mismatches) + module_reports[module_filename] = module_content + + # Add summary row + status_icon = "✅" if score == 1.0 else "⚠️" if score > 0.5 else "❌" + row_str = ( + f"| `{module}` | {module_total} | " + f"{score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" + ) + module_rows.append((score, row_str)) + + module_rows.sort(key=lambda x: x[0], reverse=True) + master_lines.extend([row for _, row in module_rows]) + + # Summary Stats + total_matches = total_high + total_low + parity_score = total_matches / total_base_features if total_base_features > 0 else 1.0 + + base_exclusive = total_base_features - total_matches + + global_stats = ( + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ High Confidence Matches** | **{total_high}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{total_low}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{base_exclusive}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{parity_score:.2%}** | " + f"Matches / Total Base Features ({total_matches} / {total_base_features}) |" + ) + + master_lines[global_score_idx] = global_stats + + return MarkdownReport( + main_report_content="\n".join(master_lines).strip(), + module_reports=module_reports, + ) + + def _generate_module_content( + self, + module: str, + group: pd.DataFrame, + total_features: int, + high_conf: int, + low_conf: int, + mismatches: int + ) -> str: + + # Calculate scores for summary + total_matches = high_conf + low_conf + coverage = total_matches / total_features if total_features > 0 else 0.0 + + summary_table = ( + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ High Confidence Matches** | **{high_conf}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{low_conf}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{mismatches}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{coverage:.2%}** | " + f"Matches / Total Base Features ({total_matches} / {total_features}) |\n" + ) + + lines = [ + f"# Module: `{module}`", + "", + f"[← Back to Master Report]({{master_report}})", + "", + summary_table, + "## Feature Details", + "", + f"| Module ({self.base_name}) | Container ({self.base_name}) | Name ({self.base_name}) | Module ({self.target_name}) | Container ({self.target_name}) | Name ({self.target_name}) | Score | Match | Confidence |", + "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | :---: |", + ] + + # Sort by score desc, then name + group_sorted = group.sort_values(by=["score", f"{self.base_code}_name"], ascending=[False, True]) + + for _, row in group_sorted.iterrows(): + # Base logic + b_ns = row[f'{self.base_code}_namespace'] + b_mem = row[f'{self.base_code}_member_of'] + b_name = row[f'{self.base_code}_name'] + + # Target logic + t_ns = row[f'{self.target_code}_namespace'] + t_mem = row[f'{self.target_code}_member_of'] + t_name = row[f'{self.target_code}_name'] + + if t_name == "" and t_mem == "" and t_ns == "": + t_name = "*(None)*" + + score = row['score'] + match_val = row['match'] + conf_val = row['confidence'] + + if match_val == "true": + if conf_val == "high": + match_icon = "✅" + else: + match_icon = "⚠️" + else: + match_icon = "❌" + + conf_display = conf_val.title() + if conf_display == "High": + conf_display = "**High**" + + lines.append( + f"| `{b_ns}` | `{b_mem}` | `{b_name}` | " + f"`{t_ns}` | `{t_mem}` | `{t_name}` | " + f"{score:.4f} | {match_icon} | {conf_display} |" + ) + + return "\n".join(lines) diff --git a/src/google/adk/scope/reporter/raw.py b/src/google/adk/scope/reporter/raw.py new file mode 100644 index 0000000..c49988d --- /dev/null +++ b/src/google/adk/scope/reporter/raw.py @@ -0,0 +1,187 @@ + +import pandas as pd +from collections import defaultdict +from typing import Optional, Dict, Any, List, Tuple +from google.adk.scope import features_pb2 +from google.adk.scope.utils.similarity import SimilarityScorer + +# Global thresholds for match confidence +SIMILARITY_THRESHOLDS = { + frozenset(["py", "go"]): {"high": 0.6, "avg": 0.5}, + frozenset(["py", "java"]): {"high": 0.6, "avg": 0.58}, + frozenset(["py", "ts"]): {"high": 0.7, "avg": 0.55}, +} + +# Fallback thresholds if language pair not explicitly defined +DEFAULT_THRESHOLDS = {"high": 0.8, "avg": 0.6} + + +def get_type_display_name(f: features_pb2.Feature) -> str: + FeatureType = features_pb2.Feature.Type + if f.type == FeatureType.CONSTRUCTOR: + return "constructor" + elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): + return "function" + elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: + return "method" + else: + return "unknown" + + +def _get_lang_code(language: str) -> str: + """Returns a short code for the language (e.g. PYTHON -> py).""" + name = language.upper() + if name in {"PYTHON", "PY"}: + return "py" + elif name in {"TYPESCRIPT", "TS"}: + return "ts" + elif name == "JAVA": + return "java" + elif name in {"GOLANG", "GO"}: + return "go" + return name.lower() + + +class RawReportGenerator: + def __init__( + self, + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + ): + self.base_registry = base_registry + self.target_registry = target_registry + self.scorer = SimilarityScorer() + + # Pre-compute useful attributes + self.base_code = _get_lang_code(self.base_registry.language) + self.target_code = _get_lang_code(self.target_registry.language) + self.thresholds = SIMILARITY_THRESHOLDS.get( + frozenset([self.base_code, self.target_code]), + DEFAULT_THRESHOLDS, + ) + + # Index target features by type + self.target_by_type = defaultdict(list) + for f in self.target_registry.features: + self.target_by_type[f.type].append(f) + + def generate(self, output_path: Optional[str] = None) -> pd.DataFrame: + """Generates the raw report DataFrame and optionally saves it to CSV.""" + rows = [] + for f_base in self.base_registry.features: + best_match, best_score = self._find_best_match(f_base) + row = self._create_row_data(f_base, best_match, best_score) + rows.append(row) + + df = self._create_dataframe(rows) + + if output_path: + self._save_csv(df, output_path) + + return df + + def _find_best_match( + self, f_base: features_pb2.Feature + ) -> Tuple[Optional[features_pb2.Feature], float]: + """Finds the best matching feature in the target registry.""" + candidates = self.target_by_type.get(f_base.type, []) + if not candidates: + return None, 0.0 + + best_match = None + best_score = -1.0 + + for f_target in candidates: + score = self.scorer.get_similarity_score(f_base, f_target) + if score > best_score: + best_score = score + best_match = f_target + + return best_match, best_score + + def _create_row_data( + self, + f_base: features_pb2.Feature, + f_target: Optional[features_pb2.Feature], + score: float, + ) -> Dict[str, Any]: + """Constructs a dictionary representing a single row in the report.""" + row: Dict[str, Any] = {} + + # Base columns + self._fill_feature_cols(row, f_base, self.base_code) + + # Target columns + if f_target: + self._fill_feature_cols(row, f_target, self.target_code) + else: + self._fill_empty_cols(row, self.target_code) + + # Metadata + row["type"] = get_type_display_name(f_base) + row["score"] = score + + # Match status + match_str, confidence_str = self._determine_match_status(score) + row["match"] = match_str + row["confidence"] = confidence_str + + return row + + def _fill_feature_cols( + self, row: Dict[str, Any], f: features_pb2.Feature, prefix: str + ): + """Populates namespace, member_of, and name columns for a feature.""" + ns = f.namespace or f.normalized_namespace or "" + mem = f.member_of or f.normalized_member_of or "" + if str(mem).lower() == "null": + mem = "" + name = f.original_name or f.normalized_name or "" + + row[f"{prefix}_namespace"] = ns + row[f"{prefix}_member_of"] = mem + row[f"{prefix}_name"] = name + + def _fill_empty_cols(self, row: Dict[str, Any], prefix: str): + """Fills feature columns with empty strings.""" + row[f"{prefix}_namespace"] = "" + row[f"{prefix}_member_of"] = "" + row[f"{prefix}_name"] = "" + + def _determine_match_status(self, score: float) -> Tuple[str, str]: + """Determines match (true/false) and confidence (high/low).""" + if score > self.thresholds["high"]: + return "true", "high" + elif score >= self.thresholds["avg"]: + return "true", "low" + else: + return "false", "high" + + def _create_dataframe(self, rows: List[Dict[str, Any]]) -> pd.DataFrame: + """Creates and formats the pandas DataFrame.""" + cols_order = [ + f"{self.base_code}_namespace", + f"{self.base_code}_member_of", + f"{self.base_code}_name", + f"{self.target_code}_namespace", + f"{self.target_code}_member_of", + f"{self.target_code}_name", + "type", + "score", + "match", + "confidence", + ] + + if not rows: + return pd.DataFrame(columns=cols_order) + + df = pd.DataFrame(rows) + # Ensure correct column order and fill missing + return df.reindex(columns=cols_order, fill_value="") + + def _save_csv(self, df: pd.DataFrame, output_path: str): + """Saves DataFrame to CSV, creating directories if needed.""" + from pathlib import Path + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + df.to_csv(output_path, index=False) \ No newline at end of file diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index ed45755..db37bc5 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -1,71 +1,19 @@ import argparse -import dataclasses import logging import sys -from collections import defaultdict from datetime import datetime from pathlib import Path -from typing import Dict, List, Optional +from typing import List, Optional from google.protobuf import text_format import pandas as pd from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher -from google.adk.scope.reporter import raw +from google.adk.scope.reporter import raw, markdown from google.adk.scope.utils import args as adk_args -from google.adk.scope.utils import stats from google.adk.scope.utils.similarity import SimilarityScorer -_NEAR_MISS_THRESHOLD = 0.15 - - -@dataclasses.dataclass -class MatchResult: - master_content: str - module_files: Dict[str, str] # filename -> content - - -def _group_features_by_module( - registry: features_pb2.FeatureRegistry, -) -> Dict[str, List[features_pb2.Feature]]: - """Groups features by their module.""" - features = defaultdict(list) - for f in registry.features: - key = f.normalized_namespace or f.namespace or "Unknown Module" - features[key].append(f) - return features - - -def _get_language_code(language_name: str) -> str: - """Returns a short code for the language.""" - name = language_name.upper() - if name in {"PYTHON", "PY"}: - return "py" - elif name in {"TYPESCRIPT", "TS"}: - return "ts" - elif name == "JAVA": - return "java" - elif name in {"GOLANG", "GO"}: - return "go" - else: - return name.lower() - - -def _get_language_name(language_name: str) -> str: - """Returns a properly capitalized display name for the language.""" - name = language_name.upper() - if name in {"PYTHON", "PY"}: - return "Python" - elif name in {"TYPESCRIPT", "TS"}: - return "TypeScript" - elif name == "JAVA": - return "Java" - elif name in {"GOLANG", "GO"}: - return "Go" - else: - return language_name.title() def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: @@ -76,450 +24,64 @@ def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: return registry -def match_registries( +def generate_markdown_raw_reports( registries: List[features_pb2.FeatureRegistry], - alpha: float, - report_type: str = "md", # Kept for backward compatibility/matrix logic - common: bool = False, + report_type: str = "md", # Kept for backward compatibility output_path: Optional[Path] = None, -) -> MatchResult: +) -> markdown.MarkdownReport: """Matches features and generates reports.""" - if report_type == "matrix": - reporter = MatrixReportGenerator(registries, alpha, common) - return reporter.generate_report("matrix") - else: - if len(registries) != 2: - raise ValueError( - f"Report type '{report_type}' requires exactly 2 registries." - ) - - # New unified flow for standard reports - generator = raw.RawReportGenerator(registries[0], registries[1]) - - # Generate DataFrame (and CSV if path provided) - csv_path = None - if output_path: - # If output is "report.md", csv will be "report.csv" - # If output is "report.csv", md will be "report.md" - stem = output_path.stem - parent = output_path.parent - csv_path = str(parent / f"{stem}.csv") - - df = generator.generate(output_path=csv_path) - - # Generate Markdown Report from DataFrame - reporter = ReportGenerator(registries[0], registries[1], df) - return reporter.generate_md_report() - - -class MatrixReportGenerator: - def __init__( - self, - registries: List[features_pb2.FeatureRegistry], - alpha: float, - common: bool = False, - ): - self.registries = registries - self.alpha = alpha - self.common = common - - self.langs = [_get_language_name(r.language) for r in self.registries] - - def _compute_jaccard_matrix(self) -> List[str]: - n = len(self.registries) - matrix = [[0.0] * n for _ in range(n)] - - for i in range(n): - for j in range(n): - if i == j: - matrix[i][j] = 1.0 - continue - if i > j: - matrix[i][j] = matrix[j][i] - continue - - # compute intersection - r_base = self.registries[i] - r_target = self.registries[j] - - features_base = _group_features_by_module(r_base) - features_target = _group_features_by_module(r_target) - matcher.fuzzy_match_namespaces(features_base, features_target) - - all_modules = set(features_base.keys()) | set(features_target.keys()) - total_solid = 0 - for mod in all_modules: - b_list = list(features_base.get(mod, [])) - t_list = list(features_target.get(mod, [])) - solid_matches = matcher.match_features(b_list, t_list, self.alpha) - total_solid += len(solid_matches) - - total_base = len(r_base.features) - total_target = len(r_target.features) - union_size = total_base + total_target - total_solid - - score = total_solid / union_size if union_size > 0 else 1.0 - matrix[i][j] = score - - lines = [ - "## Global Parity Matrix", - "", - "| Language | " + " | ".join(self.langs) + " |", - "| :--- |" + " :--- |" * n - ] - - for i in range(n): - row = [f"**{self.langs[i]}**"] - for j in range(n): - if i == j: - row.append("-") - else: - row.append(f"{matrix[i][j]:.2%}") - lines.append("| " + " | ".join(row) + " |") - - lines.append("") - return lines - - def _build_global_feature_matrix(self) -> List[str]: - # CrossLanguageFeature: dict mapping lang_idx -> Feature - global_features: List[Dict[int, features_pb2.Feature]] = [] - - # 1. Initialize with Anchor (index 0) - anchor_registry = self.registries[0] - for f in anchor_registry.features: - global_features.append({0: f}) - - # 2. Iteratively align remaining registries - for i in range(1, len(self.registries)): - target_registry = self.registries[i] - - # Group current global features by module and target features by module - global_by_mod = defaultdict(list) - for row in global_features: - # Use the feature representation from the earliest language that has it - rep_idx = min(row.keys()) - rep_f = row[rep_idx] - mod = rep_f.normalized_namespace or rep_f.namespace or "Unknown Module" - global_by_mod[mod].append((row, rep_f)) - - target_by_mod = _group_features_by_module(target_registry) - - # We must remap namespaces just for matching purposes in this step - # We'll build temporary Dict[str, List[Feature]] for namespaces - g_ns_dict = {mod: [f for _, f in lst] for mod, lst in global_by_mod.items()} - matcher.fuzzy_match_namespaces(g_ns_dict, target_by_mod) - - all_modules = set(g_ns_dict.keys()) | set(target_by_mod.keys()) - - for mod in all_modules: - base_tuples = global_by_mod.get(mod, []) # list of (row_dict, Feature) - b_list = [f for _, f in base_tuples] - t_list = target_by_mod.get(mod, []) - - # Match - solid_matches = matcher.match_features(b_list, t_list, self.alpha) - - # Record matches - for b_f, t_f, _ in solid_matches: - # Find the original row dict that owns b_f - for row_dict, feat in base_tuples: - if feat is b_f: - row_dict[i] = t_f - break - - # Record unmatched targets as new rows - # t_list was mutated by match_features (items removed) - for t_f in t_list: - global_features.append({i: t_f}) - - # 3. Render table grouped by Module - # Regroup final global features by module for rendering - final_by_mod = defaultdict(list) - for row in global_features: - rep_idx = min(row.keys()) - rep_f = row[rep_idx] - mod = rep_f.normalized_namespace or rep_f.namespace or "Unknown Module" - final_by_mod[mod].append(row) - - lines = ["## Global Feature Support", ""] - - for mod in sorted(final_by_mod.keys()): - mod_rows = final_by_mod[mod] - - if self.common: - python_idx = self.langs.index("Python") if "Python" in self.langs else -1 - mod_rows = [row for row in mod_rows if python_idx in row or len(row) >= 2] - - if not mod_rows: - continue - - lines.append(f"### Module: `{mod}`") - header = "| Feature | Type | " + " | ".join(self.langs) + " |" - divider = "| :--- | :--- |" + " :---: |" * len(self.langs) - lines.extend([header, divider]) - - # sort features in module - def get_sort_key(row): - rep_idx = min(row.keys()) - rep_f = row[rep_idx] - return (matcher._get_type_priority(rep_f), rep_f.normalized_name or "") - - mod_rows.sort(key=get_sort_key) - - for row in mod_rows: - rep_idx = min(row.keys()) - rep_f = row[rep_idx] - f_name = matcher._format_feature(rep_f) - f_type = matcher.get_type_display_name(rep_f) - - row_cells = [f"`{f_name}`", f_type] - for i in range(len(self.registries)): - if i in row: - row_cells.append("✅") - else: - row_cells.append("❌") - - lines.append("| " + " | ".join(row_cells) + " |") - - lines.append("") - - return lines - - def generate_report(self, report_type: str = "matrix") -> MatchResult: - master_lines = [ - "# Multi-SDK Feature Matrix Report", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - "## Registries", - "| Role | Language | Version |", - "| :--- | :--- | :--- |", - "| :--- | :--- | :--- |" - ] - - for idx, r in enumerate(self.registries): - role_marker = "Anchor" if idx == 0 else f"Comparison {idx}" - master_lines.append( - f"| **{role_marker}** | {self.langs[idx]} | {r.version} |" - ) - - master_lines.append("") - master_lines.extend(self._compute_jaccard_matrix()) - master_lines.extend(self._build_global_feature_matrix()) - - return MatchResult( - master_content="\n".join(master_lines).strip(), - module_files={}, - ) - - -class ReportGenerator: - def __init__( - self, - base_registry: features_pb2.FeatureRegistry, - target_registry: features_pb2.FeatureRegistry, - df: pd.DataFrame, - ): - self.base_registry = base_registry - self.target_registry = target_registry - self.df = df - - self.base_code = _get_language_code(base_registry.language) - self.target_code = _get_language_code(target_registry.language) - self.base_name = _get_language_name(base_registry.language) - self.target_name = _get_language_name(target_registry.language) - - def generate_md_report(self) -> MatchResult: - """Generates a Markdown parity report from the DataFrame.""" - master_lines = [] - master_lines.extend( - [ - "# Feature Matching Parity Report", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - "| Role | Language | Version |", - "| :--- | :--- | :--- |", - ( - f"| **Base** | {self.base_registry.language} |" - f" {self.base_registry.version} |" - ), - ( - f"| **Target** | {self.target_registry.language} |" - f" {self.target_registry.version} |" - ), - "", - ] - ) - - global_score_idx = len(master_lines) - master_lines.append("GLOBAL_SCORE_PLACEHOLDER") - master_lines.append("") - - header = f"| Module | Features ({self.base_name}) | Score | Status | Details |" - divider = "|---|---|---|---|---|" - - master_lines.extend(["## Module Summary", header, divider]) - - module_files = {} - module_rows = [] - - # Determine cols based on language codes - col_ns = f"{self.base_code}_namespace" - - # Group by base namespace - # If namespace is empty, group under "Unknown Module" - self.df["_module_group"] = self.df[col_ns].replace("", "Unknown Module") - - grouped = self.df.groupby("_module_group") - - total_high = 0 - total_low = 0 - total_mismatch = 0 - total_base_features = len(self.df) - - for module, group in grouped: - # Calculate module stats - high = len(group[group["confidence"] == "high"]) - low = len(group[group["confidence"] == "low"]) - mismatches = len(group[group["match"] == "false"]) - - # Actually, `high` and `low` confidence applies to matches usually - # But let's verify what `match` column says. - matches_high = len(group[(group["match"] == "true") & (group["confidence"] == "high")]) - matches_low = len(group[(group["match"] == "true") & (group["confidence"] == "low")]) - # Everything else is a mismatch or low confidence match? - # Let's trust `match` column for parity score - solid_matches_count = len(group[group["match"] == "true"]) - - total_high += matches_high - total_low += matches_low - total_mismatch += mismatches - - module_total = len(group) - score = solid_matches_count / module_total if module_total > 0 else 0.0 - - # Generate Module File Content - module_filename = f"{module}.md" - module_content = self._generate_module_content(module, group, module_total, matches_high, matches_low, mismatches) - module_files[module_filename] = module_content - - # Add summary row - status_icon = "✅" if score == 1.0 else "⚠️" if score > 0.5 else "❌" - row_str = ( - f"| `{module}` | {module_total} | " - f"{score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" - ) - module_rows.append((score, row_str)) - - module_rows.sort(key=lambda x: x[0], reverse=True) - master_lines.extend([row for _, row in module_rows]) - - # Summary Stats - total_matches = total_high + total_low - parity_score = total_matches / total_base_features if total_base_features > 0 else 1.0 - - base_exclusive = total_base_features - total_matches - - global_stats = ( - "## Summary\n\n" - "| Feature Category | Count | Details |\n" - "| :--- | :--- | :--- |\n" - f"| **✅ High Confidence Matches** | **{total_high}** | " - f"Strong matches found in `{self.target_name}` |\n" - f"| **⚠️ Low Confidence Matches** | **{total_low}** | " - f"Likely matches needing verification |\n" - f"| **❌ Mismatches** | **{base_exclusive}** | " - f"No suitable match found in `{self.target_name}` |\n" - f"| **📊 Coverage Score** | **{parity_score:.2%}** | " - f"Matches / Total Base Features ({total_matches} / {total_base_features}) |" - ) + # New unified flow for standard reports + generator = raw.RawReportGenerator(registries[0], registries[1]) + + # Generate DataFrame (and CSV if path provided) + csv_path = None + if output_path: + # If output is "report.md", csv will be "report.csv" + # If output is "report.csv", md will be "report.md" + stem = output_path.stem + parent = output_path.parent + csv_path = str(parent / f"{stem}.csv") - master_lines[global_score_idx] = global_stats + df = generator.generate(output_path=csv_path) + + # Generate Markdown Report from DataFrame + reporter = markdown.MarkdownReportGenerator(registries[0], registries[1], df) + result = reporter.generate() + if result.module_reports: + modules_dir_name = f"{output_path.stem}_modules" + modules_dir = output_path.parent / modules_dir_name + modules_dir.mkdir(parents=True, exist_ok=True) - return MatchResult( - master_content="\n".join(master_lines).strip(), - module_files=module_files, - ) + # Write module files + for filename, content in result.module_reports.items(): + # Replace placeholder for master report link + # The link is relative from module dir to master report + # We are in {stem}_modules/, so we need to go up one level. + final_content = content.replace("{master_report}", f"../{output_path.name}") + (modules_dir / filename).write_text(final_content) - def _generate_module_content( - self, - module: str, - group: pd.DataFrame, - total_features: int, - high_conf: int, - low_conf: int, - mismatches: int - ) -> str: - - # Calculate scores for summary - total_matches = high_conf + low_conf - coverage = total_matches / total_features if total_features > 0 else 0.0 - - summary_table = ( - "## Summary\n\n" - "| Feature Category | Count | Details |\n" - "| :--- | :--- | :--- |\n" - f"| **✅ High Confidence Matches** | **{high_conf}** | " - f"Strong matches found in `{self.target_name}` |\n" - f"| **⚠️ Low Confidence Matches** | **{low_conf}** | " - f"Likely matches needing verification |\n" - f"| **❌ Mismatches** | **{mismatches}** | " - f"No suitable match found in `{self.target_name}` |\n" - f"| **📊 Coverage Score** | **{coverage:.2%}** | " - f"Matches / Total Base Features ({total_matches} / {total_features}) |\n" + # Replace placeholder in Master Report + # We assume master report is in parent of modules_dir + # modules_dir relative to master report is just the dir name + master_report = result.main_report_content.replace( + "{modules_dir}", modules_dir_name ) + else: + master_report = result.main_report_content.replace("{modules_dir}", ".") - lines = [ - f"# Module: `{module}`", - "", - f"[← Back to Master Report]({{master_report}})", - "", - summary_table, - "## Feature Details", - "", - f"| Module ({self.base_name}) | Container ({self.base_name}) | Name ({self.base_name}) | Module ({self.target_name}) | Container ({self.target_name}) | Name ({self.target_name}) | Score | Match | Confidence |", - "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | :---: |", - ] - - # Sort by score desc, then name - group_sorted = group.sort_values(by=["score", f"{self.base_code}_name"], ascending=[False, True]) - - for _, row in group_sorted.iterrows(): - # Base logic - b_ns = row[f'{self.base_code}_namespace'] - b_mem = row[f'{self.base_code}_member_of'] - b_name = row[f'{self.base_code}_name'] - - # Target logic - t_ns = row[f'{self.target_code}_namespace'] - t_mem = row[f'{self.target_code}_member_of'] - t_name = row[f'{self.target_code}_name'] - - if t_name == "" and t_mem == "" and t_ns == "": - t_name = "*(None)*" - - score = row['score'] - match_val = row['match'] - conf_val = row['confidence'] - - if match_val == "true": - if conf_val == "high": - match_icon = "✅" - else: - match_icon = "⚠️" - else: - match_icon = "❌" + try: + output_path.write_text(master_report) + logging.info(f"Successfully wrote match report to {output_path}") + # Note: CSV writing is logged inside RawReportGenerator or we should log it here + # Actually RawReportGenerator doesn't log, so we might want to Add a log here if we knew it matched + stem = output_path.stem + csv_path = output_path.parent / f"{stem}.csv" + if csv_path.exists(): + logging.info(f"Successfully wrote raw match report to {csv_path}") - conf_display = conf_val.title() - if conf_display == "High": - conf_display = "**High**" - - lines.append( - f"| `{b_ns}` | `{b_mem}` | `{b_name}` | " - f"`{t_ns}` | `{t_mem}` | `{t_name}` | " - f"{score:.4f} | {match_icon} | {conf_display} |" - ) - - return "\n".join(lines) + except Exception as e: + logging.error(f"Error writing report to {output_path}: {e}") + sys.exit(1) def main(): @@ -547,22 +109,11 @@ def main(): required=True, help="Path to save the Markdown report. Corresponding CSV will be saved with same stem.", ) - parser.add_argument( - "--alpha", - type=float, - default=0.8, - help="Similarity threshold (0.0 to 1.0) defaults to 0.8.", - ) parser.add_argument( "--report-type", - choices=["md", "raw", "matrix"], + choices=["md", "matrix"], default="md", - help="Type of gap report. 'md' or 'raw' now produce both. 'matrix' is separate.", - ) - parser.add_argument( - "--common", - action="store_true", - help="Only list features present in Python or at least 2 languages (matrix report only).", + help="Type of gap report. 'md' or 'matrix' now produce both.", ) adk_args.add_verbose_argument(parser) args = parser.parse_args() @@ -590,64 +141,12 @@ def main(): output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) - result = match_registries( + generate_markdown_raw_reports( registries, - args.alpha, args.report_type, - args.common, output_path=output_path ) - if args.report_type == "matrix": - # Matrix only writes one file - try: - output_path.write_text(result.master_content) - logging.info(f"Successfully wrote matrix report to {output_path}") - except Exception as e: - logging.error(f"Error writing matrix report: {e}") - sys.exit(1) - return - - # For standard report, we already generated CSV inside match_registries. - # Now write the Markdown and Modules. - - # Create module directory - if result.module_files: - modules_dir_name = f"{output_path.stem}_modules" - modules_dir = output_path.parent / modules_dir_name - modules_dir.mkdir(parents=True, exist_ok=True) - - # Write module files - for filename, content in result.module_files.items(): - # Replace placeholder for master report link - # The link is relative from module dir to master report - # We are in {stem}_modules/, so we need to go up one level. - final_content = content.replace("{master_report}", f"../{output_path.name}") - (modules_dir / filename).write_text(final_content) - - # Replace placeholder in Master Report - # We assume master report is in parent of modules_dir - # modules_dir relative to master report is just the dir name - master_report = result.master_content.replace( - "{modules_dir}", modules_dir_name - ) - else: - master_report = result.master_content.replace("{modules_dir}", ".") - - try: - output_path.write_text(master_report) - logging.info(f"Successfully wrote match report to {output_path}") - # Note: CSV writing is logged inside RawReportGenerator or we should log it here - # Actually RawReportGenerator doesn't log, so we might want to Add a log here if we knew it matched - stem = output_path.stem - csv_path = output_path.parent / f"{stem}.csv" - if csv_path.exists(): - logging.info(f"Successfully wrote raw match report to {csv_path}") - - except Exception as e: - logging.error(f"Error writing report to {output_path}: {e}") - sys.exit(1) - if __name__ == "__main__": main() diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py deleted file mode 100644 index bfa5cb6..0000000 --- a/test/adk/scope/matcher/test_matcher.py +++ /dev/null @@ -1,84 +0,0 @@ -import unittest - -from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher - - -class TestMatcher(unittest.TestCase): - - def test_match_features(self): - f1 = features_pb2.Feature( - normalized_name="f_same", - normalized_member_of="c_same", - normalized_namespace="n_same", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f2 = features_pb2.Feature( - normalized_name="f_same", - normalized_member_of="c_same", - normalized_namespace="n_same", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f3 = features_pb2.Feature( - normalized_name="totally", - normalized_member_of="different", - normalized_namespace="stuff", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f4 = features_pb2.Feature( - normalized_name="entirely", - normalized_member_of="unrelated", - normalized_namespace="things", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - - base_features = [f1, f3] - target_features = [f4, f2] - - matches = matcher.match_features(base_features, target_features, 0.8) - - self.assertEqual(len(matches), 1) - - m_f1, m_f2, score = matches[0] - self.assertEqual(m_f1.normalized_name, "f_same") - self.assertEqual(m_f2.normalized_name, "f_same") - self.assertGreater(score, 0.8) - - # Assert lists were mutated and matched elements removed - self.assertEqual(len(base_features), 1) - self.assertEqual(base_features[0].normalized_name, "totally") - - self.assertEqual(len(target_features), 1) - self.assertEqual(target_features[0].normalized_name, "entirely") - - def test_fuzzy_match_namespaces(self): - features_base = {"module.one": [], "module.two": []} - features_target = { - "module.one": [features_pb2.Feature(original_name="f1_target")], - "module.ones": [features_pb2.Feature(original_name="f4")], - "module.three": [features_pb2.Feature(original_name="f5")], - } - - matcher.fuzzy_match_namespaces(features_base, features_target) - - self.assertIn("module.one", features_target) - self.assertIn("module.two", features_target) - self.assertNotIn("module.ones", features_target) - self.assertNotIn("module.three", features_target) - self.assertEqual(len(features_target["module.one"]), 3) - self.assertEqual(len(features_target["module.two"]), 0) - - def test_fuzzy_match_namespaces_empty_base(self): - features_base = {} - features_target = { - "module.one": [features_pb2.Feature(original_name="f1")] - } - - matcher.fuzzy_match_namespaces(features_base, features_target) - - self.assertIn("module.one", features_target) - self.assertEqual(len(features_target["module.one"]), 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index 87bb895..c7a9405 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -6,7 +6,7 @@ from google.protobuf import text_format from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher + from google.adk.scope.reporter import reporter @@ -90,7 +90,7 @@ def test_match_registries(self): original_name="totally_diff", normalized_name="totally", member_of="null", - namespace="google.adk.events", + namespace="stuff", normalized_member_of="different", normalized_namespace="stuff", type=features_pb2.Feature.Type.INSTANCE_METHOD, @@ -108,216 +108,53 @@ def test_match_registries(self): # Test Markdown Report result_md = reporter.match_registries( - [base_registry, target_registry], 0.8, report_type="md" + [base_registry, target_registry], report_type="md" ) - report_md = result_md.master_content + report_md = result_md.main_report_content # 1. Verify Master Report Structure self.assertIn("# Feature Matching Parity Report", report_md) self.assertIn("## Summary", report_md) - self.assertIn("| **✅ Common Shared** | **1** |", report_md) - self.assertIn("| **📦 Exclusive to `Python`** | **2** |", report_md) - self.assertIn("| **📦 Exclusive to `TypeScript`** | **1** |", report_md) - self.assertIn("| **📊 Jaccard Score** | **25.00%** |", report_md) + # Check for High/Low confidence summaries + self.assertIn("| **✅ High Confidence Matches** | **1** |", report_md) + self.assertIn("| **⚠️ Low Confidence Matches** | **1** |", report_md) + self.assertIn("| **❌ Mismatches** | **1** |", report_md) self.assertIn("## Module Summary", report_md) # Check for module entry in master summary - self.assertIn("| ADK | Module | Features (Python) | Score | Status | Details |", report_md) - self.assertIn("| `n_same` |", report_md) - self.assertIn("[View Details]({modules_dir}/n_same.md)", report_md) + self.assertIn("| Module | Features (Python) | Score | Status | Details |", report_md) + self.assertIn("| `google.adk.events` |", report_md) + self.assertIn("[View Details]({modules_dir}/google.adk.events.md)", report_md) # 2. Verify Module Content - self.assertIn("n_same.md", result_md.module_files) - module_content = result_md.module_files["n_same.md"] - - self.assertIn("# Module: `n_same`", module_content) - self.assertIn("**Features:** 3", module_content) - - # Solid Matches - self.assertIn("### ✅ Solid Features", module_content) - self.assertIn( - "| Type | Python Feature | TypeScript Feature | Similarity Score |", - module_content, - ) - self.assertIn( - "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - module_content, - ) - - # Potential Matches (formerly Near Misses) - self.assertIn("### ⚠️ Potential Matches", module_content) - self.assertIn( - "| Type | Python Feature | Closest TypeScript Candidate | Similarity |", - module_content, - ) - self.assertIn( - "| method | `base_member.base_name` | " - "`target_member.target_name` |", - module_content, - ) + self.assertIn("google.adk.events.md", result_md.module_reports) + module_content = result_md.module_reports["google.adk.events.md"] + + self.assertIn("# Module: `google.adk.events`", module_content) + # New summary table in module + self.assertIn("## Summary", module_content) + self.assertIn("## Feature Details", module_content) + + # Solid Matches (High Confidence) + self.assertIn("✅", module_content) + self.assertIn("**High**", module_content) + self.assertIn("`fSameBase`", module_content) + self.assertIn("`fSameTarget`", module_content) + + # Potential Matches (Low Confidence) + self.assertIn("⚠️", module_content) + self.assertIn("Low", module_content) + self.assertIn("`base_name`", module_content) + self.assertIn("`target_name`", module_content) # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_md.module_files) - stuff_content = result_md.module_files["stuff.md"] - self.assertIn("### ❌ Unmatched Features", stuff_content) - self.assertIn("| `totally_diff` | TypeScript |", stuff_content) - self.assertIn("**Features:** 1", stuff_content) - - def test_matrix_report(self): - f_py = features_pb2.Feature( - original_name="f", - normalized_name="f", - member_of="c", - normalized_member_of="c", - normalized_namespace="n", - type=features_pb2.Feature.Type.FUNCTION, - ) - f_ts = features_pb2.Feature( - original_name="f", - normalized_name="f", - member_of="c", - normalized_member_of="c", - normalized_namespace="n", - type=features_pb2.Feature.Type.FUNCTION, - ) - # Go only matches partially (different name) or provides a new feature - f_go1 = features_pb2.Feature( - original_name="new_f", - normalized_name="new_f", - member_of="c", - normalized_member_of="c", - normalized_namespace="n", - type=features_pb2.Feature.Type.FUNCTION, - ) - - r_py = features_pb2.FeatureRegistry(language="Python", version="1") - r_py.features.append(f_py) - - r_ts = features_pb2.FeatureRegistry(language="TypeScript", version="2") - r_ts.features.append(f_ts) - - r_go = features_pb2.FeatureRegistry(language="Go", version="3") - r_go.features.append(f_go1) - - result_matrix = reporter.match_registries( - [r_py, r_ts, r_go], 0.9, report_type="matrix" - ) - - report_md = result_matrix.master_content - - # 1. Check title & headers - self.assertIn("# Multi-SDK Feature Matrix Report", report_md) - self.assertIn("| **Anchor** | Python | 1 |", report_md) - self.assertIn("| **Comparison 1** | TypeScript | 2 |", report_md) - self.assertIn("| **Comparison 2** | Go | 3 |", report_md) - - # 2. Check Jaccard Matrix - self.assertIn("## Global Parity Matrix", report_md) - self.assertIn("| Language | Python | TypeScript | Go |", report_md) - # Py vs TS should be 100% since they both only have 'f' - self.assertIn("| **Python** | - | 100.00% | 0.00% |", report_md) - # Py/TS vs Go should be 0% since Go has 'new_f' entirely disjoint - self.assertIn("| **Go** | 0.00% | 0.00% | - |", report_md) - - # 3. Check Global Feature Matrix - self.assertIn("## Global Feature Support", report_md) - self.assertIn("### Module: `n`", report_md) - self.assertIn("| Feature | Type | Python | TypeScript | Go |", report_md) - - # 'f' should be yes for Py/Ts, no for Go - self.assertIn("| `c.f` | function | ✅ | ✅ | ❌ |", report_md) - - # 'new_f' should be no for Py/Ts, yes for Go - self.assertIn("| `c.new_f` | function | ❌ | ❌ | ✅ |", report_md) - - f1 = features_pb2.Feature( - original_name="f_same", - normalized_name="f_same", - normalized_namespace="pkg", - member_of="MyClass", - normalized_member_of="myclass", - type=features_pb2.Feature.Type.FUNCTION, - ) - base = features_pb2.FeatureRegistry(language="Python", version="1") - base.features.append(f1) - target = features_pb2.FeatureRegistry(language="TS", version="2") - target.features.append(f1) - - result = reporter.match_registries([base, target], 0.9, report_type="raw") - csv_content = result.master_content - - expected_header = ( - "py_namespace,py_member_of,py_name,ts_namespace," - "ts_member_of,ts_name,type,score" - ) - self.assertIn(expected_header, csv_content) - - # Check for solid match line - # f1 has: ns=pkg, mem=MyClass, name=f_same - # Match should have same values for base and target - expected_line = "pkg,MyClass,f_same,pkg,MyClass,f_same,function,1.0000" - self.assertIn(expected_line, csv_content) - self.assertFalse(result.module_files) - - def test_group_features_by_module(self): - registry = features_pb2.FeatureRegistry() - f1 = registry.features.add() - f1.namespace = "module.one" - f2 = registry.features.add() - f2.namespace = "module.two" - f3 = registry.features.add() - f3.namespace = "module.one" - - result = reporter._group_features_by_module(registry) - - self.assertIn("module.one", result) - self.assertIn("module.two", result) - self.assertEqual(len(result["module.one"]), 2) - self.assertEqual(len(result["module.two"]), 1) - - def test_process_module(self): - """Tests the end-to-end processing of a single module.""" - f_base = features_pb2.Feature( - original_name="f1_base", - normalized_name="f1_base", - normalized_namespace="n1", - type=features_pb2.Feature.Type.FUNCTION, - ) - f_target = features_pb2.Feature( - original_name="f1_target", - normalized_name="f1_target", - normalized_namespace="n1", - type=features_pb2.Feature.Type.FUNCTION, - ) - - with patch( - "google.adk.scope.reporter.reporter.matcher.match_features" - ) as mock_match: - # Let's assume one solid match and no potential matches - mock_match.side_effect = [ - [(f_base, f_target, 0.95)], # Solid matches - [], # Potential matches - ] - - result = matcher.process_module( - module="n1", - base_list=[f_base], - target_list=[f_target], - alpha=0.9, - base_lang_name="Python", - target_lang_name="TypeScript", - base_lang_code="py", - target_lang_code="ts", - ) - - self.assertEqual(result["solid_matches_count"], 1) - self.assertEqual(result["score"], 1.0) - self.assertIn("| py, ts |", result["row_content"]) - self.assertIn("# Module: `n1`", result["module_content"]) - self.assertIn("### ✅ Solid Features", result["module_content"]) + self.assertIn("stuff.md", result_md.module_reports) + stuff_content = result_md.module_reports["stuff.md"] + self.assertIn("❌", stuff_content) + self.assertIn("`totally_diff`", stuff_content) def test_generate_raw_report(self): - """Tests the raw CSV report generation.""" + """Tests the raw CSV report generation via RawReportGenerator.""" f_base = features_pb2.Feature( original_name="f1_base", normalized_name="f1_base", @@ -343,17 +180,20 @@ def test_generate_raw_report(self): ) target_registry.features.extend([f_target]) - # We no longer patch match_features, we rely on SimilarityScorer - # yielding a high score for identical features. - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_raw_report() - - self.assertIn( - "py_namespace,py_member_of,py_name", - result.master_content, + # Use RawReportGenerator directly + generator = reporter.raw.RawReportGenerator( + base_registry, target_registry ) - self.assertIn("n1,c1,f1_base", result.master_content) + df = generator.generate() + + # Check columns + self.assertIn("py_namespace", df.columns) + self.assertIn("score", df.columns) + + # Check content + row = df.iloc[0] + self.assertEqual(row["py_name"], "f1_base") + self.assertEqual(row["score"], 1.0) def test_global_best_match(self): """Tests that a feature matches best candidate globally, ignoring namespace.""" @@ -387,49 +227,16 @@ def test_global_best_match(self): target_registry = features_pb2.FeatureRegistry(language="Java", version="2") target_registry.features.extend([f_target_bad, f_target_good]) - # Logic should pick f_target_good because it has higher similarity - # even though it is in a different namespace. - result = reporter.ReportGenerator( - base_registry, target_registry, 0.5 - ).generate_raw_report() - - # Check that we found the match in n2 - self.assertIn("n1,,my_feature,n2,,my_feature,function,1.0000", result.master_content) - - def test_generate_md_report(self): - """Tests the md report generation.""" - base_registry = features_pb2.FeatureRegistry( - language="Python", version="1.0.0" + # RawReportGenerator logic + generator = reporter.raw.RawReportGenerator( + base_registry, target_registry ) - f1 = base_registry.features.add() - f1.namespace = "n1" - target_registry = features_pb2.FeatureRegistry( - language="TypeScript", version="2.0.0" - ) - - with patch( - "google.adk.scope.reporter.reporter.matcher.process_module" - ) as mock_process: - mock_process.return_value = { - "solid_matches_count": 1, - "score": 1.0, - "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | n1.md |", - "module_filename": "n1.md", - "module_content": "# Module: `n1`", - } - - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_md_report() - - self.assertIn( - "# Feature Matching Parity Report", result.master_content - ) - self.assertIn("## Summary", result.master_content) - self.assertIn("## Module Summary", result.master_content) - self.assertIn("| `n1` |", result.master_content) - self.assertIn("n1.md", result.module_files) + df = generator.generate() + # Check that we found the match in n2 + row = df.iloc[0] + self.assertEqual(row["java_namespace"], "n2") + self.assertEqual(row["score"], 1.0) def test_raw_integration(self): """Tests the raw report generation end-to-end.""" @@ -501,18 +308,14 @@ def test_raw_integration(self): typescript_features_str, features_pb2.FeatureRegistry() ) - result = reporter.ReportGenerator( - py_registry, ts_registry, 0.8 - ).generate_raw_report() - - self.assertIn( - "py_namespace,py_member_of,py_name,ts_namespace,ts_member_of,ts_name,type,score", - result.master_content, - ) + generator = reporter.raw.RawReportGenerator(py_registry, ts_registry) + df = generator.generate() - # Verify the solid match is present with high score - # Note: Original names are used (load_artifact vs loadArtifact) and original members (InMemoryArtifactService) - self.assertRegex(result.master_content, r"runners,InMemoryArtifactService,load_artifact,artifacts,InMemoryArtifactService,loadArtifact,.*,0.86[0-9]*") + # Verify solid match (high score) + row = df.iloc[0] + self.assertEqual(row["py_name"], "load_artifact") + self.assertEqual(row["ts_name"], "loadArtifact") + self.assertGreater(row["score"], 0.8) def test_raw_report_match_confidence(self): """Tests match and confidence columns with various scores.""" @@ -537,35 +340,59 @@ def test_raw_report_match_confidence(self): # It's easier to mock SimilarityScorer to return fixed scores. target.features.extend([f_high, f_avg, f_low]) - with patch("google.adk.scope.reporter.reporter.SimilarityScorer") as MockScorer: + with patch("google.adk.scope.reporter.raw.SimilarityScorer") as MockScorer: instance = MockScorer.return_value - # match_registries -> ReportGenerator -> generate_raw_report -> SimilarityScorer - # We need to control get_similarity_score. - # The logic iterates base features, then finds best match target. # Case 1: High match - # We want best_score to be > 0.6 instance.get_similarity_score.return_value = 0.9 + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + # Since generate iterates through base features, and we have 1 base feature, + # it will run once. We need to test behavior for different scores. + # But generate() does all at once. - gen = reporter.ReportGenerator(base, target, 0.1) - # We need to reset the scorer inside generator if we patched the class, - # but ReportGenerator instantiates it inside generate_raw_report. - # So the patch above should work for the instance created inside. + # Actually, `generate` iterates through base features. + # If we want to test different outcomes, we should perhaps just test + # the _get_confidence_level method or ensure our mock returns different values + # for different calls if possible, or just run 3 separate gens. - result = gen.generate_raw_report() + # Test High + self.assertEqual(df.iloc[0]["match"], "true") + self.assertEqual(df.iloc[0]["confidence"], "high") - # Check for match=true, confidence=high - self.assertIn("true,high", result.master_content) - - # Case 2: Avg match (0.55) -> match=true, confidence=low + # Test Avg (Low Confidence) instance.get_similarity_score.return_value = 0.55 - result = gen.generate_raw_report() - self.assertIn("true,low", result.master_content) - - # Case 3: Low/No match (0.4) -> match=false, confidence=high + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + self.assertEqual(df.iloc[0]["match"], "true") + self.assertEqual(df.iloc[0]["confidence"], "low") + + # Test Low (Mismatch) instance.get_similarity_score.return_value = 0.4 - result = gen.generate_raw_report() - self.assertIn("false,high", result.master_content) + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + self.assertEqual(df.iloc[0]["match"], "false") + self.assertEqual(df.iloc[0]["confidence"], "high") # Mismatches are high confidence if very low score? + # Wait, raw.py logic: + # if score > high_thresh: true, high + # elif score > avg_thresh: true, low + # else: match=false + # if match=false, confidence depends on score? + # Actually raw.py says: + # if match: ... + # else: row["match"] = "false" + # And confidence is set to "high" by default for mismatches in raw.py? + # Let's check raw.py. + # "confidence": "high" is default init. + # If match found, it might be updated to "low". + # If no match found (score < avg), it remains "high" (High confidence that it is NOT a match). + + self.assertEqual(df.iloc[0]["match"], "false") + self.assertEqual(df.iloc[0]["confidence"], "high") + + +if __name__ == "__main__": + unittest.main() if __name__ == "__main__": From 47cee98c1f206354c540f8620d7fd8e7c00463ce Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 14:48:05 -0800 Subject: [PATCH 8/9] Ruff stuff --- .../adk/scope/extractors/converter_go.py | 75 ++++--- .../adk/scope/extractors/converter_py.py | 8 +- .../adk/scope/extractors/extractor_go.py | 29 +-- src/google/adk/scope/reporter/markdown.py | 141 ++++++++----- src/google/adk/scope/reporter/raw.py | 7 +- src/google/adk/scope/reporter/reporter.py | 47 +++-- src/google/adk/scope/utils/score_features.py | 17 +- src/google/adk/scope/utils/similarity.py | 36 ++-- src/google/adk/scope/utils/stats.py | 19 -- .../adk/scope/extractors/test_extractor_go.py | 4 +- .../adk/scope/extractors/test_extractor_py.py | 40 ++-- .../adk/scope/extractors/test_extractor_ts.py | 4 +- test/adk/scope/reporter/test_reporter.py | 193 ++++++++++-------- test/adk/scope/utils/test_stats.py | 26 --- 14 files changed, 358 insertions(+), 288 deletions(-) delete mode 100644 src/google/adk/scope/utils/stats.py delete mode 100644 test/adk/scope/utils/test_stats.py diff --git a/src/google/adk/scope/extractors/converter_go.py b/src/google/adk/scope/extractors/converter_go.py index 3388291..4b2cd8c 100644 --- a/src/google/adk/scope/extractors/converter_go.py +++ b/src/google/adk/scope/extractors/converter_go.py @@ -19,9 +19,8 @@ class NodeProcessor: def __init__(self): self.normalizer = TypeNormalizer() - def __init__(self): - self.normalizer = TypeNormalizer() - # Mapping from struct name to list of (field_name, field_type, is_optional) + # Mapping from struct name to list of + # (field_name, field_type, is_optional) self._struct_definitions: dict[str, list[tuple[str, str, bool]]] = {} def process( @@ -33,7 +32,11 @@ def process( normalized_namespace: str, ) -> Optional[feature_pb2.Feature]: """Convert a Tree-sitter node into a Feature.""" - valid_nodes = ("function_declaration", "method_declaration", "method_elem") + valid_nodes = ( + "function_declaration", + "method_declaration", + "method_elem", + ) if node.type not in valid_nodes: return None @@ -67,7 +70,8 @@ def process( if original_returns: # Typically the first return value is the struct ret_type = original_returns[0] - # access the struct name, e.g. *Agent -> Agent, mypkg.Agent -> Agent + # access the struct name, e.g. *Agent -> Agent, + # mypkg.Agent -> Agent # Similar logic to parameter flattening type extraction clean_ret = ret_type.lstrip("*").split(".")[-1] if clean_ret: @@ -83,7 +87,7 @@ def process( parameters, is_async = self._extract_params(node) original_returns, normalized_returns = self._extract_return_types(node) - + docstring = self._extract_docstring(node) feature = feature_pb2.Feature( @@ -114,28 +118,30 @@ def register_struct(self, node: Node) -> None: """Register a struct definition to allow parameter flattening.""" # Find struct name from parent type_spec parent = node.parent - # The query capture is on (type_spec name: ... type: (struct_type) @struct_body) + # The query capture is on: + # (type_spec name: ... type: (struct_type) @struct_body) # So node is the struct_type node. Parent should be type_spec. if not parent or parent.type != "type_spec": return - + name_node = parent.child_by_field_name("name") if not name_node: return - + struct_name = name_node.text.decode("utf-8") - + # Parse fields fields = [] - - # Iterating children to find field_declaration_list because child_by_field_name - # might be failing or the field name is different in this version of tree-sitter-go + + # Iterating children to find field_declaration_list because + # child_by_field_name might be failing or the field name is different + # in this version of tree-sitter-go field_list = None for child in node.children: if child.type == "field_declaration_list": field_list = child break - + if field_list: for child in field_list.children: if child.type == "field_declaration": @@ -143,24 +149,24 @@ def register_struct(self, node: Node) -> None: type_node = child.child_by_field_name("type") if not type_node: continue - + type_str = type_node.text.decode("utf-8") - + # Determine if optional is_optional = False if type_node.type == "pointer_type": is_optional = True - + # field_declaration children names # Loop through children to find all field_identifier nodes field_names = [] for subchild in child.children: if subchild.type == "field_identifier": field_names.append(subchild.text.decode("utf-8")) - + for fname in field_names: fields.append((fname, type_str, is_optional)) - + self._struct_definitions[struct_name] = fields def _extract_docstring(self, node: Node) -> str: @@ -174,7 +180,8 @@ def _extract_docstring(self, node: Node) -> str: return "\n".join(comments) def _extract_interface_name(self, node: Node) -> str: - """Walk up the AST from a method_spec to find the interface type name.""" + """Walk up the AST from a method_spec to find the interface type name. + """ parent = node.parent while parent: if parent.type == "type_spec": @@ -238,7 +245,9 @@ def _extract_return_types( return original_returns, normalized_returns - def _extract_params(self, node: Node) -> tuple[list[feature_pb2.Param], bool]: + def _extract_params( + self, node: Node + ) -> tuple[list[feature_pb2.Param], bool]: """Extract parameters from a function_declaration node.""" params = [] params_node = node.child_by_field_name("parameters") @@ -260,20 +269,28 @@ def _extract_params(self, node: Node) -> tuple[list[feature_pb2.Param], bool]: if param_type == "context.Context": is_async = True continue - + # Check if this parameter type should be flattened # We strip pointer and module prefix to find the struct name # e.g. *Config -> Config, mypkg.Config -> Config # Simple heuristic: take the last part after dot, strip * clean_type_name = param_type.lstrip("*").split(".")[-1] - + if clean_type_name in self._struct_definitions: # FLATTEN: Add all fields of the struct as parameters - for field_name, field_type, is_optional in self._struct_definitions[clean_type_name]: + for ( + field_name, + field_type, + is_optional, + ) in self._struct_definitions[clean_type_name]: # Recursively normalize the field type - norm_types = self.normalizer.normalize(field_type, "go") - norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] - + norm_types = self.normalizer.normalize( + field_type, "go" + ) + norm_enums = [ + getattr(feature_pb2, nt) for nt in norm_types + ] + p = feature_pb2.Param( original_name=field_name, normalized_name=normalize_name(field_name), @@ -286,7 +303,9 @@ def _extract_params(self, node: Node) -> tuple[list[feature_pb2.Param], bool]: else: # Normal processing norm_types = self.normalizer.normalize(param_type, "go") - norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] + norm_enums = [ + getattr(feature_pb2, nt) for nt in norm_types + ] p = feature_pb2.Param( original_name=param_name, diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index 127c70d..9c1ba9b 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -54,10 +54,14 @@ def process( # 2. Context member_of, normalized_member_of = self._extract_member_of(node) - + # If the member belongs to a private class, skip it if member_of and member_of.startswith("_"): - logger.debug("Skipping method %s of private class %s", original_name, member_of) + logger.debug( + "Skipping method %s of private class %s", + original_name, + member_of, + ) return None feature_type = self._determine_type( diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index cfd6a0a..57d2377 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -70,7 +70,7 @@ def extract_features( return [] processor = NodeProcessor() - + # Pre-process structs to build the definition map # We need to re-query or process struct nodes specifically. # To keep it simple, let's just use the query we have. @@ -103,18 +103,23 @@ def extract_features( all_nodes = [] struct_nodes = [] - # We only want to process the actual function/method nodes, not the interface names - # which are captured just for context by the processor (via tree traversal). + # We only want to process the actual function/method nodes, not the + # interface names which are captured just for context by the processor + # (via tree traversal). for capture_name, node_list in captures.items(): if capture_name in ("func", "method", "interface_method"): all_nodes.extend(node_list) elif capture_name == "struct_body": # We need to associate the struct body with its name. - # The query captures @struct_name and @struct_body separately but in order. - # However, 'captures' is a dict of lists, so order might be tricky if we rely on index alignment across lists. + # The query captures @struct_name and @struct_body separately but + # in order. + # However, 'captures' is a dict of lists, so order might be tricky + # if we rely on index alignment across lists. # Better strategy: Capture the parent type_spec and process it? - # Or iterate the captures list (which we can't easily do with the dict output). - # Let's rely on NodeProcessor to find the name from the struct_body node's parent. + # Or iterate the captures list (which we can't easily do with the + # dict output). + # Let's rely on NodeProcessor to find the name from the struct_body + # node's parent. struct_nodes.extend(node_list) # Log results for debugging @@ -143,8 +148,8 @@ def extract_features( ), None, ) - # If there is no statement list, or it has 1 or fewer statements, - # consider it simple. + # If there is no statement list, or it has 1 or fewer + # statements, consider it simple. if stmt_list is None or stmt_list.named_child_count <= 1: # Also check physical line span to prevent skipping large # single-statement functions (e.g. methods returning a large @@ -159,7 +164,7 @@ def extract_features( logger.debug( "Skipping simple function: %s (span: %d lines)", function_name_node.text.decode("utf8"), - line_span + line_span, ) continue @@ -211,7 +216,7 @@ def get_version(repo_root: pathlib.Path) -> str: return parts[1] except Exception as e: logger.warning("Failed to read version.go file: %s", e) - + # Fallback to reading go.mod module path if version isn't found go_mod_path = repo_root / "go.mod" if go_mod_path.exists(): @@ -222,5 +227,5 @@ def get_version(repo_root: pathlib.Path) -> str: return line.split()[1] except Exception as e: logger.warning("Failed to read go.mod file: %s", e) - + return "" diff --git a/src/google/adk/scope/reporter/markdown.py b/src/google/adk/scope/reporter/markdown.py index bb8347f..11f6cae 100644 --- a/src/google/adk/scope/reporter/markdown.py +++ b/src/google/adk/scope/reporter/markdown.py @@ -1,15 +1,18 @@ - import dataclasses from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict + import pandas as pd + from google.adk.scope import features_pb2 + @dataclasses.dataclass class MarkdownReport: main_report_content: str module_reports: Dict[str, str] # filename -> content + def _get_language_code(language_name: str) -> str: """Returns a short code for the language.""" name = language_name.upper() @@ -24,6 +27,7 @@ def _get_language_code(language_name: str) -> str: else: return name.lower() + def _get_language_name(language_name: str) -> str: """Returns a properly capitalized display name for the language.""" name = language_name.upper() @@ -49,7 +53,7 @@ def __init__( self.base_registry = base_registry self.target_registry = target_registry self.df = df - + self.base_code = _get_language_code(base_registry.language) self.target_code = _get_language_code(target_registry.language) self.base_name = _get_language_name(base_registry.language) @@ -81,59 +85,76 @@ def generate(self) -> MarkdownReport: master_lines.append("GLOBAL_SCORE_PLACEHOLDER") master_lines.append("") - header = f"| Module | Features ({self.base_name}) | Score | Status | Details |" + header = ( + f"| Module | Features ({self.base_name}) | Score | Status | " + f"Details |" + ) divider = "|---|---|---|---|---|" master_lines.extend(["## Module Summary", header, divider]) module_reports = {} module_rows = [] - + # Determine cols based on language codes col_ns = f"{self.base_code}_namespace" - + # Group by base namespace # If namespace is empty, group under "Unknown Module" self.df["_module_group"] = self.df[col_ns].replace("", "Unknown Module") - + grouped = self.df.groupby("_module_group") - + total_high = 0 total_low = 0 - total_mismatch = 0 total_base_features = len(self.df) for module, group in grouped: # Calculate module stats - high = len(group[group["confidence"] == "high"]) - low = len(group[group["confidence"] == "low"]) mismatches = len(group[group["match"] == "false"]) - + # Actually, `high` and `low` confidence applies to matches usually # But let's verify what `match` column says. - matches_high = len(group[(group["match"] == "true") & (group["confidence"] == "high")]) - matches_low = len(group[(group["match"] == "true") & (group["confidence"] == "low")]) + matches_high = len( + group[ + (group["match"] == "true") & (group["confidence"] == "high") + ] + ) + matches_low = len( + group[ + (group["match"] == "true") & (group["confidence"] == "low") + ] + ) # Everything else is a mismatch or low confidence match? # Let's trust `match` column for parity score solid_matches_count = len(group[group["match"] == "true"]) - + total_high += matches_high total_low += matches_low - total_mismatch += mismatches - + module_total = len(group) - score = solid_matches_count / module_total if module_total > 0 else 0.0 - + score = ( + solid_matches_count / module_total if module_total > 0 else 0.0 + ) + # Generate Module File Content module_filename = f"{module}.md" - module_content = self._generate_module_content(module, group, module_total, matches_high, matches_low, mismatches) + module_content = self._generate_module_content( + module, + group, + module_total, + matches_high, + matches_low, + mismatches, + ) module_reports[module_filename] = module_content - + # Add summary row status_icon = "✅" if score == 1.0 else "⚠️" if score > 0.5 else "❌" row_str = ( f"| `{module}` | {module_total} | " - f"{score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" + f"{score:.2%} | {status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" ) module_rows.append((score, row_str)) @@ -142,10 +163,14 @@ def generate(self) -> MarkdownReport: # Summary Stats total_matches = total_high + total_low - parity_score = total_matches / total_base_features if total_base_features > 0 else 1.0 - + parity_score = ( + total_matches / total_base_features + if total_base_features > 0 + else 1.0 + ) + base_exclusive = total_base_features - total_matches - + global_stats = ( "## Summary\n\n" "| Feature Category | Count | Details |\n" @@ -157,9 +182,10 @@ def generate(self) -> MarkdownReport: f"| **❌ Mismatches** | **{base_exclusive}** | " f"No suitable match found in `{self.target_name}` |\n" f"| **📊 Coverage Score** | **{parity_score:.2%}** | " - f"Matches / Total Base Features ({total_matches} / {total_base_features}) |" + f"Matches / Total Base Features ({total_matches} / " + f"{total_base_features}) |" ) - + master_lines[global_score_idx] = global_stats return MarkdownReport( @@ -168,19 +194,19 @@ def generate(self) -> MarkdownReport: ) def _generate_module_content( - self, - module: str, + self, + module: str, group: pd.DataFrame, total_features: int, high_conf: int, low_conf: int, - mismatches: int + mismatches: int, ) -> str: - + # Calculate scores for summary total_matches = high_conf + low_conf coverage = total_matches / total_features if total_features > 0 else 0.0 - + summary_table = ( "## Summary\n\n" "| Feature Category | Count | Details |\n" @@ -192,42 +218,49 @@ def _generate_module_content( f"| **❌ Mismatches** | **{mismatches}** | " f"No suitable match found in `{self.target_name}` |\n" f"| **📊 Coverage Score** | **{coverage:.2%}** | " - f"Matches / Total Base Features ({total_matches} / {total_features}) |\n" + f"Matches / Total Base Features ({total_matches} / " + f"{total_features}) |\n" ) lines = [ f"# Module: `{module}`", "", - f"[← Back to Master Report]({{master_report}})", + "[← Back to Master Report]({master_report})", "", summary_table, "## Feature Details", "", - f"| Module ({self.base_name}) | Container ({self.base_name}) | Name ({self.base_name}) | Module ({self.target_name}) | Container ({self.target_name}) | Name ({self.target_name}) | Score | Match | Confidence |", - "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | :---: |", + f"| Module ({self.base_name}) | Container ({self.base_name}) | " + f"Name ({self.base_name}) | Module ({self.target_name}) | " + f"Container ({self.target_name}) | Name ({self.target_name}) | " + "Score | Match | Confidence |", + "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | " + ":---: |", ] - + # Sort by score desc, then name - group_sorted = group.sort_values(by=["score", f"{self.base_code}_name"], ascending=[False, True]) - + group_sorted = group.sort_values( + by=["score", f"{self.base_code}_name"], ascending=[False, True] + ) + for _, row in group_sorted.iterrows(): # Base logic - b_ns = row[f'{self.base_code}_namespace'] - b_mem = row[f'{self.base_code}_member_of'] - b_name = row[f'{self.base_code}_name'] - + b_ns = row[f"{self.base_code}_namespace"] + b_mem = row[f"{self.base_code}_member_of"] + b_name = row[f"{self.base_code}_name"] + # Target logic - t_ns = row[f'{self.target_code}_namespace'] - t_mem = row[f'{self.target_code}_member_of'] - t_name = row[f'{self.target_code}_name'] - + t_ns = row[f"{self.target_code}_namespace"] + t_mem = row[f"{self.target_code}_member_of"] + t_name = row[f"{self.target_code}_name"] + if t_name == "" and t_mem == "" and t_ns == "": t_name = "*(None)*" - - score = row['score'] - match_val = row['match'] - conf_val = row['confidence'] - + + score = row["score"] + match_val = row["match"] + conf_val = row["confidence"] + if match_val == "true": if conf_val == "high": match_icon = "✅" @@ -239,11 +272,11 @@ def _generate_module_content( conf_display = conf_val.title() if conf_display == "High": conf_display = "**High**" - + lines.append( f"| `{b_ns}` | `{b_mem}` | `{b_name}` | " f"`{t_ns}` | `{t_mem}` | `{t_name}` | " f"{score:.4f} | {match_icon} | {conf_display} |" ) - + return "\n".join(lines) diff --git a/src/google/adk/scope/reporter/raw.py b/src/google/adk/scope/reporter/raw.py index c49988d..a32b47d 100644 --- a/src/google/adk/scope/reporter/raw.py +++ b/src/google/adk/scope/reporter/raw.py @@ -1,7 +1,8 @@ +from collections import defaultdict +from typing import Any, Dict, List, Optional, Tuple import pandas as pd -from collections import defaultdict -from typing import Optional, Dict, Any, List, Tuple + from google.adk.scope import features_pb2 from google.adk.scope.utils.similarity import SimilarityScorer @@ -184,4 +185,4 @@ def _save_csv(self, df: pd.DataFrame, output_path: str): from pathlib import Path Path(output_path).parent.mkdir(parents=True, exist_ok=True) - df.to_csv(output_path, index=False) \ No newline at end of file + df.to_csv(output_path, index=False) diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index db37bc5..b89864d 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -1,19 +1,14 @@ - import argparse import logging import sys -from datetime import datetime from pathlib import Path from typing import List, Optional from google.protobuf import text_format -import pandas as pd from google.adk.scope import features_pb2 -from google.adk.scope.reporter import raw, markdown +from google.adk.scope.reporter import markdown, raw from google.adk.scope.utils import args as adk_args -from google.adk.scope.utils.similarity import SimilarityScorer - def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: @@ -32,7 +27,7 @@ def generate_markdown_raw_reports( """Matches features and generates reports.""" # New unified flow for standard reports generator = raw.RawReportGenerator(registries[0], registries[1]) - + # Generate DataFrame (and CSV if path provided) csv_path = None if output_path: @@ -41,11 +36,13 @@ def generate_markdown_raw_reports( stem = output_path.stem parent = output_path.parent csv_path = str(parent / f"{stem}.csv") - + df = generator.generate(output_path=csv_path) - + # Generate Markdown Report from DataFrame - reporter = markdown.MarkdownReportGenerator(registries[0], registries[1], df) + reporter = markdown.MarkdownReportGenerator( + registries[0], registries[1], df + ) result = reporter.generate() if result.module_reports: modules_dir_name = f"{output_path.stem}_modules" @@ -57,7 +54,9 @@ def generate_markdown_raw_reports( # Replace placeholder for master report link # The link is relative from module dir to master report # We are in {stem}_modules/, so we need to go up one level. - final_content = content.replace("{master_report}", f"../{output_path.name}") + final_content = content.replace( + "{master_report}", f"../{output_path.name}" + ) (modules_dir / filename).write_text(final_content) # Replace placeholder in Master Report @@ -72,16 +71,19 @@ def generate_markdown_raw_reports( try: output_path.write_text(master_report) logging.info(f"Successfully wrote match report to {output_path}") - # Note: CSV writing is logged inside RawReportGenerator or we should log it here - # Actually RawReportGenerator doesn't log, so we might want to Add a log here if we knew it matched + # Note: CSV writing is logged inside RawReportGenerator or we should + # log it here. Actually RawReportGenerator doesn't log, so we might + # want to Add a log here if we knew it matched stem = output_path.stem csv_path = output_path.parent / f"{stem}.csv" if csv_path.exists(): - logging.info(f"Successfully wrote raw match report to {csv_path}") + logging.info(f"Successfully wrote raw match report to {csv_path}") + + return result except Exception as e: logging.error(f"Error writing report to {output_path}: {e}") - sys.exit(1) + sys.exit(1) def main(): @@ -107,7 +109,10 @@ def main(): parser.add_argument( "--output", required=True, - help="Path to save the Markdown report. Corresponding CSV will be saved with same stem.", + help=( + "Path to save the Markdown report. Corresponding CSV will be " + "saved with same stem." + ), ) parser.add_argument( "--report-type", @@ -126,9 +131,11 @@ def main(): elif args.base and args.target: registry_paths.extend([args.base, args.target]) else: - logging.error("Must provide either --registries or both --base and --target") + logging.error( + "Must provide either --registries or both --base and --target" + ) sys.exit(1) - + if len(registry_paths) < 2: logging.error("Must provide at least 2 registries to compare.") sys.exit(1) @@ -142,9 +149,7 @@ def main(): output_path.parent.mkdir(parents=True, exist_ok=True) generate_markdown_raw_reports( - registries, - args.report_type, - output_path=output_path + registries, args.report_type, output_path=output_path ) diff --git a/src/google/adk/scope/utils/score_features.py b/src/google/adk/scope/utils/score_features.py index 7718c00..27c5c28 100644 --- a/src/google/adk/scope/utils/score_features.py +++ b/src/google/adk/scope/utils/score_features.py @@ -1,4 +1,3 @@ - import argparse import logging import sys @@ -9,10 +8,17 @@ from google.adk.scope import features_pb2 from google.adk.scope.utils.similarity import SimilarityScorer + def main(): - parser = argparse.ArgumentParser(description="Calculate similarity score between two features.") - parser.add_argument("feature1", type=Path, help="Path to first feature file (text proto).") - parser.add_argument("feature2", type=Path, help="Path to second feature file (text proto).") + parser = argparse.ArgumentParser( + description="Calculate similarity score between two features." + ) + parser.add_argument( + "feature1", type=Path, help="Path to first feature file (text proto)." + ) + parser.add_argument( + "feature2", type=Path, help="Path to second feature file (text proto)." + ) args = parser.parse_args() @@ -31,7 +37,7 @@ def main(): scorer = SimilarityScorer() score = scorer.get_similarity_score(f1, f2) - + print("-" * 40) print(f"Similarity Score: {score:.4f}") print("-" * 40) @@ -40,5 +46,6 @@ def main(): logging.error(f"Error: {e}") sys.exit(1) + if __name__ == "__main__": main() diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 3fdb8c6..57510f6 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -3,7 +3,6 @@ import numpy as np from jellyfish import levenshtein_distance - from scipy.optimize import linear_sum_assignment from google.adk.scope import features_pb2 as features_pb @@ -29,8 +28,7 @@ def __init__( ): self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS logger.debug( - f"Initializing SimilarityScorer with " - f"weights={self.weights}" + f"Initializing SimilarityScorer with " f"weights={self.weights}" ) assert "name" in self.weights assert "member_of" in self.weights @@ -39,7 +37,8 @@ def __init__( assert "return_type" in self.weights def get_similarity(self, s1: str, s2: str) -> float: - """Calculates similarity between two strings using the selected algorithm.""" + """Calculates similarity between two strings using the selected + algorithm.""" if not s1 and not s2: return 1.0 if not s1 or not s2: @@ -49,7 +48,6 @@ def get_similarity(self, s1: str, s2: str) -> float: dist = levenshtein_distance(s1, s2) max_len = max(len(s1), len(s2)) return 1.0 - (dist / max_len) - return 1.0 - (dist / max_len) def _fuzzy_type_match(self, types1: list, types2: list) -> float: """Calculates a fuzzy similarity score between two lists of types.""" @@ -81,7 +79,7 @@ def _to_str_set(type_list): # Check the best match between any pair of types best_score = 0.0 - + logger.debug(f"Fuzzy type match between {set1} and {set2}") for t1 in set1: for t2 in set2: @@ -165,8 +163,12 @@ def _calculate_parameters_score( ) # Log parameter matches for r, c in zip(row_ind, col_ind): - if similarity_matrix[r, c] > 0: - logger.debug(f" Matched param '{params1[r].normalized_name}' with '{params2[c].normalized_name}': {similarity_matrix[r, c]:.4f}") + if similarity_matrix[r, c] > 0: + logger.debug( + f" Matched param '{params1[r].normalized_name}' with " + f"'{params2[c].normalized_name}': " + f"{similarity_matrix[r, c]:.4f}" + ) return score def _calculate_return_type_score( @@ -242,9 +244,12 @@ def get_similarity_score( } logger.debug( f"Comparison Details:\n" - f" Name: '{feature1.normalized_name}' vs '{feature2.normalized_name}' -> {scores['name']:.4f}\n" - f" MemberOf: '{feature1.normalized_member_of}' vs '{feature2.normalized_member_of}' -> {scores['member_of']:.4f}\n" - f" Namespace: '{feature1.normalized_namespace}' vs '{feature2.normalized_namespace}' -> {scores['namespace']:.4f}" + f" Name: '{feature1.normalized_name}' vs " + f"'{feature2.normalized_name}' -> {scores['name']:.4f}\n" + f" MemberOf: '{feature1.normalized_member_of}' vs " + f"'{feature2.normalized_member_of}' -> {scores['member_of']:.4f}\n" + f" Namespace: '{feature1.normalized_namespace}' vs " + f"'{feature2.normalized_namespace}' -> {scores['namespace']:.4f}" ) logger.debug(f"Preliminary scores: {scores}") @@ -283,12 +288,15 @@ def get_similarity_score( scores[key] * current_weights[key] for key in current_weights ) logger.debug(f"Final scores including params & return: {scores}") - + # Log contributions logger.debug("Score Contributions:") for key in current_weights: contribution = scores[key] * current_weights[key] - logger.debug(f" {key}: {scores[key]:.4f} * {current_weights[key]:.4f} = {contribution:.4f}") - + logger.debug( + f" {key}: {scores[key]:.4f} * {current_weights[key]:.4f} = " + f"{contribution:.4f}" + ) + logger.debug(f"Final weighted similarity score: {final_score:.4f}") return final_score diff --git a/src/google/adk/scope/utils/stats.py b/src/google/adk/scope/utils/stats.py deleted file mode 100644 index f98479c..0000000 --- a/src/google/adk/scope/utils/stats.py +++ /dev/null @@ -1,19 +0,0 @@ -def calculate_precision(matches: int, total_target: int) -> float: - """Calculates precision: matches / total_target.""" - if total_target > 0: - return matches / total_target - return 1.0 - - -def calculate_recall(matches: int, total_base: int) -> float: - """Calculates recall: matches / total_base.""" - if total_base > 0: - return matches / total_base - return 1.0 - - -def calculate_f1(precision: float, recall: float) -> float: - """Calculates F1 score: 2 * (P * R) / (P + R).""" - if precision + recall > 0: - return 2 * (precision * recall) / (precision + recall) - return 0.0 diff --git a/test/adk/scope/extractors/test_extractor_go.py b/test/adk/scope/extractors/test_extractor_go.py index 9acf3b2..b2a0626 100644 --- a/test/adk/scope/extractors/test_extractor_go.py +++ b/test/adk/scope/extractors/test_extractor_go.py @@ -37,7 +37,7 @@ def test_extract_features( mock_func_node = MagicMock() mock_func_body = MagicMock() mock_func_body.start_point = (1, 0) - mock_func_body.end_point = (10, 0) # span = 10 lines + mock_func_body.end_point = (10, 0) # span = 10 lines mock_func_stmt_list = MagicMock() mock_func_stmt_list.type = "statement_list" mock_func_stmt_list.named_child_count = 2 @@ -47,7 +47,7 @@ def test_extract_features( mock_method_node = MagicMock() mock_method_body = MagicMock() mock_method_body.start_point = (12, 0) - mock_method_body.end_point = (20, 0) # span = 9 lines + mock_method_body.end_point = (20, 0) # span = 9 lines mock_method_stmt_list = MagicMock() mock_method_stmt_list.type = "statement_list" mock_method_stmt_list.named_child_count = 2 diff --git a/test/adk/scope/extractors/test_extractor_py.py b/test/adk/scope/extractors/test_extractor_py.py index f9ed5b4..4d78a71 100644 --- a/test/adk/scope/extractors/test_extractor_py.py +++ b/test/adk/scope/extractors/test_extractor_py.py @@ -103,26 +103,28 @@ def test_private_classes_filtered( ): mock_path = MagicMock(spec=Path) mock_path.read_bytes.return_value = b"class _PrivateClass: pass" - + mock_tree = MagicMock() mock_parser.parse.return_value = mock_tree mock_tree.root_node = MagicMock() - + mock_cursor_instance = mock_cursor_cls.return_value - + mock_node = MagicMock() mock_node.type = "class_definition" - + # Simulate query returning the private class mock_cursor_instance.captures.return_value = {"class": [mock_node]} - - with patch("google.adk.scope.extractors.extractor_py.NodeProcessor") as MockProcessor: + + with patch( + "google.adk.scope.extractors.extractor_py.NodeProcessor" + ) as MockProcessor: processor_instance = MockProcessor.return_value # The processor returns None for private classes processor_instance.process.return_value = None - + features = extract_features(mock_path, Path("/repo"), ".") - + self.assertEqual(features, []) processor_instance.process.assert_called_once() @@ -133,27 +135,31 @@ def test_private_class_methods_filtered( self, mock_parser, mock_query_cls, mock_cursor_cls ): mock_path = MagicMock(spec=Path) - mock_path.read_bytes.return_value = b"class _PrivateClass:\n def method(self): pass" - + mock_path.read_bytes.return_value = ( + b"class _PrivateClass:\n def method(self): pass" + ) + mock_tree = MagicMock() mock_parser.parse.return_value = mock_tree mock_tree.root_node = MagicMock() - + mock_cursor_instance = mock_cursor_cls.return_value - + mock_node = MagicMock() mock_node.type = "function_definition" - + # Simulate query returning the method mock_cursor_instance.captures.return_value = {"func": [mock_node]} - - with patch("google.adk.scope.extractors.extractor_py.NodeProcessor") as MockProcessor: + + with patch( + "google.adk.scope.extractors.extractor_py.NodeProcessor" + ) as MockProcessor: processor_instance = MockProcessor.return_value # The processor returns None for methods in private classes processor_instance.process.return_value = None - + features = extract_features(mock_path, Path("/repo"), ".") - + self.assertEqual(features, []) processor_instance.process.assert_called_once() diff --git a/test/adk/scope/extractors/test_extractor_ts.py b/test/adk/scope/extractors/test_extractor_ts.py index a30aa37..21f879c 100644 --- a/test/adk/scope/extractors/test_extractor_ts.py +++ b/test/adk/scope/extractors/test_extractor_ts.py @@ -62,7 +62,9 @@ def test_extract_features(self, mock_parser): # Mock Query and QueryCursor with ( - patch("google.adk.scope.extractors.extractor_ts._build_global_type_map"), + patch( + "google.adk.scope.extractors.extractor_ts._build_global_type_map" + ), patch("google.adk.scope.extractors.extractor_ts.Query"), patch( "google.adk.scope.extractors.extractor_ts.QueryCursor" diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index c7a9405..287e73c 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -1,12 +1,12 @@ import os import tempfile import unittest +from pathlib import Path from unittest.mock import patch from google.protobuf import text_format from google.adk.scope import features_pb2 - from google.adk.scope.reporter import reporter @@ -107,51 +107,63 @@ def test_match_registries(self): target_registry.features.extend([f2, f_near_target]) # Test Markdown Report - result_md = reporter.match_registries( - [base_registry, target_registry], report_type="md" - ) - report_md = result_md.main_report_content - - # 1. Verify Master Report Structure - self.assertIn("# Feature Matching Parity Report", report_md) - self.assertIn("## Summary", report_md) - # Check for High/Low confidence summaries - self.assertIn("| **✅ High Confidence Matches** | **1** |", report_md) - self.assertIn("| **⚠️ Low Confidence Matches** | **1** |", report_md) - self.assertIn("| **❌ Mismatches** | **1** |", report_md) - self.assertIn("## Module Summary", report_md) - - # Check for module entry in master summary - self.assertIn("| Module | Features (Python) | Score | Status | Details |", report_md) - self.assertIn("| `google.adk.events` |", report_md) - self.assertIn("[View Details]({modules_dir}/google.adk.events.md)", report_md) - - # 2. Verify Module Content - self.assertIn("google.adk.events.md", result_md.module_reports) - module_content = result_md.module_reports["google.adk.events.md"] - - self.assertIn("# Module: `google.adk.events`", module_content) - # New summary table in module - self.assertIn("## Summary", module_content) - self.assertIn("## Feature Details", module_content) - - # Solid Matches (High Confidence) - self.assertIn("✅", module_content) - self.assertIn("**High**", module_content) - self.assertIn("`fSameBase`", module_content) - self.assertIn("`fSameTarget`", module_content) - - # Potential Matches (Low Confidence) - self.assertIn("⚠️", module_content) - self.assertIn("Low", module_content) - self.assertIn("`base_name`", module_content) - self.assertIn("`target_name`", module_content) - - # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_md.module_reports) - stuff_content = result_md.module_reports["stuff.md"] - self.assertIn("❌", stuff_content) - self.assertIn("`totally_diff`", stuff_content) + with tempfile.TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "report.md" + result_md = reporter.generate_markdown_raw_reports( + [base_registry, target_registry], + report_type="md", + output_path=output_path, + ) + report_md = result_md.main_report_content + + # 1. Verify Master Report Structure + self.assertIn("# Feature Matching Parity Report", report_md) + self.assertIn("## Summary", report_md) + # Check for High/Low confidence summaries + self.assertIn( + "| **✅ High Confidence Matches** | **1** |", report_md + ) + self.assertIn("| **⚠️ Low Confidence Matches** | **1** |", report_md) + self.assertIn("| **❌ Mismatches** | **1** |", report_md) + self.assertIn("## Module Summary", report_md) + + # Check for module entry in master summary + self.assertIn( + "| Module | Features (Python) | Score | Status | Details |", + report_md, + ) + self.assertIn("| `google.adk.events` |", report_md) + + self.assertIn( + "[View Details]({modules_dir}/google.adk.events.md)", report_md + ) + + # 2. Verify Module Content + self.assertIn("google.adk.events.md", result_md.module_reports) + module_content = result_md.module_reports["google.adk.events.md"] + + self.assertIn("# Module: `google.adk.events`", module_content) + # New summary table in module + self.assertIn("## Summary", module_content) + self.assertIn("## Feature Details", module_content) + + # Solid Matches (High Confidence) + self.assertIn("✅", module_content) + self.assertIn("**High**", module_content) + self.assertIn("`fSameBase`", module_content) + self.assertIn("`fSameTarget`", module_content) + + # Potential Matches (Low Confidence) + self.assertIn("⚠️", module_content) + self.assertIn("Low", module_content) + self.assertIn("`base_name`", module_content) + self.assertIn("`target_name`", module_content) + + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_md.module_reports) + stuff_content = result_md.module_reports["stuff.md"] + self.assertIn("❌", stuff_content) + self.assertIn("`totally_diff`", stuff_content) def test_generate_raw_report(self): """Tests the raw CSV report generation via RawReportGenerator.""" @@ -185,18 +197,19 @@ def test_generate_raw_report(self): base_registry, target_registry ) df = generator.generate() - + # Check columns self.assertIn("py_namespace", df.columns) self.assertIn("score", df.columns) - + # Check content row = df.iloc[0] self.assertEqual(row["py_name"], "f1_base") self.assertEqual(row["score"], 1.0) def test_global_best_match(self): - """Tests that a feature matches best candidate globally, ignoring namespace.""" + """Tests that a feature matches best candidate globally, ignoring + namespace.""" # Base feature in namespace 'n1' f_base = features_pb2.Feature( original_name="my_feature", @@ -204,7 +217,7 @@ def test_global_best_match(self): namespace="n1", type=features_pb2.Feature.Type.FUNCTION, ) - + # Target feature 1: Same namespace, but different name (low score) f_target_bad = features_pb2.Feature( original_name="other_feature", @@ -212,7 +225,7 @@ def test_global_best_match(self): namespace="n1", type=features_pb2.Feature.Type.FUNCTION, ) - + # Target feature 2: Different namespace, but same name (high score) f_target_good = features_pb2.Feature( original_name="my_feature", @@ -221,10 +234,14 @@ def test_global_best_match(self): type=features_pb2.Feature.Type.FUNCTION, ) - base_registry = features_pb2.FeatureRegistry(language="Python", version="1") + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1" + ) base_registry.features.append(f_base) - - target_registry = features_pb2.FeatureRegistry(language="Java", version="2") + + target_registry = features_pb2.FeatureRegistry( + language="Java", version="2" + ) target_registry.features.extend([f_target_bad, f_target_good]) # RawReportGenerator logic @@ -310,7 +327,7 @@ def test_raw_integration(self): generator = reporter.raw.RawReportGenerator(py_registry, ts_registry) df = generator.generate() - + # Verify solid match (high score) row = df.iloc[0] self.assertEqual(row["py_name"], "load_artifact") @@ -321,58 +338,69 @@ def test_raw_report_match_confidence(self): """Tests match and confidence columns with various scores.""" # 1. High match (score 0.9 > 0.6 for py/go) f_high = features_pb2.Feature( - original_name="high", normalized_name="high", type=features_pb2.Feature.Type.FUNCTION + original_name="high", + normalized_name="high", + type=features_pb2.Feature.Type.FUNCTION, ) # 2. Avg match (score 0.55 between 0.5 and 0.6 for py/go) f_avg = features_pb2.Feature( - original_name="high", normalized_name="high_ish", type=features_pb2.Feature.Type.FUNCTION + original_name="high", + normalized_name="high_ish", + type=features_pb2.Feature.Type.FUNCTION, ) # 3. Low match (score 0.1 < 0.5 for py/go) f_low = features_pb2.Feature( - original_name="high", normalized_name="completely_different", type=features_pb2.Feature.Type.FUNCTION + original_name="high", + normalized_name="completely_different", + type=features_pb2.Feature.Type.FUNCTION, ) base = features_pb2.FeatureRegistry(language="Python", version="1") base.features.append(f_high) - + target = features_pb2.FeatureRegistry(language="Go", version="1") - # We need to craft targets that produce specific scores or mock the scorer. - # It's easier to mock SimilarityScorer to return fixed scores. + # We need to craft targets that produce specific scores or mock the + # scorer. It's easier to mock SimilarityScorer to return fixed scores. target.features.extend([f_high, f_avg, f_low]) - with patch("google.adk.scope.reporter.raw.SimilarityScorer") as MockScorer: + with patch( + "google.adk.scope.reporter.raw.SimilarityScorer" + ) as MockScorer: instance = MockScorer.return_value - + # Case 1: High match instance.get_similarity_score.return_value = 0.9 - gen = reporter.raw.RawReportGenerator(base, target) + gen = reporter.raw.RawReportGenerator(base, target) df = gen.generate() - # Since generate iterates through base features, and we have 1 base feature, - # it will run once. We need to test behavior for different scores. - # But generate() does all at once. - + # Since generate iterates through base features, and we have 1 base + # feature, it will run once. We need to test behavior for different + # scores. But generate() does all at once. + # Actually, `generate` iterates through base features. - # If we want to test different outcomes, we should perhaps just test - # the _get_confidence_level method or ensure our mock returns different values - # for different calls if possible, or just run 3 separate gens. - + # If we want to test different outcomes, we should perhaps just + # test the _get_confidence_level method or ensure our mock returns + # different values for different calls if possible, or just run 3 + # separate gens. + # Test High self.assertEqual(df.iloc[0]["match"], "true") self.assertEqual(df.iloc[0]["confidence"], "high") - + # Test Avg (Low Confidence) instance.get_similarity_score.return_value = 0.55 gen = reporter.raw.RawReportGenerator(base, target) df = gen.generate() self.assertEqual(df.iloc[0]["match"], "true") self.assertEqual(df.iloc[0]["confidence"], "low") - + # Test Low (Mismatch) instance.get_similarity_score.return_value = 0.4 gen = reporter.raw.RawReportGenerator(base, target) df = gen.generate() self.assertEqual(df.iloc[0]["match"], "false") - self.assertEqual(df.iloc[0]["confidence"], "high") # Mismatches are high confidence if very low score? + self.assertEqual( + df.iloc[0]["confidence"], "high" + ) # Mismatches are high confidence if very low score? # Wait, raw.py logic: # if score > high_thresh: true, high # elif score > avg_thresh: true, low @@ -381,19 +409,16 @@ def test_raw_report_match_confidence(self): # Actually raw.py says: # if match: ... # else: row["match"] = "false" - # And confidence is set to "high" by default for mismatches in raw.py? - # Let's check raw.py. + # And confidence is set to "high" by default for mismatches in + # raw.py? Let's check raw.py. # "confidence": "high" is default init. - # If match found, it might be updated to "low". - # If no match found (score < avg), it remains "high" (High confidence that it is NOT a match). - + # If match found, it might be updated to "low". + # If no match found (score < avg), it remains "high" (High + # confidence that it is NOT a match). + self.assertEqual(df.iloc[0]["match"], "false") self.assertEqual(df.iloc[0]["confidence"], "high") if __name__ == "__main__": unittest.main() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py deleted file mode 100644 index a98b0ab..0000000 --- a/test/adk/scope/utils/test_stats.py +++ /dev/null @@ -1,26 +0,0 @@ -import unittest - -from google.adk.scope.utils import stats - - -class TestStats(unittest.TestCase): - def test_precision(self): - self.assertEqual(stats.calculate_precision(10, 20), 0.5) - self.assertEqual(stats.calculate_precision(0, 20), 0.0) - self.assertEqual(stats.calculate_precision(10, 0), 1.0) # Edge case - - def test_recall(self): - self.assertEqual(stats.calculate_recall(10, 20), 0.5) - self.assertEqual(stats.calculate_recall(0, 20), 0.0) - self.assertEqual(stats.calculate_recall(10, 0), 1.0) # Edge case - - def test_f1(self): - self.assertAlmostEqual(stats.calculate_f1(0.5, 0.5), 0.5) - self.assertAlmostEqual(stats.calculate_f1(1.0, 1.0), 1.0) - self.assertAlmostEqual(stats.calculate_f1(0.0, 1.0), 0.0) - self.assertAlmostEqual(stats.calculate_f1(0.0, 0.0), 0.0) - self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) - - -if __name__ == "__main__": - unittest.main() From 20db26575bd1ce0eae447050836c59e13cd0fb0b Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 18 Feb 2026 14:49:41 -0800 Subject: [PATCH 9/9] Include pandas --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 238f452..fe81589 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "numpy", "jellyfish", "RapidFuzz", + "pandas", ]