From 6e076be9fe646c7dffc856d10ebe2c3fe565c386 Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Wed, 10 Sep 2025 19:48:44 -0400 Subject: [PATCH 1/9] Adjusted scripts to support the new --generate-json switch for analyzer to generate an additional json file --- labs.yml | 2 ++ .../lakebridge/analyzer/lakebridge_analyzer.py | 14 ++++++++++---- src/databricks/labs/lakebridge/cli.py | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/labs.yml b/labs.yml index a7442695ff..ec0b3e70fd 100644 --- a/labs.yml +++ b/labs.yml @@ -17,6 +17,8 @@ commands: description: (Optional) Name of report file to write - name: source-tech description: (Optional) Name of the Source System Technology you want to analyze + - name: generate-json + description: (Optional) Flag to indicate if a json file should be produced alongside the Excel file - name: transpile description: Transpile SQL script to Databricks SQL flags: diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index db5d8cb58d..aa90ff97c8 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -67,7 +67,9 @@ def _run_prompt_analyzer(self): logger.info(f"Successfully Analyzed files in ${source_dir} for ${technology} and saved report to {results_dir}") - def _run_arg_analyzer(self, source_dir: str | None, results_dir: str | None, technology: str | None): + def _run_arg_analyzer( + self, source_dir: str | None, results_dir: str | None, technology: str | None, generate_json: bool | False + ): """Run the analyzer: arg guided""" if source_dir is None or results_dir is None or technology is None: logger.error("All arguments (--source-directory, --report-file, --source-tech) must be provided") @@ -76,7 +78,7 @@ def _run_arg_analyzer(self, source_dir: str | None, results_dir: str | None, tec if check_path(source_dir) and check_path(results_dir): tmp_dir = self._temp_xlsx_path(results_dir) technology = self._get_source_tech(technology) - self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug) + self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json) move_tmp_file(tmp_dir, Path(results_dir)) @@ -85,11 +87,15 @@ def _run_arg_analyzer(self, source_dir: str | None, results_dir: str | None, tec ) def run_analyzer( - self, source_dir: str | None = None, results_dir: str | None = None, technology: str | None = None + self, + source_dir: str | None = None, + results_dir: str | None = None, + technology: str | None = None, + generate_json: bool = False, ): """Run the analyzer.""" if not any([source_dir, results_dir, technology]): self._run_prompt_analyzer() return - self._run_arg_analyzer(source_dir, results_dir, technology) + self._run_arg_analyzer(source_dir, results_dir, technology, generate_json) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 6c626f1692..504df879b0 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -651,13 +651,14 @@ def analyze( source_directory: str | None = None, report_file: str | None = None, source_tech: str | None = None, + generate_json: bool = False, ): """Run the Analyzer""" ctx = ApplicationContext(w) ctx.add_user_agent_extra("cmd", "analyze") logger.debug(f"User: {ctx.current_user}") - ctx.analyzer.run_analyzer(source_directory, report_file, source_tech) + ctx.analyzer.run_analyzer(source_directory, report_file, source_tech, generate_json) if __name__ == "__main__": From aeed62925e7cf512a24c904b744952a98acafd63 Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Wed, 10 Sep 2025 20:08:14 -0400 Subject: [PATCH 2/9] arg fix --- src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index aa90ff97c8..ddb8b5f22c 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -68,7 +68,7 @@ def _run_prompt_analyzer(self): logger.info(f"Successfully Analyzed files in ${source_dir} for ${technology} and saved report to {results_dir}") def _run_arg_analyzer( - self, source_dir: str | None, results_dir: str | None, technology: str | None, generate_json: bool | False + self, source_dir: str | None, results_dir: str | None, technology: str | None, generate_json: bool = False ): """Run the analyzer: arg guided""" if source_dir is None or results_dir is None or technology is None: From 60b494aac4f7a614887d6a970b3e8a9d888ee64e Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Wed, 10 Sep 2025 20:15:12 -0400 Subject: [PATCH 3/9] fixed args --- .../labs/lakebridge/analyzer/lakebridge_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index ddb8b5f22c..fd49a3382b 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -78,7 +78,7 @@ def _run_arg_analyzer( if check_path(source_dir) and check_path(results_dir): tmp_dir = self._temp_xlsx_path(results_dir) technology = self._get_source_tech(technology) - self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json) + self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json=generate_json) move_tmp_file(tmp_dir, Path(results_dir)) @@ -98,4 +98,4 @@ def run_analyzer( self._run_prompt_analyzer() return - self._run_arg_analyzer(source_dir, results_dir, technology, generate_json) + self._run_arg_analyzer(source_dir, results_dir, technology, generate_json) \ No newline at end of file From 71efde4397821f6f16224b1f82b65429e77ffe56 Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Wed, 10 Sep 2025 20:31:29 -0400 Subject: [PATCH 4/9] trying arg change --- src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index fd49a3382b..250efbac86 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -78,7 +78,7 @@ def _run_arg_analyzer( if check_path(source_dir) and check_path(results_dir): tmp_dir = self._temp_xlsx_path(results_dir) technology = self._get_source_tech(technology) - self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json=generate_json) + self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json) move_tmp_file(tmp_dir, Path(results_dir)) From 59ff80db0c12e4af9401ae37863421f81664aaca Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Thu, 20 Nov 2025 09:27:37 -0500 Subject: [PATCH 5/9] added generate_json arg --- .../labs/lakebridge/analyzer/lakebridge_analyzer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index e09a7b75d7..ad8f11e98e 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -73,7 +73,11 @@ def run(self, source_dir: Path, results_dir: Path, platform: str) -> AnalyzerRes raise ValueError(f"Invalid path(s) provided: source_dir={source_dir}, results_dir={results_dir}") def _run_arg_analyzer( - self, source_dir: str | None, results_dir: str | None, technology: str | None, + self, + source_dir: str | None, + results_dir: str | None, + technology: str | None, + generate_json: bool = False, ): """Run the analyzer: arg guided""" if source_dir is None or results_dir is None or technology is None: @@ -85,6 +89,7 @@ def _run_arg_analyzer( technology = self._get_source_tech(technology) self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json) + class LakebridgeAnalyzer: def __init__(self, prompts: AnalyzerPrompts, runner: AnalyzerRunner): From 2673e8f6562a32644d65051ec0a66a03138a54ab Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Thu, 20 Nov 2025 10:04:25 -0500 Subject: [PATCH 6/9] revert old generate-json changes lots of things have been changed since the last commit for this feature. next commit will have my changes added --- .../analyzer/lakebridge_analyzer.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index ad8f11e98e..322d5c7ed3 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -72,22 +72,15 @@ def run(self, source_dir: Path, results_dir: Path, platform: str) -> AnalyzerRes if not check_path(source_dir) or not check_path(results_dir): raise ValueError(f"Invalid path(s) provided: source_dir={source_dir}, results_dir={results_dir}") - def _run_arg_analyzer( - self, - source_dir: str | None, - results_dir: str | None, - technology: str | None, - generate_json: bool = False, - ): - """Run the analyzer: arg guided""" - if source_dir is None or results_dir is None or technology is None: - logger.error("All arguments (--source-directory, --report-file, --source-tech) must be provided") - return + tmp_dir = self._temp_xlsx_path(results_dir) + self._runnable(source_dir, tmp_dir, platform, self._is_debug) + self._move_file(tmp_dir, Path(results_dir)) + logger.info(f"Successfully Analyzed files in ${source_dir} for ${platform} and saved report to {results_dir}") + return AnalyzerResult(Path(source_dir), Path(results_dir), platform) - if check_path(source_dir) and check_path(results_dir): - tmp_dir = self._temp_xlsx_path(results_dir) - technology = self._get_source_tech(technology) - self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug, generate_json) + @staticmethod + def _temp_xlsx_path(results_dir: Path | str) -> Path: + return (Path(tempfile.mkdtemp()) / Path(results_dir).name).with_suffix(".xlsx") class LakebridgeAnalyzer: @@ -97,15 +90,22 @@ def __init__(self, prompts: AnalyzerPrompts, runner: AnalyzerRunner): self._runner = runner def run_analyzer( - self, - source_dir: str | None = None, - results_dir: str | None = None, - technology: str | None = None, - generate_json: bool = False, - ): - """Run the analyzer.""" - if not any([source_dir, results_dir, technology]): - self._run_prompt_analyzer() - return - - self._run_arg_analyzer(source_dir, results_dir, technology, generate_json) + self, source: str | None = None, results: str | None = None, platform: str | None = None + ) -> AnalyzerResult: + if not source: + source_dir = self._prompts.get_source_directory() + elif not isinstance(source, Path): + source_dir = Path(source) + else: + source_dir = source + + if not results: + results_dir = self._prompts.get_result_file_path(source_dir) + elif not isinstance(results, Path): + results_dir = Path(results) + else: + results_dir = results + + platform = self._prompts.get_source_system(platform) + + return self._runner.run(source_dir, results_dir, platform) \ No newline at end of file From 53b76a5ac4b23bb69bb54c63cfd5665f6e2a3fe0 Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Mon, 24 Nov 2025 13:34:17 -0500 Subject: [PATCH 7/9] adding generate_json flag --- .../labs/lakebridge/analyzer/lakebridge_analyzer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index 322d5c7ed3..56afc163bc 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -66,14 +66,14 @@ def __init__( def create(cls, is_debug: bool = False) -> "AnalyzerRunner": return cls(Analyzer.analyze, move_tmp_file, is_debug) - def run(self, source_dir: Path, results_dir: Path, platform: str) -> AnalyzerResult: + def run(self, source_dir: Path, results_dir: Path, platform: str, generate_json: bool = False) -> AnalyzerResult: logger.debug(f"Starting analyzer execution in ${source_dir} for ${platform}") if not check_path(source_dir) or not check_path(results_dir): raise ValueError(f"Invalid path(s) provided: source_dir={source_dir}, results_dir={results_dir}") tmp_dir = self._temp_xlsx_path(results_dir) - self._runnable(source_dir, tmp_dir, platform, self._is_debug) + self._runnable(source_dir, tmp_dir, platform, generate_json, self._is_debug) self._move_file(tmp_dir, Path(results_dir)) logger.info(f"Successfully Analyzed files in ${source_dir} for ${platform} and saved report to {results_dir}") return AnalyzerResult(Path(source_dir), Path(results_dir), platform) @@ -90,7 +90,7 @@ def __init__(self, prompts: AnalyzerPrompts, runner: AnalyzerRunner): self._runner = runner def run_analyzer( - self, source: str | None = None, results: str | None = None, platform: str | None = None + self, source: str | None = None, results: str | None = None, platform: str | None = None, generate_json: bool = False ) -> AnalyzerResult: if not source: source_dir = self._prompts.get_source_directory() @@ -108,4 +108,4 @@ def run_analyzer( platform = self._prompts.get_source_system(platform) - return self._runner.run(source_dir, results_dir, platform) \ No newline at end of file + return self._runner.run(source_dir, results_dir, platform, generate_json) \ No newline at end of file From 597a815d19f4696313fbccea0d5a6a3276636bee Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Tue, 25 Nov 2025 08:49:07 -0500 Subject: [PATCH 8/9] fix: correct analyzer type signature and argument order for generate_json flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Updated Callable type signature to accept 5 parameters: (Path, Path, str, bool, bool) - Fixed argument order in analyzer call to match bladespector signature: (source_dir, tmp_dir, platform, is_debug, generate_json) - Ensures generate_json flag properly flows through to analyzer binary -j flag 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../lakebridge/analyzer/lakebridge_analyzer.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index 56afc163bc..bde4bd1da2 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -56,7 +56,10 @@ def get_source_system(self, platform: str | None = None) -> str: class AnalyzerRunner: def __init__( - self, runnable: Callable[[Path, Path, str, bool], None], move_file: Callable[[Path, Path], None], is_debug: bool + self, + runnable: Callable[[Path, Path, str, bool, bool], None], + move_file: Callable[[Path, Path], None], + is_debug: bool, ): self._runnable = runnable self._move_file = move_file @@ -73,7 +76,7 @@ def run(self, source_dir: Path, results_dir: Path, platform: str, generate_json: raise ValueError(f"Invalid path(s) provided: source_dir={source_dir}, results_dir={results_dir}") tmp_dir = self._temp_xlsx_path(results_dir) - self._runnable(source_dir, tmp_dir, platform, generate_json, self._is_debug) + self._runnable(source_dir, tmp_dir, platform, self._is_debug, generate_json) self._move_file(tmp_dir, Path(results_dir)) logger.info(f"Successfully Analyzed files in ${source_dir} for ${platform} and saved report to {results_dir}") return AnalyzerResult(Path(source_dir), Path(results_dir), platform) @@ -90,7 +93,11 @@ def __init__(self, prompts: AnalyzerPrompts, runner: AnalyzerRunner): self._runner = runner def run_analyzer( - self, source: str | None = None, results: str | None = None, platform: str | None = None, generate_json: bool = False + self, + source: str | None = None, + results: str | None = None, + platform: str | None = None, + generate_json: bool = False, ) -> AnalyzerResult: if not source: source_dir = self._prompts.get_source_directory() @@ -108,4 +115,4 @@ def run_analyzer( platform = self._prompts.get_source_system(platform) - return self._runner.run(source_dir, results_dir, platform, generate_json) \ No newline at end of file + return self._runner.run(source_dir, results_dir, platform, generate_json) From 78d0cb58351edaaca72226e64723f88c60b888f7 Mon Sep 17 00:00:00 2001 From: simone-dbx-labs Date: Tue, 25 Nov 2025 08:54:51 -0500 Subject: [PATCH 9/9] fix: add explicit type cast for Analyzer.analyze to satisfy mypy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added typing.cast to explicitly declare Analyzer.analyze signature - Fixes CI mypy error where external package type information wasn't available - Cast ensures Callable[[Path, Path, str, bool, bool], None] signature is recognized 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../labs/lakebridge/analyzer/lakebridge_analyzer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index bde4bd1da2..51c8684555 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -2,6 +2,7 @@ import tempfile from pathlib import Path from collections.abc import Callable +from typing import cast from databricks.labs.blueprint.entrypoint import get_logger from databricks.labs.blueprint.tui import Prompts @@ -67,7 +68,11 @@ def __init__( @classmethod def create(cls, is_debug: bool = False) -> "AnalyzerRunner": - return cls(Analyzer.analyze, move_tmp_file, is_debug) + return cls( + cast(Callable[[Path, Path, str, bool, bool], None], Analyzer.analyze), + move_tmp_file, + is_debug, + ) def run(self, source_dir: Path, results_dir: Path, platform: str, generate_json: bool = False) -> AnalyzerResult: logger.debug(f"Starting analyzer execution in ${source_dir} for ${platform}")