From ceea803c5588fef5909047d39412d85ae942dea1 Mon Sep 17 00:00:00 2001
From: "Jaiwant.Jonathan" <Jaiwant.Jonathan@databricks.com>
Date: Sat, 28 Feb 2026 19:20:25 -0500
Subject: [PATCH] Add databricks-powerbi-migration skill

New skill for migrating Power BI semantic models to Databricks metric views.
Covers schema assessment, DAX-to-SQL translation, ERD/domain generation,
intermediate mapping layers, query optimization, KPI definitions, data
discovery, and deployment checklists with a 16-step guided workflow.
---
 databricks-skills/README.md                   |   1 +
 .../databricks-powerbi-migration/EXAMPLES.md  | 545 ++++++++++
 .../databricks-powerbi-migration/REFERENCE.md | 943 ++++++++++++++++++
 .../databricks-powerbi-migration/SKILL.md     | 416 ++++++++
 .../databricks-powerbi-migration/approach.md  | 313 ++++++
 .../scripts/compare_schemas.py                | 431 ++++++++
 .../scripts/extract_dbx_schema.py             | 163 +++
 .../scripts/generate_erd.py                   | 355 +++++++
 .../scripts/init_project.sh                   | 146 +++
 .../scripts/parse_pbi_model.py                | 459 +++++++++
 .../scripts/scan_inputs.py                    | 399 ++++++++
 databricks-skills/install_skills.sh           |   4 +-
 12 files changed, 4174 insertions(+), 1 deletion(-)
 create mode 100644 databricks-skills/databricks-powerbi-migration/EXAMPLES.md
 create mode 100644 databricks-skills/databricks-powerbi-migration/REFERENCE.md
 create mode 100644 databricks-skills/databricks-powerbi-migration/SKILL.md
 create mode 100644 databricks-skills/databricks-powerbi-migration/approach.md
 create mode 100644 databricks-skills/databricks-powerbi-migration/scripts/compare_schemas.py
 create mode 100644 databricks-skills/databricks-powerbi-migration/scripts/extract_dbx_schema.py
 create mode 100644 databricks-skills/databricks-powerbi-migration/scripts/generate_erd.py
 create mode 100755 databricks-skills/databricks-powerbi-migration/scripts/init_project.sh
 create mode 100644 databricks-skills/databricks-powerbi-migration/scripts/parse_pbi_model.py
 create mode 100644 databricks-skills/databricks-powerbi-migration/scripts/scan_inputs.py

diff --git a/databricks-skills/README.md b/databricks-skills/README.md
index ddc5b08..f65a5ed 100644
--- a/databricks-skills/README.md
+++ b/databricks-skills/README.md
@@ -51,6 +51,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/
 
 ### 📊 Analytics & Dashboards
 - **databricks-aibi-dashboards** - Databricks AI/BI dashboards (with SQL validation workflow)
+- **databricks-powerbi-migration** - Power BI to Databricks migration (metric views, DAX-to-SQL, ERD generation, schema mapping)
 - **databricks-unity-catalog** - System tables for lineage, audit, billing
 
 ### 🔧 Data Engineering
diff --git a/databricks-skills/databricks-powerbi-migration/EXAMPLES.md b/databricks-skills/databricks-powerbi-migration/EXAMPLES.md
new file mode 100644
index 0000000..e9f38a5
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/EXAMPLES.md
@@ -0,0 +1,545 @@
+# Examples: Input/Output Patterns
+
+Concrete examples for key skill workflows. Referenced from [SKILL.md](SKILL.md) and [REFERENCE.md](REFERENCE.md).
+
+---
+
+## EDW-to-CDM Intermediate Mapping (Gap 1)
+
+### Input: Power Query M Expression (from PBI model)
+
+```m
+let
+    Source = Sql.Database("edw-server", "sales_db"),
+    dbo_Transactions = Source{[Schema="dbo",Item="Transactions"]}[Data],
+    Renamed = Table.RenameColumns(dbo_Transactions, {
+        {"TransactionID", "SaleID"},
+        {"AmountUSD", "TotalAmount"},
+        {"CreatedAt", "OrderDate"}
+    }),
+    Selected = Table.SelectColumns(Renamed, {"SaleID", "TotalAmount", "OrderDate", "CustomerID"})
+in
+    Selected
+```
+
+### Output: Two-Layer Mapping (default)
+
+```json
+{
+  "mappings": [
+    {
+      "pbi_table": "SalesFact",
+      "dbx_table": "catalog.gold.sales_transactions",
+      "columns": [
+        {"pbi_column": "SaleID", "dbx_column": "transaction_id"},
+        {"pbi_column": "TotalAmount", "dbx_column": "amount_usd"},
+        {"pbi_column": "OrderDate", "dbx_column": "created_at"},
+        {"pbi_column": "CustomerID", "dbx_column": "customer_id"}
+      ]
+    }
+  ]
+}
+```
+
+### Output: Three-Layer Mapping (when M renames are relevant)
+
+```json
+{
+  "mappings": [
+    {
+      "pbi_table": "SalesFact",
+      "dbx_table": "catalog.gold.sales_transactions",
+      "columns": [
+        {"pbi_column": "SaleID", "m_query_column": "TransactionID", "dbx_column": "transaction_id"},
+        {"pbi_column": "TotalAmount", "m_query_column": "AmountUSD", "dbx_column": "amount_usd"},
+        {"pbi_column": "OrderDate", "m_query_column": "CreatedAt", "dbx_column": "created_at"},
+        {"pbi_column": "CustomerID", "m_query_column": "CustomerID", "dbx_column": "customer_id"}
+      ]
+    }
+  ]
+}
+```
+
+---
+
+## KPI Definition Template (Gap 7)
+
+### Input: Power BI DAX Measures
+
+```dax
+Total Sales = SUM(SalesFact[TotalAmount])
+Avg Order Value = DIVIDE([Total Sales], COUNTROWS(SalesFact))
+Sales YoY Growth = DIVIDE(
+    [Total Sales] - CALCULATE([Total Sales], SAMEPERIODLASTYEAR(DateDim[Date])),
+    CALCULATE([Total Sales], SAMEPERIODLASTYEAR(DateDim[Date]))
+)
+Customer Count = DISTINCTCOUNT(SalesFact[CustomerID])
+```
+
+### Output: kpi/kpi_definitions.md
+
+```markdown
+# KPI Definitions
+
+## Domain: Sales
+
+### KPI: Total Sales
+- **Business Context**: Total revenue from all completed sales transactions
+- **DAX Formula**: `SUM(SalesFact[TotalAmount])`
+- **SQL Equivalent**: `SUM(total_amount)`
+- **Source Table**: catalog.gold.sales_fact
+- **Format**: Currency, 2 decimals
+- **Data Gaps**: None identified
+- **Domain**: Sales
+
+### KPI: Avg Order Value
+- **Business Context**: Average revenue per sales transaction
+- **DAX Formula**: `DIVIDE([Total Sales], COUNTROWS(SalesFact))`
+- **SQL Equivalent**: `SUM(total_amount) / NULLIF(COUNT(1), 0)`
+- **Source Table**: catalog.gold.sales_fact
+- **Format**: Currency, 2 decimals
+- **Data Gaps**: None identified
+- **Domain**: Sales
+
+### KPI: Sales YoY Growth
+- **Business Context**: Year-over-year percentage change in total sales
+- **DAX Formula**: `DIVIDE([Total Sales] - CALCULATE([Total Sales], SAMEPERIODLASTYEAR(...)), ...)`
+- **SQL Equivalent**: Window function with LAG over year partition (see metric view)
+- **Source Table**: catalog.gold.sales_fact
+- **Format**: Percentage, 1 decimal
+- **Data Gaps**: Requires at least 2 years of data for meaningful comparison
+- **Domain**: Sales
+
+### KPI: Customer Count
+- **Business Context**: Number of unique customers with at least one transaction
+- **DAX Formula**: `DISTINCTCOUNT(SalesFact[CustomerID])`
+- **SQL Equivalent**: `COUNT(DISTINCT customer_id)`
+- **Source Table**: catalog.gold.sales_fact
+- **Format**: Integer
+- **Data Gaps**: None identified
+- **Domain**: Sales
+```
+
+---
+
+## Data Discovery Queries (Gap 4)
+
+### Input: Column Gap Analysis identifies discriminator columns
+
+```
+sales_fact.result_type  (flagged as discriminator)
+sales_fact.order_status (flagged as discriminator)
+sales_fact.order_date   (date column)
+customer_dim.customer_status (flagged as discriminator)
+```
+
+### Output: reference/data_discovery_queries.sql
+
+```sql
+-- =============================================================
+-- Data Discovery Queries
+-- Generated from column gap analysis
+-- =============================================================
+
+-- 1. Discriminator: sales_fact.result_type
+SELECT DISTINCT result_type
+FROM catalog.gold.sales_fact
+ORDER BY result_type;
+
+SELECT result_type, COUNT(*) AS cnt
+FROM catalog.gold.sales_fact
+GROUP BY result_type
+ORDER BY cnt DESC
+LIMIT 50;
+
+-- 2. Discriminator: sales_fact.order_status
+SELECT DISTINCT order_status
+FROM catalog.gold.sales_fact
+ORDER BY order_status;
+
+SELECT order_status, COUNT(*) AS cnt
+FROM catalog.gold.sales_fact
+GROUP BY order_status
+ORDER BY cnt DESC
+LIMIT 50;
+
+-- 3. Date range: sales_fact.order_date
+SELECT
+  MIN(order_date) AS min_date,
+  MAX(order_date) AS max_date
+FROM catalog.gold.sales_fact;
+
+-- 4. Discriminator: customer_dim.customer_status
+SELECT DISTINCT customer_status
+FROM catalog.gold.customer_dim
+ORDER BY customer_status;
+
+SELECT customer_status, COUNT(*) AS cnt
+FROM catalog.gold.customer_dim
+GROUP BY customer_status
+ORDER BY cnt DESC
+LIMIT 50;
+
+-- 5. Null rate analysis for all gap columns
+SELECT
+  COUNT(*) AS total_rows,
+  COUNT(*) - COUNT(result_type) AS result_type_nulls,
+  COUNT(*) - COUNT(order_status) AS order_status_nulls
+FROM catalog.gold.sales_fact;
+```
+
+---
+
+## Deployment Checklist (Gap 11)
+
+### Input: Completed project with metric views and Path A chosen
+
+### Output: reference/deployment_checklist.md
+
+```markdown
+## Deployment Checklist: Sales Analytics Migration
+
+**Project**: Sales PBI to Databricks
+**Date**: 2026-02-26
+**Path**: A (PBI Reconnection)
+
+### Pre-Deployment
+- [ ] Validate catalog access:
+  ```sql
+  SELECT 1 FROM analytics_catalog.gold.sales_fact LIMIT 1;
+  SELECT 1 FROM analytics_catalog.gold.customer_dim LIMIT 1;
+  SELECT 1 FROM analytics_catalog.gold.date_dim LIMIT 1;
+  ```
+- [ ] Verify SQL warehouse `analytics-wh` is running
+- [ ] Confirm user has SELECT on `analytics_catalog.gold`
+
+### Metric View Deployment
+- [ ] Run `models/metric_views/sales_metrics.sql`
+- [ ] Run `models/metric_views/customer_metrics.sql`
+- [ ] Verify:
+  ```sql
+  SELECT MEASURE(`Total Sales`) FROM analytics_catalog.gold.sales_metrics LIMIT 10;
+  SELECT MEASURE(`Customer Count`) FROM analytics_catalog.gold.customer_metrics LIMIT 10;
+  ```
+- [ ] Grant SELECT to `analysts` group:
+  ```sql
+  GRANT SELECT ON VIEW analytics_catalog.gold.sales_metrics TO `analysts`;
+  ```
+
+### Power BI Reconnection
+- [ ] Create parameters: `ServerHostName`, `HTTPPath`, `CatalogName`
+- [ ] Update M queries to use `Databricks.Catalogs()` connector
+- [ ] Set SalesFact to DirectQuery, CustomerDim/DateDim to Dual
+- [ ] Enable "Assume Referential Integrity" on all relationships
+- [ ] Test: verify Total Sales matches original report value
+- [ ] Publish to Power BI Service
+- [ ] Update stored credentials in Power BI Service
+
+### Post-Deployment
+- [ ] Compare 5 key KPI values between old and new reports
+- [ ] Monitor query performance for 1 week
+- [ ] Document any discrepancies in reference/validation_notes.md
+- [ ] Share deployment summary with stakeholders
+```
+
+---
+
+## CSV Schema Dump (Gap 2)
+
+### Input: CSV file with INFORMATION_SCHEMA-style headers
+
+```csv
+table_name,column_name,data_type,is_nullable,comment
+sales_fact,sale_id,BIGINT,NO,Primary key
+sales_fact,customer_id,BIGINT,NO,FK to customer_dim
+sales_fact,order_date,DATE,NO,Date of order
+sales_fact,total_amount,DECIMAL(18,2),YES,Order total in USD
+customer_dim,customer_id,BIGINT,NO,Primary key
+customer_dim,customer_name,STRING,YES,Full name
+customer_dim,customer_status,STRING,YES,Active/Inactive
+```
+
+### Scanner Output
+
+```json
+{
+  "path": "input/schema_export.csv",
+  "name": "schema_export.csv",
+  "type": "csv_schema_dump",
+  "details": "Schema dump: 2 tables, 7 columns"
+}
+```
+
+The agent should parse this CSV and construct a schema representation equivalent to `extract_dbx_schema.py` output for use in schema comparison.
+
+---
+
+## Catalog Resolution (Gap 5)
+
+### Input: User provides catalog name "analytics"
+
+### Agent probes
+
+```sql
+SELECT catalog_name FROM system.information_schema.catalogs ORDER BY catalog_name;
+-- Result: analytics, fc_analytics, hive_metastore, system
+```
+
+### Output: reference/catalog_resolution.md
+
+```markdown
+## Catalog Resolution
+
+- **Primary catalog**: `analytics`
+- **Fallback catalog**: `fc_analytics`
+- **Target schema**: `gold`
+
+### Verification
+| Table | Found In | Schema |
+|-------|----------|--------|
+| sales_fact | analytics | gold |
+| customer_dim | analytics | gold |
+| date_dim | analytics | gold |
+| product_dim | fc_analytics | gold |
+
+**Note**: `product_dim` found only in `fc_analytics`. Verify if this is the correct source.
+```
+
+---
+
+## Parallel Catalog Probing with Shell Subagents (Gap 13)
+
+### Input: Catalog list from `system.information_schema.catalogs`
+
+```
+analytics, fc_analytics, hive_metastore, system
+```
+
+Target schema: `gold`. PBI model references tables: `sales_fact`, `customer_dim`, `date_dim`, `product_dim`.
+
+### Agent launches 3 parallel shell subagents
+
+```
+Task(subagent_type="shell", description="Probe analytics catalog",
+     prompt='Probe catalog "analytics" for tables in schema "gold" using the Databricks MCP server.
+             Call CallMcpTool with server="user-databricks", toolName="execute_sql",
+             arguments={"sql_query": "SELECT table_name FROM analytics.information_schema.tables WHERE table_schema = \'gold\' ORDER BY table_name"}.
+             Return the list of table names found.')
+
+Task(subagent_type="shell", description="Probe fc_analytics catalog",
+     prompt='Probe catalog "fc_analytics" for tables in schema "gold" using the Databricks MCP server.
+             Call CallMcpTool with server="user-databricks", toolName="execute_sql",
+             arguments={"sql_query": "SELECT table_name FROM fc_analytics.information_schema.tables WHERE table_schema = \'gold\' ORDER BY table_name"}.
+             Return the list of table names found.')
+
+Task(subagent_type="shell", description="Probe hive_metastore catalog",
+     prompt='Probe catalog "hive_metastore" for tables in schema "gold" using the Databricks MCP server.
+             Call CallMcpTool with server="user-databricks", toolName="execute_sql",
+             arguments={"sql_query": "SELECT table_name FROM hive_metastore.information_schema.tables WHERE table_schema = \'gold\' ORDER BY table_name"}.
+             Return the list of table names found.')
+```
+
+### Merged output: reference/catalog_resolution.md
+
+```markdown
+## Catalog Resolution
+
+- **Primary catalog**: `analytics`
+- **Fallback catalog**: `fc_analytics`
+- **Target schema**: `gold`
+
+### Table Inventory
+| Table | Catalog | Schema |
+|-------|---------|--------|
+| sales_fact | analytics | gold |
+| customer_dim | analytics | gold |
+| date_dim | analytics | gold |
+| product_dim | fc_analytics | gold |
+```
+
+---
+
+## Batch Data Discovery with execute_sql_multi (Gap 13)
+
+### Input: Data discovery queries from Step 9
+
+All queries target the same catalog (`analytics.gold`).
+
+### Agent calls execute_sql_multi
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "execute_sql_multi"
+  arguments:
+    sql_content: |
+      -- Discriminator: sales_fact.result_type
+      SELECT DISTINCT result_type FROM analytics.gold.sales_fact ORDER BY result_type;
+
+      -- Distribution: sales_fact.result_type
+      SELECT result_type, COUNT(*) AS cnt FROM analytics.gold.sales_fact GROUP BY result_type ORDER BY cnt DESC LIMIT 50;
+
+      -- Date range: sales_fact.order_date
+      SELECT MIN(order_date) AS min_date, MAX(order_date) AS max_date FROM analytics.gold.sales_fact;
+
+      -- Discriminator: customer_dim.customer_status
+      SELECT DISTINCT customer_status FROM analytics.gold.customer_dim ORDER BY customer_status;
+
+      -- Null rate analysis
+      SELECT COUNT(*) AS total_rows, COUNT(*) - COUNT(result_type) AS result_type_nulls, COUNT(*) - COUNT(order_status) AS order_status_nulls FROM analytics.gold.sales_fact;
+    catalog: "analytics"
+    schema: "gold"
+    max_workers: 4
+```
+
+### Output: Execution summary
+
+The tool returns results per statement, with an execution summary showing which queries ran in parallel and their individual timings. Results are ingested back into the analysis for column gap resolution and KPI data gap documentation.
+
+---
+
+## Existing Metric View Detection (Gap 15)
+
+### Input: KPIs defined in Step 9
+
+```markdown
+## Domain: Sales
+- Total Sales: SUM(total_amount)
+- Avg Order Value: SUM(total_amount) / NULLIF(COUNT(1), 0)
+- Sales YoY Growth: (window function with LAG)
+- Customer Count: COUNT(DISTINCT customer_id)
+
+## Domain: Finance
+- Gross Margin: (SUM(revenue) - SUM(cost)) / NULLIF(SUM(revenue), 0)
+```
+
+### Step 1: Discover existing metric views
+
+```sql
+SELECT table_name, view_definition
+FROM analytics.information_schema.views
+WHERE table_schema = 'gold'
+  AND view_definition LIKE '%WITH METRICS%';
+```
+
+Result:
+
+| table_name | view_definition |
+|-----------|-----------------|
+| sales_metrics | CREATE VIEW ... WITH METRICS LANGUAGE YAML AS $$ ... $$ |
+| customer_metrics | CREATE VIEW ... WITH METRICS LANGUAGE YAML AS $$ ... $$ |
+
+### Step 2: Inspect each metric view
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "manage_metric_views"
+  arguments:
+    action: "describe"
+    full_name: "analytics.gold.sales_metrics"
+```
+
+Result shows measures:
+- `Total Sales`: `SUM(total_amount)`
+- `Order Count`: `COUNT(1)`
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "manage_metric_views"
+  arguments:
+    action: "describe"
+    full_name: "analytics.gold.customer_metrics"
+```
+
+Result shows measures:
+- `Customer Count`: `COUNT(DISTINCT customer_id)`
+- `Repeat Customer Rate`: `COUNT(DISTINCT CASE WHEN order_count > 1 THEN customer_id END) / NULLIF(COUNT(DISTINCT customer_id), 0)`
+
+### Step 3: Compare and classify
+
+| KPI Name | Domain | Classification | Existing View | Notes |
+|----------|--------|---------------|---------------|-------|
+| Total Sales | Sales | exists | sales_metrics | `SUM(total_amount)` matches |
+| Avg Order Value | Sales | new | — | Not found in any view |
+| Sales YoY Growth | Sales | new | — | Not found in any view |
+| Customer Count | Customer | exists | customer_metrics | `COUNT(DISTINCT customer_id)` matches |
+| Gross Margin | Finance | new | — | No finance_metrics view found |
+
+### Output: reference/existing_metric_views.md
+
+```markdown
+## Existing Metric View Analysis
+
+### Discovery
+Found 2 metric views in `analytics.gold`:
+- `sales_metrics` (2 measures: Total Sales, Order Count)
+- `customer_metrics` (2 measures: Customer Count, Repeat Customer Rate)
+
+### KPI Classification
+
+| KPI Name | Domain | Classification | Existing View | Notes |
+|----------|--------|---------------|---------------|-------|
+| Total Sales | Sales | exists | sales_metrics | Expression matches |
+| Avg Order Value | Sales | new | — | Not in any existing view |
+| Sales YoY Growth | Sales | new | — | Not in any existing view |
+| Customer Count | Customer | exists | customer_metrics | Expression matches |
+| Gross Margin | Finance | new | — | No finance view exists |
+
+### Views to Modify
+- None (no `update` classifications in this run)
+
+### New Metric Views to Create
+- **sales_metrics**: ALTER to add `Avg Order Value` and `Sales YoY Growth`
+  (same source table as existing `Total Sales`, so extend existing view)
+- **finance_metrics**: CREATE new view with `Gross Margin`
+
+### Skipped (Already Deployed)
+- Total Sales (in sales_metrics)
+- Customer Count (in customer_metrics)
+```
+
+### Step 13 actions based on classification
+
+```sql
+-- Extend existing sales_metrics with new measures
+ALTER VIEW analytics.gold.sales_metrics
+WITH METRICS LANGUAGE YAML AS $$
+  version: 1.1
+  source: analytics.gold.sales_fact
+  dimensions:
+    - name: Order Month
+      expr: date_trunc('month', order_date)
+    - name: Region
+      expr: region
+  measures:
+    - name: Total Sales
+      expr: SUM(total_amount)
+      comment: "DAX: SUM(SalesFact[TotalAmount])"
+    - name: Avg Order Value
+      expr: SUM(total_amount) / NULLIF(COUNT(1), 0)
+      comment: "DAX: DIVIDE([Total Sales], COUNTROWS(SalesFact))"
+    - name: Sales YoY Growth
+      expr: (SUM(total_amount) - LAG(SUM(total_amount)) OVER (ORDER BY date_trunc('month', order_date))) / NULLIF(LAG(SUM(total_amount)) OVER (ORDER BY date_trunc('month', order_date)), 0)
+      comment: "DAX: DIVIDE([Total Sales] - CALCULATE([Total Sales], SAMEPERIODLASTYEAR(...)), ...)"
+$$;
+
+-- Create new finance_metrics view
+CREATE OR REPLACE VIEW analytics.gold.finance_metrics
+WITH METRICS LANGUAGE YAML AS $$
+  version: 1.1
+  source: analytics.gold.revenue_fact
+  dimensions:
+    - name: Period
+      expr: date_trunc('month', revenue_date)
+    - name: Business Unit
+      expr: business_unit
+  measures:
+    - name: Gross Margin
+      expr: (SUM(revenue) - SUM(cost)) / NULLIF(SUM(revenue), 0)
+      comment: "DAX: DIVIDE(SUM(Revenue) - SUM(Cost), SUM(Revenue))"
+$$;
+
+-- Customer Count: SKIP (already deployed in customer_metrics with matching expression)
+```
diff --git a/databricks-skills/databricks-powerbi-migration/REFERENCE.md b/databricks-skills/databricks-powerbi-migration/REFERENCE.md
new file mode 100644
index 0000000..2a28ddd
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/REFERENCE.md
@@ -0,0 +1,943 @@
+# Reference: Detailed Patterns
+
+This document provides detailed patterns for each gap identified during real-world testing of the PowerBI-to-Databricks skill. Each section is referenced from [SKILL.md](SKILL.md) workflow steps.
+
+---
+
+## Gap 1: Intermediate Mapping Layer (Scenario D)
+
+When Power Query M expressions rename columns between the PBI semantic layer and the physical database, a direct name comparison fails. Scenario D handles this by extracting the renames and building a mapping.
+
+### Detection
+
+Look in the PBI model's `partitions[].source.expression` for M code containing:
+
+- `Table.RenameColumns` -- explicit column renames
+- `Table.SelectColumns` -- column selection (implies name preservation)
+- Schema parameter patterns -- `type table [ColName = type text, ...]`
+
+### Mapping Construction
+
+Build the **common two-layer mapping** (`pbi_column -> dbx_column`) by default. Use a **three-layer mapping** only where Power Query M introduces an intermediate rename:
+
+```
+Two-layer (default):
+  pbi_column  ->  dbx_column
+
+Three-layer (only when M renames are present):
+  pbi_column  ->  m_query_column  ->  dbx_column
+```
+
+### How to Extract M Renames
+
+1. Parse the PBI model JSON and locate `partitions` on each table.
+2. For each partition with `source.type == "m"`, read `source.expression`.
+3. Search for `Table.RenameColumns(...)` calls -- the argument is a list of `{old, new}` pairs.
+4. Search for the `type table [...]` schema definition to find the final column names exposed to the PBI layer.
+5. Map backward: PBI column name -> M expression column -> physical DB column.
+
+### Usage with compare_schemas.py
+
+Pass the intermediate mapping file via the `--mapping` flag:
+
+```bash
+python scripts/compare_schemas.py \
+  reference/pbi_model.json reference/dbx_schema.json \
+  -o reference/schema_comparison.md --json \
+  --mapping reference/intermediate_mapping.json
+```
+
+The mapping JSON format:
+
+```json
+{
+  "mappings": [
+    {
+      "pbi_table": "SalesFact",
+      "dbx_table": "catalog.schema.sales_transactions",
+      "columns": [
+        {
+          "pbi_column": "SaleID",
+          "m_query_column": "transaction_id",
+          "dbx_column": "transaction_id"
+        },
+        {
+          "pbi_column": "TotalAmount",
+          "dbx_column": "amount_usd"
+        }
+      ]
+    }
+  ]
+}
+```
+
+When `m_query_column` is absent, the mapping is treated as two-layer.
+
+---
+
+## Gap 2: CSV Schema Dump Detection
+
+CSV files exported from `INFORMATION_SCHEMA` queries or database documentation tools often contain schema metadata. The scanner should detect these and treat them as equivalent to schema query output.
+
+### Detection Criteria
+
+A CSV file is classified as `csv_schema_dump` when its header row contains columns matching these patterns (case-insensitive, allowing underscores, spaces, or camelCase):
+
+- `table_name` / `tableName` / `TABLE_NAME`
+- `column_name` / `columnName` / `COLUMN_NAME`
+- `data_type` / `dataType` / `DATA_TYPE`
+
+At least `table_name` and `column_name` must be present. `data_type` is strongly expected but not strictly required.
+
+### Agent Behavior
+
+When a `csv_schema_dump` is detected:
+
+1. Parse the CSV to extract table names, column names, and data types.
+2. Build a schema representation equivalent to `extract_dbx_schema.py` output.
+3. Use this schema for comparison in Step 6.
+
+---
+
+## Gap 3: Databricks-Only Column Gap Detection
+
+After schema comparison, DBX-only columns (columns present in Databricks but not referenced in the Power BI model) may be important for:
+
+- Filters and partitions in reports built outside PBI
+- Discriminator columns that determine row subsets
+- Audit/metadata columns needed for data governance
+
+### Column Gap Analysis Output
+
+The `compare_schemas.py` script produces `reference/column_gap_analysis.md` with:
+
+1. Every DBX-only column grouped by table
+2. Discriminator flagging for columns that appear to be low-cardinality (naming heuristics: `status`, `type`, `category`, `result_type`, `is_*`, `flag_*`, `*_code`)
+3. Suggested actions for each flagged column
+
+### Discriminator Heuristics
+
+A column is flagged as a potential discriminator if its name matches any of:
+
+- Contains `status`, `type`, `category`, `code`, `flag`, `class`, `kind`, `tier`, `level`, `group`
+- Starts with `is_`, `has_`, `can_`
+- Ends with `_type`, `_status`, `_code`, `_flag`, `_category`, `_class`
+
+### Output Format
+
+```markdown
+## Column Gap Analysis
+
+### Table: catalog.schema.sales_fact
+| Column | Data Type | Discriminator? | Suggested Action |
+|--------|-----------|----------------|------------------|
+| result_type | STRING | Yes | May filter report subsets -- run data discovery |
+| etl_load_date | TIMESTAMP | No | Audit column -- likely not needed in reports |
+
+### Table: catalog.schema.customer_dim
+| Column | Data Type | Discriminator? | Suggested Action |
+|--------|-----------|----------------|------------------|
+| customer_status | STRING | Yes | May be essential for active/inactive filtering |
+```
+
+---
+
+## Gap 4: Data Discovery Query Generation
+
+After schema comparison and column gap analysis, auto-generate SQL queries to understand data values, distributions, and ranges.
+
+### Query Templates
+
+For **low-cardinality / discriminator columns**:
+
+```sql
+SELECT DISTINCT <column> FROM <catalog>.<schema>.<table> ORDER BY <column>;
+```
+
+For **value distribution**:
+
+```sql
+SELECT <column>, COUNT(*) AS cnt
+FROM <catalog>.<schema>.<table>
+GROUP BY <column>
+ORDER BY cnt DESC
+LIMIT 50;
+```
+
+For **date columns**:
+
+```sql
+SELECT MIN(<date_col>) AS min_date, MAX(<date_col>) AS max_date
+FROM <catalog>.<schema>.<table>;
+```
+
+For **null rate analysis**:
+
+```sql
+SELECT
+  COUNT(*) AS total_rows,
+  COUNT(*) - COUNT(<col>) AS null_count,
+  ROUND((COUNT(*) - COUNT(<col>)) * 100.0 / COUNT(*), 2) AS null_pct
+FROM <catalog>.<schema>.<table>;
+```
+
+### Output
+
+Save all generated queries to `reference/data_discovery_queries.sql`. The agent can:
+
+1. Run queries via MCP `execute_sql` if available
+2. Present queries to the user to run manually
+3. Ingest results back into the analysis
+
+---
+
+## Gap 5: Catalog Resolution Strategy
+
+In multi-catalog environments, the agent must determine which catalog and schema contain the target tables.
+
+### Resolution Steps
+
+1. **Probe available catalogs**:
+   ```sql
+   SELECT catalog_name FROM system.information_schema.catalogs ORDER BY catalog_name;
+   ```
+
+2. **Probe schemas within the target catalog**:
+   ```sql
+   SELECT schema_name FROM <catalog>.information_schema.schemata;
+   ```
+
+3. **Verify table existence**:
+   ```sql
+   SELECT table_name
+   FROM <catalog>.information_schema.tables
+   WHERE table_schema = '<schema>';
+   ```
+
+### Handling fc_ Prefix
+
+Some environments prefix catalog names with `fc_`. The agent should:
+
+1. Try the catalog name as provided
+2. If not found, try with `fc_` prefix
+3. If not found, try without `fc_` prefix
+4. Document both primary and fallback catalog in `reference/catalog_resolution.md`
+
+### Parallel Probing with Subagents
+
+When the catalog list contains multiple candidates, probe them concurrently using parallel `shell` subagents (see Gap 13 Pattern A). Each subagent calls `execute_sql` via the `user-databricks` MCP server to check table existence in its assigned catalog. This reduces catalog resolution from serial (N sequential queries) to parallel (all catalogs probed at once, max 4 concurrent).
+
+### Output
+
+Produce `reference/catalog_resolution.md`:
+
+```markdown
+## Catalog Resolution
+
+- **Primary catalog**: `my_catalog`
+- **Fallback catalog**: `fc_my_catalog` (if applicable)
+- **Target schema**: `gold`
+- **Tables found**: 15 (listed below)
+- **Tables missing**: 2 (listed below)
+
+### Table Inventory
+| Table | Catalog | Schema | Row Count (est.) |
+|-------|---------|--------|------------------|
+| sales_fact | my_catalog | gold | ~10M |
+```
+
+---
+
+## Gap 6: Report Replication Workflow (Path B)
+
+When the goal is to replace Power BI reports with Databricks-native reports rather than reconnecting PBI:
+
+### Workflow
+
+1. Read the `databricks-aibi-dashboards` skill for dashboard creation patterns
+2. Build a report specification from the PBI model's visual layout (pages, visuals, filters)
+3. Generate `planreport/report_spec.md` with:
+   - Summary tables (KPI scorecards)
+   - Trend charts (time series by dimension)
+   - Narrative text blocks (dynamic text with metric values)
+   - Disclaimers and footnotes
+4. Generate `planreport/email_template.md` for distribution specs
+5. Use the `databricks-jobs` skill to schedule delivery
+
+### Report Specification Template
+
+```markdown
+## Report: <Report Name>
+
+### Page 1: Executive Summary
+- **Visual 1**: KPI scorecard (Total Sales, Avg Order Value, Customer Count)
+- **Visual 2**: Monthly trend line (Total Sales by Month)
+- **Visual 3**: Top 10 table (Products by Revenue)
+- **Filters**: Date range, Region, Product Category
+
+### Page 2: Detail View
+- **Visual 1**: Table with drill-through (Order details)
+- **Filters**: All from Page 1 + Customer Segment
+```
+
+---
+
+## Gap 7: KPI Definitions as First-Class Artifact
+
+KPI definitions should be structured, not informal. The agent produces `kpi/kpi_definitions.md` with a standardized template per KPI.
+
+### Template
+
+```markdown
+### KPI: <KPI Name>
+- **Business Context**: <What this KPI measures and why it matters>
+- **DAX Formula**: `<original DAX>`
+- **SQL Equivalent**: `<translated SQL aggregate>`
+- **Source Table**: <catalog.schema.table>
+- **Format**: <Currency/Percentage/Integer/Decimal, precision>
+- **Data Gaps**: <None identified / Missing values in X column / etc.>
+- **Domain**: <Business domain this KPI belongs to>
+```
+
+### Organizing KPIs
+
+Group KPIs by domain. Within each domain, order by importance (primary KPIs first, derived KPIs after).
+
+```markdown
+# KPI Definitions
+
+## Domain: Sales
+### KPI: Total Sales
+...
+### KPI: Avg Order Value
+...
+
+## Domain: Finance
+### KPI: Gross Margin
+...
+```
+
+---
+
+## Gap 8: Sample Report / Document Analysis
+
+When `input/` contains sample reports or documents (`.docx`, `.pdf`, `.png`, `.jpg`, `.xlsx`, `.pptx`), the agent should reverse-engineer the report's structure.
+
+### What to Extract
+
+- **KPI names and values** visible in the report
+- **Column formatting** (currency symbols, decimal places, date formats)
+- **Chart types** (bar, line, pie, table, scorecard)
+- **Narrative templates** (dynamic text patterns like "Sales increased by X% compared to...")
+- **Disclaimers and footnotes**
+- **Branding** (colors, logos, headers/footers)
+- **Filter/slicer positions** and default values
+
+### Output
+
+Produce `reference/report_analysis.md`:
+
+```markdown
+## Report Analysis: <filename>
+
+### KPIs Identified
+| KPI | Value (as shown) | Likely Measure | Format |
+|-----|-------------------|----------------|--------|
+| Total Revenue | $1.2M | SUM(revenue) | Currency, 1 decimal |
+
+### Visuals
+| # | Type | Title | Dimensions | Measures |
+|---|------|-------|------------|----------|
+| 1 | Scorecard | Key Metrics | - | Total Revenue, Order Count |
+| 2 | Line Chart | Monthly Trend | Month | Total Revenue |
+
+### Narrative Templates
+- "Revenue for {period} was {Total Revenue}, a {YoY Change}% change from the prior year."
+
+### Disclaimers
+- "Data as of {last_refresh_date}. Excludes returns processed after close."
+```
+
+---
+
+## Gap 9: Cross-Schema and INFORMATION_SCHEMA Probing
+
+In multi-schema and multi-catalog environments, extend schema queries to cover all relevant schemas.
+
+### Cross-Schema Column Comparison
+
+```sql
+SELECT table_schema, table_name, column_name, data_type
+FROM <catalog>.information_schema.columns
+WHERE table_schema IN ('<schema_a>', '<schema_b>')
+ORDER BY table_schema, table_name, ordinal_position;
+```
+
+### Discover All Schemas in a Catalog
+
+```sql
+SELECT schema_name FROM <catalog>.information_schema.schemata;
+```
+
+### Discover All Catalogs
+
+```sql
+SELECT catalog_name FROM system.information_schema.catalogs;
+```
+
+### When to Use Cross-Schema Probing
+
+- PBI model references tables from multiple schemas
+- Table names exist in multiple schemas (need to disambiguate)
+- Migration involves consolidating schemas
+
+---
+
+## Gap 10: Report Distribution Artifacts
+
+The `planreport/` folder contains all artifacts needed for Databricks-native report delivery.
+
+### Folder Structure
+
+```
+planreport/
+├── report_spec.md          # Visual layout, chart specs, narrative blocks
+├── email_template.md       # Recipients, schedule, subject, body template
+└── deployment_config.yml   # Job schedule, warehouse, notification targets
+```
+
+### Email Template
+
+```markdown
+## Email Distribution
+
+- **Recipients**: [list of email addresses or groups]
+- **Schedule**: Weekly, Monday 8:00 AM UTC
+- **Subject**: "{Report Name} - Week of {date}"
+- **Body**: See narrative template from report_spec.md
+- **Attachments**: PDF export of dashboard
+- **Format**: HTML with inline charts
+```
+
+### Deployment Configuration
+
+```yaml
+report_name: "Sales Weekly Report"
+warehouse_id: "<sql_warehouse_id>"
+schedule:
+  quartz_cron: "0 0 8 ? * MON"
+  timezone: "UTC"
+notifications:
+  on_success:
+    - email: "team@company.com"
+  on_failure:
+    - email: "admin@company.com"
+```
+
+---
+
+## Gap 11: Deployment Checklist Generation
+
+The final artifact is an ordered checklist of steps to go from local artifacts to a running, scheduled report.
+
+### Checklist Template
+
+```markdown
+## Deployment Checklist
+
+### Pre-Deployment
+- [ ] Validate catalog access: `SELECT 1 FROM <catalog>.<schema>.<table> LIMIT 1`
+- [ ] Verify all source tables exist and are accessible
+- [ ] Confirm SQL warehouse is running and sized appropriately
+
+### Metric View Deployment
+- [ ] Deploy metric views: run each SQL file in models/metric_views/
+- [ ] Verify metric views: `SELECT MEASURE(...) FROM <view> LIMIT 10`
+- [ ] Grant SELECT to required users/groups
+
+### Report Deployment (choose path)
+#### Path A: Power BI Reconnection
+- [ ] Update Power Query M formulas to use Databricks connector
+- [ ] Parameterize ServerHostName and HTTPPath
+- [ ] Set DirectQuery for facts, Dual for dimensions
+- [ ] Set "Assume Referential Integrity" on relationships
+- [ ] Test all report pages for data accuracy
+- [ ] Publish to Power BI Service
+- [ ] Update stored credentials in Power BI Service
+
+#### Path B: Databricks-Native Report
+- [ ] Create AI/BI dashboard from report_spec.md
+- [ ] Configure job schedule from deployment_config.yml
+- [ ] Set up email distribution from email_template.md
+- [ ] Test dashboard rendering and data accuracy
+
+### Post-Deployment
+- [ ] Run validation queries against metric views
+- [ ] Compare output values with original PBI report
+- [ ] Share deployment summary with stakeholders
+- [ ] Document any known gaps or deferred items
+```
+
+---
+
+## Gap 12: Query Access Optimization
+
+Before constructing metric views, assess query complexity, table size, grain, and access patterns to choose the optimal serving strategy. This prevents building metric views that are too slow for interactive use or unnecessarily expensive.
+
+### Assessment Criteria
+
+For each KPI or domain, evaluate:
+
+| Factor | How to Assess | Threshold |
+|--------|---------------|-----------|
+| **Table size** | `DESCRIBE DETAIL <table>` -- check `sizeInBytes` and `numFiles` | < 100 GB = small, 100 GB - 1 TB = medium, > 1 TB = large |
+| **Row count** | `SELECT COUNT(*) FROM <table>` or estimate from DESCRIBE DETAIL | < 100M = small, 100M - 1B = medium, > 1B = large |
+| **Join count** | Count the number of tables joined per KPI query | 0-2 = simple, 3-5 = moderate, 6+ = complex |
+| **Aggregation complexity** | Window functions, CASE expressions, nested aggregations | Simple SUM/COUNT = low, window/CASE = medium, nested = high |
+| **Grain mismatch** | Compare fact table grain to report grain | Same grain = no issue, different grain = pre-aggregation needed |
+| **Filter selectivity** | Typical filter narrows result to what % of table? | > 50% = low selectivity, < 10% = high selectivity |
+| **Refresh frequency** | How often does the source data change? | Real-time, hourly, daily, weekly |
+
+### Query Complexity Scoring
+
+Assign a score to determine the serving strategy:
+
+```
+Score = table_size_score + join_score + aggregation_score + grain_score
+
+table_size_score: small=0, medium=2, large=4
+join_score:       0-2 joins=0, 3-5=1, 6+=2
+aggregation_score: simple=0, medium=1, high=2
+grain_score:      same=0, different=2
+```
+
+| Total Score | Serving Strategy |
+|-------------|-----------------|
+| 0-2 | Standard metric view (direct on source tables) |
+| 3-5 | Materialized view with scheduled refresh |
+| 6+ | Gold-layer aggregate table + metric view on top |
+
+### Serving Strategies
+
+#### Strategy 1: Standard Metric View
+
+For simple KPIs on small/medium tables with few joins. The metric view queries source tables directly at runtime.
+
+```sql
+CREATE OR REPLACE VIEW <catalog>.<schema>.sales_metrics
+WITH METRICS LANGUAGE YAML AS $$
+  version: 1.1
+  source: <catalog>.<schema>.sales_fact
+  measures:
+    - name: Total Sales
+      expr: SUM(total_amount)
+$$;
+```
+
+#### Strategy 2: Materialized View with Incremental Refresh
+
+For complex KPIs or medium/large tables where pre-computing aggregations significantly reduces query time. Databricks automatically manages incremental refresh.
+
+```sql
+CREATE OR REPLACE MATERIALIZED VIEW <catalog>.<schema>.monthly_sales_agg
+AS
+SELECT
+  date_trunc('month', order_date) AS order_month,
+  region,
+  product_category,
+  SUM(total_amount) AS total_sales,
+  COUNT(1) AS order_count,
+  COUNT(DISTINCT customer_id) AS unique_customers
+FROM <catalog>.<schema>.sales_fact
+GROUP BY ALL;
+
+ALTER MATERIALIZED VIEW <catalog>.<schema>.monthly_sales_agg
+  SCHEDULE CRON '0 2 * * *' AT TIME ZONE 'UTC';
+```
+
+Then point the metric view at the materialized view:
+
+```sql
+CREATE OR REPLACE VIEW <catalog>.<schema>.sales_metrics
+WITH METRICS LANGUAGE YAML AS $$
+  version: 1.1
+  source: <catalog>.<schema>.monthly_sales_agg
+  dimensions:
+    - name: Order Month
+      expr: order_month
+    - name: Region
+      expr: region
+  measures:
+    - name: Total Sales
+      expr: SUM(total_sales)
+    - name: Avg Order Value
+      expr: SUM(total_sales) / NULLIF(SUM(order_count), 0)
+$$;
+```
+
+#### Strategy 3: Gold-Layer Aggregate Table
+
+For very large tables (> 1 TB) or when the grain mismatch is severe (e.g., transaction-level fact table but report needs monthly aggregates). Build a dedicated gold-layer table with a pipeline for incremental loads.
+
+```sql
+CREATE TABLE <catalog>.<schema>.gold_monthly_sales (
+  order_month DATE,
+  region STRING,
+  product_category STRING,
+  total_sales DECIMAL(18,2),
+  order_count BIGINT,
+  unique_customers BIGINT,
+  _etl_updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
+)
+USING DELTA
+CLUSTER BY (order_month, region);
+```
+
+Use `spark-declarative-pipelines` to maintain incremental refresh, then build metric views on top of the gold table.
+
+### Grain Analysis
+
+When the fact table grain is finer than the report grain, pre-aggregation is essential:
+
+| Fact Table Grain | Report Grain | Action |
+|-----------------|--------------|--------|
+| Transaction-level | Daily | Materialized view with `date_trunc('day', ...)` |
+| Transaction-level | Monthly | Gold-layer aggregate or materialized view |
+| Daily | Monthly | Materialized view with `date_trunc('month', ...)` |
+| Same | Same | Standard metric view (no pre-aggregation) |
+
+### Output
+
+Produce `reference/query_optimization_plan.md`:
+
+```markdown
+## Query Optimization Plan
+
+### Domain: Sales
+| KPI | Score | Strategy | Rationale |
+|-----|-------|----------|-----------|
+| Total Sales | 2 | Standard metric view | Simple SUM, table < 50 GB |
+| Sales YoY Growth | 5 | Materialized view | Window function over 2 years, 200 GB table |
+| Customer Lifetime Value | 7 | Gold-layer aggregate | 5-table join, 1.2 TB fact table, transaction-to-monthly grain |
+
+### Materialized Views to Create
+1. `monthly_sales_agg` -- monthly pre-aggregation for time-series KPIs
+   - Schedule: daily at 2:00 AM UTC
+   - Source: sales_fact (200 GB)
+   - Estimated refresh time: ~15 min
+
+### Gold-Layer Tables to Create
+1. `gold_customer_ltv` -- customer lifetime value aggregate
+   - Pipeline: notebooks/customer_ltv_pipeline.py
+   - Refresh: daily incremental via SDP
+```
+
+---
+
+## Gap 13: Subagent Parallelization Patterns
+
+Use `Task` subagents (`subagent_type="shell"`) to parallelize MCP tool calls when work spans multiple catalogs, schemas, or tables. Each subagent runs independently and returns its results. Max 4 concurrent subagents.
+
+### Decision Guide: Subagents vs. `execute_sql_multi`
+
+| Scenario | Best Tool | Why |
+|----------|-----------|-----|
+| N queries, same catalog | `execute_sql_multi` | Single MCP call, built-in parallelism (up to 4 workers), simpler |
+| N queries, different catalogs | Parallel `shell` subagents | Each needs a different catalog context; `execute_sql_multi` takes only one |
+| 1 query per catalog for probing | Parallel `shell` subagents | Each subagent probes one catalog independently |
+| Sizing N tables in same catalog | `execute_sql_multi` | Combine N `DESCRIBE DETAIL` into one call |
+| Sizing tables across catalogs | Parallel `shell` subagents | Different catalog contexts |
+
+### Pattern A: Parallel Catalog Probing
+
+When Step 3 identifies multiple candidate catalogs, launch one subagent per catalog to verify which contains the target tables.
+
+**Subagent prompt template** (one per catalog):
+
+```
+Probe catalog "<CATALOG>" for tables in schema "<SCHEMA>" using the Databricks MCP server.
+
+1. Call CallMcpTool with:
+   - server: "user-databricks"
+   - toolName: "execute_sql"
+   - arguments: {"sql_query": "SELECT table_name FROM <CATALOG>.information_schema.tables WHERE table_schema = '<SCHEMA>' ORDER BY table_name"}
+
+2. Return the list of table names found, or state that no tables were found.
+```
+
+Launch up to 4 of these concurrently:
+
+```
+Task(subagent_type="shell", prompt="<template with catalog=analytics>")
+Task(subagent_type="shell", prompt="<template with catalog=fc_analytics>")
+Task(subagent_type="shell", prompt="<template with catalog=hive_metastore>")
+```
+
+Merge results into `reference/catalog_resolution.md`.
+
+### Pattern B: Parallel Schema Extraction
+
+When Step 4 needs schema from multiple catalogs or schemas, launch one subagent per catalog/schema pair.
+
+**Subagent prompt template**:
+
+```
+Extract schema details for catalog "<CATALOG>", schema "<SCHEMA>" using the Databricks MCP server.
+
+1. Call CallMcpTool with:
+   - server: "user-databricks"
+   - toolName: "get_table_details"
+   - arguments: {"catalog": "<CATALOG>", "schema": "<SCHEMA>", "table_stat_level": "SIMPLE"}
+
+2. Return the full result including table names, column definitions, and row counts.
+```
+
+### Pattern C: Batch Data Discovery with `execute_sql_multi`
+
+When Step 9 generates discovery queries for tables within a single catalog, execute them all in one MCP call.
+
+**MCP tool call**:
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "execute_sql_multi"
+  arguments:
+    sql_content: |
+      SELECT DISTINCT result_type FROM catalog.gold.sales_fact ORDER BY result_type;
+      SELECT result_type, COUNT(*) AS cnt FROM catalog.gold.sales_fact GROUP BY result_type ORDER BY cnt DESC LIMIT 50;
+      SELECT MIN(order_date) AS min_date, MAX(order_date) AS max_date FROM catalog.gold.sales_fact;
+      SELECT DISTINCT customer_status FROM catalog.gold.customer_dim ORDER BY customer_status;
+    catalog: "catalog"
+    schema: "gold"
+    max_workers: 4
+```
+
+The tool automatically detects independent queries and runs them in parallel. Results are returned per-statement.
+
+### Pattern D: Parallel Table Sizing
+
+When Step 10 needs `DESCRIBE DETAIL` for many tables, batch them or use subagents.
+
+**Same catalog** -- use `execute_sql_multi`:
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "execute_sql_multi"
+  arguments:
+    sql_content: |
+      DESCRIBE DETAIL catalog.gold.sales_fact;
+      DESCRIBE DETAIL catalog.gold.customer_dim;
+      DESCRIBE DETAIL catalog.gold.product_dim;
+      DESCRIBE DETAIL catalog.gold.date_dim;
+    catalog: "catalog"
+    max_workers: 4
+```
+
+**Cross-catalog** -- launch parallel `shell` subagents, each running `execute_sql` with one `DESCRIBE DETAIL` statement against its catalog.
+
+Extract `sizeInBytes` and `numFiles` from results to feed into the query complexity scoring in Gap 12.
+
+---
+
+## Gap 14: Early Catalog Accessibility Validation
+
+After parsing the PBI model (Step 2), immediately extract all data source references and verify that the agent has access to the required catalogs and schemas -- or that schema dumps have been provided. This prevents wasted work in later steps that depend on schema information.
+
+### Extracting Data Source References from the PBI Model
+
+Data source references are embedded in the parsed PBI model JSON in several locations:
+
+1. **Partition source expressions** (`partitions[].source.expression`):
+   Look for M code containing `Sql.Database("server", "database")` or `Sql.Databases("server")`. Extract the server name and database/catalog name.
+
+   ```
+   // Example M expression:
+   let Source = Sql.Database("myserver.database.windows.net", "my_catalog"),
+       gold = Source{[Schema="gold"]}[Data],
+       ...
+   ```
+
+2. **Connection string annotations** (`model.annotations` or `model.dataSources`):
+   Some models store explicit connection strings with server, database, catalog, and schema.
+
+3. **Table source metadata** (`tables[].partitions[].source`):
+   For DirectQuery tables, the `source` object may contain `schema` and `entity` (table) names.
+
+### Extraction Logic
+
+```python
+def extract_data_sources(model: dict) -> list[dict]:
+    """Extract catalog/schema references from parsed PBI model."""
+    sources = []
+    for table in model.get("tables", []):
+        for partition in table.get("partitions", []):
+            src = partition.get("source", {})
+            expr = src.get("expression", "")
+            # Look for Sql.Database("server", "database")
+            match = re.search(r'Sql\.Database\("([^"]+)",\s*"([^"]+)"\)', expr)
+            if match:
+                sources.append({
+                    "server": match.group(1),
+                    "catalog": match.group(2),
+                    "table": table.get("name"),
+                })
+            # Look for schema references: Source{[Schema="xxx"]}
+            schema_match = re.search(r'\[Schema="([^"]+)"\]', expr)
+            if schema_match:
+                sources[-1]["schema"] = schema_match.group(1) if sources else None
+    return sources
+```
+
+### Accessibility Test
+
+For each unique catalog extracted, test accessibility:
+
+```sql
+SELECT 1 FROM <catalog>.information_schema.tables LIMIT 1;
+```
+
+When multiple catalogs need testing, launch parallel `shell` subagents (max 4). Each subagent runs `execute_sql` via the `user-databricks` MCP server.
+
+### Cross-Reference Against Provided Inputs
+
+Before running live queries, check whether the `input/` folder already provides schema information for the referenced catalog:
+
+1. Check `reference/input_manifest.json` for files classified as `csv_schema_dump`, `dbx_schema`, or `sql_ddl`
+2. Check whether the file's content references the target catalog/schema
+3. If schema info is available locally, mark the catalog as "covered by input file" and skip the live accessibility test
+
+### Warning Message Template
+
+If a catalog is referenced but neither accessible nor covered by input files:
+
+```
+⚠ Missing catalog access: The PBI model references `<catalog>.<schema>` (used by tables: <table_list>),
+but I have no schema information and cannot access this catalog.
+
+Please provide one of:
+1. A schema dump (CSV, DDL, or JSON) for `<catalog>.<schema>` in the input/ folder
+2. Databricks credentials with access to this catalog
+3. Run this query and paste the output:
+   SELECT table_name, column_name, data_type, is_nullable, comment
+   FROM <catalog>.information_schema.columns
+   WHERE table_schema = '<schema>'
+   ORDER BY table_name, ordinal_position;
+```
+
+### Output
+
+Update `reference/catalog_resolution.md` with an accessibility status section:
+
+```markdown
+## Catalog Accessibility Status
+
+| Catalog | Schema | Status | Source |
+|---------|--------|--------|--------|
+| my_catalog | gold | ✅ Accessible | Live MCP query |
+| other_catalog | silver | ✅ Covered | input/other_catalog_schema.csv |
+| missing_catalog | dbo | ❌ Inaccessible | No schema info -- user action required |
+```
+
+---
+
+## Gap 15: Existing Metric View Detection
+
+Before building new metric views (Step 13), check whether any of the target KPIs already exist in metric views in the target catalog/schema. This avoids duplication and enables incremental updates.
+
+### Discovery: Find Existing Metric Views
+
+Query the catalog's `information_schema` to find views created with the `WITH METRICS` clause:
+
+```sql
+SELECT table_name, view_definition
+FROM <catalog>.information_schema.views
+WHERE table_schema = '<schema>'
+  AND view_definition LIKE '%WITH METRICS%';
+```
+
+If MCP access is available, execute this via `execute_sql`. If multiple target schemas exist, use `execute_sql_multi` or parallel subagents.
+
+### Inspection: Get Measure Details
+
+For each discovered metric view, use the `manage_metric_views` MCP tool to retrieve its full definition:
+
+```
+CallMcpTool:
+  server: "user-databricks"
+  toolName: "manage_metric_views"
+  arguments:
+    action: "describe"
+    full_name: "<catalog>.<schema>.<view_name>"
+```
+
+The result includes `measures` (name + expression) and `dimensions` (name + expression). Extract the measure names and SQL expressions for comparison.
+
+### Comparison Logic
+
+For each KPI from `kpi/kpi_definitions.md`, compare against all measures found in existing metric views:
+
+```
+For each kpi in kpi_definitions:
+  match = find measure where normalize(measure.name) == normalize(kpi.name)
+  if no match:
+    classification = "new"
+  elif normalize(measure.expr) == normalize(kpi.sql_equivalent):
+    classification = "exists"
+  else:
+    classification = "update"
+```
+
+**Normalization** for comparison:
+- Lowercase both names
+- Strip whitespace
+- Remove surrounding quotes
+- For expressions: normalize whitespace, remove catalog/schema prefixes for column references
+
+### Classification Actions
+
+| Classification | Description | Action in Step 13 |
+|---------------|-------------|-------------------|
+| `new` | KPI not found in any existing metric view | `CREATE OR REPLACE VIEW ... WITH METRICS` |
+| `update` | KPI name exists but expression differs | `ALTER VIEW` or `manage_metric_views` with `action: "alter"` to update |
+| `exists` | KPI name and expression match | Skip -- log as "already deployed" in deployment checklist |
+
+### Output
+
+Produce `reference/existing_metric_views.md`:
+
+```markdown
+## Existing Metric View Analysis
+
+### Discovery
+Found 3 metric views in `my_catalog.gold`:
+- `sales_metrics` (4 measures)
+- `finance_metrics` (2 measures)
+- `customer_metrics` (3 measures)
+
+### KPI Classification
+
+| KPI Name | Domain | Classification | Existing View | Notes |
+|----------|--------|---------------|---------------|-------|
+| Total Sales | Sales | exists | sales_metrics | Expression matches |
+| Sales YoY Growth | Sales | update | sales_metrics | Expression differs (old: SUM vs new: window) |
+| Gross Margin | Finance | new | — | Not found in any existing view |
+| Customer Count | Customer | exists | customer_metrics | Expression matches |
+| Avg Order Value | Sales | new | — | Not found in any existing view |
+
+### Views to Modify
+- **sales_metrics**: ALTER to update `Sales YoY Growth` expression
+
+### New Views to Create
+- **finance_metrics_v2**: CREATE with `Gross Margin` measure
+  (or add to existing `finance_metrics` if same source table)
+
+### Skipped (Already Deployed)
+- Total Sales (in sales_metrics)
+- Customer Count (in customer_metrics)
+```
+
+### Handling No Access
+
+If Databricks access is not available and no schema description covers the target schema's views, skip this step and treat all KPIs as `new`. Log a note in the deployment checklist: "Manual verification recommended -- could not check for existing metric views."
diff --git a/databricks-skills/databricks-powerbi-migration/SKILL.md b/databricks-skills/databricks-powerbi-migration/SKILL.md
new file mode 100644
index 0000000..f8e2a08
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/SKILL.md
@@ -0,0 +1,416 @@
+---
+name: databricks-powerbi-migration
+description: "Converts Power BI semantic models to Databricks metric views. Handles schema assessment, ERD/domain generation, intermediate mapping layers, DAX-to-SQL translation, query access optimization (materialized views, incremental refresh), KPI definitions, data discovery, report replication, and deployment checklists. Use when migrating Power BI to Databricks, converting DAX to metric views, building metric views from semantic models, or when the user mentions Power BI, PBI, semantic model, DAX migration, or PBI-to-Databricks."
+---
+
+# PowerBI to Databricks Conversion
+
+Convert Power BI semantic models into Databricks metric views, Delta tables, and associated assets.
+
+**Always run in Plan mode.** Prompt the user for direction when uncertain.
+
+## Prerequisites
+
+- Python 3.10+
+- *(Optional)* Databricks CLI profile configured with PAT authentication
+- *(Optional)* `databricks-sdk` installed (`pip install databricks-sdk`)
+
+## Quick Start
+
+```bash
+bash scripts/init_project.sh
+# Place all files in input/ (PBI models, DDLs, mappings, schema dumps, sample reports, configs)
+python scripts/scan_inputs.py input/ -o reference/input_manifest.json
+# Present results to user, confirm usage, then follow the checklist below
+```
+
+## Migration Scenarios
+
+Determined by `scripts/compare_schemas.py` when Databricks schema information is available:
+
+- **Scenario A** (names match): Repoint Power BI connection directly to Databricks
+- **Scenario B** (names differ, model same): Create aliasing views in Databricks, then repoint
+- **Scenario C** (names and model differ): Generate views from a JSON mapping document
+- **Scenario D** (intermediate mapping layer): PBI semantic layer -> mapping (Power Query M renames) -> Databricks physical layer. See [REFERENCE.md](REFERENCE.md) Gap 1.
+
+If no Databricks schema is available, skip comparison and proceed to ERD/domain generation.
+
+## Workflow Checklist
+
+Copy this checklist and check off steps as you complete them:
+
+```
+Task Progress:
+- [ ] 1. Scan and classify all files in input/
+- [ ] 2. Parse Power BI model(s)
+- [ ] 3. Validate catalog accessibility (raise early warnings for missing catalogs)
+- [ ] 4. Resolve catalog (probe available catalogs)
+- [ ] 5. Extract or ingest Databricks schema (suggest queries if missing)
+- [ ] 6. Build mapping layer (direct, view, translation docs, or mapping doc)
+- [ ] 7. Compare schemas and detect column gaps (DBX-only columns)
+- [ ] 8. Generate ERD and domain model
+- [ ] 9. Build KPI definitions file (kpi/kpi_definitions.md)
+- [ ] 10. Generate data discovery queries for unknown filter values
+- [ ] 11. Optimize query access layer (materialized views, grain, complexity)
+- [ ] 12. Check existing metric views (detect duplicates, propose update vs create)
+- [ ] 13. Build or update metric views by domain
+- [ ] 14. Analyze sample report (if provided) and choose output path
+- [ ] 15. Output Path A: PBI reconnection  --OR--  Path B: Databricks-native report
+- [ ] 16. Generate deployment checklist
+```
+
+### Step 1: Scan, Classify, and Confirm All Inputs
+
+**Before doing anything else**, read every file in `input/`. Classify each file by content -- not extension.
+
+```bash
+python scripts/scan_inputs.py input/ -o reference/input_manifest.json
+```
+
+Detects: `pbi_model`, `csv_schema_dump`, `mapping_json`, `dbx_schema`, `sql_ddl`, `sql_query_output`, `csv_data`, `sample_report`, `databricks_config`, `unknown`.
+
+**Present classification to the user and ask:**
+1. "I found these files. Here is what each appears to be: [list]. Is this correct?"
+2. "How should I use each file?"
+3. If no Databricks schema info found: offer schema suggestion queries (Step 5).
+
+**Do not proceed until the user confirms.**
+
+### Step 2: Parse Power BI Models
+
+```bash
+python scripts/parse_pbi_model.py input/<file> -o reference/pbi_model.json
+# or batch mode:
+python scripts/parse_pbi_model.py input/ -o reference/pbi_model.json
+```
+
+The parser handles any file extension -- tries ZIP, JSON, and TMDL detection in sequence.
+
+### Step 3: Validate Catalog Accessibility
+
+**Immediately after parsing**, extract all data source references from the PBI model -- server names, database/catalog names, and schema names found in `partitions[].source` (connection strings, M expressions, or `Sql.Database` calls).
+
+Cross-reference these against what is available:
+
+1. Schema files provided in `input/` (DDL, CSV schema dump, JSON schema)
+2. Databricks config in `input/` (host/token/catalog)
+3. Live MCP access (test with `execute_sql`)
+
+**If MCP is available**, test accessibility for each referenced catalog:
+
+```sql
+SELECT 1 FROM <catalog>.information_schema.tables LIMIT 1;
+```
+
+Launch parallel `shell` subagents when multiple catalogs need testing. See [REFERENCE.md](REFERENCE.md) Gap 14.
+
+**If a referenced catalog is inaccessible AND no schema dump was provided**, raise a warning to the user immediately:
+
+> "The PBI model references data from `<catalog>.<schema>`, but I have no schema information and cannot access this catalog. Please provide one of:
+> 1. A schema dump (CSV, DDL, or JSON) in the `input/` folder
+> 2. Databricks credentials with access to this catalog
+> 3. Run this query and paste the output: `SELECT table_name, column_name, data_type FROM <catalog>.information_schema.columns WHERE table_schema = '<schema>'`"
+
+**Do not proceed past Step 5 without resolving all catalog gaps.** ERD/domain generation (Step 8) can proceed with PBI-only data, but schema comparison and metric view creation require catalog access or schema dumps.
+
+Update `reference/catalog_resolution.md` with accessibility status for each referenced catalog.
+
+Probe available catalogs before schema extraction. See [REFERENCE.md](REFERENCE.md) Gap 5 for detailed strategy.
+
+First, list all catalogs:
+
+```sql
+SELECT catalog_name FROM system.information_schema.catalogs ORDER BY catalog_name;
+```
+
+Then verify table existence. **When multiple candidate catalogs exist** (e.g., `analytics`, `fc_analytics`), launch parallel `shell` subagents -- one per catalog -- to probe concurrently:
+
+```
+-- Each subagent runs this against its assigned catalog:
+SELECT table_name FROM <catalog>.information_schema.tables WHERE table_schema = '<schema>';
+```
+
+Use `CallMcpTool` with `execute_sql` on the `user-databricks` MCP server. See [REFERENCE.md](REFERENCE.md) Gap 13 Pattern A for the subagent prompt template.
+
+Produce `reference/catalog_resolution.md` documenting primary catalog, fallback catalog, and table locations.
+
+### Step 5: Extract or Ingest Databricks Schema
+
+If no schema was found in `input/`, suggest these queries:
+
+```sql
+SELECT table_name, column_name, data_type, is_nullable, comment
+FROM <catalog>.information_schema.columns
+WHERE table_schema = '<schema>'
+ORDER BY table_name, ordinal_position;
+```
+
+**Programmatic extraction**: Use `CallMcpTool` with `get_table_details` on the `user-databricks` server (`catalog`, `schema` required). When extracting from **multiple catalogs or schemas**, launch parallel `shell` subagents -- one per catalog/schema pair. See [REFERENCE.md](REFERENCE.md) Gap 13 Pattern B.
+
+For cross-schema environments, see [REFERENCE.md](REFERENCE.md) Gap 9.
+
+Tell the user: *"Paste output in chat, save to input/, or provide catalog.schema for programmatic extraction."*
+
+Also accept DDL files and CSV schema dumps as schema sources.
+
+### Step 6: Build Mapping Layer
+
+Choose the appropriate mapping approach based on schema comparison:
+
+- **Direct (A)**: Names match -- no mapping needed
+- **View layer (B)**: Create aliasing views
+- **Mapping document (C)**: Generate JSON mapping from comparison output
+- **Intermediate mapping (D)**: Extract Power Query M renames, build two-layer mapping (`pbi_column -> dbx_column`), use three-layer only where applicable (`pbi_column -> m_query_column -> dbx_column`). See [REFERENCE.md](REFERENCE.md) Gap 1.
+
+```bash
+python scripts/compare_schemas.py \
+  reference/pbi_model.json reference/dbx_schema.json \
+  -o reference/schema_comparison.md --json --mapping reference/intermediate_mapping.json
+```
+
+### Step 7: Column Gap Analysis
+
+After schema comparison, review `reference/column_gap_analysis.md` for DBX-only columns. Flag potential discriminators (low-cardinality columns like `status`, `category`, `result_type`). See [REFERENCE.md](REFERENCE.md) Gap 3.
+
+### Step 8: ERD and Domain Modeling
+
+```bash
+python scripts/generate_erd.py reference/pbi_model.json -o reference/
+```
+
+Produces `reference/erd.md` and `reference/domains.md`. Review with user before proceeding.
+
+### Step 9: KPI Definitions
+
+Build structured KPI definitions in `kpi/kpi_definitions.md`. See [REFERENCE.md](REFERENCE.md) Gap 7 for template. Each KPI includes: business context, DAX formula, SQL equivalent, source table, format, data gaps, and domain.
+
+```bash
+bash scripts/init_project.sh --kpi
+```
+
+### Step 10: Data Discovery Queries
+
+Auto-generate SQL queries for unknown filter values, value distributions, date ranges, and null rates. Save to `reference/data_discovery_queries.sql`. See [REFERENCE.md](REFERENCE.md) Gap 4.
+
+**Execute directly** using MCP tools when Databricks access is available:
+
+- **Same catalog** (preferred): Use `execute_sql_multi` to run all discovery queries in a single batched call with automatic parallelism (up to 4 workers). See [REFERENCE.md](REFERENCE.md) Gap 13 Pattern C.
+- **Cross-catalog**: Launch parallel `shell` subagents, each running `execute_sql` against its catalog.
+
+If no Databricks access, present queries to the user to run manually and ingest results back.
+
+### Step 11: Optimize Query Access Layer
+
+Before building metric views, assess each KPI's query complexity and the underlying table characteristics to decide the optimal serving strategy. Produce `reference/query_optimization_plan.md`. See [REFERENCE.md](REFERENCE.md) Gap 12 for detailed criteria.
+
+**Assess each KPI / domain along these dimensions:**
+
+| Factor | What to Check |
+|--------|---------------|
+| Table size | Row count and data size from `DESCRIBE DETAIL` or schema metadata |
+| Query complexity | Number of joins, subqueries, window functions, or CASE expressions |
+| Grain mismatch | Fact table grain vs. report grain (e.g., transaction-level vs. monthly) |
+| Filter patterns | High-selectivity filters (date range, region) vs. full-table scans |
+| Refresh frequency | Real-time vs. daily/hourly vs. weekly |
+
+**Decision matrix:**
+
+| Condition | Serving Strategy |
+|-----------|-----------------|
+| Simple aggregation, table < 100 GB, few joins | Standard metric view |
+| Complex joins or expensive aggregation, table 100 GB - 1 TB | Materialized view with scheduled refresh |
+| Very large table (> 1 TB) or grain mismatch requiring pre-aggregation | Gold-layer aggregate table + metric view on top |
+| Mixed: some KPIs simple, some complex | Split across strategies per domain |
+
+**Materialized view pattern with incremental refresh:**
+
+```sql
+CREATE OR REPLACE MATERIALIZED VIEW <catalog>.<schema>.monthly_sales_agg
+AS
+SELECT
+  date_trunc('month', order_date) AS order_month,
+  region,
+  SUM(total_amount) AS total_sales,
+  COUNT(1) AS order_count
+FROM <catalog>.<schema>.sales_fact
+GROUP BY ALL;
+
+ALTER MATERIALIZED VIEW <catalog>.<schema>.monthly_sales_agg
+  SCHEDULE CRON '0 2 * * *' AT TIME ZONE 'UTC';
+```
+
+The metric view then references the materialized view instead of the raw fact table, reducing query cost and latency.
+
+**Collect table sizing data** using `DESCRIBE DETAIL <table>` for each candidate table. When assessing multiple tables, launch parallel `shell` subagents to run `DESCRIBE DETAIL` concurrently via the `execute_sql` MCP tool. See [REFERENCE.md](REFERENCE.md) Gap 13 Pattern D.
+
+**Output:** `reference/query_optimization_plan.md` documenting the chosen strategy per domain/KPI with rationale.
+
+### Step 12: Check Existing Metric Views
+
+**Before building metric views**, check whether any KPIs already exist in metric views in the target catalog/schema. This prevents duplication and enables incremental updates. See [REFERENCE.md](REFERENCE.md) Gap 15.
+
+**If Databricks access is available**, discover existing metric views:
+
+```sql
+SELECT table_name, view_definition
+FROM <catalog>.information_schema.views
+WHERE table_schema = '<schema>'
+  AND view_definition LIKE '%WITH METRICS%';
+```
+
+For each discovered metric view, use `manage_metric_views` MCP tool to inspect its measures:
+
+```
+CallMcpTool: server="user-databricks", toolName="manage_metric_views"
+Arguments: {"action": "describe", "full_name": "<catalog>.<schema>.<view_name>"}
+```
+
+**Compare** existing measures against KPI definitions from Step 9. For each KPI, classify as:
+
+| Classification | Condition | Action |
+|---------------|-----------|--------|
+| `new` | Measure name not found in any existing metric view | CREATE new metric view |
+| `update` | Measure name exists but SQL expression differs | ALTER existing metric view |
+| `exists` | Measure name and expression match | Skip -- already deployed |
+
+**Output:** `reference/existing_metric_views.md` documenting the classification per KPI and listing any existing views that will be modified.
+
+**If no Databricks access**, skip this step and treat all KPIs as `new`.
+
+### Step 13: Build or Update Metric Views
+
+Based on the classification from Step 12, handle each domain's metric views:
+
+- **New KPIs**: Create metric views with `CREATE OR REPLACE VIEW ... WITH METRICS`
+- **Updated KPIs**: Use `ALTER VIEW` or `manage_metric_views` with `action: "alter"` to update existing definitions
+- **Existing KPIs**: Skip -- log in deployment checklist as "already deployed"
+
+```sql
+CREATE OR REPLACE VIEW <catalog>.<schema>.domain_metrics
+WITH METRICS LANGUAGE YAML AS $$
+  version: 1.1
+  source: <catalog>.<schema>.fact_table
+  dimensions:
+    - name: Dimension Name
+      expr: column_expression
+  measures:
+    - name: Measure Name
+      expr: AGG_FUNC(column)
+      comment: "DAX: ORIGINAL_FORMULA"
+$$;
+```
+
+Create folders on demand: `bash scripts/init_project.sh --models`
+
+Use the `databricks-metric-views` skill for YAML syntax details.
+
+### Step 14: Sample Report Analysis
+
+If `input/` contains `.docx`, `.pdf`, `.png`, `.jpg`, `.xlsx`, or `.pptx` files, analyze them to extract KPI names, formatting, chart types, narrative templates, and disclaimers. Produce `reference/report_analysis.md`. See [REFERENCE.md](REFERENCE.md) Gap 8.
+
+### Step 15: Output Path
+
+**Path A: PBI Reconnection** -- Update Power Query M formulas to use `Databricks.Catalogs()`. Use DirectQuery for facts, Dual for dimensions. Parameterize `ServerHostName`/`HTTPPath`.
+
+**Path B: Databricks-Native Report** -- Read `databricks-aibi-dashboards` skill. Build report spec and email template in `planreport/`. See [REFERENCE.md](REFERENCE.md) Gaps 6 and 10.
+
+```bash
+bash scripts/init_project.sh --report
+```
+
+### Step 16: Deployment Checklist
+
+Generate `reference/deployment_checklist.md` with ordered steps from local artifacts to running report. See [REFERENCE.md](REFERENCE.md) Gap 11.
+
+## Auxiliary Steps (run as needed)
+
+- **Multi-Model Handling** (Step 5 in old workflow): Process each model/schema pair separately. Track shared dimensions.
+- **Duplicate KPI Report**: After all models analyzed, identify duplicates. Save to `reference/kpi_duplicates_report.md`.
+- **Performance Assessment** (requires DBX schema): Flag tables >100 GB or >1B rows. Consider materialized metric views.
+- **Gold Layer**: Define ERD in `models/gold_layer_erd.md`. Use `spark-declarative-pipelines` and `databricks-jobs` skills.
+
+## Related Skills
+
+The agent should discover and leverage these skills as needed throughout the workflow:
+
+- **`databricks-metric-views`** -- YAML syntax, MCP `manage_metric_views` tool
+- **`databricks-dbsql`** -- Advanced SQL, pipe syntax, AI functions
+- **`databricks-unity-catalog`** -- System tables, volume operations
+- **`databricks-aibi-dashboards`** -- AI/BI dashboard creation
+- **`databricks-genie`** / **`genie-space-curation`** -- Genie Spaces
+- **`spark-declarative-pipelines`** -- Gold-layer pipelines
+- **`databricks-jobs`** -- Job scheduling
+- **`databricks-asset-bundles`** -- Multi-environment deployment
+- **`databricks-config`** -- Workspace authentication
+- **`databricks-python-sdk`** -- SDK and REST API
+- **`databricks-docs`** -- Documentation lookup
+
+Also check for MCP tools (e.g., `execute_sql`, `execute_sql_multi`, `manage_metric_views`, `get_table_details`).
+
+## Subagent Parallelization
+
+Use `Task` subagents to parallelize work when multiple catalogs, schemas, or tables must be probed independently. This accelerates Steps 3, 4, 5, 10, and 11.
+
+### When to Use Subagents vs. `execute_sql_multi`
+
+| Scenario | Approach |
+|----------|----------|
+| Multiple queries within **one catalog** | `execute_sql_multi` MCP tool (built-in parallelism, up to 4 workers) |
+| Probing **across catalogs** (each needs its own catalog context) | Launch parallel `shell` subagents (max 4 concurrent) |
+| Mixed: some cross-catalog, some same-catalog | Combine both -- subagents for cross-catalog, `execute_sql_multi` within each |
+
+### Pattern: Parallel Shell Subagents
+
+Launch up to 4 concurrent `Task` calls with `subagent_type="shell"`. Each subagent executes `CallMcpTool` against the `user-databricks` MCP server.
+
+Example prompt for a catalog-probing subagent:
+
+```
+Use the CallMcpTool to call the "execute_sql" tool on the "user-databricks" server.
+Arguments: {"sql_query": "SELECT table_name FROM <catalog>.information_schema.tables WHERE table_schema = '<schema>'"}
+Return the full result set.
+```
+
+### Pattern: Batch SQL with `execute_sql_multi`
+
+For multiple independent queries within the same catalog, use the `execute_sql_multi` MCP tool in a single call. It automatically parallelizes independent statements (up to `max_workers` default 4).
+
+```
+CallMcpTool: server="user-databricks", toolName="execute_sql_multi"
+Arguments: {
+  "sql_content": "<all queries separated by ;>",
+  "catalog": "<catalog>",
+  "schema": "<schema>",
+  "max_workers": 4
+}
+```
+
+### Where Subagents Are Used in This Workflow
+
+- **Step 3** -- Test catalog accessibility across multiple catalogs in parallel
+- **Step 4** -- Probe multiple candidate catalogs in parallel
+- **Step 5** -- Extract schema from multiple catalogs/schemas in parallel
+- **Step 10** -- Run data discovery queries (prefer `execute_sql_multi` for same-catalog batch)
+- **Step 11** -- Run `DESCRIBE DETAIL` on multiple tables in parallel for sizing
+
+## Common Issues
+
+| Issue | Resolution |
+|-------|------------|
+| No Databricks schema available | Suggest INFORMATION_SCHEMA queries (Step 5) or accept DDL/CSV |
+| Column names differ between PBI and DBX | Use Scenario B (views) or D (intermediate mapping) |
+| DBX-only columns not in PBI model | Check column_gap_analysis.md for discriminators |
+| Multiple catalogs/schemas | Use catalog resolution (Step 4) and cross-schema probing |
+| Power Query M renames columns | Extract from `partitions[].source.expression`, build intermediate mapping |
+| Sample report provided but no PBI model | Reverse-engineer KPIs from report analysis |
+| fc_ prefix on catalog names | Try both with and without prefix during catalog resolution |
+| Slow metric view queries on large tables | Use materialized views with scheduled refresh (Step 11) |
+| Grain mismatch between fact table and report | Pre-aggregate into gold-layer table, point metric view at aggregate |
+| KPIs already exist in metric views | Step 12 detects existing measures -- propose ALTER instead of CREATE |
+| PBI references catalog with no access or schema | Step 3 raises early warning -- provide schema dump or credentials |
+
+## Detailed Reference
+
+- For detailed patterns for all gaps, see [REFERENCE.md](REFERENCE.md)
+- For input/output examples, see [EXAMPLES.md](EXAMPLES.md)
+- For the full 7-step methodology, see [approach.md](approach.md)
diff --git a/databricks-skills/databricks-powerbi-migration/approach.md b/databricks-skills/databricks-powerbi-migration/approach.md
new file mode 100644
index 0000000..1332d1b
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/approach.md
@@ -0,0 +1,313 @@
+# PowerBI Semantic Model to Databricks Metric Views: High-Level Approach
+
+This document outlines the methodology for converting Power BI semantic models into Databricks metric views, enabling governed, reusable analytics on the Databricks Lakehouse platform.
+
+---
+
+## 1. Assess and Document the Existing Semantic Model
+
+Start by analyzing your Power BI dataset to identify the critical semantic assets:
+
+### Tables and Relationships
+
+- List all fact and dimension tables, their cardinalities, and relationship types.
+- Document join keys, cross-filter directions, and any inactive relationships.
+- Note whether relationships are one-to-one, one-to-many, or many-to-many.
+
+### Measures and KPIs
+
+- Document all DAX measures, aggregations, and calculated columns, noting their purpose and logic.
+- Categorize measures by business domain (e.g., Revenue, Operations, Customer).
+- Identify complex measures (e.g., time-intelligence, CALCULATE with filters, iterator functions) that require careful SQL translation.
+
+### Hierarchies, Metadata, and Security Roles
+
+- Capture any row-level security (RLS) filters and their definitions.
+- Document display folders, field formatting, sort-by columns, and default summarization.
+- Record any hierarchies (e.g., Geography: Country > State > City).
+
+This inventory becomes your **translation map** to Databricks metric views.
+
+---
+
+## 2. Migrate Data Foundations to Delta Tables
+
+Databricks metric views rely on Delta tables as the core storage layer. Follow these steps:
+
+### Ingest and Convert
+
+- Convert or sync your existing data sources (from SQL Server, Synapse, Redshift, etc.) into Delta format using **Auto Loader** or **Spark Declarative Pipelines (Lakeflow)**.
+- Use the **medallion architecture**: land raw data in Bronze, cleanse in Silver, and serve business-ready aggregates from Gold.
+
+### Optimize for Performance
+
+- Apply **Liquid Clustering** (preferred) or Z-Ordering on columns frequently used in filters and joins.
+- Consider partitioning for tables exceeding 1 TB.
+- Enable **predictive optimization** or schedule regular `VACUUM` and `OPTIMIZE` operations.
+- Enable **schema evolution** to handle upstream changes gracefully.
+
+### Maintain Semantic Consistency
+
+- Column naming, data types, and business keys should match those used in Power BI to minimize transformation complexity.
+- Declare **primary and foreign key constraints** in Unity Catalog with `RELY` to enable Power BI auto-relationship detection and Databricks query optimization.
+- Avoid wide data types and high-cardinality columns to reduce Power BI semantic model size.
+
+### Example
+
+If Power BI used a `SalesFact` table joined to `CustomerDim` and `DateDim`, ensure equivalent tables exist as optimized Delta tables in your Databricks Lakehouse:
+
+```sql
+-- Gold layer fact table with PK/FK constraints
+CREATE TABLE gold.sales_fact (
+  sale_id BIGINT NOT NULL,
+  customer_id BIGINT NOT NULL,
+  date_key INT NOT NULL,
+  quantity INT,
+  unit_price DECIMAL(18,2),
+  total_amount DECIMAL(18,2)
+)
+USING DELTA
+CLUSTER BY (date_key, customer_id);
+
+ALTER TABLE gold.sales_fact ADD CONSTRAINT pk_sales PRIMARY KEY (sale_id) RELY;
+ALTER TABLE gold.sales_fact ADD CONSTRAINT fk_customer
+  FOREIGN KEY (customer_id) REFERENCES gold.customer_dim(customer_id) RELY;
+ALTER TABLE gold.sales_fact ADD CONSTRAINT fk_date
+  FOREIGN KEY (date_key) REFERENCES gold.date_dim(date_key) RELY;
+```
+
+---
+
+## 3. Translate Measures and Logic into Metric Definitions
+
+Power BI measures (DAX) must be reimplemented as SQL metrics using Databricks metric views:
+
+### Reimplement DAX as SQL
+
+- Create SQL-based logic for each core DAX measure using Databricks' metric view YAML syntax.
+- Map DAX aggregation functions to SQL equivalents:
+
+| DAX Function | SQL Equivalent |
+|---|---|
+| `SUM(Sales[Amount])` | `SUM(total_amount)` |
+| `COUNTROWS(Orders)` | `COUNT(1)` |
+| `DISTINCTCOUNT(Customer[ID])` | `COUNT(DISTINCT customer_id)` |
+| `DIVIDE(SUM(...), SUM(...))` | `SUM(...) / NULLIF(SUM(...), 0)` |
+| `CALCULATE(SUM(...), Filter)` | Use metric view `filter` or filtered measure expressions |
+
+### Centralize and Organize
+
+- Store metric definitions in Unity Catalog metric views for discoverability and reuse.
+- When applicable, use parameterized queries to enable dynamic filtering within connected BI tools.
+
+**Best practice:** Group related metrics (e.g., revenue, margin, quantity) into a single metric view per business domain to simplify governance.
+
+---
+
+## 4. Define Metric Views in Databricks SQL
+
+Leverage Databricks' metric view functionality to expose reusable, governed aggregates:
+
+### Build the Metric View
+
+- Reference the underlying Delta tables directly in the metric view definition.
+- Include measure definitions, column metadata (comments), default filters, and optional groupings.
+- For star/snowflake schemas, declare joins within the metric view YAML.
+
+### Register in Unity Catalog
+
+- Register the metric view in Unity Catalog.
+- Assign business-friendly names, tags, and ownership to maintain discoverability and lineage.
+- Grant `SELECT` privileges to appropriate users and groups.
+
+### Example: Translating a Power BI "Total Sales" Measure
+
+**Power BI DAX:**
+```dax
+Total Sales = SUM(SalesFact[TotalAmount])
+Avg Order Value = DIVIDE([Total Sales], COUNTROWS(SalesFact))
+Sales YoY Growth = DIVIDE([Total Sales] - CALCULATE([Total Sales], SAMEPERIODLASTYEAR(DateDim[Date])), CALCULATE([Total Sales], SAMEPERIODLASTYEAR(DateDim[Date])))
+```
+
+**Databricks Metric View (YAML):**
+```sql
+CREATE OR REPLACE VIEW gold.sales_metrics
+WITH METRICS
+LANGUAGE YAML
+AS $$
+  version: 1.1
+  comment: "Sales KPIs translated from Power BI semantic model"
+  source: gold.sales_fact
+
+  dimensions:
+    - name: Order Date
+      expr: date_key
+      comment: "Date key for time-based analysis"
+    - name: Customer Segment
+      expr: customer_segment
+      comment: "Customer classification"
+    - name: Product Category
+      expr: product_category
+      comment: "Product grouping"
+
+  measures:
+    - name: Total Sales
+      expr: SUM(total_amount)
+      comment: "Equivalent to Power BI DAX: SUM(SalesFact[TotalAmount])"
+    - name: Order Count
+      expr: COUNT(1)
+      comment: "Total number of sales transactions"
+    - name: Avg Order Value
+      expr: SUM(total_amount) / NULLIF(COUNT(1), 0)
+      comment: "Equivalent to Power BI DAX: DIVIDE([Total Sales], COUNTROWS(SalesFact))"
+    - name: Distinct Customers
+      expr: COUNT(DISTINCT customer_id)
+      comment: "Unique customers with purchases"
+
+  joins:
+    - name: customer_dim
+      source: gold.customer_dim
+      on: sales_fact.customer_id = customer_dim.customer_id
+    - name: date_dim
+      source: gold.date_dim
+      on: sales_fact.date_key = date_dim.date_key
+$$;
+```
+
+**Querying the metric view:**
+```sql
+SELECT
+  `Order Date`,
+  `Customer Segment`,
+  MEASURE(`Total Sales`) AS total_sales,
+  MEASURE(`Avg Order Value`) AS avg_order_value,
+  MEASURE(`Distinct Customers`) AS unique_customers
+FROM gold.sales_metrics
+GROUP BY ALL
+ORDER BY ALL;
+```
+
+The result acts like a Power BI semantic model layer -- computationally defined and reusable across analytics tools.
+
+---
+
+## 5. Connect Power BI to Databricks SQL Warehouse
+
+Using guidance from the [Power BI on Databricks Best Practices Cheatsheet](https://www.databricks.com/sites/default/files/2025-04/2025-04-power-bi-on-databricks-best-practices-cheat-sheet.pdf):
+
+### Connection Setup
+
+- Connect Power BI via **DirectQuery** mode to your Databricks SQL Warehouse.
+- Use **DirectQuery for fact tables** and **Dual mode for dimension tables** to balance performance and freshness.
+- Use the Databricks connector in Power BI with `Databricks.Catalogs()` function.
+
+### Parameterize Connections
+
+- Create `ServerHostName` and `HTTPPath` parameters in Power Query (Type must be **Text**).
+- Replace hardcoded values in M queries with parameter references for dynamic environments (Dev vs. Prod).
+- Update parameters via Power BI Service UI or REST API for CI/CD pipelines.
+
+**Power Query M formula pattern:**
+```
+let
+    Source = Databricks.Catalogs(ServerHostName, HTTPPath,
+             [Catalog=null, Database=null, EnableAutomaticProxyDiscovery=null]),
+    catalog = Source{[Name=CatalogName, Kind="Database"]}[Data],
+    schema = catalog{[Name="gold", Kind="Schema"]}[Data],
+    metric_view = schema{[Name="sales_metrics", Kind="Table"]}[Data]
+in
+    metric_view
+```
+
+### Performance Settings
+
+- Configure query parallelization settings (MaxParallelismPerQuery, max connections per data source).
+- Set appropriate cache and query timeout values.
+- Use the same SQL Warehouse for datasets querying the same data to maximize caching.
+
+### Simplify Power BI
+
+- Remove redundant calculated columns and measures now handled by metric views.
+- "Move left" transformations: prefer SQL views and metric views over Power Query transformations and DAX formulas.
+- Set "Assume Referential Integrity" on relationships when PK/FK constraints are declared with `RELY`.
+
+---
+
+## 6. Validate and Refine
+
+### Functional Validation
+
+- Validate that Power BI visuals return identical results as before.
+- Compare DAX vs. metric view outputs side-by-side for consistency.
+- Test edge cases: null handling, division by zero, date boundary conditions.
+
+### Performance Validation
+
+- Monitor query performance using Databricks **Query Profile** tools.
+- Use Power BI **Performance Analyzer** to identify bottleneck visuals.
+- Adjust caching, aggregation strategies, or SQL Warehouse sizing as needed.
+
+### Common Pitfalls to Check
+
+| Issue | What to Verify |
+|---|---|
+| Column name casing | Databricks is case-insensitive but Power BI may expect specific casing |
+| Data type mismatches | Integers from some sources become decimals in Power BI |
+| Relationship loss | PK/FK auto-detection may delete manually created relationships |
+| Large result sets | Visuals pulling 1000s of rows indicate inefficient DAX or missing aggregations |
+
+---
+
+## 7. Govern and Automate
+
+### Governance
+
+- Manage metric versioning and ownership through **Unity Catalog** and audit logs.
+- Use tags and comments on metric views for discoverability.
+- Implement row-level security through Unity Catalog grants (replacing Power BI RLS where appropriate).
+
+### Automation
+
+- Automate metric view deployments with **CI/CD pipelines** using Databricks REST APIs, CLI, or **Databricks Asset Bundles**.
+- Schedule Delta table maintenance (OPTIMIZE, VACUUM) via Databricks Jobs or predictive optimization.
+
+### Data Contracts
+
+- Establish a data contract process: new metrics or schema changes must go through metric view updates, not ad-hoc Power BI model edits.
+- Document metric definitions, owners, and SLAs in the Unity Catalog metadata.
+
+---
+
+## Example Flow Summary
+
+```
+1. Extract semantic layer details from Power BI
+       |
+       v
+2. Recreate the data model with Delta tables in Databricks
+       |
+       v
+3. Implement equivalent metrics as SQL metric definitions
+       |
+       v
+4. Register and expose them through metric views in Unity Catalog
+       |
+       v
+5. Connect Power BI in DirectQuery mode to Databricks
+       |
+       v
+6. Validate and tune
+       |
+       v
+7. Govern, automate, and iterate
+```
+
+---
+
+## Reference Articles
+
+- [Adopting Power BI semantic models on Databricks SQL](https://medium.com/dbsql-sme-engineering/adopting-power-bi-semantic-models-on-databricks-sql-6efb4b0f78c9) -- Migration walkthrough with M formula examples for Snowflake, Synapse, and Hive Metastore sources.
+- [Parameterizing your Databricks SQL Connections in Power BI](https://medium.com/@kyle.hale/parameterizing-your-databricks-sql-connections-in-power-bi-fd7aae20863e) -- Step-by-step guide for parameterizing ServerHostName and HTTPPath in Power Query.
+- [Power BI on Databricks Best Practices Cheat Sheet](https://medium.com/dbsql-sme-engineering/introducing-the-power-bi-on-databricks-best-practices-cheatsheet-a55e0aed9575) -- One-pager covering Data Preparation, SQL Serving, Power BI Integration, and Report Design best practices.
+- [Cheat Sheet PDF](https://www.databricks.com/sites/default/files/2025-04/2025-04-power-bi-on-databricks-best-practices-cheat-sheet.pdf) -- Downloadable best practices reference.
+- [Unity Catalog Metric Views Documentation](https://docs.databricks.com/en/metric-views/) -- YAML syntax, joins, materialization, and MEASURE() function reference.
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/compare_schemas.py b/databricks-skills/databricks-powerbi-migration/scripts/compare_schemas.py
new file mode 100644
index 0000000..6e6f445
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/compare_schemas.py
@@ -0,0 +1,431 @@
+#!/usr/bin/env python3
+"""
+Compare Power BI model JSON against Databricks schema JSON.
+Classifies the migration scenario (A, B, C, or D) and generates a comparison report.
+
+Usage:
+    python compare_schemas.py <pbi_model.json> <dbx_schema.json> [-o report.md] [--json]
+        [--mapping intermediate_mapping.json]
+        [--gap-analysis reference/column_gap_analysis.md]
+
+Inputs are the JSON outputs of parse_pbi_model.py and extract_dbx_schema.py.
+No external dependencies required (stdlib only).
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+from difflib import SequenceMatcher
+
+
+_DISCRIMINATOR_PATTERNS = [
+    r"status", r"type", r"category", r"code", r"flag", r"class",
+    r"kind", r"tier", r"level", r"group",
+]
+_DISCRIMINATOR_PREFIXES = ("is_", "has_", "can_")
+_DISCRIMINATOR_SUFFIXES = ("_type", "_status", "_code", "_flag", "_category", "_class")
+
+
+def _is_discriminator(col_name: str) -> bool:
+    """Heuristic check: does the column name suggest a low-cardinality discriminator?"""
+    lower = col_name.lower()
+    if lower.startswith(_DISCRIMINATOR_PREFIXES):
+        return True
+    if lower.endswith(_DISCRIMINATOR_SUFFIXES):
+        return True
+    for pat in _DISCRIMINATOR_PATTERNS:
+        if re.search(rf"\b{pat}\b", lower):
+            return True
+    return False
+
+
+def _load_mapping(mapping_path: str) -> dict[str, dict[str, str]]:
+    """Load an intermediate mapping file and return a lookup: (pbi_table_lower, pbi_col_lower) -> dbx_col."""
+    with open(mapping_path) as f:
+        data = json.load(f)
+    lookup: dict[str, dict[str, str]] = {}
+    for m in data.get("mappings", []):
+        pbi_table = m.get("pbi_table", "").lower()
+        col_map: dict[str, str] = {}
+        for c in m.get("columns", []):
+            pbi_col = c.get("pbi_column", "").lower()
+            dbx_col = c.get("dbx_column", "")
+            if pbi_col and dbx_col:
+                col_map[pbi_col] = dbx_col
+        if pbi_table and col_map:
+            lookup[pbi_table] = col_map
+    return lookup
+
+
+def compare(pbi: dict, dbx: dict, mapping: dict[str, dict[str, str]] | None = None) -> dict:
+    pbi_tables = {t["name"].lower(): t for t in pbi["tables"]}
+    dbx_tables = {t["name"].lower(): t for t in dbx["tables"]}
+
+    report = {
+        "scenario": None,
+        "table_matches": [],
+        "table_pbi_only": [],
+        "table_dbx_only": [],
+        "suggested_mappings": [],
+        "summary": {},
+    }
+
+    matched = 0
+    name_differs = 0
+    mapping_used = 0
+    matched_dbx_keys = set()
+
+    for pbi_name, pbi_table in pbi_tables.items():
+        table_mapping = mapping.get(pbi_name) if mapping else None
+
+        if pbi_name in dbx_tables:
+            col_report = _compare_columns(pbi_table, dbx_tables[pbi_name], table_mapping)
+            report["table_matches"].append({
+                "pbi_table": pbi_table["name"],
+                "dbx_table": dbx_tables[pbi_name]["name"],
+                "dbx_fqn": dbx_tables[pbi_name].get("fqn", ""),
+                "match_type": "mapped" if table_mapping else "exact",
+                "column_comparison": col_report,
+            })
+            matched_dbx_keys.add(pbi_name)
+            if table_mapping:
+                mapping_used += 1
+            elif col_report["exact_matches"] == col_report["total_pbi_columns"]:
+                matched += 1
+            else:
+                name_differs += 1
+        else:
+            best_match, score = _fuzzy_match(pbi_name, list(dbx_tables.keys()))
+            if score >= 0.6:
+                col_report = _compare_columns(pbi_table, dbx_tables[best_match], table_mapping)
+                report["table_matches"].append({
+                    "pbi_table": pbi_table["name"],
+                    "dbx_table": dbx_tables[best_match]["name"],
+                    "dbx_fqn": dbx_tables[best_match].get("fqn", ""),
+                    "match_type": "fuzzy",
+                    "similarity": round(score, 2),
+                    "column_comparison": col_report,
+                })
+                matched_dbx_keys.add(best_match)
+                name_differs += 1
+            else:
+                report["table_pbi_only"].append(pbi_table["name"])
+
+    for dbx_name, dbx_table in dbx_tables.items():
+        if dbx_name not in matched_dbx_keys:
+            report["table_dbx_only"].append(dbx_table.get("fqn", dbx_table["name"]))
+
+    total_pbi = len(pbi_tables)
+    if total_pbi == 0:
+        report["scenario"] = "EMPTY"
+    elif mapping_used > 0:
+        report["scenario"] = "D"
+    elif matched == total_pbi:
+        report["scenario"] = "A"
+    elif (matched + name_differs) >= total_pbi * 0.8 and not report["table_pbi_only"]:
+        report["scenario"] = "B"
+    else:
+        report["scenario"] = "C"
+
+    if report["scenario"] in ("B", "C", "D"):
+        report["suggested_mappings"] = _generate_mapping(report["table_matches"])
+
+    all_measures = []
+    for t in pbi["tables"]:
+        for m in t.get("measures", []):
+            all_measures.append({
+                "table": t["name"],
+                "name": m["name"],
+                "expression": m.get("expression", ""),
+                "displayFolder": m.get("displayFolder", ""),
+            })
+
+    report["summary"] = {
+        "total_pbi_tables": total_pbi,
+        "total_dbx_tables": len(dbx_tables),
+        "exact_table_matches": matched,
+        "fuzzy_table_matches": name_differs,
+        "mapped_table_matches": mapping_used,
+        "pbi_only_tables": len(report["table_pbi_only"]),
+        "dbx_only_tables": len(report["table_dbx_only"]),
+        "total_pbi_measures": len(all_measures),
+        "scenario": report["scenario"],
+    }
+    report["measures"] = all_measures
+
+    return report
+
+
+def _compare_columns(
+    pbi_table: dict,
+    dbx_table: dict,
+    col_mapping: dict[str, str] | None = None,
+) -> dict:
+    pbi_cols = {}
+    for c in pbi_table.get("columns", []):
+        pbi_cols[c["name"].lower()] = c["name"]
+
+    dbx_cols = {}
+    for c in dbx_table.get("columns", []):
+        dbx_cols[c["name"].lower()] = c
+
+    exact = []
+    mapped = []
+    fuzzy = []
+    pbi_only = []
+
+    for pbi_lower, pbi_name in pbi_cols.items():
+        if col_mapping and pbi_lower in col_mapping:
+            dbx_target = col_mapping[pbi_lower].lower()
+            if dbx_target in dbx_cols:
+                mapped.append({
+                    "pbi": pbi_name,
+                    "dbx": dbx_cols[dbx_target]["name"],
+                    "via": "mapping",
+                })
+                continue
+
+        if pbi_lower in dbx_cols:
+            exact.append({"pbi": pbi_name, "dbx": dbx_cols[pbi_lower]["name"]})
+        else:
+            best, score = _fuzzy_match(pbi_lower, list(dbx_cols.keys()))
+            if score >= 0.6:
+                fuzzy.append({
+                    "pbi": pbi_name,
+                    "dbx": dbx_cols[best]["name"],
+                    "similarity": round(score, 2),
+                })
+            else:
+                pbi_only.append(pbi_name)
+
+    dbx_matched = {m["dbx"].lower() for m in exact + fuzzy + mapped}
+    dbx_only_cols = []
+    for c in dbx_table.get("columns", []):
+        if c["name"].lower() not in dbx_matched:
+            entry = {"name": c["name"], "data_type": c.get("data_type", c.get("dataType", ""))}
+            entry["is_discriminator"] = _is_discriminator(c["name"])
+            dbx_only_cols.append(entry)
+
+    return {
+        "total_pbi_columns": len(pbi_cols),
+        "total_dbx_columns": len(dbx_cols),
+        "exact_matches": len(exact),
+        "mapped_matches": len(mapped),
+        "fuzzy_matches": len(fuzzy),
+        "pbi_only": pbi_only,
+        "dbx_only": [c["name"] for c in dbx_only_cols],
+        "dbx_only_details": dbx_only_cols,
+        "fuzzy_details": fuzzy,
+        "mapped_details": mapped,
+    }
+
+
+def _fuzzy_match(name: str, candidates: list[str]) -> tuple[str, float]:
+    best = ""
+    best_score = 0.0
+    for c in candidates:
+        score = SequenceMatcher(None, name, c).ratio()
+        if score > best_score:
+            best = c
+            best_score = score
+    return best, best_score
+
+
+def _generate_mapping(matches: list) -> list:
+    mappings = []
+    for m in matches:
+        cc = m["column_comparison"]
+        columns = []
+        for fm in cc.get("fuzzy_details", []):
+            columns.append({
+                "powerbi_column": fm["pbi"],
+                "databricks_column": fm["dbx"],
+                "transform": None,
+            })
+        for po in cc.get("pbi_only", []):
+            columns.append({
+                "powerbi_column": po,
+                "databricks_column": "???",
+                "transform": "NEEDS_MAPPING",
+            })
+        if columns:
+            mappings.append({
+                "powerbi_table": m["pbi_table"],
+                "databricks_table": m.get("dbx_fqn", m["dbx_table"]),
+                "columns": columns,
+            })
+    return mappings
+
+
+def render_markdown(report: dict) -> str:
+    lines = ["# Schema Comparison Report\n"]
+
+    s = report["summary"]
+    lines.append(f"**Scenario: {s['scenario']}**\n")
+    scenario_desc = {
+        "A": "Direct repoint -- table and column names match.",
+        "B": "View layer needed -- names differ but model structure is the same.",
+        "C": "Mapping document needed -- names and/or model structure differ significantly.",
+        "D": "Intermediate mapping layer -- Power Query M renames resolved via mapping file.",
+    }
+    lines.append(f"> {scenario_desc.get(s['scenario'], 'Unknown')}\n")
+
+    lines.append("## Summary\n")
+    lines.append("| Metric | Count |")
+    lines.append("|--------|-------|")
+    lines.append(f"| Power BI tables | {s['total_pbi_tables']} |")
+    lines.append(f"| Databricks tables | {s['total_dbx_tables']} |")
+    lines.append(f"| Exact table matches | {s['exact_table_matches']} |")
+    lines.append(f"| Fuzzy table matches | {s['fuzzy_table_matches']} |")
+    if s.get('mapped_table_matches', 0) > 0:
+        lines.append(f"| Mapped table matches | {s['mapped_table_matches']} |")
+    lines.append(f"| Power BI only | {s['pbi_only_tables']} |")
+    lines.append(f"| Databricks only | {s['dbx_only_tables']} |")
+    lines.append(f"| Power BI measures | {s['total_pbi_measures']} |")
+    lines.append("")
+
+    if report["table_matches"]:
+        lines.append("## Table Matches\n")
+        for m in report["table_matches"]:
+            match_type = m["match_type"]
+            if match_type == "exact":
+                label = "(exact)"
+            elif match_type == "mapped":
+                label = "(mapped via intermediate layer)"
+            else:
+                label = f"(fuzzy, {m.get('similarity', '')})"
+            lines.append(f"### {m['pbi_table']} -> {m['dbx_table']} {label}\n")
+            cc = m["column_comparison"]
+            parts = [f"{cc['exact_matches']} exact"]
+            if cc.get('mapped_matches', 0) > 0:
+                parts.append(f"{cc['mapped_matches']} mapped")
+            parts.append(f"{cc['fuzzy_matches']} fuzzy")
+            parts.append(f"{len(cc['pbi_only'])} PBI-only")
+            parts.append(f"{len(cc['dbx_only'])} DBX-only")
+            lines.append(f"- Columns: {', '.join(parts)}")
+            if cc.get("mapped_details"):
+                lines.append("- Mapped column matches (via intermediate mapping):")
+                for mm in cc["mapped_details"]:
+                    lines.append(f"  - `{mm['pbi']}` -> `{mm['dbx']}`")
+            if cc["fuzzy_details"]:
+                lines.append("- Fuzzy column matches:")
+                for fm in cc["fuzzy_details"]:
+                    lines.append(f"  - `{fm['pbi']}` ~ `{fm['dbx']}` ({fm['similarity']})")
+            if cc["pbi_only"]:
+                lines.append(f"- PBI-only columns: {', '.join(f'`{c}`' for c in cc['pbi_only'])}")
+            if cc["dbx_only"]:
+                lines.append(f"- DBX-only columns: {', '.join(f'`{c}`' for c in cc['dbx_only'])}")
+            lines.append("")
+
+    if report["table_pbi_only"]:
+        lines.append("## Power BI Only Tables (no Databricks match)\n")
+        for t in report["table_pbi_only"]:
+            lines.append(f"- `{t}`")
+        lines.append("")
+
+    if report["table_dbx_only"]:
+        lines.append("## Databricks Only Tables (not referenced in Power BI)\n")
+        for t in report["table_dbx_only"]:
+            lines.append(f"- `{t}`")
+        lines.append("")
+
+    if report.get("measures"):
+        lines.append("## Power BI Measures Inventory\n")
+        lines.append("| Table | Measure | Display Folder |")
+        lines.append("|-------|---------|----------------|")
+        for m in report["measures"]:
+            folder = m["displayFolder"] or "-"
+            lines.append(f"| {m['table']} | {m['name']} | {folder} |")
+        lines.append("")
+
+    if report["suggested_mappings"]:
+        lines.append("## Suggested Mapping Document\n")
+        lines.append("Save this to `models/mapping_documents/mapping.json` and edit as needed:\n")
+        lines.append("```json")
+        lines.append(json.dumps({"mappings": report["suggested_mappings"]}, indent=2))
+        lines.append("```\n")
+
+    return "\n".join(lines)
+
+
+def render_column_gap_analysis(report: dict) -> str:
+    """Produce a dedicated Column Gap Analysis markdown from the comparison report."""
+    lines = ["# Column Gap Analysis\n"]
+    lines.append("Columns present in Databricks but not referenced in the Power BI model.\n")
+    lines.append("Columns flagged as **discriminators** may be essential for filters, partitions, or report subsets.\n")
+
+    has_gaps = False
+    for m in report.get("table_matches", []):
+        cc = m["column_comparison"]
+        dbx_details = cc.get("dbx_only_details", [])
+        if not dbx_details:
+            continue
+        has_gaps = True
+        table_label = m.get("dbx_fqn") or m["dbx_table"]
+        lines.append(f"## Table: {table_label}\n")
+        lines.append("| Column | Data Type | Discriminator? | Suggested Action |")
+        lines.append("|--------|-----------|----------------|------------------|")
+        for col in dbx_details:
+            disc = "Yes" if col.get("is_discriminator") else "No"
+            action = "May filter report subsets -- run data discovery" if col.get("is_discriminator") else "Review if needed in reports"
+            lines.append(f"| {col['name']} | {col.get('data_type', '')} | {disc} | {action} |")
+        lines.append("")
+
+    if not has_gaps:
+        lines.append("No DBX-only columns detected. All Databricks columns are referenced in the Power BI model.\n")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare Power BI model vs Databricks schema"
+    )
+    parser.add_argument("pbi_json", help="Power BI model JSON (from parse_pbi_model.py)")
+    parser.add_argument("dbx_json", help="Databricks schema JSON (from extract_dbx_schema.py)")
+    parser.add_argument("-o", "--output", default=None, help="Output report path (.md)")
+    parser.add_argument("--json", action="store_true", help="Also output raw JSON report")
+    parser.add_argument(
+        "--mapping", default=None,
+        help="Intermediate mapping JSON (Scenario D). Maps pbi_column -> dbx_column via M renames.",
+    )
+    parser.add_argument(
+        "--gap-analysis", default=None,
+        help="Output path for column gap analysis (.md). Defaults to reference/column_gap_analysis.md.",
+    )
+    args = parser.parse_args()
+
+    with open(args.pbi_json) as f:
+        pbi = json.load(f)
+    with open(args.dbx_json) as f:
+        dbx = json.load(f)
+
+    mapping = _load_mapping(args.mapping) if args.mapping else None
+    report = compare(pbi, dbx, mapping)
+    md = render_markdown(report)
+
+    output_path = args.output or "schema_comparison.md"
+    with open(output_path, "w") as f:
+        f.write(md)
+
+    print(f"Scenario: {report['summary']['scenario']}")
+    print(f"Report: {output_path}")
+
+    if args.json:
+        json_path = output_path.replace(".md", ".json")
+        with open(json_path, "w") as f:
+            json.dump(report, f, indent=2)
+        print(f"JSON: {json_path}")
+
+    gap_path = args.gap_analysis or "reference/column_gap_analysis.md"
+    gap_md = render_column_gap_analysis(report)
+    os.makedirs(os.path.dirname(gap_path) or ".", exist_ok=True)
+    with open(gap_path, "w") as f:
+        f.write(gap_md)
+    print(f"Column gap analysis: {gap_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/extract_dbx_schema.py b/databricks-skills/databricks-powerbi-migration/scripts/extract_dbx_schema.py
new file mode 100644
index 0000000..a356635
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/extract_dbx_schema.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Extract Databricks schema metadata for all tables in a catalog.schema.
+Outputs a JSON file with columns, types, constraints, and table size estimates.
+
+Usage:
+    python extract_dbx_schema.py <catalog> <schema> [-o output.json] [--profile PROFILE]
+
+Requires:
+    pip install databricks-sdk
+
+Authentication:
+    Uses Databricks SDK default auth chain. Preferred: PAT via ~/.databrickscfg
+    or DATABRICKS_HOST + DATABRICKS_TOKEN environment variables.
+    Use --profile to select a specific CLI profile.
+"""
+
+import argparse
+import json
+import sys
+from databricks.sdk import WorkspaceClient
+
+
+def extract_schema(catalog: str, schema: str, profile: str | None = None) -> dict:
+    kwargs = {"profile": profile} if profile else {}
+    w = WorkspaceClient(**kwargs)
+    warehouse_id = _get_warehouse_id(w)
+
+    result = {"catalog": catalog, "schema": schema, "tables": []}
+
+    tables_result = w.statement_execution.execute_statement(
+        warehouse_id=warehouse_id,
+        statement=f"SHOW TABLES IN `{catalog}`.`{schema}`",
+        wait_timeout="30s",
+    )
+
+    if not tables_result.result or not tables_result.result.data_array:
+        print(f"No tables found in {catalog}.{schema}", file=sys.stderr)
+        return result
+
+    for row in tables_result.result.data_array:
+        table_name = row[1]
+        fqn = f"`{catalog}`.`{schema}`.`{table_name}`"
+
+        desc = w.statement_execution.execute_statement(
+            warehouse_id=warehouse_id,
+            statement=f"DESCRIBE EXTENDED {fqn}",
+            wait_timeout="30s",
+        )
+        columns, properties = _parse_describe(desc.result.data_array)
+
+        detail = w.statement_execution.execute_statement(
+            warehouse_id=warehouse_id,
+            statement=f"DESCRIBE DETAIL {fqn}",
+            wait_timeout="30s",
+        )
+        size_info = _parse_detail(detail.result)
+
+        constraints = _get_constraints(w, warehouse_id, fqn)
+
+        result["tables"].append({
+            "name": table_name,
+            "fqn": f"{catalog}.{schema}.{table_name}",
+            "columns": columns,
+            "properties": properties,
+            "size": size_info,
+            "constraints": constraints,
+        })
+
+    return result
+
+
+def _get_warehouse_id(w: WorkspaceClient) -> str:
+    warehouses = list(w.warehouses.list())
+    if not warehouses:
+        raise RuntimeError("No SQL warehouses found. Create one or specify warehouse_id.")
+    for wh in warehouses:
+        if wh.state and wh.state.value == "RUNNING":
+            if getattr(wh, "enable_serverless_compute", False):
+                return wh.id
+    for wh in warehouses:
+        if wh.state and wh.state.value == "RUNNING":
+            return wh.id
+    return warehouses[0].id
+
+
+def _parse_describe(data_array: list) -> tuple[list, dict]:
+    columns = []
+    properties = {}
+    in_columns = True
+
+    for row in data_array:
+        col_name = row[0] if row[0] else ""
+        col_type = row[1] if len(row) > 1 and row[1] else ""
+        comment = row[2] if len(row) > 2 and row[2] else ""
+
+        if col_name.strip() == "" or col_name.startswith("#"):
+            in_columns = False
+            continue
+
+        if in_columns and col_type:
+            columns.append({
+                "name": col_name.strip(),
+                "type": col_type.strip(),
+                "comment": comment.strip(),
+            })
+        elif not in_columns and col_name.strip() and col_type:
+            properties[col_name.strip()] = col_type.strip()
+
+    return columns, properties
+
+
+def _parse_detail(result) -> dict:
+    if not result or not result.data_array:
+        return {}
+    headers = [col.name for col in result.manifest.schema.columns]
+    row = result.data_array[0]
+    detail = dict(zip(headers, row))
+    return {
+        "num_files": detail.get("numFiles"),
+        "size_bytes": detail.get("sizeInBytes"),
+        "num_rows": detail.get("numRows"),
+        "format": detail.get("format"),
+    }
+
+
+def _get_constraints(w: WorkspaceClient, warehouse_id: str, fqn: str) -> list:
+    try:
+        result = w.statement_execution.execute_statement(
+            warehouse_id=warehouse_id,
+            statement=f"SHOW CONSTRAINTS ON {fqn}",
+            wait_timeout="30s",
+        )
+        if result.result and result.result.data_array:
+            return result.result.data_array
+        return []
+    except Exception:
+        return []
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract Databricks schema metadata to JSON"
+    )
+    parser.add_argument("catalog", help="Unity Catalog name")
+    parser.add_argument("schema", help="Schema name")
+    parser.add_argument("-o", "--output", default=None, help="Output JSON file path")
+    parser.add_argument("--profile", default=None, help="Databricks CLI profile name")
+    args = parser.parse_args()
+
+    schema_data = extract_schema(args.catalog, args.schema, args.profile)
+
+    output_path = args.output or f"{args.catalog}_{args.schema}_schema.json"
+    with open(output_path, "w") as f:
+        json.dump(schema_data, f, indent=2)
+
+    table_count = len(schema_data["tables"])
+    total_cols = sum(len(t["columns"]) for t in schema_data["tables"])
+    print(f"Extracted {table_count} tables, {total_cols} columns -> {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/generate_erd.py b/databricks-skills/databricks-powerbi-migration/scripts/generate_erd.py
new file mode 100644
index 0000000..94202d4
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/generate_erd.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+Generate an ERD (Mermaid + text) and domain groupings from a parsed Power BI model.
+
+Reads the JSON output of parse_pbi_model.py and produces:
+  - erd.md      -- Mermaid erDiagram + text-based table/relationship summary
+  - domains.md  -- Tables and measures grouped by inferred business domain
+
+Domain inference uses (in priority order):
+  1. Measure displayFolder values
+  2. Table name prefix patterns (e.g., Sales_*, Fin_*)
+  3. Relationship connectivity (connected components)
+
+Usage:
+    python generate_erd.py <pbi_model.json> [-o OUTPUT_DIR]
+
+No external dependencies required (stdlib only).
+"""
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Mermaid ERD generation
+# ---------------------------------------------------------------------------
+
+_MERMAID_TYPE_MAP = {
+    "int64": "bigint",
+    "int32": "int",
+    "double": "double",
+    "decimal": "decimal",
+    "string": "string",
+    "boolean": "bool",
+    "dateTime": "datetime",
+    "date": "date",
+    "binary": "binary",
+}
+
+
+def _sanitize_mermaid_name(name: str) -> str:
+    """Strip characters that break Mermaid node names."""
+    return re.sub(r"[^A-Za-z0-9_]", "_", name)
+
+
+def _mermaid_type(data_type: str) -> str:
+    return _MERMAID_TYPE_MAP.get(data_type.lower(), data_type or "unknown")
+
+
+def _build_fk_lookup(relationships: list) -> dict[str, set[str]]:
+    """Return {table_lower: {col_lower, ...}} for FK columns."""
+    fks: dict[str, set[str]] = defaultdict(set)
+    for rel in relationships:
+        fks[rel["fromTable"].lower()].add(rel["fromColumn"].lower())
+    return fks
+
+
+def _build_pk_lookup(relationships: list) -> dict[str, set[str]]:
+    """Return {table_lower: {col_lower, ...}} for PK (to-side) columns."""
+    pks: dict[str, set[str]] = defaultdict(set)
+    for rel in relationships:
+        pks[rel["toTable"].lower()].add(rel["toColumn"].lower())
+    return pks
+
+
+def generate_mermaid(model: dict) -> str:
+    """Return a Mermaid erDiagram string."""
+    lines = ["erDiagram"]
+
+    relationships = model.get("relationships", [])
+    fk_lookup = _build_fk_lookup(relationships)
+    pk_lookup = _build_pk_lookup(relationships)
+
+    for rel in relationships:
+        from_t = _sanitize_mermaid_name(rel["fromTable"])
+        to_t = _sanitize_mermaid_name(rel["toTable"])
+        label = rel.get("fromColumn", "")
+        active = rel.get("isActive", True)
+        if not active:
+            label += " (inactive)"
+        lines.append(f'    {from_t} }}o--|| {to_t} : "{label}"')
+
+    for table in model.get("tables", []):
+        safe_name = _sanitize_mermaid_name(table["name"])
+        tbl_lower = table["name"].lower()
+        cols = table.get("columns", []) + table.get("calculated_columns", [])
+        if not cols:
+            continue
+        lines.append(f"    {safe_name} {{")
+        for col in cols:
+            dtype = _mermaid_type(col.get("dataType", ""))
+            col_name = re.sub(r"\s+", "_", col["name"])
+            marker = ""
+            if col["name"].lower() in pk_lookup.get(tbl_lower, set()):
+                marker = " PK"
+            elif col["name"].lower() in fk_lookup.get(tbl_lower, set()):
+                marker = " FK"
+            lines.append(f"        {dtype} {col_name}{marker}")
+        lines.append("    }")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Text-based ERD
+# ---------------------------------------------------------------------------
+
+def generate_text_erd(model: dict) -> str:
+    """Return a human-readable text ERD."""
+    relationships = model.get("relationships", [])
+    fk_lookup = _build_fk_lookup(relationships)
+    pk_lookup = _build_pk_lookup(relationships)
+    sections = []
+
+    for table in model.get("tables", []):
+        tbl_lower = table["name"].lower()
+        cols = table.get("columns", []) + table.get("calculated_columns", [])
+        header = f"Table: {table['name']}"
+        sections.append(header)
+        sections.append("=" * len(header))
+
+        if cols:
+            name_w = max(len(c["name"]) for c in cols)
+            type_w = max(len(c.get("dataType", "")) for c in cols)
+            name_w = max(name_w, 6)
+            type_w = max(type_w, 4)
+
+            sections.append(f"  {'Column':<{name_w}}  {'Type':<{type_w}}  Key")
+            sections.append(f"  {'-' * name_w}  {'-' * type_w}  ---")
+            for col in cols:
+                key = ""
+                if col["name"].lower() in pk_lookup.get(tbl_lower, set()):
+                    key = "PK"
+                elif col["name"].lower() in fk_lookup.get(tbl_lower, set()):
+                    key = "FK"
+                dt = col.get("dataType", "")
+                sections.append(f"  {col['name']:<{name_w}}  {dt:<{type_w}}  {key}")
+
+        measures = table.get("measures", [])
+        if measures:
+            sections.append(f"\n  Measures ({len(measures)}):")
+            for m in measures:
+                sections.append(f"    - {m['name']}")
+
+        sections.append("")
+
+    if relationships:
+        sections.append("Relationships")
+        sections.append("=============")
+        for rel in relationships:
+            active = "" if rel.get("isActive", True) else " [inactive]"
+            cf = rel.get("crossFilteringBehavior", "oneDirection")
+            sections.append(
+                f"  {rel['fromTable']}.{rel['fromColumn']} --> "
+                f"{rel['toTable']}.{rel['toColumn']}  "
+                f"(crossFilter: {cf}){active}"
+            )
+        sections.append("")
+
+    return "\n".join(sections)
+
+
+# ---------------------------------------------------------------------------
+# Domain inference
+# ---------------------------------------------------------------------------
+
+def _infer_domains(model: dict) -> dict[str, dict]:
+    """
+    Return {domain_name: {"tables": [...], "measures": [...], "relationships": [...]}}.
+
+    Inference priority:
+      1. Measure displayFolder (most explicit signal from PBI model authors)
+      2. Table name prefix (e.g., Sales_Fact -> "Sales")
+      3. Relationship connectivity (connected-component grouping)
+    """
+    tables = model.get("tables", [])
+    relationships = model.get("relationships", [])
+
+    table_domain: dict[str, str] = {}
+    domain_measures: dict[str, list] = defaultdict(list)
+
+    # --- Pass 1: displayFolder -------------------------------------------------
+    folder_to_tables: dict[str, set] = defaultdict(set)
+    for table in tables:
+        for m in table.get("measures", []):
+            folder = (m.get("displayFolder") or "").strip()
+            if folder:
+                top = folder.split("\\")[0].split("/")[0].strip()
+                folder_to_tables[top].add(table["name"])
+                domain_measures[top].append(m["name"])
+
+    for domain, tbl_set in folder_to_tables.items():
+        for t in tbl_set:
+            if t not in table_domain:
+                table_domain[t] = domain
+
+    # --- Pass 2: table name prefix ---------------------------------------------
+    prefix_groups: dict[str, list] = defaultdict(list)
+    for table in tables:
+        name = table["name"]
+        if name in table_domain:
+            continue
+        match = re.match(r"^([A-Z][a-z]+|[A-Z]+(?=[A-Z_]))", name)
+        if match:
+            prefix_groups[match.group(1)].append(name)
+
+    for prefix, tbl_names in prefix_groups.items():
+        if len(tbl_names) >= 2:
+            for t in tbl_names:
+                if t not in table_domain:
+                    table_domain[t] = prefix
+
+    # --- Pass 3: relationship connectivity (union-find) ------------------------
+    parent: dict[str, str] = {}
+
+    def find(x: str) -> str:
+        while parent.get(x, x) != x:
+            parent[x] = parent.get(parent[x], parent[x])
+            x = parent[x]
+        return x
+
+    def union(a: str, b: str) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    all_table_names = [t["name"] for t in tables]
+    for t in all_table_names:
+        parent.setdefault(t, t)
+
+    for rel in relationships:
+        ft, tt = rel.get("fromTable", ""), rel.get("toTable", "")
+        if ft and tt:
+            union(ft, tt)
+
+    component_tables: dict[str, list] = defaultdict(list)
+    for t in all_table_names:
+        component_tables[find(t)].append(t)
+
+    for root, members in component_tables.items():
+        assigned_domain = None
+        for m in members:
+            if m in table_domain:
+                assigned_domain = table_domain[m]
+                break
+        if assigned_domain:
+            for m in members:
+                if m not in table_domain:
+                    table_domain[m] = assigned_domain
+
+    # --- Remaining tables go to "Unassigned" -----------------------------------
+    for table in tables:
+        if table["name"] not in table_domain:
+            table_domain[table["name"]] = "Unassigned"
+
+    # --- Build result ----------------------------------------------------------
+    domains: dict[str, dict] = defaultdict(lambda: {"tables": [], "measures": [], "relationships": []})
+
+    for table in tables:
+        d = table_domain[table["name"]]
+        domains[d]["tables"].append(table["name"])
+        for m in table.get("measures", []):
+            domains[d]["measures"].append(m["name"])
+
+    for rel in relationships:
+        ft = rel.get("fromTable", "")
+        d = table_domain.get(ft, "Unassigned")
+        domains[d]["relationships"].append(
+            f"{rel['fromTable']}.{rel['fromColumn']} -> {rel['toTable']}.{rel['toColumn']}"
+        )
+
+    return dict(domains)
+
+
+def render_domains_md(domains: dict[str, dict]) -> str:
+    lines = ["# Data Domains\n"]
+    lines.append("Domains inferred from Power BI measure display folders, table name patterns, and relationship connectivity.\n")
+
+    for name in sorted(domains.keys()):
+        info = domains[name]
+        lines.append(f"## Domain: {name}\n")
+        lines.append(f"**Tables:** {', '.join(info['tables'])}\n")
+        if info["measures"]:
+            lines.append(f"**Measures ({len(info['measures'])}):**")
+            for m in info["measures"]:
+                lines.append(f"- {m}")
+            lines.append("")
+        if info["relationships"]:
+            lines.append("**Relationships:**")
+            for r in info["relationships"]:
+                lines.append(f"- {r}")
+            lines.append("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate ERD and domain analysis from a parsed Power BI model"
+    )
+    parser.add_argument("pbi_json", help="Path to pbi_model.json (from parse_pbi_model.py)")
+    parser.add_argument(
+        "-o", "--output-dir",
+        default=".",
+        help="Directory to write erd.md and domains.md (default: current dir)",
+    )
+    args = parser.parse_args()
+
+    with open(args.pbi_json) as f:
+        model = json.load(f)
+
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # --- ERD -------------------------------------------------------------------
+    mermaid = generate_mermaid(model)
+    text_erd = generate_text_erd(model)
+
+    erd_path = out_dir / "erd.md"
+    with open(erd_path, "w") as f:
+        f.write("# Entity-Relationship Diagram\n\n")
+        f.write("## Mermaid ERD\n\n")
+        f.write("```mermaid\n")
+        f.write(mermaid)
+        f.write("\n```\n\n")
+        f.write("## Text ERD\n\n")
+        f.write("```\n")
+        f.write(text_erd)
+        f.write("\n```\n")
+
+    table_count = len(model.get("tables", []))
+    rel_count = len(model.get("relationships", []))
+    print(f"ERD: {table_count} tables, {rel_count} relationships -> {erd_path}")
+
+    # --- Domains ---------------------------------------------------------------
+    domains = _infer_domains(model)
+    domains_md = render_domains_md(domains)
+
+    domains_path = out_dir / "domains.md"
+    with open(domains_path, "w") as f:
+        f.write(domains_md)
+
+    print(f"Domains: {len(domains)} domains -> {domains_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/init_project.sh b/databricks-skills/databricks-powerbi-migration/scripts/init_project.sh
new file mode 100755
index 0000000..e2eea78
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/init_project.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+#
+# Initialize project folder structure for PowerBI-to-Databricks conversion.
+#
+# Usage:
+#   bash init_project.sh [project_dir] [FLAGS...]
+#
+# Flags:
+#   --models  Create models/metric_views/ and models/mapping_documents/
+#   --gold    Create notebooks/
+#   --kpi     Create kpi/
+#   --report  Create planreport/
+#   --all     Create everything
+#
+# Without flags, creates the minimal base:
+#   input/, reference/, temp/, .gitignore
+#
+# All user-provided files (PBI models, mappings, DDLs, configs, schema
+# dumps, sample reports) go into a single flat input/ folder.
+#
+# Examples:
+#   bash init_project.sh                          # minimal base in cwd
+#   bash init_project.sh myproject                # minimal base in myproject/
+#   bash init_project.sh myproject --models       # + metric view folders
+#   bash init_project.sh myproject --kpi --report # + KPI and report folders
+#   bash init_project.sh myproject --all          # everything
+
+set -euo pipefail
+
+# ---- Parse arguments --------------------------------------------------------
+PROJECT_DIR="."
+FLAGS=()
+
+for arg in "$@"; do
+    case "$arg" in
+        --*) FLAGS+=("$arg") ;;
+        *)
+            if [ "$PROJECT_DIR" = "." ]; then
+                PROJECT_DIR="$arg"
+            fi
+            ;;
+    esac
+done
+
+want_all=false
+want_models=false
+want_gold=false
+want_kpi=false
+want_report=false
+
+for flag in "${FLAGS[@]+"${FLAGS[@]}"}"; do
+    case "$flag" in
+        --all)    want_all=true ;;
+        --models) want_models=true ;;
+        --gold)   want_gold=true ;;
+        --kpi)    want_kpi=true ;;
+        --report) want_report=true ;;
+        *)        echo "Unknown flag: $flag" >&2; exit 1 ;;
+    esac
+done
+
+if $want_all; then
+    want_models=true
+    want_gold=true
+    want_kpi=true
+    want_report=true
+fi
+
+echo "Initializing PowerBI-to-Databricks project in: ${PROJECT_DIR}"
+
+# ---- Always create base folders --------------------------------------------
+mkdir -p "${PROJECT_DIR}/input"
+mkdir -p "${PROJECT_DIR}/reference"
+mkdir -p "${PROJECT_DIR}/temp"
+
+echo "  Created: input/, reference/, temp/"
+
+# ---- Conditional folders ----------------------------------------------------
+if $want_models; then
+    mkdir -p "${PROJECT_DIR}/models/metric_views"
+    mkdir -p "${PROJECT_DIR}/models/mapping_documents"
+    echo "  Created: models/metric_views/, models/mapping_documents/"
+fi
+
+if $want_gold; then
+    mkdir -p "${PROJECT_DIR}/notebooks"
+    echo "  Created: notebooks/"
+fi
+
+if $want_kpi; then
+    mkdir -p "${PROJECT_DIR}/kpi"
+    echo "  Created: kpi/"
+fi
+
+if $want_report; then
+    mkdir -p "${PROJECT_DIR}/planreport"
+    echo "  Created: planreport/"
+fi
+
+# ---- .gitignore -------------------------------------------------------------
+GITIGNORE="${PROJECT_DIR}/.gitignore"
+if [ ! -f "${GITIGNORE}" ]; then
+    cat > "${GITIGNORE}" << 'EOF'
+# Secrets
+*.token
+.env
+input/databricks.yml
+
+# Working files
+temp/
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Python
+__pycache__/
+*.pyc
+.venv/
+EOF
+    echo "  Created: ${GITIGNORE}"
+fi
+
+# ---- Summary ----------------------------------------------------------------
+echo ""
+echo "Project structure:"
+echo ""
+find "${PROJECT_DIR}" -type d | sort | while read -r d; do
+    echo "  ${d}/"
+done
+echo ""
+echo "Next steps:"
+echo "  - Place ALL input files in input/ (PBI models, mappings, DDLs, configs, sample reports, etc.)"
+echo "  - Run the skill workflow -- the agent will scan and classify every file"
+if ! $want_models; then
+    echo "  - To add metric view folders later: rerun with --models"
+fi
+if ! $want_gold; then
+    echo "  - To add gold-layer folders later: rerun with --gold"
+fi
+if ! $want_kpi; then
+    echo "  - To add KPI definitions folder later: rerun with --kpi"
+fi
+if ! $want_report; then
+    echo "  - To add report planning folder later: rerun with --report"
+fi
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/parse_pbi_model.py b/databricks-skills/databricks-powerbi-migration/scripts/parse_pbi_model.py
new file mode 100644
index 0000000..f7883bb
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/parse_pbi_model.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+"""
+Parse Power BI semantic model exports into structured JSON.
+
+Detects format by content (not extension), supporting:
+  - PBIT / PBIX files (ZIP archives containing DataModelSchema)
+  - BIM files (JSON with a model key)
+  - TMDL directories or standalone .tmdl files
+  - Any file with an unknown extension -- probed as ZIP, then JSON, then TMDL text
+
+Single-file mode:
+    python parse_pbi_model.py input/model.pbix -o reference/pbi_model.json
+
+Batch mode (scans a directory for all PBI model files):
+    python parse_pbi_model.py input/ -o reference/pbi_model.json
+
+No external dependencies required (stdlib only).
+"""
+
+import argparse
+import json
+import re
+import sys
+import zipfile
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Format-specific parsers (unchanged)
+# ---------------------------------------------------------------------------
+
+def parse_zip_model(zip_path: str) -> dict:
+    """Parse a ZIP archive (PBIT or PBIX) containing DataModelSchema."""
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        schema_name = _find_data_model_schema(zf)
+        if not schema_name:
+            raise ValueError(
+                f"No DataModelSchema found in {zip_path}. "
+                f"Contents: {zf.namelist()}"
+            )
+
+        raw = zf.read(schema_name)
+        for encoding in ("utf-16-le", "utf-8-sig", "utf-8"):
+            try:
+                text = raw.decode(encoding)
+                break
+            except (UnicodeDecodeError, ValueError):
+                continue
+        else:
+            text = raw.decode("utf-8", errors="replace")
+
+        text = text.lstrip("\ufeff")
+        model_json = json.loads(text)
+
+    model = model_json.get("model", model_json)
+    result = _parse_model_dict(model)
+    result["source"] = str(zip_path)
+    ext = Path(zip_path).suffix.lstrip(".").lower() or "zip"
+    result["format"] = ext if ext in ("pbit", "pbix") else "pbit"
+    return result
+
+
+def _find_data_model_schema(zf: zipfile.ZipFile) -> str | None:
+    for candidate in ("DataModelSchema", "DataModelSchema.json"):
+        if candidate in zf.namelist():
+            return candidate
+    for name in zf.namelist():
+        if "datamodelschema" in name.lower():
+            return name
+    return None
+
+
+def parse_bim(bim_path: str) -> dict:
+    """Parse a model.bim (JSON) file."""
+    with open(bim_path, "r", encoding="utf-8-sig") as f:
+        model_json = json.load(f)
+
+    model = model_json.get("model", model_json)
+    result = _parse_model_dict(model)
+    result["source"] = str(bim_path)
+    result["format"] = "bim"
+    return result
+
+
+def _parse_model_dict(model: dict) -> dict:
+    """Parse a model dictionary (shared by BIM, PBIT, and PBIX formats)."""
+    result = {
+        "source": "",
+        "format": "",
+        "tables": [],
+        "relationships": [],
+        "roles": [],
+    }
+
+    for table in model.get("tables", []):
+        table_info = {
+            "name": table["name"],
+            "columns": [],
+            "measures": [],
+            "calculated_columns": [],
+            "partitions": [],
+        }
+
+        for col in table.get("columns", []):
+            col_info = {
+                "name": col["name"],
+                "dataType": col.get("dataType", ""),
+                "sourceColumn": col.get("sourceColumn", ""),
+                "isHidden": col.get("isHidden", False),
+                "formatString": col.get("formatString", ""),
+            }
+            if "expression" in col:
+                col_info["expression"] = _normalize_expression(col["expression"])
+                table_info["calculated_columns"].append(col_info)
+            else:
+                table_info["columns"].append(col_info)
+
+        for measure in table.get("measures", []):
+            table_info["measures"].append({
+                "name": measure["name"],
+                "expression": _normalize_expression(measure.get("expression", "")),
+                "displayFolder": measure.get("displayFolder", ""),
+                "formatString": measure.get("formatString", ""),
+                "description": measure.get("description", ""),
+            })
+
+        for partition in table.get("partitions", []):
+            source = partition.get("source", {})
+            table_info["partitions"].append({
+                "name": partition.get("name", ""),
+                "type": source.get("type", ""),
+                "expression": _normalize_expression(source.get("expression", "")),
+            })
+
+        result["tables"].append(table_info)
+
+    for rel in model.get("relationships", []):
+        result["relationships"].append({
+            "fromTable": rel.get("fromTable", ""),
+            "fromColumn": rel.get("fromColumn", ""),
+            "toTable": rel.get("toTable", ""),
+            "toColumn": rel.get("toColumn", ""),
+            "crossFilteringBehavior": rel.get("crossFilteringBehavior", "oneDirection"),
+            "isActive": rel.get("isActive", True),
+        })
+
+    for role in model.get("roles", []):
+        role_info = {"name": role["name"], "filters": []}
+        for perm in role.get("tablePermissions", []):
+            role_info["filters"].append({
+                "table": perm.get("name", ""),
+                "filterExpression": perm.get("filterExpression", ""),
+            })
+        result["roles"].append(role_info)
+
+    return result
+
+
+def parse_tmdl(tmdl_dir: str) -> dict:
+    """Parse a TMDL directory export."""
+    result = {
+        "source": str(tmdl_dir),
+        "format": "tmdl",
+        "tables": [],
+        "relationships": [],
+        "roles": [],
+    }
+
+    base = Path(tmdl_dir)
+    tables_dir = base / "tables"
+    if not tables_dir.exists():
+        tables_dir = base / "definition" / "tables"
+
+    if tables_dir.exists():
+        for tmdl_file in sorted(tables_dir.glob("*.tmdl")):
+            table_info = _parse_tmdl_table(tmdl_file)
+            if table_info:
+                result["tables"].append(table_info)
+
+    rel_file = base / "relationships.tmdl"
+    if not rel_file.exists():
+        rel_file = base / "definition" / "relationships.tmdl"
+    if rel_file.exists():
+        result["relationships"] = _parse_tmdl_relationships(rel_file)
+
+    roles_dir = base / "roles"
+    if not roles_dir.exists():
+        roles_dir = base / "definition" / "roles"
+    if roles_dir.exists():
+        for role_file in sorted(roles_dir.glob("*.tmdl")):
+            role_info = _parse_tmdl_role(role_file)
+            if role_info:
+                result["roles"].append(role_info)
+
+    return result
+
+
+def _parse_tmdl_table(filepath: Path) -> dict | None:
+    content = filepath.read_text(encoding="utf-8-sig")
+    table_match = re.search(r"^table\s+'?([^'\n]+)'?\s*$", content, re.MULTILINE)
+    if not table_match:
+        return None
+
+    table_info = {
+        "name": table_match.group(1).strip(),
+        "columns": [],
+        "measures": [],
+        "calculated_columns": [],
+        "partitions": [],
+    }
+
+    for m in re.finditer(
+        r"column\s+'?([^'\n]+)'?\s*\n((?:\t[^\n]*\n)*)", content
+    ):
+        col_name = m.group(1).strip()
+        col_body = m.group(2)
+        col_info = {"name": col_name, "dataType": "", "sourceColumn": ""}
+
+        dt = re.search(r"dataType:\s*(\S+)", col_body)
+        if dt:
+            col_info["dataType"] = dt.group(1)
+        sc = re.search(r"sourceColumn:\s*(.+)", col_body)
+        if sc:
+            col_info["sourceColumn"] = sc.group(1).strip()
+
+        expr = re.search(r"expression\s*=\s*(.+?)(?=\n\t\w|\Z)", col_body, re.DOTALL)
+        if expr:
+            col_info["expression"] = _normalize_expression(expr.group(1))
+            table_info["calculated_columns"].append(col_info)
+        else:
+            table_info["columns"].append(col_info)
+
+    for m in re.finditer(
+        r"measure\s+'?([^'\n]+)'?\s*=\s*(.*?)(?=\n\tmeasure\s|\n\tcolumn\s|\n\tpartition\s|\nrelationship\s|\Z)",
+        content,
+        re.DOTALL,
+    ):
+        measure_name = m.group(1).strip()
+        measure_body = m.group(2).strip()
+        lines = measure_body.split("\n")
+        expr_lines = []
+        meta = {}
+        for line in lines:
+            stripped = line.strip()
+            kv = re.match(r"^(displayFolder|formatString|description)\s*:\s*(.*)", stripped)
+            if kv:
+                meta[kv.group(1)] = kv.group(2).strip()
+            else:
+                expr_lines.append(stripped)
+
+        table_info["measures"].append({
+            "name": measure_name,
+            "expression": _normalize_expression("\n".join(expr_lines)),
+            "displayFolder": meta.get("displayFolder", ""),
+            "formatString": meta.get("formatString", ""),
+            "description": meta.get("description", ""),
+        })
+
+    return table_info
+
+
+def _parse_tmdl_relationships(filepath: Path) -> list:
+    content = filepath.read_text(encoding="utf-8-sig")
+    relationships = []
+    for m in re.finditer(r"relationship\s+.*?\n((?:\t[^\n]*\n)*)", content):
+        body = m.group(1)
+        rel = {}
+        ft = re.search(r"fromTable:\s*'?([^'\n]+)", body)
+        fc = re.search(r"fromColumn:\s*'?([^'\n]+)", body)
+        tt = re.search(r"toTable:\s*'?([^'\n]+)", body)
+        tc = re.search(r"toColumn:\s*'?([^'\n]+)", body)
+        if ft:
+            rel["fromTable"] = ft.group(1).strip()
+        if fc:
+            rel["fromColumn"] = fc.group(1).strip()
+        if tt:
+            rel["toTable"] = tt.group(1).strip()
+        if tc:
+            rel["toColumn"] = tc.group(1).strip()
+        cf = re.search(r"crossFilteringBehavior:\s*(\S+)", body)
+        if cf:
+            rel["crossFilteringBehavior"] = cf.group(1)
+        ia = re.search(r"isActive:\s*(\S+)", body)
+        if ia:
+            rel["isActive"] = ia.group(1).lower() == "true"
+        if rel:
+            relationships.append(rel)
+    return relationships
+
+
+def _parse_tmdl_role(filepath: Path) -> dict | None:
+    content = filepath.read_text(encoding="utf-8-sig")
+    role_match = re.search(r"^role\s+'?([^'\n]+)'?\s*$", content, re.MULTILINE)
+    if not role_match:
+        return None
+    return {"name": role_match.group(1).strip(), "filters": []}
+
+
+def _normalize_expression(expr) -> str:
+    if isinstance(expr, list):
+        expr = "\n".join(expr)
+    if not isinstance(expr, str):
+        return str(expr)
+    return re.sub(r"\s+", " ", expr).strip()
+
+
+# ---------------------------------------------------------------------------
+# Content-based detection
+# ---------------------------------------------------------------------------
+
+def _is_tmdl_directory(path: Path) -> bool:
+    """Check if a directory looks like a TMDL export."""
+    return (
+        (path / "tables").is_dir()
+        or (path / "definition" / "tables").is_dir()
+    )
+
+
+def detect_and_parse(path: Path) -> dict | None:
+    """Detect the format of a file by probing its content and parse it.
+
+    Returns the parsed model dict, or None if the file is not a
+    recognizable Power BI model.
+    """
+    if path.is_dir():
+        if _is_tmdl_directory(path):
+            return parse_tmdl(str(path))
+        return None
+
+    # 1. Try ZIP (handles .pbit, .pbix, .zip, or any extension)
+    if zipfile.is_zipfile(str(path)):
+        try:
+            with zipfile.ZipFile(str(path), "r") as zf:
+                if _find_data_model_schema(zf):
+                    return parse_zip_model(str(path))
+        except (zipfile.BadZipFile, Exception):
+            pass
+
+    # 2. Try JSON (handles .bim, .json, or any extension)
+    try:
+        with open(path, "r", encoding="utf-8-sig") as f:
+            data = json.load(f)
+        if isinstance(data, dict):
+            model = data.get("model", data)
+            if isinstance(model, dict) and "tables" in model:
+                result = _parse_model_dict(model)
+                result["source"] = str(path)
+                result["format"] = "bim"
+                return result
+    except (json.JSONDecodeError, UnicodeDecodeError, OSError):
+        pass
+
+    # 3. Try TMDL text (handles .tmdl or any text file with TMDL content)
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+        first_line = text.lstrip().split("\n", 1)[0].strip()
+        if re.match(r"^(table|relationship)\s+", first_line):
+            table_info = _parse_tmdl_table(path)
+            if table_info:
+                return {
+                    "source": str(path),
+                    "format": "tmdl",
+                    "tables": [table_info],
+                    "relationships": [],
+                    "roles": [],
+                }
+    except (UnicodeDecodeError, OSError):
+        pass
+
+    return None
+
+
+def parse_directory(input_dir: Path) -> list[dict]:
+    """Parse all recognizable PBI model files/subdirs in a directory."""
+    models = []
+
+    for child in sorted(input_dir.iterdir()):
+        if child.name.startswith("."):
+            continue
+        result = detect_and_parse(child)
+        if result:
+            models.append(result)
+            _print_model_summary(result)
+        else:
+            if child.is_file():
+                print(f"  Skipped (not a PBI model): {child.name}", file=sys.stderr)
+
+    return models
+
+
+def _print_model_summary(model: dict) -> None:
+    table_count = len(model["tables"])
+    measure_count = sum(len(t["measures"]) for t in model["tables"])
+    rel_count = len(model["relationships"])
+    fmt = model.get("format", "unknown")
+    src = model.get("source", "?")
+    print(
+        f"  Parsed [{fmt}] {table_count} tables, {measure_count} measures, "
+        f"{rel_count} relationships <- {src}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse Power BI model(s) to JSON. Detects format by content."
+    )
+    parser.add_argument(
+        "path",
+        help="Path to a PBI file (any extension), TMDL directory, or a "
+             "directory of input files (batch mode)",
+    )
+    parser.add_argument("-o", "--output", default=None, help="Output JSON file path")
+    args = parser.parse_args()
+
+    path = Path(args.path)
+
+    if path.is_dir() and not _is_tmdl_directory(path):
+        # Batch mode: scan directory for all PBI model files
+        print(f"Scanning {path} for Power BI models...")
+        models = parse_directory(path)
+        if not models:
+            print("No Power BI model files found.", file=sys.stderr)
+            sys.exit(1)
+
+        if len(models) == 1:
+            output_data = models[0]
+        else:
+            output_data = models
+
+        output_path = args.output or "pbi_model.json"
+        with open(output_path, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"\n{len(models)} model(s) -> {output_path}")
+    else:
+        # Single file/TMDL dir mode
+        model = detect_and_parse(path)
+        if not model:
+            print(
+                f"Error: Could not detect a Power BI model in {path}. "
+                "Tried ZIP (PBIT/PBIX), JSON (BIM), and TMDL formats.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        output_path = args.output or "pbi_model.json"
+        with open(output_path, "w") as f:
+            json.dump(model, f, indent=2)
+
+        _print_model_summary(model)
+        print(f"-> {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/databricks-skills/databricks-powerbi-migration/scripts/scan_inputs.py b/databricks-skills/databricks-powerbi-migration/scripts/scan_inputs.py
new file mode 100644
index 0000000..ff8cb25
--- /dev/null
+++ b/databricks-skills/databricks-powerbi-migration/scripts/scan_inputs.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+"""
+Scan the input/ folder and classify every file by content.
+
+Classifications:
+  pbi_model         ZIP with DataModelSchema, BIM JSON, or TMDL
+  mapping_json      JSON with a "mappings" array
+  dbx_schema        JSON with catalog/schema/tables keys (extract_dbx_schema.py output)
+  sql_ddl           Text containing CREATE TABLE / CREATE VIEW statements
+  sql_query_output  Text resembling DESCRIBE TABLE or INFORMATION_SCHEMA output
+  csv_schema_dump   CSV with schema-shaped headers (table_name, column_name, data_type)
+  csv_data          CSV file (headers can inform schema)
+  sample_report     Document/image that may contain report layout (.docx, .pdf, .png, .jpg, .xlsx, .pptx)
+  databricks_config YAML with host/token keys
+  unknown           Anything else
+
+Usage:
+    python scan_inputs.py <input_dir> [-o manifest.json]
+
+No external dependencies required (stdlib only).
+"""
+
+import argparse
+import csv
+import io
+import json
+import re
+import sys
+import zipfile
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Classifiers -- each returns (type, format_or_none, details) or None
+# ---------------------------------------------------------------------------
+
+def _try_pbi_model(path: Path) -> tuple[str, str, str] | None:
+    """Detect Power BI model files (ZIP/JSON/TMDL)."""
+    if path.is_dir():
+        if (path / "tables").is_dir() or (path / "definition" / "tables").is_dir():
+            tmdl_count = sum(
+                1 for _ in (path / "tables").glob("*.tmdl")
+            ) if (path / "tables").is_dir() else sum(
+                1 for _ in (path / "definition" / "tables").glob("*.tmdl")
+            )
+            return ("pbi_model", "tmdl", f"TMDL directory with {tmdl_count} table files")
+        return None
+
+    if zipfile.is_zipfile(str(path)):
+        try:
+            with zipfile.ZipFile(str(path), "r") as zf:
+                names = zf.namelist()
+                has_schema = any("datamodelschema" in n.lower() for n in names)
+                if has_schema:
+                    ext = path.suffix.lstrip(".").lower()
+                    fmt = ext if ext in ("pbit", "pbix") else "pbit"
+                    raw = None
+                    for candidate in ("DataModelSchema", "DataModelSchema.json"):
+                        if candidate in names:
+                            raw = zf.read(candidate)
+                            break
+                    if raw is None:
+                        for n in names:
+                            if "datamodelschema" in n.lower():
+                                raw = zf.read(n)
+                                break
+                    details = f"ZIP archive ({fmt})"
+                    if raw:
+                        try:
+                            for enc in ("utf-16-le", "utf-8-sig", "utf-8"):
+                                try:
+                                    text = raw.decode(enc).lstrip("\ufeff")
+                                    break
+                                except (UnicodeDecodeError, ValueError):
+                                    continue
+                            else:
+                                text = raw.decode("utf-8", errors="replace")
+                            mj = json.loads(text)
+                            model = mj.get("model", mj)
+                            tc = len(model.get("tables", []))
+                            mc = sum(len(t.get("measures", [])) for t in model.get("tables", []))
+                            details = f"{tc} tables, {mc} measures"
+                        except Exception:
+                            pass
+                    return ("pbi_model", fmt, details)
+        except (zipfile.BadZipFile, Exception):
+            pass
+
+    try:
+        with open(path, "r", encoding="utf-8-sig") as f:
+            data = json.load(f)
+        if isinstance(data, dict):
+            model = data.get("model", data)
+            if isinstance(model, dict) and "tables" in model:
+                tc = len(model.get("tables", []))
+                mc = sum(len(t.get("measures", [])) for t in model.get("tables", []))
+                return ("pbi_model", "bim", f"{tc} tables, {mc} measures")
+    except (json.JSONDecodeError, UnicodeDecodeError, OSError):
+        pass
+
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+        first_line = text.lstrip().split("\n", 1)[0].strip()
+        if re.match(r"^(table|relationship)\s+", first_line):
+            return ("pbi_model", "tmdl", "TMDL text file")
+    except (UnicodeDecodeError, OSError):
+        pass
+
+    return None
+
+
+def _try_mapping_json(path: Path) -> tuple[str, str | None, str] | None:
+    if path.is_dir():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8-sig") as f:
+            data = json.load(f)
+        if isinstance(data, dict) and "mappings" in data and isinstance(data["mappings"], list):
+            count = len(data["mappings"])
+            return ("mapping_json", None, f"{count} table mapping(s)")
+    except (json.JSONDecodeError, UnicodeDecodeError, OSError):
+        pass
+    return None
+
+
+def _try_dbx_schema(path: Path) -> tuple[str, str | None, str] | None:
+    if path.is_dir():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8-sig") as f:
+            data = json.load(f)
+        if isinstance(data, dict) and all(k in data for k in ("catalog", "schema", "tables")):
+            tc = len(data.get("tables", []))
+            return ("dbx_schema", None, f"{data['catalog']}.{data['schema']} -- {tc} tables")
+    except (json.JSONDecodeError, UnicodeDecodeError, OSError):
+        pass
+    return None
+
+
+def _try_sql_ddl(path: Path) -> tuple[str, str | None, str] | None:
+    if path.is_dir():
+        return None
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+    except (UnicodeDecodeError, OSError):
+        return None
+    upper = text.upper()
+    create_count = len(re.findall(r"\bCREATE\s+(OR\s+REPLACE\s+)?(TABLE|VIEW|MATERIALIZED\s+VIEW)\b", upper))
+    if create_count > 0:
+        return ("sql_ddl", None, f"{create_count} CREATE TABLE/VIEW statement(s)")
+    return None
+
+
+def _try_sql_query_output(path: Path) -> tuple[str, str | None, str] | None:
+    """Detect pasted DESCRIBE TABLE or INFORMATION_SCHEMA output."""
+    if path.is_dir():
+        return None
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+    except (UnicodeDecodeError, OSError):
+        return None
+    upper = text.upper()
+    describe_markers = [
+        r"\bDESCRIBE\s+(TABLE\s+)?EXTENDED\b",
+        r"\bSHOW\s+TABLES\b",
+        r"\bINFORMATION_SCHEMA\b",
+        r"\btable_name\b.*\bcolumn_name\b.*\bdata_type\b",
+    ]
+    for marker in describe_markers:
+        if re.search(marker, upper if marker.startswith(r"\b") else text, re.IGNORECASE):
+            lines = [l for l in text.strip().splitlines() if l.strip()]
+            return ("sql_query_output", None, f"Schema query output, {len(lines)} lines")
+
+    # Heuristic: tab/pipe-separated rows that look like col_name | type patterns
+    pipe_rows = re.findall(r"^\s*\w+\s*\|\s*\w+", text, re.MULTILINE)
+    if len(pipe_rows) >= 3:
+        return ("sql_query_output", None, f"Tabular schema output, {len(pipe_rows)} rows")
+
+    return None
+
+
+def _try_csv_schema_dump(path: Path) -> tuple[str, str | None, str] | None:
+    """Detect CSV files containing schema metadata (INFORMATION_SCHEMA-style exports)."""
+    if path.is_dir():
+        return None
+    if path.suffix.lower() not in (".csv", ".tsv"):
+        return None
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+        reader = csv.reader(io.StringIO(text))
+        headers = next(reader, None)
+        if not headers:
+            return None
+        normalized = [h.strip().lower().replace(" ", "_") for h in headers]
+        has_table = any(h in ("table_name", "tablename", "table") for h in normalized)
+        has_column = any(h in ("column_name", "columnname", "column") for h in normalized)
+        if not (has_table and has_column):
+            return None
+        rows = list(reader)
+        table_names = set()
+        table_col_idx = next(
+            (i for i, h in enumerate(normalized) if h in ("table_name", "tablename", "table")),
+            0,
+        )
+        for row in rows:
+            if len(row) > table_col_idx and row[table_col_idx].strip():
+                table_names.add(row[table_col_idx].strip())
+        return (
+            "csv_schema_dump",
+            None,
+            f"Schema dump: {len(table_names)} tables, {len(rows)} columns",
+        )
+    except Exception:
+        return None
+
+
+def _try_csv(path: Path) -> tuple[str, str | None, str] | None:
+    if path.is_dir():
+        return None
+    if path.suffix.lower() in (".csv", ".tsv"):
+        try:
+            text = path.read_text(encoding="utf-8-sig")
+            reader = csv.reader(io.StringIO(text))
+            headers = next(reader, None)
+            row_count = sum(1 for _ in reader)
+            if headers:
+                cols = ", ".join(headers[:5])
+                extra = f" + {len(headers) - 5} more" if len(headers) > 5 else ""
+                return ("csv_data", None, f"{row_count} rows, columns: {cols}{extra}")
+        except Exception:
+            pass
+    return None
+
+
+def _try_databricks_config(path: Path) -> tuple[str, str | None, str] | None:
+    if path.is_dir():
+        return None
+    if path.suffix.lower() not in (".yml", ".yaml"):
+        return None
+    try:
+        text = path.read_text(encoding="utf-8-sig")
+    except (UnicodeDecodeError, OSError):
+        return None
+    has_host = bool(re.search(r"^host\s*:", text, re.MULTILINE))
+    has_token = bool(re.search(r"^token\s*:", text, re.MULTILINE))
+    has_profile = bool(re.search(r"^profile\s*:", text, re.MULTILINE))
+    if has_host or has_token or has_profile:
+        cat_match = re.search(r"^catalog\s*:\s*[\"']?(\S+)", text, re.MULTILINE)
+        sch_match = re.search(r"^schema\s*:\s*[\"']?(\S+)", text, re.MULTILINE)
+        parts = []
+        if cat_match:
+            cat_val = cat_match.group(1).strip("\"'")
+            parts.append(f"catalog={cat_val}")
+        if sch_match:
+            sch_val = sch_match.group(1).strip("\"'")
+            parts.append(f"schema={sch_val}")
+        details = ", ".join(["Databricks config"] + parts) if parts else "Databricks config (host/token)"
+        return ("databricks_config", None, details)
+    return None
+
+
+_SAMPLE_REPORT_EXTENSIONS = {
+    ".docx", ".pdf", ".png", ".jpg", ".jpeg", ".xlsx", ".pptx", ".gif", ".bmp", ".tiff",
+}
+
+
+def _try_sample_report(path: Path) -> tuple[str, str | None, str] | None:
+    """Detect document/image files that may contain sample report layouts."""
+    if path.is_dir():
+        return None
+    if path.suffix.lower() in _SAMPLE_REPORT_EXTENSIONS:
+        try:
+            size = path.stat().st_size
+            size_label = (
+                f"{size / 1024:.0f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.1f} MB"
+            )
+            ext = path.suffix.lstrip(".").upper()
+            return ("sample_report", ext.lower(), f"{ext} file, {size_label}")
+        except OSError:
+            return None
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Main scanner
+# ---------------------------------------------------------------------------
+
+CLASSIFIERS = [
+    _try_pbi_model,
+    _try_mapping_json,
+    _try_dbx_schema,
+    _try_sql_ddl,
+    _try_sql_query_output,
+    _try_csv_schema_dump,
+    _try_csv,
+    _try_sample_report,
+    _try_databricks_config,
+]
+
+
+def scan_directory(input_dir: Path) -> list[dict]:
+    """Scan all files in input_dir and classify each one."""
+    results = []
+
+    entries = sorted(input_dir.iterdir(), key=lambda p: (p.is_dir(), p.name))
+    for entry in entries:
+        if entry.name.startswith("."):
+            continue
+
+        classified = False
+        for classifier in CLASSIFIERS:
+            hit = classifier(entry)
+            if hit:
+                file_type, fmt, details = hit
+                record = {
+                    "path": str(entry),
+                    "name": entry.name,
+                    "type": file_type,
+                    "details": details,
+                }
+                if fmt:
+                    record["format"] = fmt
+                results.append(record)
+                classified = True
+                break
+
+        if not classified:
+            try:
+                if entry.is_file():
+                    size = entry.stat().st_size
+                    line_count = len(entry.read_text(errors="replace").splitlines())
+                    details = f"{line_count} lines, {size} bytes"
+                else:
+                    details = "Directory"
+            except OSError:
+                details = "Unreadable"
+            results.append({
+                "path": str(entry),
+                "name": entry.name,
+                "type": "unknown",
+                "details": details,
+            })
+
+    return results
+
+
+def render_summary(files: list[dict]) -> str:
+    """Return a human-readable summary of classified files."""
+    lines = [f"Found {len(files)} file(s) in input folder:\n"]
+    type_icons = {
+        "pbi_model": "[PBI MODEL]",
+        "mapping_json": "[MAPPING]",
+        "dbx_schema": "[DBX SCHEMA]",
+        "sql_ddl": "[SQL DDL]",
+        "sql_query_output": "[QUERY OUTPUT]",
+        "csv_schema_dump": "[CSV SCHEMA]",
+        "csv_data": "[CSV]",
+        "sample_report": "[REPORT]",
+        "databricks_config": "[CONFIG]",
+        "unknown": "[UNKNOWN]",
+    }
+    for f in files:
+        icon = type_icons.get(f["type"], "[?]")
+        fmt = f" ({f['format']})" if "format" in f else ""
+        lines.append(f"  {icon} {f['name']}{fmt} -- {f['details']}")
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Scan and classify all files in the input directory"
+    )
+    parser.add_argument("input_dir", help="Path to the input directory")
+    parser.add_argument("-o", "--output", default=None, help="Output manifest JSON path")
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir)
+    if not input_dir.is_dir():
+        print(f"Error: {input_dir} is not a directory", file=sys.stderr)
+        sys.exit(1)
+
+    files = scan_directory(input_dir)
+    manifest = {"input_dir": str(input_dir), "files": files}
+
+    print(render_summary(files))
+
+    if args.output:
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(out_path, "w") as f:
+            json.dump(manifest, f, indent=2)
+        print(f"\nManifest written to {out_path}")
+    else:
+        print(f"\nJSON manifest (use -o to save):")
+        print(json.dumps(manifest, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh
index 30339ad..be915f2 100755
--- a/databricks-skills/install_skills.sh
+++ b/databricks-skills/install_skills.sh
@@ -42,7 +42,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills"
 MLFLOW_REPO_REF="main"
 
 # Databricks skills (hosted in this repo)
-DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
+DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-powerbi-migration databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
 
 # MLflow skills (fetched from mlflow/skills repo)
 MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"
@@ -71,6 +71,7 @@ get_skill_description() {
         "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;;
         "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;;
         "databricks-model-serving") echo "Model Serving - deploy MLflow models and AI agents" ;;
+        "databricks-powerbi-migration") echo "Power BI to Databricks migration - metric views, DAX conversion, ERD generation" ;;
         "databricks-mlflow-evaluation") echo "MLflow evaluation and trace analysis" ;;
         "databricks-spark-declarative-pipelines") echo "Spark Declarative Pipelines (SDP/LDP/DLT)" ;;
         "spark-python-data-source") echo "Spark custom Python data sources" ;;
@@ -109,6 +110,7 @@ get_skill_extra_files() {
         "databricks-lakebase-provisioned") echo "connection-patterns.md reverse-etl.md" ;;
         "databricks-metric-views") echo "yaml-reference.md patterns.md" ;;
         "databricks-model-serving") echo "1-classical-ml.md 2-custom-pyfunc.md 3-genai-agents.md 4-tools-integration.md 5-development-testing.md 6-logging-registration.md 7-deployment.md 8-querying-endpoints.md 9-package-requirements.md" ;;
+        "databricks-powerbi-migration") echo "REFERENCE.md EXAMPLES.md approach.md scripts/init_project.sh scripts/scan_inputs.py scripts/parse_pbi_model.py scripts/generate_erd.py scripts/extract_dbx_schema.py scripts/compare_schemas.py" ;;
         "databricks-mlflow-evaluation") echo "references/CRITICAL-interfaces.md references/GOTCHAS.md references/patterns-context-optimization.md references/patterns-datasets.md references/patterns-evaluation.md references/patterns-scorers.md references/patterns-trace-analysis.md references/user-journeys.md" ;;
         "databricks-spark-declarative-pipelines") echo "1-ingestion-patterns.md 2-streaming-patterns.md 3-scd-patterns.md 4-performance-tuning.md 5-python-api.md 6-dlt-migration.md 7-advanced-configuration.md 8-project-initialization.md" ;;
         "databricks-spark-structured-streaming") echo "checkpoint-best-practices.md kafka-streaming.md merge-operations.md multi-sink-writes.md stateful-operations.md stream-static-joins.md stream-stream-joins.md streaming-best-practices.md trigger-and-cost-optimization.md" ;;