diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md new file mode 100644 index 000000000..4972bd453 --- /dev/null +++ b/SANITY_CHECKS_MIGRATION.md @@ -0,0 +1,593 @@ +# Sanity Checks Migration Guide + +This guide explains how to migrate sanity check functions from `sanity_checks.py` to inline validation rules that integrate with the egon-validation framework. + +## Overview + +**Before:** Sanity checks were standalone functions called manually +**After:** Sanity checks are validation rules declared inline in Dataset definitions + +## Benefits + +- ✅ Structured validation results with pass/fail tracking +- ✅ Automatic execution as part of dataset tasks +- ✅ Results collected in validation reports +- ✅ Better error reporting with observed vs expected values +- ✅ Parallel execution support +- ✅ Consistent with formal validation rules + +--- + +## Example Migration + +### Before: Old Sanity Check Function + +```python +# In sanity_checks.py +def cts_electricity_demand_share(rtol=0.005): + """Check CTS electricity demand share sums to 1.""" + df_demand_share = pd.read_sql(...) + + np.testing.assert_allclose( + actual=df_demand_share.groupby(["bus_id", "scenario"])["profile_share"].sum(), + desired=1, + rtol=rtol, + verbose=False, + ) + + logger.info("CTS electricity demand shares sum correctly") +``` + +### After: New Validation Rule + +```python +# In egon/data/validation/rules/custom/sanity/cts_demand.py +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + +class CtsElectricityDemandShare(DataFrameRule): + """Validate CTS electricity demand shares sum to 1 for each substation.""" + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_electricity_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand shares sum to 1 (max deviation: {max_diff:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Demand share mismatch: {len(violations)} violations", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) +``` + +--- + +## Using Inline Validations in Datasets + +### Option 1: Dataset-Specific Inline Validation + +For validations tied to a specific dataset (e.g., CTS demand validations), add them inline to that dataset: + +```python +from egon.data.datasets import Dataset +from egon.data.validation.rules.custom.sanity import ( + CtsElectricityDemandShare, + CtsHeatDemandShare, +) + +class CtsElectricityDemand(Dataset): + def __init__(self, dependencies): + super().__init__( + name="CtsElectricityDemand", + version="1.0.0", + dependencies=dependencies, + tasks=( + download_data, + process_demand, + distribute_to_buildings, + ), + validation={ + "data_quality": [ + CtsElectricityDemandShare( + table="demand.egon_cts_electricity_demand_building_share", + rule_id="SANITY_CTS_ELECTRICITY_DEMAND_SHARE", + rtol=0.005 + ), + CtsHeatDemandShare( + table="demand.egon_cts_heat_demand_building_share", + rule_id="SANITY_CTS_HEAT_DEMAND_SHARE", + rtol=0.005 + ), + ] + }, + validation_on_failure="continue" # or "fail" to stop pipeline + ) +``` + +### Option 2: Cross-Cutting Validations in FinalValidations + +For validations that check data consistency **across multiple datasets** (e.g., gas store capacity checks), add them to the `FinalValidations` dataset: + +```python +# In: src/egon/data/datasets/final_validations.py + +from egon.data.validation.rules.custom.sanity import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, + # Import your new validation rule here +) + +class FinalValidations(Dataset): + def __init__(self, dependencies): + super().__init__( + # ... + validation={ + "gas_stores": [ + CH4StoresCapacity(...), + H2SaltcavernStoresCapacity(...), + # Add your new rule here + ], + # Add new category if needed + "your_category": [ + YourNewValidationRule(...), + ], + }, + ) +``` + +Then update `pipeline.py` to include your dataset in `FinalValidations` dependencies: + +```python +final_validations = FinalValidations( + dependencies=[ + insert_data_ch4_storages, + insert_H2_storage, + storage_etrago, + your_new_dataset, # Add dataset providing data for your validation + ] +) +``` + +**When to use FinalValidations:** +- ✅ Validation checks data from multiple datasets +- ✅ Validation should run at the end of the pipeline +- ✅ Validation is cross-cutting (gas network, timeseries consistency, etc.) +- ❌ Don't use for dataset-specific checks (use inline validation instead) + +### How It Works + +1. **Validation tasks are created automatically** from the `validation` dict +2. **Tasks are named:** `{dataset_name}.validate.{validation_key}` + - Example: `CtsElectricityDemand.validate.data_quality` +3. **Tasks run after the main dataset tasks** complete +4. **Results are written** to `validation_runs/{run_id}/tasks/{task_name}/{rule_id}/results.jsonl` +5. **Validation report collects** all results at the end of the pipeline + +--- + +## Migration Patterns + +### Pattern 1: Simple DataFrame Assertion + +**Sanity Check:** +```python +def check_something(rtol=0.01): + df = db.select_dataframe("SELECT * FROM table") + np.testing.assert_allclose(df["actual"], df["expected"], rtol=rtol) + logger.info("Check passed") +``` + +**Validation Rule:** +```python +class CheckSomething(DataFrameRule): + def __init__(self, table, rule_id, rtol=0.01, **kwargs): + super().__init__(rule_id, table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return "SELECT * FROM table" + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol") + try: + np.testing.assert_allclose(df["actual"], df["expected"], rtol=rtol) + return RuleResult(success=True, ...) + except AssertionError: + return RuleResult(success=False, ...) +``` + +### Pattern 2: Multi-Table Comparison + +**Sanity Check:** +```python +def compare_tables(): + df1 = db.select_dataframe("SELECT SUM(value) FROM table1 GROUP BY key") + df2 = db.select_dataframe("SELECT SUM(value) FROM table2 GROUP BY key") + merged = df1.merge(df2, on="key") + assert (merged["value_x"] == merged["value_y"]).all() +``` + +**Validation Rule:** +```python +class CompareTablesCheck(DataFrameRule): + def get_query(self, ctx): + return """ + SELECT + t1.key, + t1.total as table1_total, + t2.total as table2_total + FROM (SELECT key, SUM(value) as total FROM table1 GROUP BY key) t1 + JOIN (SELECT key, SUM(value) as total FROM table2 GROUP BY key) t2 + ON t1.key = t2.key + """ + + def evaluate_df(self, df, ctx): + matches = (df["table1_total"] == df["table2_total"]).all() + return RuleResult(success=matches, ...) +``` + +### Pattern 3: Complex Checks with Loops + +For complex sanity checks with loops (e.g., `etrago_timeseries_length()`), you have two options: + +**Option A: Create one rule per component** (Recommended) +```python +validation = { + "timeseries_length": [ + TimeseriesLengthCheck( + table="grid.egon_etrago_generator_timeseries", + rule_id="SANITY_GENERATOR_TIMESERIES_LENGTH", + component="generator" + ), + TimeseriesLengthCheck( + table="grid.egon_etrago_load_timeseries", + rule_id="SANITY_LOAD_TIMESERIES_LENGTH", + component="load" + ), + # ... more components + ] +} +``` + +**Option B: Handle all components in one rule** +```python +class TimeseriesLengthCheck(DataFrameRule): + def evaluate_df(self, df, ctx): + # Check all components in a loop + # Return aggregated result +``` + +--- + +## Completed Migrations + +The following sanity checks have been migrated to validation rules: + +### ✅ Residential Electricity +- `residential_electricity_annual_sum()` → `ResidentialElectricityAnnualSum` +- `residential_electricity_hh_refinement()` → `ResidentialElectricityHhRefinement` + +### ✅ CTS Demand +- `cts_electricity_demand_share()` → `CtsElectricityDemandShare` +- `cts_heat_demand_share()` → `CtsHeatDemandShare` + +### ✅ Home Batteries +- `sanitycheck_home_batteries()` → `HomeBatteriesAggregation` + +### ✅ Gas Stores +- `sanity_check_CH4_stores()` → `CH4StoresCapacity` +- `sanity_check_H2_saltcavern_stores()` → `H2SaltcavernStoresCapacity` + +### ✅ Gas Grid +- `sanity_check_gas_buses()` → `GasBusesIsolated` + `GasBusesCount` +- `sanity_check_gas_one_port()` → `GasOnePortConnections` +- `sanity_check_CH4_grid()` → `CH4GridCapacity` +- `sanity_check_gas_links()` → `GasLinksConnections` + +### ✅ Gas Loads and Generators +- `etrago_eGon2035_gas_DE()` → `GasLoadsCapacity` + `GasGeneratorsCapacity` (wrapper function - components already migrated) + +### ✅ Electricity Capacity +- `etrago_eGon2035_electricity()` → `ElectricityCapacityComparison` (9 generator carriers + 1 storage carrier) + - Validates: wind_onshore, wind_offshore, solar, solar_rooftop, biomass, run_of_river, reservoir, oil, others, pumped_hydro + +### ✅ Heat Supply Capacity +- `etrago_eGon2035_heat()` → `ElectricityCapacityComparison` (5 heat supply carriers - reused for heat!) + - Links: central_heat_pump, rural_heat_pump, central_resistive_heater + - Generators: solar_thermal_collector, geo_thermal + - **Note:** Heat demand check from this function still needs migration (timeseries-based validation) + +### ✅ Timeseries Length +- `etrago_timeseries_length()` → `ArrayCardinalityValidation` (reused from egon-validation formal rules!) + - Validates ALL 24 array columns across 5 component types (generator, load, link, store, storage) + - **Generator timeseries (5):** p_set, q_set, p_min_pu, p_max_pu, marginal_cost + - **Load timeseries (2):** p_set, q_set + - **Link timeseries (5):** p_set, p_min_pu, p_max_pu, efficiency, marginal_cost + - **Storage timeseries (7):** p_set, q_set, p_min_pu, p_max_pu, state_of_charge_set, inflow, marginal_cost + - **Store timeseries (5):** p_set, q_set, e_min_pu, e_max_pu, marginal_cost + - Leverages existing formal validation rule from egon-validation library + - **Updated:** Now matches original dynamic column discovery behavior (sanity_checks.py:2465-2494) + +### ✅ eGon100RE Capacity Validations +- `generators_links_storages_stores_100RE()` → `ElectricityCapacityComparison` (reused for eGon100RE!) + - **Generators (13):** wind_onshore, wind_offshore, solar, solar_rooftop, run_of_river, oil, lignite, coal, solar_thermal_collector, geo_thermal, rural_solar_thermal, urban_central_gas_CHP, urban_central_solid_biomass_CHP + - **Links (9):** central_gas_boiler, central_heat_pump, central_resistive_heater, OCGT, rural_biomass_boiler, rural_gas_boiler, rural_heat_pump, rural_oil_boiler, rural_resistive_heater + - **Storage (1):** pumped_hydro + - **Note:** Stores validation deferred (original function only prints, no validation logic) + +### ✅ Electrical Load Demand +- `electrical_load_100RE()` → `ElectricalLoadAggregationValidation` + `ElectricalLoadSectorBreakdown` + - **Total load validation:** `ElectricalLoadAggregationValidation` validates annual load sum (TWh) for all scenarios + - Also checks max/min load (GW) - more comprehensive than original + - Leverages existing custom validation rule from egon-validation library + - **Sector breakdown validation:** `ElectricalLoadSectorBreakdown` validates eGon100RE by sector (new class!) + - Residential: 90.4 TWh expected (from household_curves table) + - Commercial: 146.7 TWh expected (from cts_curves table) + - Industrial: 382.9 TWh expected (from osm_curves + sites_curves tables) + - Total: 620.0 TWh expected (from etrago AC loads) + - Validates each sector independently with 1% tolerance + - Queries source tables directly matching original implementation + - **Updated:** Now provides full sector granularity as in original (sanity_checks.py:2676-2784) + +### ✅ Heat Demand +- Heat demand validation (from `etrago_eGon2035_heat()`) → `HeatDemandValidation` (new class!) + - Validates annual heat demand (rural_heat + central_heat) against peta_heat reference + - Compares timeseries sum vs expected demand + - eGon2035 scenario + +--- + +## Migration Status Summary + +### ✅ All Core Validations Migrated + +All core sanity checks have been successfully migrated to the new validation framework, including: +- Residential electricity (annual sum, household refinement) +- CTS demand (electricity and heat shares) +- Home batteries aggregation +- Gas infrastructure (stores, buses, grid, links, loads, generators) +- Electricity capacity (eGon2035 and eGon100RE generators, storage) +- Heat capacity (heat pumps, resistive heaters, solar thermal, geothermal) +- Timeseries length validation +- Electrical load aggregation +- Heat demand validation + +### Deferred Validations (Require Dataset-Inline Implementation) + +The following sanity checks require dataset-inline validation due to their complexity and cannot be easily migrated to standalone validation rules: + +**Reason for Deferral: Complex with External Dependencies** +1. **`sanitycheck_pv_rooftop_buildings()`** + - Creates matplotlib/seaborn visualizations + - Loads external building data via `load_building_data()` + - Has dataset-boundary-specific logic (Schleswig-Holstein special cases) + - Reads from Excel files for certain scenarios + - **Migration approach**: Implement as dataset-inline validation in the PV rooftop dataset + +2. **`sanitycheck_emobility_mit()`** + - Multiple sub-checks (EV allocation, trip data, model components) + - Uses ORM queries with session scopes + - Depends on SimBEV metadata files + - Has testmode conditional logic + - **Migration approach**: Implement as dataset-inline validation in the e-mobility dataset + +3. **`heat_gas_load_egon100RE()`** + - Only prints comparison table (no assertions/validations) + - Reads from pypsa_eur network data + - No actual validation logic to migrate + - **Migration approach**: Keep as reporting function or convert to validation with assertions + +**Reason for Deferral: Uses External Calculation Functions** +4. **`etrago_eGon2035_gas_abroad()`** + - Uses external calculation functions from gas_neighbours module + - Requires dataset-specific context + - **Migration approach**: Implement as dataset-inline validation in the gas grid dataset + +5. **`sanitycheck_dsm()`** + - Complex aggregation logic with multiple steps + - Dataset-specific calculations + - **Migration approach**: Implement as dataset-inline validation in the DSM dataset + +--- + +## Directory Structure + +``` +egon-data/src/egon/data/ +├── datasets/ +│ ├── sanity_checks.py # ⚠️ Old sanity checks (kept for deferred validations) +│ ├── final_validations.py # ✅ Cross-cutting validations +│ └── ... +└── validation/ + └── rules/ + └── custom/ + └── sanity/ + ├── __init__.py # ✅ Exports all sanity validation classes + ├── residential_electricity.py # ✅ Migrated (2 rules) + ├── cts_demand.py # ✅ Migrated (2 rules) + ├── home_batteries.py # ✅ Migrated (1 rule) + ├── gas_stores.py # ✅ Migrated (2 rules: CH4, H2 saltcavern) + ├── gas_grid.py # ✅ Migrated (5 rules: buses, one-port, CH4 grid, links) + ├── gas_loads_generators.py # ✅ Migrated (2 rules: loads, generators) + ├── electricity_capacity.py # ✅ Migrated (reusable class for capacity comparison) + ├── electrical_load_sectors.py # ✅ Migrated (1 rule: sector breakdown) + └── heat_demand.py # ✅ Migrated (1 rule) + +egon-validation/egon_validation/rules/ +├── formal/ +│ └── array_cardinality_check.py # ✅ Reused for timeseries length validation +└── custom/ + └── numeric_aggregation_check.py # ✅ Reused for electrical load aggregation +``` + +--- + +## Migration Statistics + +**Total sanity checks in original `sanity_checks.py`**: 21 functions + +**Successfully migrated**: 16 functions (76%) +- Converted to **65 individual validation rules** across multiple categories +- Organized into **9 custom validation modules** +- Reused **2 existing validation classes** from egon-validation + +**Deferred (require dataset-inline implementation)**: 5 functions (24%) +- 3 complex validations with external dependencies +- 2 validations requiring external calculation functions + +**Validation rules by category**: +- Electricity capacity: 10 rules (eGon2035) +- Heat capacity: 5 rules (eGon2035) +- eGon100RE capacity: 23 rules (13 generators, 9 links, 1 storage) +- Gas infrastructure: 11 rules +- Demand validation: 4 rules +- Timeseries: 24 rules (all array columns across 5 component types) +- Home batteries: 1 rule +- Electrical load: 2 rules (total aggregation + sector breakdown) +- Heat demand: 1 rule + +**Recent Updates (2025-12-30)**: +- ✅ **Timeseries validation coverage expanded**: 8 → 24 array columns (now matches original dynamic discovery) +- ✅ **Electrical load sector breakdown implemented**: Added granular validation by sector (residential, commercial, industrial) + +--- + +## Testing Your Migration + +1. **Add validation to a dataset:** +```python +validation={ + "data_quality": [ + YourNewRule( + table="schema.table", + rule_id="SANITY_YOUR_CHECK", + param1=value1 + ) + ] +} +``` + +2. **Run the dataset:** +```bash +airflow tasks test your_dag your_dataset_task execution_date +``` + +3. **Check validation results:** +```bash +ls validation_runs/{run_id}/tasks/{dataset}.validate.data_quality/{rule_id}/ +cat validation_runs/{run_id}/tasks/{dataset}.validate.data_quality/{rule_id}/results.jsonl +``` + +4. **View the validation report:** +```bash +open validation_runs/{run_id}/final/report.html +``` + +--- + +## Best Practices + +1. **One rule class per check** - Keep rules focused and reusable +2. **Use descriptive rule_ids** - Follow pattern `SANITY_{CATEGORY}_{CHECK_NAME}` +3. **Set appropriate tolerances** - Document why you chose specific `rtol` values +4. **Provide clear messages** - Include context in success/failure messages +5. **Return observed/expected values** - Helps with debugging failures +6. **Override `kind = "sanity"`** - Ensures rules are categorized correctly + +--- + +## Getting Help + +- See implemented examples in `egon/data/validation/rules/custom/sanity/` +- Check egon-validation documentation for `DataFrameRule` API +- Ask in the team channel for migration assistance + +--- + +## Summary and Next Steps + +### ✅ Completed Work + +The sanity checks migration is **76% complete** with all core validations successfully migrated to the new framework: + +1. **9 custom validation modules** created in `egon/data/validation/rules/custom/sanity/` +2. **65 individual validation rules** implemented across all major categories +3. **Reused 2 existing validation classes** from egon-validation library (code reuse > new code) +4. **Fixed 4 RuleResult 'details' parameter errors** by moving violation data to message field +5. **Integrated validations** into `FinalValidations` dataset for cross-cutting checks +6. **Full timeseries coverage** - All 24 array columns validated (matches original dynamic discovery) +7. **Sector breakdown validation** - Electrical load validated by sector (residential, commercial, industrial) + +### 🔄 Remaining Work + +5 sanity check functions (24%) are deferred for dataset-inline implementation: + +**High Priority** (complex with external dependencies): +1. `sanitycheck_pv_rooftop_buildings()` - Implement in PV rooftop dataset +2. `sanitycheck_emobility_mit()` - Implement in e-mobility dataset +3. `heat_gas_load_egon100RE()` - Add assertions or keep as reporting function + +**Medium Priority** (use external calculation functions): +4. `etrago_eGon2035_gas_abroad()` - Implement in gas grid dataset +5. `sanitycheck_dsm()` - Implement in DSM dataset + +### 🎯 Recommended Approach for Deferred Validations + +For each deferred validation: +1. Add inline `validation={}` dict to the relevant Dataset class +2. Create custom validation rules that can access dataset-specific functions +3. Use the same pattern as migrated validations (SqlRule or DataFrameRule) +4. Ensure validations run after dataset tasks complete + +### 📊 Impact + +- **Better error reporting**: Structured validation results with observed/expected values +- **Consistent framework**: All validations follow the same pattern +- **Parallel execution**: Validations can run concurrently +- **Automated reports**: HTML reports generated from all validation results +- **Code reuse**: Leveraged existing validation classes where possible diff --git a/pyproject.toml b/pyproject.toml index d36887230..6a98602b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "cdsapi", "click<8.1", "disaggregator @ git+https://github.com/openego/disaggregator.git@features/update-cache-directory#egg=disaggregator", + "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@v1.2.1", "entsoe-py>=0.6.2", "fiona==1.9.6", "Flask-Session<0.6.0", diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py index e9b87ea94..0b2a55bb0 100755 --- a/src/egon/data/airflow/dags/pipeline.py +++ b/src/egon/data/airflow/dags/pipeline.py @@ -102,6 +102,9 @@ from egon.data.datasets.zensus_vg250 import ZensusVg250 from egon.data.metadata import Json_Metadata +from egon.data.datasets.validation_report import ValidationReport +from egon.data.datasets.final_validations import FinalValidations + # Set number of threads used by numpy and pandas set_numexpr_threads() @@ -730,6 +733,32 @@ ] ) + with TaskGroup(group_id="final_validations") as final_validations_group: + # Cross-cutting validations that check data consistency across datasets + # These run after all data generation but before the validation report + final_validations = FinalValidations( + dependencies=[ + insert_data_ch4_storages, # CH4Storages - for CH4 store validation + insert_H2_storage, # HydrogenStoreEtrago - for H2 saltcavern validation + storage_etrago, # StorageEtrago - general storage validation + hts_etrago_table, + fill_etrago_generators, + household_electricity_demand_annual, + cts_demand_buildings, + emobility_mit, + low_flex_scenario, + ] + ) + + with TaskGroup(group_id="validation_report") as validation_report_group: + # Generate validation report from all validation tasks + # Runs after all validations (including final_validations) are complete + validation_report = ValidationReport( + dependencies=[ + final_validations, # Wait for final validations + ] + ) + with TaskGroup(group_id="sanity_checks") as sanity_checks_group: # ########## Keep this dataset at the end # Sanity Checks diff --git a/src/egon/data/datasets/DSM_cts_ind.py b/src/egon/data/datasets/DSM_cts_ind.py index a3025968a..9b2e86bdf 100644 --- a/src/egon/data/datasets/DSM_cts_ind.py +++ b/src/egon/data/datasets/DSM_cts_ind.py @@ -32,6 +32,10 @@ sources, ) +from egon_validation import ( + ArrayCardinalityValidation +) + # CONSTANTS # TODO: move to datasets.yml CON = db.engine() @@ -142,6 +146,35 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(dsm_cts_ind_processing,), + validation={ + "data-quality": [ + ArrayCardinalityValidation( + table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_demandregio_sites_ind_electricity_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ArrayCardinalityValidation( + table="demand.egon_etrago_electricity_cts_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_etrago_electricity_cts_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ArrayCardinalityValidation( + table="demand.egon_osm_ind_load_curves_individual_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_osm_ind_load_curves_individual_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ArrayCardinalityValidation( + table="demand.egon_sites_ind_load_curves_individual_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_sites_ind_load_curves_individual_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index d65339d01..d64573060 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections import abc -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial, reduce, update_wrapper from typing import Callable, Iterable, Set, Tuple, Union import re @@ -12,9 +12,17 @@ from airflow.operators.python import PythonOperator from sqlalchemy import Column, ForeignKey, Integer, String, Table, orm, tuple_ from sqlalchemy.ext.declarative import declarative_base +from typing import Dict, List +from egon.data.validation import create_validation_tasks from egon.data import config, db, logger +try: + from egon_validation.rules.base import Rule +except ImportError: + Rule = None # Type hint only + + Base = declarative_base() SCHEMA = "metadata" @@ -197,6 +205,8 @@ class Dataset: #: The tasks of this :class:`Dataset`. A :class:`TaskGraph` will #: automatically be converted to :class:`Tasks_`. tasks: Tasks = () + validation: Dict[str, List] = field(default_factory=dict) + on_validation_failure: str = "continue" def check_version(self, after_execution=()): scenario_names = config.settings()["egon-data"]["--scenarios"] @@ -264,6 +274,27 @@ def __post_init__(self): self.dependencies = list(self.dependencies) if not isinstance(self.tasks, Tasks_): self.tasks = Tasks_(self.tasks) + # Process validation configuration + if self.validation: + validation_tasks = create_validation_tasks( + validation_dict=self.validation, + dataset_name=self.name, + on_failure=self.on_validation_failure + ) + + # Append validation tasks to existing tasks + if validation_tasks: + if hasattr(self.tasks, 'graph'): + graph = self.tasks.graph + else: + graph = self.tasks + if isinstance(graph, (tuple, set, list)): + task_list = list(graph) + else: + task_list = [graph] + task_list.extend(validation_tasks) + self.tasks = Tasks_(tuple(task_list)) + if len(self.tasks.last) > 1: # Explicitly create single final task, because we can't know # which of the multiple tasks finishes last. @@ -302,3 +333,33 @@ def __post_init__(self): for p in predecessors: for first in self.tasks.first: p.set_downstream(first) + + # Link validation tasks to run after data tasks + if self.validation and validation_tasks: + # Get last non-validation tasks + non_validation_task_ids = [ + task.task_id for task in self.tasks.values() + if not any( + task.task_id.endswith(f".validate.{name}") + for name in self.validation.keys() + ) + ] + + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids + and task in self.tasks.last + ] + + if not last_data_tasks: + # Fallback to last non-validation task + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids + ][-1:] + + # Link each validation task downstream of last data tasks + for validation_task in validation_tasks: + for last_task in last_data_tasks: + last_task.set_downstream(validation_task) + diff --git a/src/egon/data/datasets/chp/__init__.py b/src/egon/data/datasets/chp/__init__.py index ac51ff881..066a6d99a 100644 --- a/src/egon/data/datasets/chp/__init__.py +++ b/src/egon/data/datasets/chp/__init__.py @@ -47,6 +47,15 @@ sources, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + Base = declarative_base() @@ -853,4 +862,82 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_chp_plants", + rule_id="ROW_COUNT.egon_chp_plants", + expected_count={ + "Schleswig-Holstein": 1720, + "Everything": 40197 + } + ), + DataTypeValidation( + table="supply.egon_chp_plants", + rule_id="DATA_TYPES.egon_chp_plants", + column_types={ + "id": "integer", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "district_heating": "boolean", + "el_capacity": "double precision", + "th_capacity": "double precision", + "electrical_bus_id": "integer", + "district_heating_area_id": "integer", + "ch4_bus_id": "integer", + "voltage_level": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_chp_plants", + rule_id="NOT_NAN.egon_chp_plants", + columns=[ + "id", + "sources", + "source_id", + "carrier", + "district_heating", + "el_capacity", + "th_capacity", + "electrical_bus_id", + "district_heating_area_id", + "ch4_bus_id", + "voltage_level", + "scenario", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_chp_plants", + rule_id="TABLE_NOT_NAN.egon_chp_plants" + ), + ValueSetValidation( + table="supply.egon_chp_plants", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_chp_plants", + column="carrier", + expected_values=[ + "oil", + "others", + "gas", + "gas extended", + "biomass" + ] + ), + ValueSetValidation( + table="supply.egon_chp_plants", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_chp_plants", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + SRIDUniqueNonZero( + table="supply.egon_chp_plants", + rule_id="SRIDUniqueNonZero.egon_chp_plants", + column="geom" + ) + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index 479492ceb..efffd571f 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -20,6 +20,13 @@ ) import egon.data.config import egon.data.datasets.scenario_parameters.parameters as scenario_parameters +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + ArrayCardinalityValidation +) try: from disaggregator import config, data, spatial, temporal @@ -87,6 +94,68 @@ def __init__(self, dependencies): insert_cts_ind_demands, }, ), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_demandregio_hh", + rule_id="ROW_COUNT.egon_demandregio_hh", + expected_count={ + "Schleswig-Holstein": 180, + "everything": 7218 + } + ), + DataTypeValidation( + table="demand.egon_demandregio_hh", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_hh", + column_types={"nuts3": "character varying", + "hh_size": "integer", + "scenario": "character varying", + "year": "integer", + "demand": "double precision" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_hh", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_hh" + ), + ValueSetValidation( + table="demand.egon_demandregio_hh", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_demandregio_hh", + column="scenario", + expected_values=["eGon2035", "eGon100RE", "eGon2021"] + ), + RowCountValidation( + table=" demand.egon_demandregio_wz", + rule_id="ROW_COUNT.egon_demandregio_wz", + expected_count=87 + ), + DataTypeValidation( + table="demand.egon_demandregio_wz", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_wz", + column_types={"wz": "integer", + "sector": "character varying", + "definition": "character varying" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_wz", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_wz" + ), + ValueSetValidation( + table="demand.egon_demandregio_wz", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_demandregio_wz", + column="sector", + expected_values=["industry", "CTS"] + ), + ArrayCardinalityValidation( + table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_demandregio_sites_ind_electricity_dsm_timeseries", + array_column="load_curve", + expected_length=8760, + ) + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/district_heating_areas/__init__.py b/src/egon/data/datasets/district_heating_areas/__init__.py index df347bdbb..5f8ca856a 100644 --- a/src/egon/data/datasets/district_heating_areas/__init__.py +++ b/src/egon/data/datasets/district_heating_areas/__init__.py @@ -40,6 +40,14 @@ ) from egon.data.metadata import context, license_ccby, meta_metadata, sources +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + # import time @@ -82,6 +90,45 @@ def __init__(self, dependencies): version=self.version, # maybe rethink the naming dependencies=dependencies, tasks=(create_tables, demarcation), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_district_heating_areas", + rule_id="ROW_COUNT.egon_district_heating_areas", + expected_count={ + "Schleswig-Holstein": 100, + "Everything": 6335 + } + ), + DataTypeValidation( + table="demand.egon_district_heating_areas", + rule_id="DATA_MULTIPLE_TYPES.egon_district_heating_areas", + column_types={ + "id": "integer", + "area_id": "integer", + "scenario": "character varying", + "geom_polygon": "geometry", + "residential_and_service_demand": "double precision" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_district_heating_areas", + rule_id="WHOLE_TABLE_NOT_NAN.egon_district_heating_areas" + ), + ValueSetValidation( + table="demand.egon_district_heating_areas", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_district_heating_areas", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + SRIDUniqueNonZero( + table="demand.egon_district_heating_areas", + rule_id="SRIDUniqueNonZero.egon_district_heating_areas", + column="geom_polygon" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index f6ef464d5..f9a630f39 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -10,6 +10,17 @@ from egon.data import db from egon.data.datasets import Dataset from egon.data.datasets.electricity_demand.temporal import insert_cts_load +from egon.data.validation.rules.custom.sanity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, +) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + from egon.data.datasets.electricity_demand_timeseries.hh_buildings import ( HouseholdElectricityProfilesOfBuildings, get_iee_hh_demand_profiles_raw, @@ -53,6 +64,55 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(create_tables, get_annual_household_el_demand_cells), + validation={ + "data_quality": [ + ResidentialElectricityAnnualSum( + table="demand.egon_demandregio_zensus_electricity", + rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM", + rtol=0.005 + ), + ResidentialElectricityHhRefinement( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT", + rtol=1e-5 + ), + RowCountValidation( + table=" demand.egon_demandregio_zensus_electricity", + rule_id="ROW_COUNT.egon_demandregio_zensus_electricity", + expected_count={ + "Schleswig-Holstein": 154527, + "Everything": 7355160 + } + ), + DataTypeValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_zensus_electricity", + column_types={ + "zensus_population_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "demand": "double precision" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_zensus_electricity" + ), + ValueSetValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_demandregio_zensus_electricity", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_demandregio_zensus_electricity", + column="sector", + expected_values=["residential", "service"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py index 6de5a5b74..d8cc2621f 100755 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py @@ -23,6 +23,12 @@ random_point_in_square, ) import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) engine = db.engine() Base = declarative_base() @@ -1232,4 +1238,99 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=self.tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_building_electricity_peak_loads", + rule_id="ROW_COUNT.egon_building_electricity_peak_loads", + expected_count={ + "Schleswig-Holstein": 3054820, + "Everything": 44683620 + } + ), + DataTypeValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="DATA_MULTIPLE_TYPES.egon_building_electricity_peak_loads", + column_types={ + "building_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "peak_load_in_w": "real", + "voltage_level": "integer" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="WHOLE_TABLE_NOT_NAN.egon_building_electricity_peak_loads" + ), + ValueSetValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_building_electricity_peak_loads", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_building_electricity_peak_loads", + column="sector", + expected_values=["cts", "residential"] + ), + RowCountValidation( + table=" demand.egon_building_heat_peak_loads", + rule_id="ROW_COUNT.egon_building_heat_peak_loads", + expected_count={ + "Schleswig-Holstein": 732905, + "Everything": 42128819 + } + ), + DataTypeValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="DATA_MULTIPLE_TYPES.egon_building_heat_peak_loads", + column_types={ + "building_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "peak_load_in_w": "real" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="WHOLE_TABLE_NOT_NAN.egon_building_heat_peak_loads" + ), + ValueSetValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_building_heat_peak_loads", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_building_heat_peak_loads", + column="sector", + expected_values=["residential+cts"] + ), + RowCountValidation( + table=" demand.egon_household_electricity_profile_of_buildings", + rule_id="ROW_COUNT.egon_household_electricity_profile_of_buildings", + expected_count={ + "Schleswig-Holstein": 1371592, + "Everything": 38605221 + } + ), + DataTypeValidation( + table="demand.egon_household_electricity_profile_of_buildings", + rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_of_buildings", + column_types={ + "id": "integer", + "building_id": "integer", + "cell_id": "integer", + "profile_id": "character varying"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_household_electricity_profile_of_buildings", + rule_id="WHOLE_TABLE_NOT_NAN.egon_household_electricity_profile_of_buildings" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index 7d613be6c..7bb5ecb84 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -27,6 +27,13 @@ from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() engine = db.engine() @@ -300,6 +307,70 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_household_electricity_profile_in_census_cell", + rule_id="ROW_COUNT.egon_household_electricity_profile_in_census_cell", + expected_count={ + "Schleswig-Holstein": 143521, + "Everything": 3177723 + } + ), + DataTypeValidation( + table="demand.egon_household_electricity_profile_in_census_cell", + rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_in_census_cell", + column_types={ + "Schleswig-Holstein":{ + "cell_id": "integer", + "grid_id": "character varying", + "cell_profile_ids": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "factor_2019": "double precision", + "factor_2023": "double precision", + "factor_2035": "double precision", + "factor_2050": "double precision" + }, + "Everything":{ + "cell_id": "integer", + "grid_id": "character varying", + "cell_profile_ids": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "factor_2035": "double precision", + "factor_2050": "double precision" + } + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_household_electricity_profile_in_census_cell", + rule_id="WHOLE_TABLE_NOT_NAN.egon_household_electricity_profile_in_census_cell" + ), + RowCountValidation( + table=" demand.demand.iee_household_load_profiles", + rule_id="ROW_COUNT.iee_household_load_profiles", + expected_count={ + "Schleswig-Holstein": 2511, + "Everything": 1000000 + } + ), + DataTypeValidation( + table="demand.iee_household_load_profiles", + rule_id="DATA_MULTIPLE_TYPES.iee_household_load_profiles", + column_types={ + "id": "integer", + "type": "character", + "load_in_wh": "real[]" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.iee_household_load_profiles", + rule_id="WHOLE_TABLE_NOT_NAN.iee_household_load_profiles" + ) + ] + }, + on_validation_failure="continue" ) @@ -1583,7 +1654,7 @@ def houseprofiles_in_census_cells(): """ Allocate household electricity demand profiles for each census cell. - Creates table `emand.egon_household_electricity_profile_in_census_cell` that maps + Creates table `demand.egon_household_electricity_profile_in_census_cell` that maps household electricity demand profiles to census cells. Each row represents one cell and contains a list of profile IDs. This table is fundamental for creating subsequent data like demand profiles on MV grid level or for diff --git a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py index 072a3e342..d772617d4 100644 --- a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py +++ b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py @@ -56,6 +56,13 @@ read_simbev_metadata_file, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + # ========== Register np datatypes with SQLA ========== def adapt_numpy_float64(numpy_float64): @@ -490,4 +497,288 @@ def generate_model_data_tasks(scenario_name): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_ev_count_municipality", + rule_id="ROW_COUNT.egon_ev_count_municipality", + expected_count={ + "Schleswig-Holstein": 1108, + "Everything": 44012 + } + ), + DataTypeValidation( + table="demand.egon_ev_count_municipality", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_municipality", + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "ags": "integer", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_municipality", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_municipality" + ), + ValueSetValidation( + table="demand.egon_ev_count_municipality", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_municipality", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_municipality", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_municipality", + column="scenario_variation", + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] + ), + RowCountValidation( + table=" demand.egon_ev_count_mv_grid_district", + rule_id="ROW_COUNT.egon_ev_count_mv_grid_district", + expected_count={ + "Schleswig-Holstein": 199, + "Everything": 15348 + } + ), + DataTypeValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_mv_grid_district", + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "bus_id": "integer", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_mv_grid_district" + ), + ValueSetValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_mv_grid_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_mv_grid_district", + column="scenario_variation", + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] + ), + RowCountValidation( + table=" demand.egon_ev_count_registration_district", + rule_id="ROW_COUNT.egon_ev_count_registration_district", + expected_count={ + "Schleswig-Holstein": 400, + "Everything": 1600 + } + ), + DataTypeValidation( + table="demand.egon_ev_count_registration_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_registration_district", + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "ags_reg_district": "integer", + "reg_district": "character varying", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_registration_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_registration_district" + ), + ValueSetValidation( + table="demand.egon_ev_count_registration_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_registration_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_registration_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_registration_district", + column="scenario_variation", + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] + ), + RowCountValidation( + table=" demand.egon_ev_mv_grid_district", + rule_id="ROW_COUNT.egon_ev_mv_grid_district", + expected_count={ + "Schleswig-Holstein": 534899, + "Everything": 125609556 + } + ), + DataTypeValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_mv_grid_district", + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "bus_id": "integer", + "reg_district": "character varying", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_mv_grid_district" + ), + ValueSetValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_mv_grid_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_mv_grid_district", + column="scenario_variation", + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] + ), + RowCountValidation( + table=" demand.egon_ev_pool", + rule_id="ROW_COUNT.egon_ev_pool", + expected_count={ + "Schleswig-Holstein": 7000, + "Everything": 65376 + } + ), + DataTypeValidation( + table="demand.egon_ev_pool", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_pool", + column_types={ + "scenario": "character varying", + "ev_id": "integer", + "rs7_id": "smallint", + "type": "character varying", + "simbev_ev_id": "integer" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_pool", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_pool" + ), + ValueSetValidation( + table="demand.egon_ev_pool", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_pool", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_pool", + rule_id="VALUE_SET_VALIDATION_TYPE.egon_ev_pool", + column="type", + expected_values=[ + "bev_mini", + "bev_medium", + "bev_luxury", + "phev_mini", + "phev_medium", + "phev_luxury" + ] + ), + RowCountValidation( + table=" demand.egon_ev_trip", + rule_id="ROW_COUNT.egon_ev_trip", + expected_count={ + "Schleswig-Holstein":11642066, + "Everything": 108342188 + } + ), + DataTypeValidation( + table="demand.egon_ev_trip", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_trip", + column_types={ + "scenario": "character varying", + "event_id": "integer", + "egon_ev_pool_ev_id": "integer", + "simbev_event_id": "integer", + "location": "character varying", + "use_case": "character varying", + "charging_capacity_nominal": "real", + "charging_capacity_grid": "real", + "charging_capacity_battery": "real", + "soc_start": "real", "soc_end": "real", + "charging_demand": "real", + "park_start": "integer", + "park_end": "integer", + "drive_start": "integer", + "drive_end": "integer", + "consumption": "real" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_trip", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_trip" + ), + ValueSetValidation( + table="demand.egon_ev_trip", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_trip", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_trip", + rule_id="VALUE_SET_LOCATION.egon_ev_trip", + column="type", + expected_values=[ + "0_work", + "1_business", + "2_school", + "3_shopping", + "4_private/ridesharing", + "5_leisure", + "6_home", + "7_charging_hub", + "driving" + ] + ) + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/era5.py b/src/egon/data/datasets/era5.py index baaf3ed0c..1f9e74da9 100644 --- a/src/egon/data/datasets/era5.py +++ b/src/egon/data/datasets/era5.py @@ -16,6 +16,15 @@ from egon.data.datasets.scenario_parameters import get_sector_parameters import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + # will be later imported from another file ### Base = declarative_base() @@ -56,6 +65,44 @@ def __init__(self, dependencies): }, insert_weather_cells, ), # download_era5 should be included once issue #1250 is solved + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_era5_weather_cells", + rule_id="ROW_COUNT.egon_era5_weather_cells", + expected_count=29673 + ), + DataTypeValidation( + table="supply.egon_era5_weather_cells", + rule_id="DATA_TYPES.egon_era5_weather_cells", + column_types={ + "w_id": "integer", + "geom": "geometry", + "geom_point": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_era5_weather_cells", + rule_id="NOT_NAN.egon_era5_weather_cells", + columns=["w_id", "geom", "geom_point"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_era5_weather_cells", + rule_id="TABLE_NOT_NAN.egon_era5_weather_cells" + ), + SRIDUniqueNonZero( + table="supply.egon_era5_weather_cells", + rule_id="SRIDUniqueNonZero.egon_era5_weather_cells", + column="geom" + ), + SRIDUniqueNonZero( + table="supply.egon_era5_weather_cells", + rule_id="SRIDUniqueNonZero.egon_era5_weather_cells", + column="geom_point" + ) + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py new file mode 100644 index 000000000..36fefac83 --- /dev/null +++ b/src/egon/data/datasets/final_validations.py @@ -0,0 +1,2021 @@ +""" +Dataset for cross-cutting validations that run at the end of the pipeline. + +This module provides the FinalValidations dataset which contains validation rules +that check data consistency across multiple datasets. These validations should run +after all data generation is complete, but before the final validation report. +""" + +from egon.data.datasets import Dataset +from egon.data.validation.rules.custom.sanity import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, + GasBusesIsolated, + GasBusesCount, + GasOnePortConnections, + CH4GridCapacity, + GasLinksConnections, + GasLoadsCapacity, + GasGeneratorsCapacity, + ElectricityCapacityComparison, + HeatDemandValidation, + ElectricalLoadSectorBreakdown, +) +from egon_validation import ( + ArrayCardinalityValidation, + ElectricalLoadAggregationValidation, + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + + +def notasks(): + """ + Placeholder task function. + + This dataset has no data generation tasks - it only runs validation rules + defined in the validation dict. The validation framework automatically creates + validation tasks from the rules. + + Returns + ------- + None + """ + return None + + +class FinalValidations(Dataset): + """ + Cross-cutting validations that run at the end of the pipeline. + + This dataset contains validation rules that check data consistency across + multiple datasets and should run after all data generation is complete. + + The validations are organized by category and run automatically as part of + the dataset's validation tasks. Results are collected by ValidationReport. + + *Dependencies* + Should depend on all datasets whose data is validated by the rules + defined here. At minimum: + * CH4Storages - for CH4 store capacity validation + * HydrogenStoreEtrago - for H2 saltcavern store validation + * Add more as you add validation rules + + *Validation Results* + Results are written to validation_runs/{run_id}/tasks/FinalValidations.validate.*/ + and collected by the ValidationReport dataset + + *Adding New Validations* + To add new cross-cutting validations: + 1. Create the validation rule class in validation/rules/custom/sanity/ + 2. Import it at the top of this file + 3. Add instances to the appropriate category in the validation dict below + 4. Update dependencies to include datasets that provide the data being validated + + Example + ------- + To add a new gas grid validation: + + ```python + from egon.data.validation.rules.custom.sanity import CH4GridCapacity + + # In the validation dict: + "gas_stores": [ + # ... existing rules ... + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY", + scenario="eGon2035" + ), + ] + ``` + """ + + #: + name: str = "FinalValidations" + #: + version: str = "0.0.1" + + def __init__(self, dependencies): + super().__init__( + name=self.name, + version=self.version, + dependencies=dependencies, + tasks=(notasks,), # No data tasks - only validation tasks + validation={ + # Gas store capacity validations + # These check that CH4 and H2 store capacities match expected values + "gas_stores": [ + # CH4 stores - eGon2035 + CH4StoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_CH4_STORES_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.02 + ), + # CH4 stores - eGon100RE + CH4StoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_CH4_STORES_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.02 + ), + # H2 saltcavern stores - eGon2035 + H2SaltcavernStoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_H2_SALTCAVERN_STORES_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.02 + ), + # H2 saltcavern stores - eGon100RE + H2SaltcavernStoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_H2_SALTCAVERN_STORES_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.02 + ), + ], + + # Gas grid bus validations + # These check that gas buses are properly connected and counts match expectations + "gas_grid": [ + # Check for isolated CH4 buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4" + ), + # Check for isolated H2_grid buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON2035", + scenario="eGon2035", + carrier="H2_grid" + ), + # Check for isolated H2_saltcavern buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON2035", + scenario="eGon2035", + carrier="H2_saltcavern" + ), + # NOTE: eGon100RE gas bus isolated checks are commented out + # because they are also commented out in the original sanity_checks.py + # (lines 1435-1439). Uncomment when eGon100RE gas bus data is ready. + # # Check for isolated CH4 buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON100RE", + # scenario="eGon100RE", + # carrier="CH4" + # ), + # # Check for isolated H2_grid buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_grid" + # ), + # # Check for isolated H2_saltcavern buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_saltcavern" + # ), + # Check CH4 bus count - eGon2035 + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4", + rtol=0.10 + ), + # Check H2_grid bus count - eGon2035 + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON2035", + scenario="eGon2035", + carrier="H2_grid", + rtol=0.10 + ), + # NOTE: eGon100RE gas bus count checks are commented out + # because sanity_check_gas_buses() is only called for eGon2035 (line 1943) + # # Check CH4 bus count - eGon100RE + # GasBusesCount( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON100RE", + # scenario="eGon100RE", + # carrier="CH4", + # rtol=0.10 + # ), + # # Check H2_grid bus count - eGon100RE + # GasBusesCount( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_grid", + # rtol=0.10 + # ), + # Check CH4 grid capacity - eGon2035 + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.10 + ), + # Check CH4 grid capacity - eGon100RE + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.10 + ), + ], + + # Gas one-port component connection validations + # These check that loads, generators, and stores are connected to valid buses + "gas_one_port": [ + # LOADS - eGon2035 + # CH4_for_industry loads in Germany must connect to CH4 buses + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_CH4_FOR_INDUSTRY_DE_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="CH4_for_industry", + bus_conditions=[("CH4", "= 'DE'")] + ), + # CH4 loads abroad must connect to CH4 buses outside Germany + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_CH4_ABROAD_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="CH4", + bus_conditions=[("CH4", "!= 'DE'")] + ), + # H2_for_industry loads must connect to H2_grid in DE or AC abroad + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_H2_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="H2_for_industry", + bus_conditions=[ + ("H2_grid", "= 'DE'"), + ("AC", "!= 'DE'") + ] + ), + + # GENERATORS - eGon2035 + # CH4 generators must connect to CH4 buses (any country) + GasOnePortConnections( + table="grid.egon_etrago_generator", + rule_id="SANITY_GAS_ONE_PORT_GENERATOR_CH4_EGON2035", + scenario="eGon2035", + component_type="generator", + component_carrier="CH4", + bus_conditions=[("CH4", "")] # Any CH4 bus, no country filter + ), + + # STORES - eGon2035 + # CH4 stores must connect to CH4 buses (any country) + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_CH4_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="CH4", + bus_conditions=[("CH4", "")] # Any CH4 bus, no country filter + ), + # H2_underground stores must connect to H2_saltcavern buses (any country) + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_H2_UNDERGROUND_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="H2_underground", + bus_conditions=[("H2_saltcavern", "")] # Any H2_saltcavern bus, no country filter + ), + # H2_overground stores must connect to H2_saltcavern or H2_grid in DE + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_H2_OVERGROUND_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="H2_overground", + bus_conditions=[ + ("H2_saltcavern", "= 'DE'"), + ("H2_grid", "= 'DE'") + ] + ), + ], + + # Gas link connection validations + # These check that gas links have both bus0 and bus1 connected to existing buses + "gas_links": [ + # CH4 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4" + ), + # H2_feedin links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_FEEDIN_EGON2035", + scenario="eGon2035", + carrier="H2_feedin" + ), + # H2_to_CH4 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_TO_CH4_EGON2035", + scenario="eGon2035", + carrier="H2_to_CH4" + ), + # CH4_to_H2 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CH4_TO_H2_EGON2035", + scenario="eGon2035", + carrier="CH4_to_H2" + ), + # H2_to_power links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_TO_POWER_EGON2035", + scenario="eGon2035", + carrier="H2_to_power" + ), + # power_to_H2 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_POWER_TO_H2_EGON2035", + scenario="eGon2035", + carrier="power_to_H2" + ), + # OCGT links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_OCGT_EGON2035", + scenario="eGon2035", + carrier="OCGT" + ), + # central_gas_boiler links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_BOILER_EGON2035", + scenario="eGon2035", + carrier="central_gas_boiler" + ), + # central_gas_CHP links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_CHP_EGON2035", + scenario="eGon2035", + carrier="central_gas_CHP" + ), + # central_gas_CHP_heat links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_CHP_HEAT_EGON2035", + scenario="eGon2035", + carrier="central_gas_CHP_heat" + ), + # industrial_gas_CHP links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_INDUSTRIAL_GAS_CHP_EGON2035", + scenario="eGon2035", + carrier="industrial_gas_CHP" + ), + ], + + # Gas loads and generators capacity validations + # These check that gas demand and generation capacity match reference data + "gas_loads_generators": [ + # CH4_for_industry loads - eGon2035 + GasLoadsCapacity( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_LOADS_CH4_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + carrier="CH4_for_industry", + rtol=0.10 + ), + # H2_for_industry loads - eGon2035 + GasLoadsCapacity( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_LOADS_H2_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + carrier="H2_for_industry", + rtol=0.10 + ), + # CH4 generators - eGon2035 + GasGeneratorsCapacity( + table="grid.egon_etrago_generator", + rule_id="SANITY_GAS_GENERATORS_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4", + rtol=0.10 + ), + ], + + # Electricity capacity validations + # These check that distributed generator and storage capacities match input capacities + "electricity_capacity": [ + # GENERATORS - eGon2035 + # Wind onshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_ONSHORE_EGON2035", + scenario="eGon2035", + carrier="wind_onshore", + component_type="generator", + rtol=0.10 + ), + # Wind offshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_OFFSHORE_EGON2035", + scenario="eGon2035", + carrier="wind_offshore", + component_type="generator", + rtol=0.10 + ), + # Solar + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_EGON2035", + scenario="eGon2035", + carrier="solar", + component_type="generator", + rtol=0.10 + ), + # Solar rooftop + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_ROOFTOP_EGON2035", + scenario="eGon2035", + carrier="solar_rooftop", + component_type="generator", + rtol=0.10 + ), + # Biomass (maps to multiple output carriers: biomass, industrial_biomass_CHP, central_biomass_CHP) + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_BIOMASS_EGON2035", + scenario="eGon2035", + carrier="biomass", + component_type="generator", + output_carriers=[ + "biomass", + "industrial_biomass_CHP", + "central_biomass_CHP" + ], + rtol=0.10 + ), + # Run of river + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RUN_OF_RIVER_EGON2035", + scenario="eGon2035", + carrier="run_of_river", + component_type="generator", + rtol=0.10 + ), + # Reservoir + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RESERVOIR_EGON2035", + scenario="eGon2035", + carrier="reservoir", + component_type="generator", + rtol=0.10 + ), + # Oil + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OIL_EGON2035", + scenario="eGon2035", + carrier="oil", + component_type="generator", + rtol=0.10 + ), + # Others + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OTHERS_EGON2035", + scenario="eGon2035", + carrier="others", + component_type="generator", + rtol=0.10 + ), + + # STORAGE - eGon2035 + # Pumped hydro + ElectricityCapacityComparison( + table="grid.egon_etrago_storage", + rule_id="SANITY_ELECTRICITY_STORAGE_PUMPED_HYDRO_EGON2035", + scenario="eGon2035", + carrier="pumped_hydro", + component_type="storage", + rtol=0.10 + ), + + # GENERATORS - eGon100RE + # Wind onshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_ONSHORE_EGON100RE", + scenario="eGon100RE", + carrier="wind_onshore", + component_type="generator", + rtol=0.10 + ), + # Wind offshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_OFFSHORE_EGON100RE", + scenario="eGon100RE", + carrier="wind_offshore", + component_type="generator", + rtol=0.10 + ), + # Solar + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_EGON100RE", + scenario="eGon100RE", + carrier="solar", + component_type="generator", + rtol=0.10 + ), + # Solar rooftop + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_ROOFTOP_EGON100RE", + scenario="eGon100RE", + carrier="solar_rooftop", + component_type="generator", + rtol=0.10 + ), + # Run of river + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RUN_OF_RIVER_EGON100RE", + scenario="eGon100RE", + carrier="run_of_river", + component_type="generator", + rtol=0.10 + ), + # Oil + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OIL_EGON100RE", + scenario="eGon100RE", + carrier="oil", + component_type="generator", + rtol=0.10 + ), + # Lignite + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_LIGNITE_EGON100RE", + scenario="eGon100RE", + carrier="lignite", + component_type="generator", + rtol=0.10 + ), + # Coal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_COAL_EGON100RE", + scenario="eGon100RE", + carrier="coal", + component_type="generator", + rtol=0.10 + ), + # Solar thermal collector + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_solar_thermal_collector", + component_type="generator", + output_carriers=["solar_thermal_collector"], + rtol=0.10 + ), + # Geothermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_GEO_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_geo_thermal", + component_type="generator", + output_carriers=["geo_thermal"], + rtol=0.10 + ), + # Rural solar thermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RURAL_SOLAR_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="rural_solar_thermal", + component_type="generator", + rtol=0.10 + ), + # Urban central gas CHP + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_URBAN_GAS_CHP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_gas_CHP", + component_type="generator", + rtol=0.10 + ), + # Urban central solid biomass CHP + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_BIOMASS_CHP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_solid_biomass_CHP", + component_type="generator", + rtol=0.10 + ), + + # LINKS - eGon100RE + # Central gas boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_GAS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_gas_boiler", + component_type="link", + output_carriers=["central_gas_boiler"], + rtol=0.10 + ), + # Central heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_HEAT_PUMP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_heat_pump", + component_type="link", + output_carriers=["central_heat_pump"], + rtol=0.10 + ), + # Central resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_RESISTIVE_HEATER_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_resistive_heater", + component_type="link", + output_carriers=["central_resistive_heater"], + rtol=0.10 + ), + # OCGT (gas) + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_OCGT_EGON100RE", + scenario="eGon100RE", + carrier="gas", + component_type="link", + output_carriers=["OCGT"], + rtol=0.10 + ), + # Rural biomass boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_BIOMASS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_biomass_boiler", + component_type="link", + rtol=0.10 + ), + # Rural gas boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_GAS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_gas_boiler", + component_type="link", + rtol=0.10 + ), + # Rural heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_HEAT_PUMP_EGON100RE", + scenario="eGon100RE", + carrier="rural_heat_pump", + component_type="link", + rtol=0.10 + ), + # Rural oil boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_OIL_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_oil_boiler", + component_type="link", + rtol=0.10 + ), + # Rural resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_RESISTIVE_HEATER_EGON100RE", + scenario="eGon100RE", + carrier="rural_resistive_heater", + component_type="link", + rtol=0.10 + ), + + # STORAGE - eGon100RE + # Pumped hydro + ElectricityCapacityComparison( + table="grid.egon_etrago_storage", + rule_id="SANITY_ELECTRICITY_STORAGE_PUMPED_HYDRO_EGON100RE", + scenario="eGon100RE", + carrier="pumped_hydro", + component_type="storage", + rtol=0.10 + ), + ], + + # Heat capacity validations + # These check that distributed heat supply capacities match input capacities + "heat_capacity": [ + # LINKS - eGon2035 + # Central heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_CENTRAL_HEAT_PUMP_EGON2035", + scenario="eGon2035", + carrier="urban_central_heat_pump", + component_type="link", + output_carriers=["central_heat_pump"], + rtol=0.10 + ), + # Rural heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_RURAL_HEAT_PUMP_EGON2035", + scenario="eGon2035", + carrier="residential_rural_heat_pump", + component_type="link", + output_carriers=["rural_heat_pump"], + rtol=0.10 + ), + # Central resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_CENTRAL_RESISTIVE_HEATER_EGON2035", + scenario="eGon2035", + carrier="urban_central_resistive_heater", + component_type="link", + output_carriers=["central_resistive_heater"], + rtol=0.10 + ), + + # GENERATORS - eGon2035 + # Solar thermal collector + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_HEAT_GENERATOR_SOLAR_THERMAL_EGON2035", + scenario="eGon2035", + carrier="urban_central_solar_thermal_collector", + component_type="generator", + output_carriers=["solar_thermal_collector"], + rtol=0.10 + ), + # Geothermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_HEAT_GENERATOR_GEO_THERMAL_EGON2035", + scenario="eGon2035", + carrier="urban_central_geo_thermal", + component_type="generator", + output_carriers=["geo_thermal"], + rtol=0.10 + ), + ], + + # Timeseries length validations + # These check that all timeseries arrays have the expected length (8760 hours) + # NOTE: All array columns are validated to match original sanity_checks.py + # which dynamically discovers all array columns (lines 2465-2494) + "timeseries_length": [ + # Generator timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="q_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Load timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LOAD_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_load_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LOAD_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_load_timeseries", + array_column="q_set", + expected_length=8760 + ), + + # Link timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_EFFICIENCY", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="efficiency", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Storage timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="q_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_STATE_OF_CHARGE_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="state_of_charge_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_INFLOW", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="inflow", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Store timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="q_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_E_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="e_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_E_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="e_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + ], + + # Electrical load demand validations + # Validates annual electrical load sums against expected values + "electrical_load": [ + # Total AC load aggregation for all scenarios (eGon2035, eGon100RE, etc.) + ElectricalLoadAggregationValidation( + rule_id="SANITY_ELECTRICAL_LOAD_AGGREGATION", + task="FinalValidations.electrical_load", + table="grid.egon_etrago_load", + tolerance=0.05 # 5% tolerance + ), + # Sector breakdown validation for eGon100RE + # Validates residential (90.4 TWh), commercial (146.7 TWh), + # industrial (382.9 TWh), and total (620.0 TWh) loads + ElectricalLoadSectorBreakdown( + rule_id="SANITY_ELECTRICAL_LOAD_SECTOR_BREAKDOWN_EGON100RE", + task="FinalValidations.electrical_load", + table="grid.egon_etrago_load", + scenario="eGon100RE", + rtol=0.01 # 1% tolerance as in original + ), + ], + + # Heat demand validations + # Validates annual heat demand against peta_heat reference values + "heat_demand": [ + # Heat demand - eGon2035 + HeatDemandValidation( + table="grid.egon_etrago_load", + rule_id="SANITY_HEAT_DEMAND_EGON2035", + scenario="eGon2035", + rtol=0.02 # 2% tolerance + ), + ], + "data-quality": [ + #grid validation + RowCountValidation( + table="grid.egon_etrago_bus", + rule_id="ROW_COUNT.egon_etrago_bus", + expected_count={"Schleswig-Holstein": 2729, "Everything": 85710} + ), + DataTypeValidation( + table="grid.egon_etrago_bus", + rule_id="DATA_TYPES.egon_etrago_bus", + column_types={ + "scen_name": "character varying", + "bus_id": "bigint", + "v_nom": "double precision", + "type": "text", + "carrier": "text", + "v_mag_pu_set": "double precision", + "v_mag_pu_min": "double precision", + "v_mag_pu_max": "double precision", + "x": "double precision", + "y": "double precision", + "geometry": "geometry", + "country": "text" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_bus", + rule_id="NOT_NAN.egon_etrago_bus", + columns=[ + "scn_name", + "bus_id", + "v_nom", + "carrier", + "v_mag_pu_min", + "v_mag_pu_max", + "x", + "y", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_bus", + rule_id="TABLE_NOT_NAN.egon_etrago_bus" + ), + ValueSetValidation( + table="grid.egon_etrago_bus", + rule_id="VALUE_SET_SCENARIO.egon_etrago_bus", + column="scn_name", + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] + ), + ValueSetValidation( + table="grid.egon_etrago_bus", + rule_id="VALUE_SET_CARRIER.egon_etrago_bus", + column="carrier", + expected_values=[ + "rural_heat", + "urban_central_water_tanks", + "low_voltage", + "CH4", + "H2_saltcavern", + "services_rural_heat", + "services_rural_water_tanks", + "central_heat_store", + "AC", + "Li_ion", + "H2_grid", + "dsm", + "urban_central_heat", + "residential_rural_heat", + "central_heat", + "rural_heat_store", + "residential_rural_water_tanks" + ] + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_bus", + rule_id="SRIDUniqueNonZero.egon_etrago_bus", + column="geometry" + ), + RowCountValidation( + table="grid.egon_etrago_generator", + rule_id="ROW_COUNT.egon_etrago_generator", + expected_count={ + "Schleswig-Holstein": 2863, + "Everything": 40577 + } + ), + DataTypeValidation( + table="grid.egon_etrago_generator", + rule_id="DATA_TYPES.egon_etrago_generator", + column_types={ + "scen_name": "character varying", + "generator_id": "bigint", + "control": "text", + "type": "text", + "carrier": "text", + "p_nom": "double precision", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision", + "marginal_cost": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", + "efficiency": "double precision", + "commitable": "boolean", + "start_up_cost": "double precision", + "shut_down_cost": "double precision", + "min_up_time": "bigint", + "min_down_time": "bigint", + "up_time_before": "bigint", + "down_time_before": "bigint", + "ramp_limit_up": "double precision", + "ramp_limit_down": "double precision", + "ramp_limit_start_up": "double precision", + "ramp_limit_shut_down": "double precision", + "e_nom_max": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_generator", + rule_id="NOT_NAN.egon_etrago_generator", + columns=[ + "scn_name", + "generator_id", + "bus", + "control", + "type", + "carrier", + "p_nom", + "p_nom_extendable", + "p_nom_min", + "p_nom_max", + "p_min_pu", + "p_max_pu", + "sign", + "marginal_cost", + "build_year", + "lifetime", + "capital_cost", + "efficiency", + "committable", + "start_up_cost", + "shut_down_cost", + "min_up_time", + "min_down_time", + "up_time_before", + "down_time_before", + "ramp_limit_start_up", + "ramp_limit_shut_down", + "e_nom_max" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_generator", + rule_id="TABLE_NOT_NAN.egon_etrago_generator" + ), + ValueSetValidation( + table="grid.egon_etrago_generator", + rule_id="VALUE_SET_SCENARIO.egon_etrago_generator", + column="scn_name", + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] + ), + ValueSetValidation( + table="grid.egon_etrago_generator", + rule_id="VALUE_SET_CARRIER.egon_etrago_generator", + column="carrier", + expected_values=[ + "CH4", + "others", + "central_biomass_CHP", + "wind_onshore", + "lignite", + "geo_thermal", + "solar", + "reservoir", + "services_rural_solar_thermal_collector", + "residential_rural_solar_thermal_collector", + "industrial_biomass_CHP", + "biomass", + "urban_central_solar_thermal_collector", + "run_of_river", + "oil", + "central_biomass_CHP_heat", + "nuclear", + "coal", + "solar_thermal_collector", + "solar_rooftop", + "wind_offshore" + ] + ), + RowCountValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="ROW_COUNT.egon_etrago_generator_timeseries", + expected_count={ + "Schleswig-Holstein": 1929, + "Everything": 28651 + } + ), + DataTypeValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="DATA_TYPES.egon_etrago_generator_timeseries", + column_types={ + "scn_name": "character varying", + "generator_id": "integer", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="NOT_NAN.egon_etrago_generator_timeseries", + columns=[ + "scn_name", + "generator_id", + "temp_id", + "p_max_pu" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_generator_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_generator_timeseries", + column="scn_name", + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] + ), + RowCountValidation( + table="grid.egon_etrago_line", + rule_id="ROW_COUNT.egon_etrago_line", + expected_count={ + "Schleswig-Holstein": 1197, + "Everything": 69901 + } + ), + DataTypeValidation( + table="grid.egon_etrago_line", + rule_id="DATA_TYPES.egon_etrago_line", + column_types={ + "scn_name": "character varying", + "line_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "x": "numeric", + "r": "numeric", + "g": "numeric", + "b": "numeric", + "s_nom": "numeric", + "s_nom_extendable": "boolean", + "s_nom_min": "double precision", + "s_nom_max": "double precision", + "s_max_pu": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", + "length": "double precision", + "cables": "integer", + "terrain_factor": "double precision", + "num_parallel": "double precision", + "v_ang_min": "double precision", + "v_ang_max": "double precision", + "v_nom": "double precision", + "geom": "geometry", + "topo": "geometry" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_line", + rule_id="NOT_NAN.egon_etrago_line", + columns=[ + "scn_name", + "line_id", + "bus0", + "bus1", + "carrier", + "x", + "r", + "g", + "b", + "s_nom", + "s_nom_extendable", + "s_nom_min", + "s_nom_max", + "s_max_pu", + "build_year", + "lifetime", + "capital_cost", + "length", + "cables", + "terrain_factor", + "num_parallel", + "v_ang_min", + "v_ang_max", + "v_nom", + "geom", + "topo" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_line", + rule_id="TABLE_NOT_NAN.egon_etrago_line" + ), + ValueSetValidation( + table="grid.egon_etrago_line", + rule_id="VALUE_SET_SCENARIO.egon_etrago_line", + column="scn_name", + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] + ), + ValueSetValidation( + table="grid.egon_etrago_line", + rule_id="VALUE_SET_CARRIER.egon_etrago_line", + column="carrier", + expected_values=["AC"] + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line", + rule_id="SRIDUniqueNonZero.egon_etrago_line.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line", + rule_id="SRIDUniqueNonZero.egon_etrago_line.topo", + column="topo" + ), + #Row Count does not equal egon_etrago_line, because buses are located outside Germany + RowCountValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="ROW_COUNT.egon_etrago_line_timeseries", + expected_count={ + "Schleswig-Holstein": 1197, + "Everything": 69714 + } + ), + DataTypeValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="DATA_TYPES.egon_etrago_line_timeseries", + column_types={ + "scn_name": "character varying", + "line_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "x": "numeric", + "r": "numeric", + "g": "numeric", + "b": "numeric", + "s_nom": "numeric", + "s_nom_extendable": "boolean", + "s_nom_min": "double precision", + "s_nom_max": "double precision", + "s_max_pu": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", + "length": "double precision", + "cables": "integer", + "terrain_factor": "double precision", + "num_parallel": "double precision", + "v_ang_min": "double precision", + "v_ang_max": "double precision", + "v_nom": "double precision", + "geom": "geometry", + "topo": "geometry" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="NOT_NAN.egon_etrago_line_timeseries", + columns=[ + "scn_name", + "line_id", + "bus0", + "bus1", + "carrier", + "x", + "r", + "g", + "b", + "s_nom", + "s_nom_extendable", + "s_nom_min", + "s_nom_max", + "s_max_pu", + "build_year", + "lifetime", + "capital_cost", + "length", + "cables", + "terrain_factor", + "num_parallel", + "v_ang_min", + "v_ang_max", + "v_nom", + "geom", + "topo", + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_line_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_line_timeseries", + column="scn_name", + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] + ), + ValueSetValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="VALUE_SET_CARRIER.egon_etrago_line_timeseries", + column="carrier", + expected_values=["AC"] + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line_timeseries", + rule_id="SRIDUniqueNonZero.egon_etrago_line_timeseries.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line_timeseries", + rule_id="SRIDUniqueNonZero.egon_etrago_line_timeseries.topo", + column="topo" + ), + RowCountValidation( + table="grid.egon_etrago_link", + rule_id="ROW_COUNT.egon_etrago_link", + expected_count={ + "Schleswig-Holstein": 15496, + "Everything": 83980 + } + ), + DataTypeValidation( + table="grid.egon_etrago_link", + rule_id="DATA_TYPES.egon_etrago_link", + column_types={ + "scn_name": "character varying", + "link_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "efficiency": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "p_nom": "numeric", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "capital_cost": "double precision", + "marginal_cost": "double precision", + "length": "double precision", + "terrain_factor": "double precision", + "geom": "geometry", + "topo": "geometry", + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_link", + rule_id="NOT_NAN.egon_etrago_link", + columns=[ + "scn_name", "link_id", "bus0", "bus1", "carrier", "efficiency", "build_year", "p_nom", + "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", + "capital_cost", "marginal_cost", "length", "terrain_factor", "geom", "topo" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_link", + rule_id="TABLE_NOT_NAN.egon_etrago_link" + ), + ValueSetValidation( + table="grid.egon_etrago_link", + rule_id="VALUE_SET_SCENARIO.egon_etrago_link", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_link", + rule_id="VALUE_SET_CARRIER.egon_etrago_link", + column="carrier", + expected_values=[ + "industrial_gas_CHP", "residential_rural_water_tanks_discharger", "BEV_charger", "CH4", + "power_to_H2", "urban_central_gas_CHP", "rural_heat_store_discharger", "H2_gridextension", + "urban_central_gas_CHP_CC", "dsm", "services_rural_water_tanks_charger", "H2_to_CH4", + "rural_heat_store_charger", "DC", "central_gas_boiler", "H2_feedin", "H2_retrofit", "OCGT", + "central_gas_CHP_heat", "residential_rural_water_tanks_charger", "central_heat_pump", + "services_rural_ground_heat_pump", "rural_heat_pump", "CH4_to_H2", "central_resistive_heater", + "urban_central_air_heat_pump", "urban_central_water_tanks_discharger", + "urban_central_water_tanks_charger", "services_rural_water_tanks_discharger", + "electricity_distribution_grid", "central_heat_store_discharger", "H2_to_power", + "central_heat_store_charger", "central_gas_CHP", "residential_rural_ground_heat_pump"] + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_link", + rule_id="SRIDUniqueNonZero.egon_etrago_link.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_link", + rule_id="SRIDUniqueNonZero.egon_etrago_link.topo", + column="topo" + ), + RowCountValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="ROW_COUNT.egon_etrago_link_timeseries", + expected_count={"Schleswig-Holstein": 947, "Everything": 25729} + ), + DataTypeValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="DATA_TYPES.egon_etrago_link_timeseries", + column_types={ + "scn_name": "character varying", + "link_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "efficiency": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="NOT_NAN.egon_etrago_link_timeseries", + columns=[ + "scn_name", "link_id", "temp_id", "p_set", "p_min_pu", "p_max_pu", "efficiency", + "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_link_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_link_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_load", + rule_id="ROW_COUNT.egon_etrago_load", + expected_count={"Schleswig-Holstein": 3202, "Everything": 44019} + ), + DataTypeValidation( + table="grid.egon_etrago_load", + rule_id="DATA_TYPES.egon_etrago_load", + column_types={ + "scn_name": "character varying", + "load_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_load", + rule_id="NOT_NAN.egon_etrago_load", + columns=[ + "scn_name", "load_id", "bus", "type", "carrier", "p_set", "q_set", "sign" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_load", + rule_id="TABLE_NOT_NAN.egon_etrago_load" + ), + ValueSetValidation( + table="grid.egon_etrago_load", + rule_id="VALUE_SET_SCENARIO.egon_etrago_load", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_load", + rule_id="VALUE_SET_CARRIER.egon_etrago_load", + column="carrier", + expected_values=[ + "CH4", "H2_for_industry", "services_rural_heat", "H2_system_boundary", "AC", + "urban_central_heat", "residential_rural_heat", "low-temperature_heat_for_industry", + "CH4_for_industry", "central_heat", "CH4_system_boundary", "land_transport_EV", + "H2_hgv_load", "rural_gas_boiler", "rural_heat" + ] + ), + RowCountValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="ROW_COUNT.egon_etrago_load_timeseries", + expected_count={"Schleswig-Holstein": 3176, "Everything": 44013} + ), + DataTypeValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="DATA_TYPES.egon_etrago_load_timeseries", + column_types={ + "scn_name": "character varying", + "load_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="NOT_NAN.egon_etrago_load_timeseries", + columns=[ + "scn_name", "load_id", "temp_id", "p_set", "q_set" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_load_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_load_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_storage", + rule_id="ROW_COUNT.egon_etrago_storage", + expected_count={"Schleswig-Holstein": 418, "Everything": 13044} + ), + DataTypeValidation( + table="grid.egon_etrago_storage", + rule_id="DATA_TYPES.egon_etrago_storage", + column_types={ + "scn_name": "character varying", + "storage_id": "bigint", + "bus": "bigint", + "control": "text", + "type": "text", + "carrier": "text", + "p_nom": "double precision", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "state_of_charge_initial": "double precision", + "cyclic_state_of_charge": "boolean", + "state_of_charge_set": "double precision", + "max_hours": "double precision", + "efficiency_store": "double precision", + "efficiency_dispatch": "double precision", + "standing_loss": "double precision", + "inflow": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_storage", + rule_id="NOT_NAN.egon_etrago_storage", + columns=[ + "scn_name", "storage_id", "bus", "control", "type", "carrier", "p_nom", + "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", + "q_set", "sign", "marginal_cost", "capital_cost", "build_year", "lifetime", + "state_of_charge_initial", "cyclic_state_of_charge", "state_of_charge_set", + "max_hours", "efficiency_store", "efficiency_dispatch", "standing_loss", "inflow" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_storage", + rule_id="TABLE_NOT_NAN.egon_etrago_storage" + ), + ValueSetValidation( + table="grid.egon_etrago_storage", + rule_id="VALUE_SET_SCENARIO.egon_etrago_storage", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_storage", + rule_id="VALUE_SET_CARRIER.egon_etrago_storage", + column="carrier", + expected_values=["battery", "home_battery", "pumped_hydro", "reservoir"] + ), + RowCountValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="ROW_COUNT.egon_etrago_storage_timeseries", + expected_count={"Schleswig-Holstein": 0, "Everything": 9} + ), + DataTypeValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="DATA_MULTIPLE_TYPES.egon_etrago_storage_timeseries", + column_types={ + "scn_name": "character varying", + "storage_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "state_of_charge_set": "double precision[]", + "inflow": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="NOT_NAN.egon_etrago_storage_timeseries", + columns=[ + "scn_name", "storage_id", "temp_id", "inflow", "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_storage_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_storage_timeseries", + column="scn_name", + expected_values=["eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_store", + rule_id="ROW_COUNT.egon_etrago_store", + expected_count={"Schleswig-Holstein": 2913, "Everything": 26520} + ), + DataTypeValidation( + table="grid.egon_etrago_store", + rule_id="DATA_TYPES.egon_etrago_store", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "e_nom": "double precision", + "e_nom_extendable": "boolean", + "e_nom_min": "double precision", + "e_nom_max": "double precision", + "e_min_pu": "double precision", + "e_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "e_initial": "double precision", + "e_cyclic": "boolean", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "standing_loss": "double precision", + "build_year": "bigint", + "lifetime": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_store", + rule_id="NOT_NAN.egon_etrago_store", + columns=[ + "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", + "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", + "e_cyclic", "sign", "marginal_cost", "capital_cost", "standing_loss", "build_year", + "lifetime" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_store", + rule_id="TABLE_NOT_NAN.egon_etrago_store" + ), + ValueSetValidation( + table="grid.egon_etrago_store", + rule_id="VALUE_SET_SCENARIO.egon_etrago_store", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="ROW_COUNT.egon_etrago_store_timeseries", + expected_count={"Schleswig-Holstein": 392, "Everything": 15281} + ), + DataTypeValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="DATA_TYPES.egon_etrago_store_timeseries", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "e_min_pu": "double precision[]", + "e_max_pu": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="NOT_NAN.egon_etrago_store_timeseries", + columns=[ + "scn_name", "store_id", "temp_id", "p_set", "q_set", "e_min_pu", "e_max_pu", + "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TABLE_NOT_NAN.egon_etrago_store_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_store_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="ROW_COUNT.egon_etrago_temp_resolution", + expected_count=1 + ), + DataTypeValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="DATA_TYPES.egon_etrago_temp_resolution", + column_types={ + "temp_id": "bigint", + "timesteps": "bigint", + "resolution": "text", + "start_time": "timestamp without time zone" + }, + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="TABLE_NOT_NAN.egon_etrago_temp_resolution" + ), + RowCountValidation( + table="grid.egon_etrago_transformer", + rule_id="ROW_COUNT.egon_etrago_transformer", + expected_count={"Schleswig-Holstein": 31, "Everything": 1545} + ), + DataTypeValidation( + table="grid.egon_etrago_transformer", + rule_id="DATA_TYPES.egon_etrago_transformer", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "e_nom": "double precision", + "e_nom_extendable": "boolean", + "e_nom_min": "double precision", + "e_nom_max": "double precision", + "e_min_pu": "double precision", + "e_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "e_initial": "double precision", + "e_cyclic": "boolean", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "standing_loss": "double precision", + "build_year": "bigint", + "lifetime": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_transformer", + rule_id="NOT_NAN.egon_etrago_transformer", + columns=[ + "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", + "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", + "e_cyclic", "sign", "marginal_cost", "capital_cost", "standing_loss", "build_year", + "lifetime" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_transformer", + rule_id="TABLE_NOT_NAN.egon_etrago_transformer" + ), + ValueSetValidation( + table="grid.egon_etrago_transformer", + rule_id="VALUE_SET_SCENARIO.egon_etrago_transformer", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_hvmv_substation", + rule_id="ROW_COUNT.hvmv_substation", + expected_count={"Schleswig-Holstein": 200, "Everything": 3854} + ), + DataTypeValidation( + table="grid.egon_hvmv_substation", + rule_id="DATA_TYPES.egon_hvmv_substation", + column_types={ + "bus_id": "integer", + "lon": "double precision", + "lat": "double precision", + "point": "geometry", + "polygon": "geometry", + "voltage": "text", + "power_type": "text", + "substation": "text", + "osm_id": "text", + "osm_www": "text", + "frequency": "text", + "subst_name": "text", + "ref": "text", + "operator": "text", + "dbahn": "text", + "status": "integer" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_hvmv_substation", + rule_id="NOT_NAN.egon_hvmv_substation", + columns=[ + "bus_id", "lon", "lat", "point", "polygon", "voltage", "power_type", "substation", + "osm_id", "osm_www", "frequency", "subst_name", "ref", "operator", "dbahn", "status" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_hvmv_substation", + rule_id="TABLE_NOT_NAN.egon_hvmv_substation" + ), + SRIDUniqueNonZero( + table="grid.egon_hvmv_substation", + rule_id="SRIDUniqueNonZero.egon_hvmv_substation.point", + column="point" + ), + SRIDUniqueNonZero( + table="grid.egon_hvmv_substation", + rule_id="SRIDUniqueNonZero.egon_hvmv_substation.polygon", + column="polygon" + ), + RowCountValidation( + table="grid.egon_mv_grid_district", + rule_id="ROW_COUNT.egon_mv_grid_district", + expected_count={"Schleswig-Holstein": 200, "Everything": 3854} + ), + DataTypeValidation( + table="grid.egon_mv_grid_district", + rule_id="DATA_TYPES.egon_mv_grid_district", + column_types={ + "bus_id": "integer", + "geom": "geometry", + "area": "double precision" + }, + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_mv_grid_district", + rule_id="TABLE_NOT_NAN.egon_mv_grid_district" + ), + SRIDUniqueNonZero( + table="grid.egon_mv_grid_district", + rule_id="SRIDUniqueNonZero.egon_mv_grid_district.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" # Continue pipeline even if validations fail + ) diff --git a/src/egon/data/datasets/heat_demand/__init__.py b/src/egon/data/datasets/heat_demand/__init__.py index c0f9ce682..fbfb01bee 100644 --- a/src/egon/data/datasets/heat_demand/__init__.py +++ b/src/egon/data/datasets/heat_demand/__init__.py @@ -39,6 +39,13 @@ from egon.data.metadata import context, license_ccby, meta_metadata, sources import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + class HeatDemandImport(Dataset): """ @@ -74,6 +81,41 @@ def __init__(self, dependencies): version=self.version, # maybe rethink the naming dependencies=dependencies, tasks=(scenario_data_import), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_peta_heat", + rule_id="ROW_COUNT.egon_peta_heat", + expected_count={"Schleswig-Holstein": 139250, "Everything": 6836426} + ), + DataTypeValidation( + table="demand.egon_peta_heat", + rule_id="DATA_MULTIPLE_TYPES.egon_peta_heat", + column_types={"id": "integer", "demand": "double precision", "sector": "character varying", + "scenario": "character varying", "zensus_pupulation_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_peta_heat", + rule_id="WHOLE_TABLE_NOT_NAN.egon_peta_heat" + ), + ValueSetValidation( + table="demand.egon_peta_heat", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_peta_heat", + column="scenario", + expected_values={ + "Schleswig-Holstein":["eGon2035"], + "Everything":["eGon2035", "eGon100RE"] + } + ), + ValueSetValidation( + table="demand.egon_peta_heat", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_peta_heat", + column="sector", + expected_values=["residential", "service"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/heat_demand_timeseries/__init__.py b/src/egon/data/datasets/heat_demand_timeseries/__init__.py index 972166780..d4712db34 100644 --- a/src/egon/data/datasets/heat_demand_timeseries/__init__.py +++ b/src/egon/data/datasets/heat_demand_timeseries/__init__.py @@ -37,6 +37,14 @@ sources, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + ArrayCardinalityValidation +) + Base = declarative_base() @@ -1263,4 +1271,46 @@ def __init__(self, dependencies): metadata, store_national_profiles, ), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_heat_idp_pool", + rule_id="ROW_COUNT.egon_heat_idp_pool", + expected_count=459535 + ), + DataTypeValidation( + table="demand.egon_heat_idp_pool", + rule_id="DATA_MULTIPLE_TYPES.egon_heat_idp_pool", + column_types={"index": "bigint", "idp": "double precision[]"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_heat_idp_pool", + rule_id="WHOLE_TABLE_NOT_NAN.egon_heat_idp_pool" + ), + RowCountValidation( + table="demand.egon_heat_timeseries_selected_profiles", + rule_id="ROW_COUNT.egon_heat_timeseries_selected_profiles", + expected_count={"Schleswig-Holstein": 719960, "Everything": 20606259} + ), + DataTypeValidation( + table="demand.egon_heat_timeseries_selected_profiles", + rule_id="DATA_MULTIPLE_TYPES.egon_heat_timeseries_selected_profiles", + column_types={"zensus_population_id": "integer", "bulding_id": "integer", + "selected_idp_profiles": "integer[]"} + ), + ArrayCardinalityValidation( + table="demand.egon_heat_timeseries_selected_profiles", + rule_id="ARRAY.egon_heat_timeseries_selected_profiles", + array_column="selected_idp_profiles", + expected_length=365, + ), + ArrayCardinalityValidation( + table="demand.egon_timeseries_district_heating", + rule_id="ARRAY.egon_timeseries_district_heating", + array_column="dist_aggregated_mw", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/heat_supply/__init__.py b/src/egon/data/datasets/heat_supply/__init__.py index 2c3a619b5..8d3d8ba8b 100644 --- a/src/egon/data/datasets/heat_supply/__init__.py +++ b/src/egon/data/datasets/heat_supply/__init__.py @@ -32,6 +32,15 @@ sources, ) +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + # Will later be imported from another file. Base = declarative_base() @@ -404,6 +413,115 @@ def __init__(self, dependencies): }, metadata, ), + validation={ + "data-quality":[ + RowCountValidation( + table="supply.egon_district_heating", + rule_id="ROW_COUNT.egon_district_heating", + expected_count={"Schleswig-Holstein": 402, "Everything": 9090} + ), + DataTypeValidation( + table="supply.egon_district_heating", + rule_id="DATA_TYPES.egon_district_heating", + column_types={ + "index": "integer", + "district_heating_id": "integer", + "carrier": "character varying", + "category": "character varying", + "capacity": "double precision", + "geometry": "geometry", + "scenario": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_district_heating", + rule_id="NOT_NAN.egon_district_heating", + columns=[ + "index", + "district_heating_id", + "carrier", + "category", + "capacity", + "geometry", + "scenario" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_district_heating", + rule_id="TABLE_NOT_NAN.egon_district_heating" + ), + SRIDUniqueNonZero( + table="supply.egon_district_heating", + rule_id="SRIDUniqueNonZero.egon_district_heating.geometry", + column="geometry" + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_district_heating", + column="carrier", + expected_values=["geo_thermal", "CHP", "gas_boiler", "resistive_heater", "heat_pump", "solar_thermal_collector"] + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_district_heating", + column="scenario", + expected_values=["eGon2035"] + ), + RowCountValidation( + table="supply.egon_individual_heating", + rule_id="ROW_COUNT.egon_individual_heating", + expected_count={"Schleswig-Holstein": 400, "Everything": 7692} + ), + DataTypeValidation( + table="supply.egon_individual_heating", + rule_id="DATA_TYPES.egon_individual_heating", + column_types={ + "index": "integer", + "mv_grid_id": "integer", + "carrier": "character varying", + "category": "character varying", + "capacity": "double precision", + "geometry": "geometry", + "scenario": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_individual_heating", + rule_id="NOT_NAN.egon_individual_heating", + columns=[ + "index", + "mv_grid_id", + "carrier", + "category", + "capacity", + "geometry", + "scenario" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_individual_heating", + rule_id="TABLE_NOT_NAN.egon_individual_heating" + ), + SRIDUniqueNonZero( + table="supply.egon_individual_heating", + rule_id="SRIDUniqueNonZero.egon_individual_heating.geometry", + column="geometry" + ), + ValueSetValidation( + table="supply.egon_individual_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_individual_heating", + column="carrier", + expected_values=["gas_boiler", "heat_pump"] + ), + ValueSetValidation( + table="supply.egon_individual_heating", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_individual_heating", + column="scenario", + expected_values=["eGon2035"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/heat_supply/individual_heating.py b/src/egon/data/datasets/heat_supply/individual_heating.py index 0b9b6f552..ab13d715f 100644 --- a/src/egon/data/datasets/heat_supply/individual_heating.py +++ b/src/egon/data/datasets/heat_supply/individual_heating.py @@ -50,6 +50,8 @@ # get zensus cells with district heating from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts +from egon_validation import ArrayCardinalityValidation + engine = db.engine() Base = declarative_base() @@ -219,6 +221,17 @@ def dyn_parallel_tasks_pypsa_eur(): version=self.version, dependencies=dependencies, tasks=tasks_HeatPumpsPypsaEur, + validation={ + "data-quality": [ + ArrayCardinalityValidation( + table="demand.egon_etrago_timeseries_individual_heating", + rule_id="ARRAY_HEAT_PUMPS_PYPSA.egon_etrago_timeseries_individual_heating", + array_column="dist_aggregated_mv", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) @@ -458,6 +471,17 @@ def dyn_parallel_tasks_2035(): version="0.0.3", dependencies=dependencies, tasks=tasks_HeatPumps2035, + validation={ + "data-quality":[ + ArrayCardinalityValidation( + table="demand.egon_etrago_timeseries_individual_heating", + rule_id="ARRAY_HEAT_PUMPS.egon_etrago_timeseries_individual_heating", + array_column="dist_aggregated_mv", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/low_flex_scenario/__init__.py b/src/egon/data/datasets/low_flex_scenario/__init__.py index 9e528ad58..2b1d24dbe 100644 --- a/src/egon/data/datasets/low_flex_scenario/__init__.py +++ b/src/egon/data/datasets/low_flex_scenario/__init__.py @@ -8,6 +8,8 @@ from egon.data.datasets import Dataset +from egon_validation import ArrayCardinalityValidation + Base = declarative_base() @@ -29,4 +31,15 @@ def __init__(self, dependencies): ), }, ), + validation={ + "data-quality":[ + ArrayCardinalityValidation( + table="grid.egon_etrago_bus_timeseries", + rule_id="ARRAY.egon_etrago_bus_timeseries", + array_column="v_mag_pu_set", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index 5677cf224..a3ad8541c 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -7,6 +7,12 @@ from egon.data import db from egon.data.datasets import Dataset +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero +) def execute_sql_script(script): @@ -211,4 +217,289 @@ def __init__(self, dependencies): drop_temp_tables, add_metadata, ), + validation={ + "data_quality": [ + RowCountValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="ROW_COUNT.egon_map_zensus_buildings_filtered", + expected_count={"Schleswig-Holstein":1010387, + "Everything":28070301} + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="DATA_TYPES.egon_map_zensus_buildings_filtered", + column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="TABLE_NOT_NAN.egon_map_zensus_buildings_filtered" + ), + RowCountValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="ROW_COUNT.egon_map_zensus_buildings_residential", + expected_count={"Schleswig-Holstein":989967, + "Everything":27477467} + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="DATA_TYPES.egon_map_zensus_buildings_residential", + column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="TABLE_NOT_NAN.egon_map_zensus_buildings_residential" + ), + RowCountValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="ROW_COUNT.osm_amenities_not_in_buildings", + expected_count={"Schleswig-Holstein": 3142, + "Everything": 79928} + ), + DataTypeValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="DATA_TYPES.osm_amenities_not_in_buildings", + column_types={ + "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", + "tags": "hstore", "egon_amenity_id": "integer" } + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="TABLE_NOT_NAN.osm_amenities_not_in_buildings" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="SRIDUniqueNonZero.osm_amenities_not_in_buildings.geom_amenity", + column="geom_amenity" + ), + RowCountValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="ROW_COUNT.osm_amenities_shops_filtered", + expected_count={"Schleswig-Holstein": 27438, "Everything": 700315} + ), + DataTypeValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="DATA_TYPES.osm_amenities_shops_filtered", + column_types={ + "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", + "tags": "hstore", "egon_amenity_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="TABLE_NOT_NAN.osm_amenities_shops_filtered" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="SRIDUniqueNonZero.osm_amenities_shops_filtered.geom_amenity", + column="geom_amenity" + ), + RowCountValidation( + table="openstreetmap.osm_buildings", + rule_id="ROW_COUNT.osm_buildings", + expected_count={"Schleswig-Holstein": 1298230, "Everything": 34328483} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings", + rule_id="DATA_TYPES.osm_buildings", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings", + rule_id="TABLE_NOT_NAN.osm_buildings" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings", + rule_id="SRIDUniqueNonZero.osm_buildings.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings", + rule_id="SRIDUniqueNonZero.osm_buildings.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="ROW_COUNT.osm_buildings_filtered", + expected_count={"Schleswig-Holstein": 1169881, "Everything": 31619905} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="DATA_TYPES.osm_buildings_filtered", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="TABLE_NOT_NAN.osm_buildings_filtered" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_filtered", + rule_id="SRIDUniqueNonZero.osm_buildings_filtered.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_filtered", + rule_id="SRIDUniqueNonZero.osm_buildings_filtered.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="ROW_COUNT.osm_buildings_residential", + expected_count={"Schleswig-Holstein": 1130929, "Everything": 30713011} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="DATA_TYPES.osm_buildings_residential", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="TABLE_NOT_NAN.osm_buildings_residential" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_residential", + rule_id="SRIDUniqueNonZero.osm_buildings_residential.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_residential", + rule_id="SRIDUniqueNonZero.osm_buildings_residential.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="ROW_COUNT.osm_buildings_synthetic", + expected_count={"Schleswig-Holstein": 9498, "Everything": 706911} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="DATA_TYPES.osm_buildings_synthetic", + column_types={ + "id": "character varying", "cell_id": "character varying", "geom_building": "geometry", + "geom_point": "geometry", "n_amenities_inside": "integer", "building": "character varying", + "area": "real"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="TABLE_NOT_NAN.osm_buildings_synthetic" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_synthetic", + rule_id="SRIDUniqueNonZero.osm_buildings_synthetic.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_synthetic", + rule_id="SRIDUniqueNonZero.osm_buildings_synthetic.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="ROW_COUNT.osm_buildings_with_amenities", + expected_count={"Schleswig-Holstein": 24314, "Everything": 621385} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="DATA_TYPES.osm_buildings_with_amenities", + column_types={ + "osm_id_amenity": "bigint", + "osm_id_building": "bigint", + "id": "integer", + "building": "text", + "area": "double precision", + "geom_building": "geometry", + "geom_amenity": "geometry", + "geom_point": "geometry", + "name": "text", + "tags_building": "hstore", + "tags_amenity": "hstore", + "n_amenities_inside": "bigint", + "apartment_count": "numeric"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="TABLE_NOT_NAN.osm_buildings_with_amenities" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_amenity", + column="geom_amenity" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="ROW_COUNT.osm_buildings_without_amenities", + expected_count={"Schleswig-Holstein": 1152146, "Everything": 31151277} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="DATA_TYPES.osm_buildings_without_amenities", + column_types={ + "osm_id": "bigint", + "id": "integer", + "building": "text", + "area": "double precision", + "geom_building": "geometry", + "geom_point": "geometry", + "name": "text", + "tags": "hstore", + "apartment_count": "numeric"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="TABLE_NOT_NAN.osm_buildings_without_amenities" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_without_amenities.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_without_amenities.geom_point", + column="geom_point" + ), + RowCountValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="ROW_COUNT.osm_ways_with_segments", + expected_count={"Schleswig-Holstein": 263427, "Everything": 6716196} + ), + DataTypeValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="DATA_TYPES.osm_ways_with_segments", + column_types={ + "osm_id": "bigint", + "nodes": "bigint[]", + "highway": "text", + "geom": "geometry", + "length_segments": "double precision[]"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="TABLE_NOT_NAN.osm_ways_with_segments" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_ways_with_segments", + rule_id="SRIDUniqueNonZero.osm_ways_with_segments.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/power_plants/__init__.py b/src/egon/data/datasets/power_plants/__init__.py index 3ea65fba0..2fe95ede8 100755 --- a/src/egon/data/datasets/power_plants/__init__.py +++ b/src/egon/data/datasets/power_plants/__init__.py @@ -44,6 +44,15 @@ import egon.data.datasets.power_plants.wind_farms as wind_onshore import egon.data.datasets.power_plants.wind_offshore as wind_offshore +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) + Base = declarative_base() @@ -1624,4 +1633,73 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_power_plants", + rule_id="ROW_COUNT.egon_power_plants", + expected_count={"Schleswig-Holstein":34828, "Everything": 1103} + ), + DataTypeValidation( + table="supply.egon_power_plants", + rule_id="DATA_TYPES.egon_power_plants", + column_types={ + "id": "bigint", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "el_capacity": "double precision", + "bus_id": "integer", + "voltage_level": "integer", + "weather_cell_id": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_power_plants", + rule_id="NOT_NAN.egon_power_plants", + columns=["id", + "sources", + "source_id", + "carrier", + "el_capacity", + "bus_id", + "voltage_level", + "weather_cell_id", + "scenario", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_power_plants", + rule_id="TABLE_NOT_NAN.egon_power_plants" + ), + ValueSetValidation( + table="supply.egon_power_plants", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_power_plants", + column="carrier", + expected_values=["others", + "gas", + "biomass", + "run_of_river", + "wind_onshore", + "oil", + "wind_offshore", + "solar", + "reservoir"] + ), + ValueSetValidation( + table="supply.egon_power_plants", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_power_plants", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + SRIDUniqueNonZero( + table="supply.egon_power_plants", + rule_id="SRIDUniqueNonZero.egon_power_plants.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/re_potential_areas/__init__.py b/src/egon/data/datasets/re_potential_areas/__init__.py index bcb34af86..5edb489bb 100644 --- a/src/egon/data/datasets/re_potential_areas/__init__.py +++ b/src/egon/data/datasets/re_potential_areas/__init__.py @@ -13,6 +13,14 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero +) + Base = declarative_base() @@ -152,4 +160,93 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=self.tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="ROW_COUNT.egon_re_potential_area_pv_agricultur", + expected_count={"Schleswig-Holstein": 388, "Everything": 8259} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="DATA_TYPES.egon_re_potential_area_pv_agricultur", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="NOT_NAN.egon_re_potential_area_pv_agricultur", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_agricultur" + ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_pv_agricultur.geom", + column="geom" + ), + RowCountValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="ROW_COUNT.egon_re_potential_area_pv_road_railway", + expected_count={"Schleswig-Holstein": 479, "Everything": 5159} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="DATA_TYPES.egon_re_potential_area_pv_road_railway", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="NOT_NAN.egon_re_potential_area_pv_road_railway", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="TABLE_NOT_NAN.egon_re_potential_area_pv_road_railway" + ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_pv_road_railway.geom", + column="geom" + ), + RowCountValidation( + table="supply.egon_re_potential_area_wind", + rule_id="ROW_COUNT.egon_re_potential_area_wind", + expected_count={"Schleswig-Holstein": 6306, "Everything": 120268} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_wind", + rule_id="DATA_TYPES.egon_re_potential_area_wind", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_wind", + rule_id="NOT_NAN.egon_re_potential_area_wind", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_wind", + rule_id="TABLE_NOT_NAN.egon_re_potential_area_wind" + ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_wind", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_wind.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/renewable_feedin.py b/src/egon/data/datasets/renewable_feedin.py index 549c7e073..78a6b7ff7 100644 --- a/src/egon/data/datasets/renewable_feedin.py +++ b/src/egon/data/datasets/renewable_feedin.py @@ -24,6 +24,13 @@ from egon.data.metadata import context, license_ccby, meta_metadata, sources import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) class RenewableFeedin(Dataset): """ @@ -64,6 +71,42 @@ def __init__(self, dependencies): wind_offshore, mapping_zensus_weather, }, + validation = { + "data-quality": [ + RowCountValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="ROW_COUNT.egon_renewable_feedin", + expected_count=6102 + ), + DataTypeValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="DATA_MULTIPLE_TYPES.egon_era5_renewable_feedin", + column_types={ + "w_id": "integer", + "weather_year": "integer", + "carrier": "character varying", + "feedin": "double precision[]" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="NOT_NAN.egon_era5_renewable_feedin", + columns=["w_id", "weather_year", "carrier", "feedin"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="TABLE_NOT_NAN.egon_era5_renewable_feedin" + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_district_heating", + column="carrier", + expected_values=["wind_onshore", "solar_thermal", "heat_pump_cop", "wind_offshore", "pv"] + ), + + ] + }, + on_validation_failure = "continue" ) diff --git a/src/egon/data/datasets/scenario_capacities.py b/src/egon/data/datasets/scenario_capacities.py index c810fc2ab..612c002d9 100755 --- a/src/egon/data/datasets/scenario_capacities.py +++ b/src/egon/data/datasets/scenario_capacities.py @@ -24,6 +24,14 @@ sources, ) +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -1051,4 +1059,146 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="ROW_COUNT.egon_nep_2021_conventional_powerplants", + expected_count={"Schleswig-Holstein": 40, "Everything": 737} + ), + DataTypeValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="DATA_TYPES.egon_nep_2021_conventional_powerplants", + column_types={ + "index": "bigint", + "bnetza_id": "text", + "name": "text", + "name_unit": "text", + "carrier_nep": "text", + "chp": "text", + "postcode": "text", + "city": "text", + "federal_state": "text", + "commissioned": "double precision", + "status": "text", + "capacity": "double precision", + "a2035_chp": "text", + "a2035_capacity": "double precision", + "b2035_chp": "text", + "b2035_capacity": "double precision", + "c2035_chp": "text", + "c2035_capacity": "double precision", + "b2040_chp": "text", + "b2040_capacity": "double precision", + "carrier": "text" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="NOT_NAN.egon_nep_2021_conventional_powerplants", + columns=[ + "index", + "bnetza_id", + "name", + "name_unit", + "carrier_nep", + "chp", + "postcode", + "city", + "federal_state", + "commissioned", + "status", + "capacity", + "a2035_chp", + "a2035_capacity", + "b2035_chp", + "b2035_capacity", + "c2035_chp", + "c2035_capacity", + "b2040_chp", + "b2040_capacity", + "carrier" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="TABLE_NOT_NAN.egon_nep_2021_conventional_powerplants" + ), + RowCountValidation( + table="supply.egon_scenario_capacities", + rule_id="ROW_COUNT.egon_scenario_capacities", + expected_count={"Schleswig-Holstein": 17, "Everything": 236} + ), + DataTypeValidation( + table="supply.egon_scenario_capacities", + rule_id="DATA_TYPES.egon_scenario_capacities", + column_types={ + "index": "integer", + "component": "character varying", + "carrier": "character varying", + "capacity": "double precision", + "nuts": "character varying", + "scenario_name": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_scenario_capacities", + rule_id="NOT_NAN.egon_scenario_capacities", + columns=[ + "index", + "component", + "carrier", + "capacity", + "nuts", + "scenario_name" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_scenario_capacities", + rule_id="TABLE_NOT_NAN.egon_scenario_capacities" + ), + ValueSetValidation( + table="supply.egon_scenario_capacities", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_scenario_capacities", + column="carrier", + expected_values=["pumped_hydro", + "gas_for_industry", + "gas_for_industry_CC", + "biogas_to_gas", + "Sabatier", + "urban_central_gas_CHP", + "solar", + "reservoir", + "biogas", + "residential_rural_heat_pump", + "urban_central_solar_thermal_collector", + "oil", + "urban_central_resistive_heater", + "wind_offshore", + "battery", + "others", + "gas", + "wind_onshore", + "small_chp", + "Li_ion", + "urban_central_heat_pump", + "urban_central_geo_thermal", + "SMR", + "biomass", + "hydro", + "run_of_river", + "rural_solar_thermal", + "solar_rooftop", + "BEV_charger"] + ), + ValueSetValidation( + table="supply.egon_scenario_capacities", + rule_id="VALUE_SET_VALIDATION_SCENARIO_NAME.egon_scenario_capacities", + column="scenario_name", + expected_values=["eGon2035", "eGon100RE"] + ), + + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/scenario_parameters/__init__.py b/src/egon/data/datasets/scenario_parameters/__init__.py index ceef011ff..6956dda9e 100755 --- a/src/egon/data/datasets/scenario_parameters/__init__.py +++ b/src/egon/data/datasets/scenario_parameters/__init__.py @@ -17,6 +17,12 @@ import egon.data.config import egon.data.datasets.scenario_parameters.parameters as parameters +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation +) + Base = declarative_base() @@ -314,4 +320,26 @@ def __init__(self, dependencies): download_pypsa_technology_data, insert_scenarios, ), + validation={ + "data-quality": [ + RowCountValidation( + table="scenario.egon_scenario_parameters", + rule_id="ROW_COUNT.egon_scenario_parameters", + expected_count={"Schleswig-Holstein": 5, "Everything": 3} + ), + DataTypeValidation( + table="scenario.egon_scenario_parameters", + rule_id="DATA_MULTIPLE_TYPES.egon_scenario_parameters", + column_types={ + "name": "character varying", "global_parameters": "jsonb", "electricity_parameters": "jsonb", + "gas_parameters": "jsonb", "heat_parameters": "jsonb", "mobility_parameters": "jsonb", + "description": "character varying"} + ), + WholeTableNotNullAndNotNaNValidation( + table="scenario.egon_scenario_parameters", + rule_id="WHOLE_TABLE_NOT_NAN.egon_scenario_parameters" + ) + ] + }, + on_validation_failure = "continue" ) diff --git a/src/egon/data/datasets/society_prognosis.py b/src/egon/data/datasets/society_prognosis.py index b0a42e96f..d916aa1cf 100755 --- a/src/egon/data/datasets/society_prognosis.py +++ b/src/egon/data/datasets/society_prognosis.py @@ -11,6 +11,13 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation +) + # will be later imported from another file ### Base = declarative_base() @@ -22,6 +29,50 @@ def __init__(self, dependencies): version="0.0.1", dependencies=dependencies, tasks=(create_tables, {zensus_population, zensus_household}), + validation={ + "data-quality":[ + RowCountValidation( + table="society.egon_household_prognosis", + rule_id="ROW_COUNT.egon_household_prognosis", + expected_count={"Everything": 5319490} + ), + DataTypeValidation( + table="society.egon_household_prognosis", + rule_id="DATA_TYPES.egon_household_prognosis", + column_types={"zensus_population_id": "integer", "year": "integer", "households": "double precision"} + ), + NotNullAndNotNaNValidation( + table="society.egon_household_prognosis", + rule_id="NOT_NAN.egon_household_prognosis", + columns=["zensus_population_id", "year", "households"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_household_prognosis", + rule_id="TABLE_NOT_NAN.egon_household_prognosis" + ), + RowCountValidation( + table="society.egon_population_prognosis", + rule_id="ROW_COUNT.egon_population_prognosis", + expected_count={"Everything": 6355446} + ), + DataTypeValidation( + table="society.egon_population_prognosis", + rule_id="DATA_TYPES.egon_population_prognosis", + column_types={"zensus_population_id": "integer", "year": "integer", + "population": "double precision"} + ), + NotNullAndNotNaNValidation( + table="society.egon_population_prognosis", + rule_id="NOT_NAN.egon_population_prognosis", + columns=["zensus_population_id", "year", "population"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_population_prognosis", + rule_id="TABLE_NOT_NAN.egon_population_prognosis" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 6ecda8b2c..e3b426779 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -33,6 +33,16 @@ ) from egon.data.db import session_scope +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) +from egon.data.validation.rules.custom.sanity.home_batteries import HomeBatteriesAggregation + Base = declarative_base() @@ -99,6 +109,77 @@ def __init__(self, dependencies): allocate_pv_home_batteries_to_grids, allocate_home_batteries_to_buildings, ), + validation={ + "sanity_home_batteries_aggregation": [ + HomeBatteriesAggregation( + table="supply.egon_home_batteries", + rule_id="SANITY_HOME_BATTERIES_AGGREGATION_EGON2035", + scenario="eGon2035" + ), + HomeBatteriesAggregation( + table="supply.egon_home_batteries", + rule_id="SANITY_HOME_BATTERIES_AGGREGATION_EGON100RE", + scenario="eGon100RE" + ), + RowCountValidation( + table="supply.egon_storages", + rule_id="ROW_COUNT.egon_storages", + expected_count={"Schleswig-Holstein": 290, "Everything": 7748} + ), + DataTypeValidation( + table="supply.egon_storages", + rule_id="DATA_TYPES.egon_storages", + column_types={ + "id": "bigint", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "el_capacity": "double precision", + "bus_id": "integer", + "voltage_level": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_storages", + rule_id="NOT_NAN.egon_storages", + columns=[ + "id", + "sources", + "source_id", + "carrier", + "el_capacity", + "bus_id", + "voltage_level", + "scenario", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_storages", + rule_id="TABLE_NOT_NAN.egon_storages" + ), + ValueSetValidation( + table="supply.egon_storages", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_storages", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="supply.egon_storages", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_storages", + column="carrier", + expected_values=["home_battery", "pumped_hydro"] + ), + SRIDUniqueNonZero( + table="supply.egon_storages", + rule_id="SRIDUniqueNonZero.egon_storages.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/substation/__init__.py b/src/egon/data/datasets/substation/__init__.py index 7e792eee7..3144ff174 100644 --- a/src/egon/data/datasets/substation/__init__.py +++ b/src/egon/data/datasets/substation/__init__.py @@ -12,6 +12,16 @@ from egon.data.datasets import Dataset import egon.data.config +# Uncomment to add validation rules: +# from egon_validation import ( +# RowCountValidation, +# DataTypeValidation, +# NotNullAndNotNaNValidation, +# WholeTableNotNullAndNotNaNValidation, +# ValueSetValidation, +# SRIDUniqueNonZero, +# ) + Base = declarative_base() @@ -86,6 +96,18 @@ def __init__(self, dependencies): }, transfer_busses, ), + # Validation placeholder - add rules here. See vg250/__init__.py + # for examples of RowCountValidation, DataTypeValidation, etc. + validation={ + # "": [ + # RowCountValidation( + # table=".", + # rule_id="ROW_COUNT.", + # expected_count={"Schleswig-Holstein": X, "Everything": Y} + # ), + # ] + }, + on_validation_failure="continue", ) diff --git a/src/egon/data/datasets/validation_report.py b/src/egon/data/datasets/validation_report.py new file mode 100644 index 000000000..5efe19b54 --- /dev/null +++ b/src/egon/data/datasets/validation_report.py @@ -0,0 +1,132 @@ +""" +Dataset for generating validation reports during pipeline execution. + +This module provides the ValidationReport dataset which generates +comprehensive validation reports by aggregating all validation results +from individual dataset validation tasks executed during the pipeline run. +""" + +import os +import time + +from egon.data import logger, db as egon_db +from egon.data.datasets import Dataset +from egon_validation import RunContext +from egon_validation.runner.aggregate import ( + collect, build_coverage, write_outputs +) +from egon_validation.report.generate import generate +from egon_validation.runner.coverage_analysis import discover_total_tables +from egon_validation.config import ENV_DB_URL +import os as _os + + +def generate_validation_report(**kwargs): + """ + Generate validation report aggregating all validation results. + + This function collects all validation results from individual dataset + validation tasks that were executed during the pipeline run and generates + a comprehensive HTML report including: + - All validation results from individual dataset tasks + - Coverage analysis showing which tables were validated + - Summary statistics and pass/fail counts + """ + # Use same run_id as other validation tasks in the pipeline + # This ensures all tasks read/write to the same directory + dag_run = kwargs.get('dag_run') + ti = kwargs.get('ti') + run_id = ( + os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or + kwargs.get('run_id') or + (ti and hasattr(ti, 'dag_run') and ti.dag_run.run_id) or + (dag_run and dag_run.run_id) or + f"pipeline_validation_report_{int(time.time())}" + ) + + # Determine output directory at runtime (not import time) + # Priority: EGON_VALIDATION_DIR env var > current working directory + out_dir = os.path.join( + os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), + "validation_runs" + ) + + try: + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) + logger.info("Starting pipeline validation report generation", extra={ + "run_id": run_id, + "output_dir": out_dir + }) + + # Make database connection available for table counting + # Set the database URL from egon.data configuration + try: + # Get the database URL from egon.data + db_url = str(egon_db.engine().url) + # Set env var so discover_total_tables can use it + _os.environ[ENV_DB_URL] = db_url + logger.info("Database connection available for table counting") + except Exception as e: + logger.warning( + f"Could not set database URL for table counting: {e}" + ) + + # Collect all validation results from existing validation runs + collected = collect(ctx) + coverage = build_coverage(ctx, collected) + final_out_dir = write_outputs(ctx, collected, coverage) + generate(ctx) + + report_path = os.path.join(final_out_dir, 'report.html') + logger.info( + "Pipeline validation report generated successfully", + extra={ + "report_path": report_path, + "run_id": run_id, + "total_results": len(collected.get("items", [])) + } + ) + + except FileNotFoundError as e: + logger.warning( + f"No validation results found for pipeline validation report | " + f"run_id={run_id} | out_dir={out_dir} | error={e} | " + f"suggestion=This may be expected if no validation tasks ran" + ) + + # Don't raise - this is acceptable if no validations were run + except Exception as e: + logger.error("Pipeline validation report generation failed", extra={ + "run_id": run_id, + "error": str(e), + "error_type": type(e).__name__ + }) + raise + + +# Define the task +tasks = (generate_validation_report,) + + +class ValidationReport(Dataset): + """ + Dataset for generating validation reports. + + This dataset generates a comprehensive HTML validation report by + aggregating all validation results from individual dataset validation + tasks that were executed during the pipeline run. It should be placed + before sanity_checks in the DAG to ensure validation results are + collected before final checks. + """ + #: + name: str = "ValidationReport" + #: + version: str = "0.0.2.dev" + + def __init__(self, dependencies): + super().__init__( + name=self.name, + version=self.version, + dependencies=dependencies, + tasks=tasks, + ) \ No newline at end of file diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 378f86895..a58ce5b1e 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,6 +29,15 @@ meta_metadata, ) import egon.data.config +from egon.data.validation import TableValidation, resolve_boundary_dependence +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero +) def download_files(): @@ -529,4 +538,164 @@ def __init__(self, dependencies): add_metadata, cleaning_and_preperation, ), + validation={ + "data_quality": [ + TableValidation( + table_name="boundaries.vg250_krs", + row_count=resolve_boundary_dependence({"Schleswig-Holstein": 27, "Everything": 537}), + geometry_columns=["geometry"], + data_type_columns={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"timestamp without time zone", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"}, + "Everything":{"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"} + }, + not_null_columns=["gf", "bsg"], + value_set_columns={"nbd": ["ja", "nein"]}, + ), + RowCountValidation( + table="boundaries.vg250_krs", + rule_id="ROW_COUNT.vg250_krs", + expected_count=resolve_boundary_dependence({"Schleswig-Holstein":27, "Everything":431}) + ), + DataTypeValidation( + table="boundaries.vg250_krs", + rule_id="DATA_TYPES.vg250_krs", + column_types={ + "Schleswig-Holstein": { + "id": "bigint", + "ade": "integer", + "gf": "integer", + "bsg": "integer", + "ars": "text", + "ags": "text", + "sdv_ars": "text", + "gen": "text", + "bez": "text", + "ibz": "integer", + "bem": "text", + "nbd": "text", + "sn_l": "text", + "sn_r": "text", + "sn_k": "text", + "sn_v1": "text", + "sn_v2": "text", + "sn_g": "text", + "fk_s3": "text", + "nuts": "text", + "ars_0": "text", + "ags_0": "text", + "wsk": "timestamp without time zone", + "debkg_id": "text", + "rs": "text", + "sdv_rs": "text", + "rs_0": "text", + "geometry": "geometry" + }, + "Everything": { + "id": "bigint", + "ade": "bigint", + "gf": "bigint", + "bsg": "bigint", + "ars": "text", + "ags": "text", + "sdv_ars": "text", + "gen": "text", + "bez": "text", + "ibz": "bigint", + "bem": "text", + "nbd": "text", + "sn_l": "text", + "sn_r": "text", + "sn_k": "text", + "sn_v1": "text", + "sn_v2": "text", + "sn_g": "text", + "fk_s3": "text", + "nuts": "text", + "ars_0": "text", + "ags_0": "text", + "wsk": "text", + "debkg_id": "text", + "rs": "text", + "sdv_rs": "text", + "rs_0": "text", + "geometry": "geometry" + } + } + ), + NotNullAndNotNaNValidation( + table="boundaries.vg250_krs", + rule_id="NOT_NAN.vg250_krs", + columns=["gf", "bsg"] + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.vg250_krs", + rule_id="TABLE_NOT_NAN.vg250_krs" + ), + SRIDUniqueNonZero( + table="boundaries.vg250_krs", + rule_id="SRIDUniqueNonZero.vg250_krs.geometry", + column="geometry" + ), + ValueSetValidation( + table="boundaries.vg250_krs", + rule_id="VALUE_SET_NBD.vg250_krs", + column="nbd", + expected_values=["ja", "nein"] + ), + RowCountValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="ROW_COUNT.destatis_zensus_population_per_ha_inside_germany", + expected_count={ + "Schleswig-Holstein": 143521, + "Everything": 3177723 + } + ), + DataTypeValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="DATA_TYPES.destatis_zensus_population_per_ha_inside_germany", + column_types={ + "id": "integer", + "grid_id": "character varying (254)", + "population": "smallint", + "geom_point": "geometry", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="NOT_NAN.destatis_zensus_population_per_ha_inside_germany", + columns=[ + "id", + "grid_id", + "population", + "geom_point", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany" + ), + SRIDUniqueNonZero( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom_point", + column="geom_point" + ), + SRIDUniqueNonZero( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom", + column="geom" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/zensus/__init__.py b/src/egon/data/datasets/zensus/__init__.py index 3d498a12b..97147d95c 100755 --- a/src/egon/data/datasets/zensus/__init__.py +++ b/src/egon/data/datasets/zensus/__init__.py @@ -17,6 +17,14 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero +) + class ZensusPopulation(Dataset): def __init__(self, dependencies): @@ -28,6 +36,43 @@ def __init__(self, dependencies): create_zensus_pop_table, population_to_postgres, ), + validation={ + "data-quality": [ + RowCountValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_apartment_building_population_per_ha", + expected_count={"Schleswig-Holstein": 145634, "Everything": 3206490} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_apartment_building_population_per_ha", + column_types={ + "grid_id": "character varying", "zensus_population_id": "integer", "building_count": "smallint", + "apartment_count": "smallint", "geom": "geometry", "geom_point": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha", + columns=["grid_id", "zensus_population_id", "building_count", "apartment_count", "geom", "geom_point"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha" + ), + SRIDUniqueNonZero( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="SRIDUniqueNonZero.egon_destatis_zensus_apartment_building_population_per_ha.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="SRIDUniqueNonZero.egon_destatis_zensus_apartment_building_population_per_ha.geom_point", + column="geom_point" + ), + ] + }, + on_validation_failure="continue" ) @@ -41,6 +86,156 @@ def __init__(self, dependencies): create_zensus_misc_tables, zensus_misc_to_postgres, ), + validation={ + "data-quality": [ + + RowCountValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_apartment_per_ha", + expected_count={"Schleswig-Holstein": 1946300, "Everything": 51095280} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_apartment_per_ha", + column_types={ + "id": "integer", "grid_id": "character varying", "grid_id_new": "character varying", + "attribute": "character varying", "characteristics_code": "smallint", + "characteristics_text": "text", "quantity": "smallint", "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_apartment_per_ha", + columns=[ + "id", "grid_id", "grid_id_new", "attribute", "characteristics_code", "characteristics_text", + "quantity", "quantity_q", "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_apartment_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_building_per_ha", + expected_count={"Schleswig-Holstein": 978493, "Everything": 24297136} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_building_per_ha", + column_types={ + "id": "integer", + "grid_id": "character varying", + "grid_id_new": "character varying", + "attribute": "character varying", + "characteristics_code": "smallint", + "characteristics_text": "text", + "quantity": "smallint", + "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_building_per_ha", + columns=[ + "id", + "grid_id", + "grid_id_new", + "attribute", + "characteristics_code", + "characteristics_text", + "quantity", + "quantity_q", + "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_building_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_household_per_ha", + expected_count={"Schleswig-Holstein": 724970, "Everything": 18788917} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_household_per_ha", + column_types={ + "id": "integer", + "grid_id": "character varying", + "grid_id_new": "character varying", + "attribute": "character varying", + "characteristics_code": "smallint", + "characteristics_text": "text", + "quantity": "smallint", + "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_household_per_ha", + columns=[ + "id", + "grid_id", + "grid_id_new", + "attribute", + "characteristics_code", + "characteristics_text", + "quantity", + "quantity_q", + "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="ROW_COUNT.egon_destatis_zensus_household_per_ha_refined", + expected_count={"Schleswig-Holstein": 551678, "Everything": 13304814} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="DATA_TYPES.egon_destatis_zensus_household_per_ha_refined", + column_types={ + "id": "integer", + "cell_id": "integer", + "grid_id": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "characteristics_code": "integer", + "hh_5types": "integer", + "hh_type": "character", + "hh_10types": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="NOT_NAN.egon_destatis_zensus_household_per_ha_refined", + columns=[ + "id", + "cell_id", + "grid_id", + "nuts3", + "nuts1", + "characteristics_code", + "hh_5types", + "hh_type", + "hh_10types" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha_refined" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/zensus_mv_grid_districts.py b/src/egon/data/datasets/zensus_mv_grid_districts.py index ad2b36673..fe64bce60 100644 --- a/src/egon/data/datasets/zensus_mv_grid_districts.py +++ b/src/egon/data/datasets/zensus_mv_grid_districts.py @@ -11,6 +11,11 @@ from egon.data.datasets.mv_grid_districts import MvGridDistricts from egon.data.datasets.zensus_vg250 import DestatisZensusPopulationPerHa import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation +) class ZensusMvGridDistricts(Dataset): @@ -38,6 +43,25 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(mapping), + validation={ + "data_quality": [ + RowCountValidation( + table=" boundaries.egon_map_zensus_grid_districts", + rule_id="ROW_COUNT.egon_map_zensus_grid_districts", + expected_count={"Schleswig-Holstein": 7519, "Everything": 35718586} + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_grid_districts", + rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_grid_districts", + column_types={"index": "bigint", "zensus_population_id": "bigint", "bus_id": "bigint"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_grid_districts", + rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_grid_districts" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/validation/__init__.py b/src/egon/data/validation/__init__.py new file mode 100644 index 000000000..9c6c482ac --- /dev/null +++ b/src/egon/data/validation/__init__.py @@ -0,0 +1,46 @@ +""" +Validation framework for egon-data. + +Supports two configuration styles (can be mixed): + +1) "rule-first": + validation_dict = {"task_name": [Rule(...), Rule(...)]} + +2) "table-first": + validation_dict = {"task_name": [TableValidation(...), ...]} +""" + +from .resolver import ( + BoundaryDependent, + resolve_boundary_dependence, + resolve_value, +) +from .specs import ( + TableValidation, + ValidationSpec, + clone_rule, + expand_specs, + prepare_rules, + resolve_rule_params, +) +from .airflow import ( + create_validation_tasks, + run_validation_task, +) + +__all__ = [ + # resolver + "BoundaryDependent", + "resolve_boundary_dependence", + "resolve_value", + # specs + "TableValidation", + "ValidationSpec", + "clone_rule", + "expand_specs", + "prepare_rules", + "resolve_rule_params", + # airflow + "create_validation_tasks", + "run_validation_task", +] diff --git a/src/egon/data/validation/airflow.py b/src/egon/data/validation/airflow.py new file mode 100644 index 000000000..cca86ad99 --- /dev/null +++ b/src/egon/data/validation/airflow.py @@ -0,0 +1,149 @@ +"""Airflow integration for validation tasks.""" + +from __future__ import annotations + +import logging +from functools import partial +import re +from typing import Any, Dict, List, Sequence + +from airflow.operators.python import PythonOperator +from egon_validation import RunContext, run_validations + +from .specs import ValidationSpec, prepare_rules + +logger = logging.getLogger(__name__) + + +def run_validation_task( + *, + specs: Sequence[ValidationSpec], + task_name: str, + dataset_name: str, + on_failure: str, + **context: Any, +) -> Dict[str, int]: + """ + This is the function Airflow actually calls. + + It's top-level (not nested), so: + - easier to test + - easier stack traces + - fewer closure surprises + """ + import os + import time + from datetime import datetime + from egon.data import db as egon_db + from egon.data.config import settings + + # Consistent run_id across tasks so reports can correlate results + run_id = ( + os.environ.get("AIRFLOW_CTX_DAG_RUN_ID") + or context.get("run_id") + or ( + context.get("ti") + and hasattr(context["ti"], "dag_run") + and context["ti"].dag_run.run_id + ) + or (context.get("dag_run") and context["dag_run"].run_id) + or f"airflow-{dataset_name}-{task_name}-{int(time.time())}" + ) + + out_dir = os.path.join( + os.environ.get("EGON_VALIDATION_DIR", os.getcwd()), + "validation_runs", + ) + + execution_date = context.get("execution_date") or datetime.now() + timestamp = execution_date.strftime("%Y%m%dT%H%M%S") + full_task_name = f"{dataset_name}.{task_name}.{timestamp}" + + logger.info("Validation: %s (run_id: %s)", full_task_name, run_id) + + engine = egon_db.engine() + + config = settings()["egon-data"] + boundary = config["--dataset-boundary"] + logger.info("Resolving validation parameters for boundary='%s'", boundary) + + rules = prepare_rules( + specs=specs, + boundary=boundary, + dataset_name=dataset_name, + task_name=task_name, + ) + + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) + results = run_validations(engine, ctx, rules, full_task_name) + + total = len(results) + failed = sum(1 for r in results if not r.success) + + logger.info("Complete: %s/%s passed", total - failed, total) + + if failed > 0 and on_failure == "fail": + raise Exception(f"{failed}/{total} validations failed") + + return {"total": total, "passed": total - failed, "failed": failed} + + +def create_validation_tasks( + validation_dict: Dict[str, Sequence[ValidationSpec]], + dataset_name: str, + on_failure: str = "continue", +) -> List[PythonOperator]: + """ + Creates one PythonOperator per task_name in validation_dict. + + - values can still be List[Rule] + - values can be List[TableValidation] + + Mixed lists also work. + """ + if not validation_dict: + return [] + + tasks: List[PythonOperator] = [] + + safe_dataset = sanitize_airflow_key(dataset_name) + + for task_name, specs in validation_dict.items(): + callable_for_airflow = partial( + run_validation_task, + specs=specs, + task_name=task_name, + dataset_name=dataset_name, + on_failure=on_failure, + ) + + tasks.append( + PythonOperator( + task_id=f"{safe_dataset}.validate.{task_name}", + python_callable=callable_for_airflow, + provide_context=True, + ) + ) + + return tasks + + +def sanitize_airflow_key(value: str) -> str: + """ + Airflow task_id/key must match: [A-Za-z0-9_.-]+ + Replace everything else with underscores. + """ + # 1) strip outer whitespace + v = value.strip() + + # 2) replace any run of invalid characters (including spaces) with "_" + v = re.sub(r"[^A-Za-z0-9_.-]+", "_", v) + + # 3) collapse multiple underscores + v = re.sub(r"_+", "_", v) + + # 4) avoid leading/trailing separators that can look ugly / confusing + v = v.strip("._-") + + # 5) don't return empty + return v or "unnamed" diff --git a/src/egon/data/validation/resolver.py b/src/egon/data/validation/resolver.py new file mode 100644 index 000000000..327b5ee61 --- /dev/null +++ b/src/egon/data/validation/resolver.py @@ -0,0 +1,60 @@ +"""Boundary resolution helpers for validation parameters.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Any, Dict + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class BoundaryDependent: + """ + Wrapper for values that vary by boundary. + + E.g. Schleswig-Holstein vs Everything. At validation runtime, the + appropriate value is selected based on the current boundary setting. + """ + values: Dict[str, Any] + + def resolve(self, boundary: str) -> Any: + """Return value for given boundary, or the whole dict if not found.""" + if boundary in self.values: + logger.debug( + "Resolved boundary-dependent value: %s -> %s", + boundary, self.values[boundary] + ) + return self.values[boundary] + return self.values + + +def resolve_boundary_dependence( + boundary_dict: Dict[str, Any] +) -> BoundaryDependent: + """ + Wrap a boundary-dependent dict for deferred resolution. + + At validation runtime, the appropriate value is selected based on the + current boundary setting. + + Example: + expected_count=resolve_boundary_dependence( + {"Schleswig-Holstein": 27, "Everything": 431} + ) + """ + return BoundaryDependent(boundary_dict) + + +def resolve_value(value: Any, boundary: str) -> Any: + """ + Resolve boundary-dependent values. + + If value is a BoundaryDependent, resolve it using the current boundary. + Otherwise return value unchanged. + """ + if isinstance(value, BoundaryDependent): + return value.resolve(boundary) + + return value diff --git a/src/egon/data/validation/rules/custom/__init__.py b/src/egon/data/validation/rules/custom/__init__.py new file mode 100644 index 000000000..4f07cd008 --- /dev/null +++ b/src/egon/data/validation/rules/custom/__init__.py @@ -0,0 +1,15 @@ +"""Custom validation rules for eGon data.""" + +from .sanity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, + CtsElectricityDemandShare, + CtsHeatDemandShare, +) + +__all__ = [ + "ResidentialElectricityAnnualSum", + "ResidentialElectricityHhRefinement", + "CtsElectricityDemandShare", + "CtsHeatDemandShare", +] diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py new file mode 100644 index 000000000..27cf5f960 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -0,0 +1,57 @@ +"""Sanity check validation rules for eGon data quality.""" + +from .residential_electricity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, +) +from .cts_demand import ( + CtsElectricityDemandShare, + CtsHeatDemandShare, +) +from .home_batteries import ( + HomeBatteriesAggregation, +) +from .gas_stores import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, +) +from .gas_grid import ( + GasBusesIsolated, + GasBusesCount, + GasOnePortConnections, + CH4GridCapacity, + GasLinksConnections, +) +from .gas_loads_generators import ( + GasLoadsCapacity, + GasGeneratorsCapacity, +) +from .electricity_capacity import ( + ElectricityCapacityComparison, +) +from .heat_demand import ( + HeatDemandValidation, +) +from .electrical_load_sectors import ( + ElectricalLoadSectorBreakdown, +) + +__all__ = [ + "ResidentialElectricityAnnualSum", + "ResidentialElectricityHhRefinement", + "CtsElectricityDemandShare", + "CtsHeatDemandShare", + "HomeBatteriesAggregation", + "CH4StoresCapacity", + "H2SaltcavernStoresCapacity", + "GasBusesIsolated", + "GasBusesCount", + "GasOnePortConnections", + "CH4GridCapacity", + "GasLinksConnections", + "GasLoadsCapacity", + "GasGeneratorsCapacity", + "ElectricityCapacityComparison", + "HeatDemandValidation", + "ElectricalLoadSectorBreakdown", +] diff --git a/src/egon/data/validation/rules/custom/sanity/cts_demand.py b/src/egon/data/validation/rules/custom/sanity/cts_demand.py new file mode 100644 index 000000000..5dbf16526 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/cts_demand.py @@ -0,0 +1,170 @@ +"""CTS (Commercial, Trade, Services) demand sanity check validation rules.""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + + +class CtsElectricityDemandShare(DataFrameRule): + """Validate CTS electricity demand shares sum to 1 for each substation. + + Checks that the sum of aggregated CTS electricity demand share equals 1 + for every substation, as the substation profile is linearly disaggregated + to all buildings. + + Args: + table: Primary table being validated (demand.egon_cts_electricity_demand_building_share) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... CtsElectricityDemandShare( + ... table="demand.egon_cts_electricity_demand_building_share", + ... rule_id="SANITY_CTS_ELECTRICITY_DEMAND_SHARE", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_electricity_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + # Check that all shares sum to 1 (within tolerance) + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand shares sum to 1 for all {len(df)} bus/scenario combinations (max deviation: {max_diff:.6f}, tolerance: {rtol:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand share mismatch: max deviation {max_diff:.6f} exceeds tolerance {rtol:.6f}. {len(violations)} bus/scenario combinations have shares != 1.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class CtsHeatDemandShare(DataFrameRule): + """Validate CTS heat demand shares sum to 1 for each substation. + + Checks that the sum of aggregated CTS heat demand share equals 1 + for every substation, as the substation profile is linearly disaggregated + to all buildings. + + Args: + table: Primary table being validated (demand.egon_cts_heat_demand_building_share) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... CtsHeatDemandShare( + ... table="demand.egon_cts_heat_demand_building_share", + ... rule_id="SANITY_CTS_HEAT_DEMAND_SHARE", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_heat_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + # Check that all shares sum to 1 (within tolerance) + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS heat demand shares sum to 1 for all {len(df)} bus/scenario combinations (max deviation: {max_diff:.6f}, tolerance: {rtol:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"CTS heat demand share mismatch: max deviation {max_diff:.6f} exceeds tolerance {rtol:.6f}. {len(violations)} bus/scenario combinations have shares != 1.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py b/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py new file mode 100644 index 000000000..007c7d273 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py @@ -0,0 +1,275 @@ +""" +Sanity check validation rules for electrical load sector breakdown. + +Validates that electrical loads are correctly disaggregated into sectors +(residential, commercial, industrial) and that each sector matches expected values. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from egon.data import config, db +import pandas as pd + + +class ElectricalLoadSectorBreakdown(DataFrameRule): + """ + Validate electrical load breakdown by sector (residential, commercial, industrial). + + This rule checks that the electrical load for each sector matches expected values: + - Residential: 90.4 TWh (from household_curves) + - Commercial: 146.7 TWh (from cts_curves) + - Industrial: 382.9 TWh (from osm_curves + sites_curves) + - Total: 620.0 TWh (from etrago AC loads) + + Matches the original electrical_load_100RE() function from sanity_checks.py. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon100RE", + rtol: float = 0.01, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name (default: "eGon100RE") + rtol : float + Relative tolerance for load deviation (default: 0.01 = 1%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to get total AC electrical load for Germany. + + Returns total load in TWh from etrago tables. + """ + return f""" + SELECT SUM((SELECT SUM(p) FROM UNNEST(b.p_set) p))/1000000::numeric as load_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b + ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c + ON (a.bus = c.bus_id) + WHERE a.scn_name = '{self.scenario}' + AND b.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND a.carrier = 'AC' + AND c.country = 'DE' + """ + + def _get_sector_loads(self): + """ + Get electrical loads by sector from source tables. + + Returns + ------- + dict + Dictionary with sector loads in TWh: + - residential: TWh from household_curves + - commercial: TWh from cts_curves + - industrial: TWh from osm_curves + sites_curves + """ + sources = config.datasets()["etrago_electricity"]["sources"] + + # Commercial load from CTS curves + cts_curves = db.select_dataframe( + f"""SELECT bus_id AS bus, p_set FROM + {sources['cts_curves']['schema']}. + {sources['cts_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + commercial_twh = ( + cts_curves.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Industrial load from OSM landuse areas + ind_curves_osm = db.select_dataframe( + f"""SELECT bus, p_set FROM + {sources['osm_curves']['schema']}. + {sources['osm_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + industrial_osm_twh = ( + ind_curves_osm.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Industrial load from industrial sites + ind_curves_sites = db.select_dataframe( + f"""SELECT bus, p_set FROM + {sources['sites_curves']['schema']}. + {sources['sites_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + industrial_sites_twh = ( + ind_curves_sites.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Total industrial + industrial_twh = industrial_osm_twh + industrial_sites_twh + + # Residential load from household curves + hh_curves = db.select_dataframe( + f"""SELECT bus_id AS bus, p_set FROM + {sources['household_curves']['schema']}. + {sources['household_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + residential_twh = ( + hh_curves.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + return { + "residential": residential_twh, + "commercial": commercial_twh, + "industrial": industrial_twh + } + + def evaluate_df(self, df, ctx): + """ + Evaluate electrical load sector breakdown. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with total load_twh column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["load_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No electrical load data found for scenario {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Get total AC load + total_load_twh = float(df["load_twh"].values[0]) + + # Get sector loads + try: + sector_loads = self._get_sector_loads() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error reading sector load data: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Expected values (from original sanity_checks.py lines 2689-2694) + # References: + # https://github.com/openego/powerd-data/blob/56b8215928a8dc4fe953d266c563ce0ed98e93f9/src/egon/data/datasets/demandregio/__init__.py#L480 + # https://github.com/openego/powerd-data/blob/56b8215928a8dc4fe953d266c563ce0ed98e93f9/src/egon/data/datasets/demandregio/__init__.py#L775 + expected_values = { + "residential": 90.4, + "commercial": 146.7, + "industrial": 382.9, + "total": 620.0 + } + + # Build load summary dataframe + load_summary = pd.DataFrame({ + "sector": ["residential", "commercial", "industrial", "total"], + "expected": [ + expected_values["residential"], + expected_values["commercial"], + expected_values["industrial"], + expected_values["total"] + ], + "observed": [ + sector_loads["residential"], + sector_loads["commercial"], + sector_loads["industrial"], + total_load_twh + ] + }) + + load_summary["diff"] = load_summary["observed"] - load_summary["expected"] + load_summary["diff_pct"] = ( + load_summary["diff"] / load_summary["observed"] * 100 + ) + + # Check if all deviations are within tolerance (< 1% as in original) + violations = load_summary[load_summary["diff_pct"].abs() >= (self.rtol * 100)] + + if not violations.empty: + # Format violation details + violation_details = [] + for _, row in violations.iterrows(): + violation_details.append( + f"{row['sector']}: {row['observed']:.2f} TWh " + f"(expected {row['expected']:.2f} TWh, " + f"deviation {row['diff_pct']:+.2f}%)" + ) + + max_deviation = load_summary["diff_pct"].abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_deviation), + expected=self.rtol * 100, + message=( + f"Electrical load sector breakdown deviations exceed tolerance for {self.scenario}: " + f"{'; '.join(violation_details)}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # All sectors within tolerance + sector_summary = "; ".join([ + f"{row['sector']}: {row['observed']:.2f} TWh " + f"(expected {row['expected']:.2f} TWh, " + f"deviation {row['diff_pct']:+.2f}%)" + for _, row in load_summary.iterrows() + ]) + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=( + f"Electrical load sector breakdown valid for {self.scenario}: {sector_summary}" + ), + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py new file mode 100644 index 000000000..1e1319231 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py @@ -0,0 +1,273 @@ +""" +Sanity check validation rules for electricity capacity comparison. + +Validates that distributed capacities in etrago tables match input capacities +from scenario_capacities table. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from typing import Optional, List + + +class ElectricityCapacityComparison(DataFrameRule): + """ + Compare distributed capacity with input capacity for electricity components. + + Compares the total capacity in etrago tables (grid.egon_etrago_generator, + grid.egon_etrago_storage) against the input capacity from the scenario + capacities table (supply.egon_scenario_capacities). + + This validation ensures that capacity distribution is correct and no + capacity is lost or incorrectly added during the distribution process. + """ + + def __init__( + self, + table: str, + rule_id: str, + scenario: str = "eGon2035", + carrier: str = "wind_onshore", + component_type: str = "generator", + output_carriers: Optional[List[str]] = None, + rtol: float = 0.10, + **kwargs + ): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_generator or grid.egon_etrago_storage) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Carrier type for the input table (supply.egon_scenario_capacities) + component_type : str + Type of component ("generator", "storage", or "link") + output_carriers : List[str], optional + List of carrier names in output table. If None, uses carrier parameter. + Useful for biomass which maps to multiple output carriers. + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__( + rule_id=rule_id, + table=table, + scenario=scenario, + carrier=carrier, + component_type=component_type, + output_carriers=output_carriers, + rtol=rtol, + **kwargs + ) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + self.component_type = component_type + self.output_carriers = output_carriers or [carrier] + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to compare input and output capacities. + + Returns a query that: + 1. Sums output capacity from etrago table for German buses + 2. Sums input capacity from scenario_capacities table + 3. Returns both values for comparison + """ + # Build carrier filter for output table + if len(self.output_carriers) == 1: + carrier_filter = f"carrier = '{self.output_carriers[0]}'" + else: + carriers_str = "', '".join(self.output_carriers) + carrier_filter = f"carrier IN ('{carriers_str}')" + + # Build bus filter based on component type + # Links have bus0 and bus1, generators/storage have bus + if self.component_type == "link": + bus_filter = f""" + AND (bus0 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + ) OR bus1 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + )) + """ + else: + bus_filter = f""" + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + ) + """ + + return f""" + WITH output_capacity AS ( + SELECT + COALESCE(SUM(p_nom::numeric), 0) as output_capacity_mw + FROM {self.table} + WHERE scn_name = '{self.scenario}' + AND {carrier_filter} + {bus_filter} + ), + input_capacity AS ( + SELECT + COALESCE(SUM(capacity::numeric), 0) as input_capacity_mw + FROM supply.egon_scenario_capacities + WHERE carrier = '{self.carrier}' + AND scenario_name = '{self.scenario}' + ) + SELECT + o.output_capacity_mw, + i.input_capacity_mw + FROM output_capacity o + CROSS JOIN input_capacity i + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate capacity comparison. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with output_capacity_mw and input_capacity_mw columns + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No data found for {self.carrier} capacity comparison", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + output_capacity = float(df["output_capacity_mw"].values[0]) + input_capacity = float(df["input_capacity_mw"].values[0]) + + # Case 1: Both zero - OK, no capacity needed + if output_capacity == 0 and input_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=( + f"No {self.carrier} {self.component_type} capacity needed " + f"for {self.scenario} (both input and output are zero)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 2: Input > 0 but output = 0 - ERROR + if input_capacity > 0 and output_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=0.0, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity was not distributed at all! " + f"Input: {input_capacity:.2f} MW, Output: 0 MW for {self.scenario}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 3: Output > 0 but input = 0 - ERROR + if output_capacity > 0 and input_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_capacity, + expected=0.0, + message=( + f"{self.carrier} {self.component_type} capacity was distributed " + f"even though no input was provided! " + f"Output: {output_capacity:.2f} MW, Input: 0 MW for {self.scenario}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 4: Both > 0 - Check deviation + deviation = abs(output_capacity - input_capacity) / input_capacity + error_pct = ((output_capacity - input_capacity) / input_capacity) * 100 + + success = deviation <= self.rtol + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=output_capacity, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity valid for {self.scenario}: " + f"Output: {output_capacity:.2f} MW, Input: {input_capacity:.2f} MW, " + f"Deviation: {error_pct:+.2f}% (tolerance: ±{self.rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_capacity, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity deviation too large for {self.scenario}: " + f"Output: {output_capacity:.2f} MW, Input: {input_capacity:.2f} MW, " + f"Deviation: {error_pct:+.2f}% (tolerance: ±{self.rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_grid.py b/src/egon/data/validation/rules/custom/sanity/gas_grid.py new file mode 100644 index 000000000..974e0a3f6 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_grid.py @@ -0,0 +1,808 @@ +""" +Sanity check validation rules for gas grid components. + +Validates gas bus connectivity, counts, and grid consistency. +""" + +from pathlib import Path +import pandas as pd +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from typing import List, Tuple +from egon.data.datasets.scenario_parameters import get_sector_parameters + + +class GasBusesIsolated(DataFrameRule): + """ + Validate that gas buses are not isolated. + + Checks that all gas buses (CH4, H2_grid, H2_saltcavern) in Germany + are connected to at least one link. Isolated buses indicate potential + issues with grid connectivity. + + The check examines buses that don't appear in either bus0 or bus1 + of the corresponding link carrier. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_bus) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Bus carrier type ("CH4", "H2_grid", or "H2_saltcavern") + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + # Map bus carrier to corresponding link carrier + self.carrier_mapping = { + "eGon2035": { + "CH4": "CH4", + "H2_grid": "H2_feedin", + "H2_saltcavern": "power_to_H2", + }, + "eGon100RE": { + "CH4": "CH4", + "H2_grid": "H2_retrofit", + "H2_saltcavern": "H2_extension", + } + } + + def get_query(self, ctx): + """ + Query to find isolated gas buses. + + Returns a query that finds buses of the specified carrier that + are not connected to any links (don't appear in bus0 or bus1 + of links with the corresponding carrier). + """ + if self.scenario not in self.carrier_mapping: + # Return empty query for unsupported scenarios + return "SELECT NULL as bus_id, NULL as carrier, NULL as country LIMIT 0" + + link_carrier = self.carrier_mapping[self.scenario].get(self.carrier) + if not link_carrier: + return "SELECT NULL as bus_id, NULL as carrier, NULL as country LIMIT 0" + + return f""" + SELECT bus_id, carrier, country + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND country = 'DE' + AND bus_id NOT IN ( + SELECT bus0 + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{link_carrier}' + ) + AND bus_id NOT IN ( + SELECT bus1 + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{link_carrier}' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate isolated buses. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with isolated buses (bus_id, carrier, country) + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + # Filter out NULL rows from unsupported scenarios + df = df.dropna() + + isolated_count = len(df) + + if isolated_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"No isolated {self.carrier} buses found for {self.scenario} " + f"(all buses connected to grid)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Show sample of isolated buses (first 5) + sample_buses = df.head(5).to_dict(orient='records') + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=isolated_count, + expected=0, + message=( + f"Found {isolated_count} isolated {self.carrier} buses for {self.scenario}. " + f"Sample (first 5): {sample_buses}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasBusesCount(DataFrameRule): + """ + Validate gas grid bus count against SciGRID_gas data. + + Compares the number of gas grid buses (CH4 or H2_grid) in the database + against the original SciGRID_gas node count for Germany. Allows for + small deviations due to grid simplification or modifications. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_bus) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Bus carrier type ("CH4" or "H2_grid") + rtol : float + Relative tolerance for bus count deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to count gas grid buses in Germany. + + Returns a query that counts buses of the specified carrier + in Germany for the specified scenario. + """ + return f""" + SELECT COUNT(*) as bus_count + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = '{self.carrier}' + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate bus count against SciGRID_gas reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with bus_count column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["bus_count"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} buses found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_count = int(df["bus_count"].values[0]) + + # Get expected count from SciGRID_gas data + try: + target_file = Path(".") / "datasets" / "gas_data" / "data" / "IGGIELGN_Nodes.csv" + grid_buses_df = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["country_code"], + ) + grid_buses_df = grid_buses_df[ + grid_buses_df["country_code"].str.match("DE") + ] + expected_count = len(grid_buses_df.index) + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error reading SciGRID_gas reference data: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_count - expected_count) / expected_count + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(observed_count), + expected=float(expected_count), + message=( + f"{self.carrier} bus count valid for {self.scenario}: " + f"{observed_count} buses (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(observed_count), + expected=float(expected_count), + message=( + f"{self.carrier} bus count deviation too large for {self.scenario}: " + f"{observed_count} vs {expected_count} expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasOnePortConnections(DataFrameRule): + """ + Validate that gas one-port components are connected to existing buses. + + Checks that all gas one-port components (loads, generators, stores) are + connected to buses that exist in the database with the correct carrier type. + + This validation ensures data integrity across the etrago tables and prevents + orphaned components that would cause errors in network optimization. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + component_type: str = "load", component_carrier: str = "CH4_for_industry", + bus_conditions: List[Tuple[str, str]] = None, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load, grid.egon_etrago_generator, + or grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + component_type : str + Type of component ("load", "generator", or "store") + component_carrier : str + Carrier of the component to check + bus_conditions : List[Tuple[str, str]] + List of (bus_carrier, country_condition) tuples that define valid buses + Examples: + - [("CH4", "= 'DE'")] - CH4 buses in Germany + - [("CH4", "!= 'DE'")] - CH4 buses outside Germany + - [("H2_grid", "= 'DE'"), ("AC", "!= 'DE'")] - H2_grid in DE OR AC abroad + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + component_type=component_type, + component_carrier=component_carrier, + bus_conditions=bus_conditions or [], **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.component_type = component_type + self.component_carrier = component_carrier + self.bus_conditions = bus_conditions or [] + + # Map component type to ID column name + self.id_column_map = { + "load": "load_id", + "generator": "generator_id", + "store": "store_id" + } + + def get_query(self, ctx): + """ + Query to find one-port components not connected to valid buses. + + Returns a query that finds components of the specified type and carrier + that are NOT connected to any of the valid bus types specified in + bus_conditions. + """ + if not self.bus_conditions: + # No bus conditions specified - skip validation + return "SELECT NULL as component_id, NULL as bus, NULL as carrier LIMIT 0" + + id_column = self.id_column_map.get(self.component_type, "id") + + # Build bus subqueries for each condition + bus_subqueries = [] + for bus_carrier, country_cond in self.bus_conditions: + subquery = f""" + (SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND carrier = '{bus_carrier}' + AND country {country_cond}) + """ + bus_subqueries.append(subquery) + + # Build NOT IN clauses for all bus conditions + not_in_clauses = [f"bus NOT IN {subq}" for subq in bus_subqueries] + combined_condition = " AND ".join(not_in_clauses) + + return f""" + SELECT {id_column} as component_id, bus, carrier, scn_name + FROM {self.table} + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.component_carrier}' + AND {combined_condition} + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate one-port component connections. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with disconnected components (component_id, bus, carrier) + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + # Filter out NULL rows + df = df.dropna() + + disconnected_count = len(df) + + if disconnected_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"All {self.component_carrier} {self.component_type}s connected " + f"to valid buses for {self.scenario}" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Show sample of disconnected components (first 5) + sample_components = df.head(5).to_dict(orient='records') + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=disconnected_count, + expected=0, + message=( + f"Found {disconnected_count} disconnected {self.component_carrier} " + f"{self.component_type}s for {self.scenario}. " + f"Sample (first 5): {sample_components}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class CH4GridCapacity(DataFrameRule): + """ + Validate CH4 grid capacity against SciGRID_gas reference data. + + Compares the total capacity (p_nom) of CH4 pipelines in Germany from the + database against the original SciGRID_gas pipeline data. For eGon100RE, + the expected capacity is adjusted to account for the share of CH4 pipelines + retrofitted to H2 pipelines (based on PyPSA-eur-sec parameters). + + This validation ensures that the CH4 grid capacity in the database matches + the imported SciGRID_gas data, accounting for any scenario-specific modifications. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_link) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total CH4 pipeline capacity in Germany. + + Returns a query that sums the p_nom of all CH4 links where both + bus0 and bus1 are in Germany. + """ + return f""" + SELECT SUM(p_nom::numeric) as total_p_nom + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = 'CH4' + AND bus0 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + AND bus1 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + """ + + def _get_reference_capacity(self): + """ + Calculate reference capacity from SciGRID_gas pipeline data. + + Returns + ------- + float + Expected total pipeline capacity for the scenario + """ + try: + # Read pipeline segments from SciGRID_gas + target_file = ( + Path(".") + / "datasets" + / "gas_data" + / "data" + / "IGGIELGN_PipeSegments.csv" + ) + + pipelines = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["id", "node_id", "country_code", "param"], + ) + + # Parse bus0, bus1 and countries + pipelines["bus0"] = pipelines["node_id"].apply(lambda x: x.split(",")[0]) + pipelines["bus1"] = pipelines["node_id"].apply(lambda x: x.split(",")[1]) + pipelines["country_0"] = pipelines["country_code"].apply(lambda x: x.split(",")[0]) + pipelines["country_1"] = pipelines["country_code"].apply(lambda x: x.split(",")[1]) + + # Filter for pipelines within Germany + germany_pipelines = pipelines[ + (pipelines["country_0"] == "DE") & (pipelines["country_1"] == "DE") + ] + + # Read pipeline classification for capacity mapping + classification_file = ( + Path(".") + / "data_bundle_egon_data" + / "pipeline_classification_gas" + / "pipeline_classification.csv" + ) + + classification = pd.read_csv( + classification_file, + delimiter=",", + usecols=["classification", "max_transport_capacity_Gwh/d"], + ) + + # Map pipeline param to capacity + param_to_capacity = dict( + zip(classification["classification"], + classification["max_transport_capacity_Gwh/d"]) + ) + + germany_pipelines["p_nom"] = germany_pipelines["param"].map(param_to_capacity) + + # Sum total capacity + total_p_nom = germany_pipelines["p_nom"].sum() + + # Adjust for eGon100RE (H2 retrofit share) + if self.scenario == "eGon100RE": + scn_params = get_sector_parameters("gas", "eGon100RE") + h2_retrofit_share = scn_params["retrofitted_CH4pipeline-to-H2pipeline_share"] + total_p_nom = total_p_nom * (1 - h2_retrofit_share) + + return float(total_p_nom) + + except Exception as e: + raise ValueError(f"Error reading SciGRID_gas reference data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate CH4 grid capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with total_p_nom column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["total_p_nom"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No CH4 links found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["total_p_nom"].values[0]) + + # Get expected capacity from SciGRID_gas data + try: + expected_capacity = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 grid capacity valid for {self.scenario}: " + f"{observed_capacity:.2f} GWh/d (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 grid capacity deviation too large for {self.scenario}: " + f"{observed_capacity:.2f} vs {expected_capacity:.2f} GWh/d expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasLinksConnections(DataFrameRule): + """ + Validate that gas links are connected to existing buses. + + Checks that all gas links (two-port components) have both bus0 and bus1 + connected to buses that exist in the database. This validation ensures + data integrity and prevents orphaned links that would cause errors in + network optimization. + + This check covers all gas-related link carriers including CH4 pipelines, + H2 conversion links, and power-to-gas links. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_link) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Link carrier type to check (e.g., "CH4", "H2_feedin", "power_to_H2") + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to find links with missing buses. + + Returns a query that finds links where either bus0 or bus1 + does not exist in the bus table for the same scenario. + """ + return f""" + SELECT link_id, bus0, bus1, carrier, scn_name + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND ( + bus0 NOT IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + ) + OR bus1 NOT IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + ) + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate link connections. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with links that have missing buses + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + disconnected_count = len(df) + + if disconnected_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"All {self.carrier} links connected to valid buses for {self.scenario}" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Show sample of disconnected links (first 5) + sample_links = df.head(5).to_dict(orient='records') + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=disconnected_count, + expected=0, + message=( + f"Found {disconnected_count} disconnected {self.carrier} links " + f"for {self.scenario}. " + f"Sample (first 5): {sample_links}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py b/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py new file mode 100644 index 000000000..a01076f57 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py @@ -0,0 +1,412 @@ +""" +Sanity check validation rules for gas loads and generators. + +Validates gas demand and generation capacity against reference data. +""" + +from pathlib import Path +import pandas as pd +import ast +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + + +class GasLoadsCapacity(DataFrameRule): + """ + Validate gas loads capacity against reference data. + + Compares the total annual load (in TWh) for gas loads in Germany + from the database against reference data from opendata.ffe. + This validates that industrial gas demand (CH4 and H2) matches + expected values from external sources. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4_for_industry", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Load carrier type ("CH4_for_industry" or "H2_for_industry") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to get total annual load for gas loads in Germany. + + Returns a query that sums the annual load from timeseries data + for the specified carrier in Germany, converting to TWh. + """ + return f""" + SELECT (SUM( + (SELECT SUM(p) + FROM UNNEST(b.p_set) p))/1000000)::numeric as load_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b + ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c + ON (a.bus=c.bus_id) + WHERE b.scn_name = '{self.scenario}' + AND a.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND c.country = 'DE' + AND a.carrier = '{self.carrier}' + """ + + def _get_reference_capacity(self): + """ + Calculate reference load capacity from opendata.ffe data. + + Returns + ------- + float + Expected total annual load in TWh + """ + try: + path = Path(".") / "datasets" / "gas_data" / "demand" + + # Read region correlation file + corr_file = path / "region_corr.json" + df_corr = pd.read_json(corr_file) + df_corr = df_corr.loc[:, ["id_region", "name_short"]] + df_corr.set_index("id_region", inplace=True) + + # Read demand data for carrier + input_gas_demand = pd.read_json( + path / (self.carrier + f"_{self.scenario}.json") + ) + input_gas_demand = input_gas_demand.loc[:, ["id_region", "value"]] + input_gas_demand.set_index("id_region", inplace=True) + + # Join with correlation and filter for Germany + input_gas_demand = pd.concat( + [input_gas_demand, df_corr], axis=1, join="inner" + ) + input_gas_demand["NUTS0"] = (input_gas_demand["name_short"].str)[0:2] + input_gas_demand = input_gas_demand[ + input_gas_demand["NUTS0"].str.match("DE") + ] + + # Sum and convert to TWh + total_demand = sum(input_gas_demand.value.to_list()) / 1000000 + + return float(total_demand) + + except Exception as e: + raise ValueError(f"Error reading reference load data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate gas loads capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with load_twh column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["load_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} loads found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_load = float(df["load_twh"].values[0]) + + # Get expected capacity from reference data + try: + expected_load = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_load - expected_load) / expected_load + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_load, + expected=expected_load, + message=( + f"{self.carrier} load valid for {self.scenario}: " + f"{observed_load:.2f} TWh (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_load, + expected=expected_load, + message=( + f"{self.carrier} load deviation too large for {self.scenario}: " + f"{observed_load:.2f} vs {expected_load:.2f} TWh expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasGeneratorsCapacity(DataFrameRule): + """ + Validate gas generators capacity against reference data. + + Compares the total nominal power (p_nom) of CH4 generators in Germany + from the database against reference data from SciGRID_gas productions + and the Biogaspartner Einspeiseatlas. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_generator) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Generator carrier type (default: "CH4") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to get total generator capacity in Germany. + + Returns a query that sums the p_nom of all gas generators + in Germany for the specified carrier. + """ + return f""" + SELECT SUM(p_nom::numeric) as p_nom_germany + FROM grid.egon_etrago_generator + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = '{self.carrier}' + ) + """ + + def _get_reference_capacity(self): + """ + Calculate reference generation capacity from SciGRID_gas + biogas data. + + Returns + ------- + float + Expected total generation capacity in MW + """ + try: + # Read SciGRID_gas natural gas productions + target_file = ( + Path(".") + / "datasets" + / "gas_data" + / "data" + / "IGGIELGN_Productions.csv" + ) + + ng_generators = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["country_code", "param"], + ) + + ng_generators = ng_generators[ + ng_generators["country_code"].str.match("DE") + ] + + # Sum natural gas production capacity + p_ng = 0 + for index, row in ng_generators.iterrows(): + param = ast.literal_eval(row["param"]) + p_ng = p_ng + param["max_supply_M_m3_per_d"] + + conversion_factor = 437.5 # MCM/day to MWh/h + p_ng = p_ng * conversion_factor + + # Read biogas production data + basename = "Biogaspartner_Einspeiseatlas_Deutschland_2021.xlsx" + target_file = ( + Path(".") / "data_bundle_egon_data" / "gas_data" / basename + ) + + conversion_factor_b = 0.01083 # m^3/h to MWh/h + p_biogas = ( + pd.read_excel( + target_file, + usecols=["Einspeisung Biomethan [(N*m^3)/h)]"], + )["Einspeisung Biomethan [(N*m^3)/h)]"].sum() + * conversion_factor_b + ) + + total_generation = p_ng + p_biogas + + return float(total_generation) + + except Exception as e: + raise ValueError(f"Error reading reference generation data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate gas generators capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with p_nom_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["p_nom_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} generators found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["p_nom_germany"].values[0]) + + # Get expected capacity from reference data + try: + expected_capacity = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"{self.carrier} generator capacity valid for {self.scenario}: " + f"{observed_capacity:.2f} MW (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"{self.carrier} generator capacity deviation too large for {self.scenario}: " + f"{observed_capacity:.2f} vs {expected_capacity:.2f} MW expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_stores.py b/src/egon/data/validation/rules/custom/sanity/gas_stores.py new file mode 100644 index 000000000..c4eda057f --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_stores.py @@ -0,0 +1,322 @@ +""" +Sanity check validation rules for gas storage components. + +Validates CH4 and H2 storage capacities against expected values from +grid capacities and external data sources. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + +from egon.data.datasets.hydrogen_etrago.storage import ( + calculate_and_map_saltcavern_storage_potential +) + + +class CH4StoresCapacity(DataFrameRule): + """ + Validate CH4 store capacity in Germany. + + Compares the sum of CH4 store capacities in the database against the + expected capacity calculated from: + - CH4 grid capacity allocation + - Total CH4 store capacity in Germany (source: GIE) + + The check allows for small deviations between observed and expected values. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.02, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.02 = 2%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total CH4 store capacity in Germany. + + Returns a query that sums all CH4 store capacities for German buses + in the specified scenario. + """ + return f""" + SELECT SUM(e_nom::numeric) as e_nom_germany + FROM grid.egon_etrago_store + WHERE scn_name = '{self.scenario}' + AND carrier = 'CH4' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate CH4 store capacity against expected values. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with e_nom_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["e_nom_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No CH4 store data found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["e_nom_germany"].values[0]) + + # Calculate expected capacity based on scenario + if self.scenario == "eGon2035": + grid_cap = 130000 # MWh + elif self.scenario == "eGon100RE": + # Get retrofitted share from config + from egon.data.datasets.scenario_parameters import get_sector_parameters + retrofitted_share = get_sector_parameters("gas", "eGon100RE")[ + "retrofitted_CH4pipeline-to-H2pipeline_share" + ] + grid_cap = 13000 * (1 - retrofitted_share) # MWh + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Unknown scenario: {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # GIE capacity: https://www.gie.eu/transparency/databases/storage-database/ + stores_cap_germany = 266424202 # MWh + + expected_capacity = stores_cap_germany + grid_cap + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.02) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 stores capacity valid for {self.scenario}: " + f"deviation {deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 stores capacity deviation too large for {self.scenario}: " + f"{deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class H2SaltcavernStoresCapacity(DataFrameRule): + """ + Validate H2 saltcavern store potential capacity in Germany. + + Compares the sum of H2 saltcavern potential storage capacities (e_nom_max) + in the database against the expected capacity calculated from: + - Area fractions around substations in federal states + - Estimated total hydrogen storage potential per federal state (InSpEE-DS) + + The check allows for small deviations between observed and expected values. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.02, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.02 = 2%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total H2 saltcavern potential storage capacity in Germany. + + Returns a query that sums all H2_underground store e_nom_max capacities + for German H2_saltcavern buses in the specified scenario. + """ + return f""" + SELECT SUM(e_nom_max::numeric) as e_nom_max_germany + FROM grid.egon_etrago_store + WHERE scn_name = '{self.scenario}' + AND carrier = 'H2_underground' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'H2_saltcavern' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate H2 saltcavern storage capacity against expected values. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with e_nom_max_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["e_nom_max_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No H2 saltcavern store data found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["e_nom_max_germany"].values[0]) + + # Calculate expected capacity from saltcavern potential + try: + storage_potentials = calculate_and_map_saltcavern_storage_potential() + storage_potentials["storage_potential"] = ( + storage_potentials["area_fraction"] * storage_potentials["potential"] + ) + expected_capacity = sum(storage_potentials["storage_potential"].to_list()) + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error calculating expected H2 saltcavern capacity: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.02) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"H2 saltcavern stores capacity valid for {self.scenario}: " + f"deviation {deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"H2 saltcavern stores capacity deviation too large for {self.scenario}: " + f"{deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/heat_demand.py b/src/egon/data/validation/rules/custom/sanity/heat_demand.py new file mode 100644 index 000000000..1f0da0935 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/heat_demand.py @@ -0,0 +1,163 @@ +""" +Sanity check validation rules for heat demand. + +Validates that heat demand timeseries match expected values from peta_heat. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + + +class HeatDemandValidation(DataFrameRule): + """ + Validate annual heat demand against peta_heat reference values. + + Compares the sum of rural_heat and central_heat load timeseries + against the demand from egon_peta_heat table to ensure demand is + correctly distributed. + """ + + def __init__( + self, + table: str, + rule_id: str, + scenario: str = "eGon2035", + rtol: float = 0.02, + **kwargs + ): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for deviation (default: 0.02 = 2%) + """ + super().__init__( + rule_id=rule_id, + table=table, + scenario=scenario, + rtol=rtol, + **kwargs + ) + self.kind = "sanity" + self.scenario = scenario + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to compare heat demand output vs input. + + Returns a query that: + 1. Sums rural_heat + central_heat timeseries from etrago_load + 2. Sums demand from egon_peta_heat + 3. Returns both values for comparison + """ + return f""" + WITH output_demand AS ( + SELECT + SUM((SELECT SUM(p) FROM UNNEST(b.p_set) p)) / 1000000 as demand_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c ON (a.bus = c.bus_id) + WHERE b.scn_name = '{self.scenario}' + AND a.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND c.country = 'DE' + AND a.carrier IN ('rural_heat', 'central_heat') + ), + input_demand AS ( + SELECT + SUM(demand / 1000000) as demand_twh + FROM demand.egon_peta_heat + WHERE scenario = '{self.scenario}' + ) + SELECT + o.demand_twh as output_demand_twh, + i.demand_twh as input_demand_twh + FROM output_demand o + CROSS JOIN input_demand i + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate heat demand comparison. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with output_demand_twh and input_demand_twh columns + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["output_demand_twh"].isna().all() or df["input_demand_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No heat demand data found for {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + output_twh = float(df["output_demand_twh"].values[0]) + input_twh = float(df["input_demand_twh"].values[0]) + + # Calculate deviation + deviation = abs(output_twh - input_twh) / input_twh + deviation_pct = deviation * 100 + diff_twh = output_twh - input_twh + + success = deviation <= self.rtol + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=output_twh, + expected=input_twh, + message=( + f"Heat demand valid for {self.scenario}: " + f"{output_twh:.2f} TWh vs {input_twh:.2f} TWh expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {self.rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_twh, + expected=input_twh, + message=( + f"Heat demand deviation too large for {self.scenario}: " + f"{output_twh:.2f} TWh vs {input_twh:.2f} TWh expected " + f"(diff: {diff_twh:+.2f} TWh, deviation: {deviation_pct:.2f}%, " + f"tolerance: {self.rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py new file mode 100644 index 000000000..828250230 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -0,0 +1,240 @@ +""" +Sanity check validation rules for home batteries + +Validates that home battery capacities are correctly aggregated +from building-level to bus-level in the storages table. +""" + +import pandas as pd +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + +from egon.data import config +from egon.data.validation.rules.custom.sanity.utils import get_cbat_pbat_ratio + + +class HomeBatteriesAggregation(DataFrameRule): + """ + Validate home battery capacity aggregation from buildings to buses. + + This rule checks that the sum of home battery capacities allocated to + buildings matches the aggregated capacity per bus in the storage table. + + The check compares: + 1. p_nom (power rating in MW) per bus + 2. capacity (energy capacity in MWh) per bus + + Both values are rounded to 6 decimal places for comparison. + """ + + def __init__( + self, table: str, rule_id: str, scenario: str = "eGon2035", **kwargs + ): + super().__init__( + rule_id=rule_id, table=table, scenario=scenario, **kwargs + ) + self.kind = "sanity" + self.scenario = scenario + + def evaluate(self, engine, ctx) -> RuleResult: + """Override evaluate to catch errors from get_cbat_pbat_ratio().""" + try: + return super().evaluate(engine, ctx) + except IndexError as e: + # get_cbat_pbat_ratio() failed - no home_battery data exists + if "index 0 is out of bounds" in str(e): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=( + f"NO DATA FOUND: No home_battery carrier found in " + f"etrago_storage table for scenario {self.scenario}" + ), + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + raise + + def get_query(self, ctx): + """ + Query to compare storage and building-level home battery data. + + Returns a joined query that compares aggregated building-level data + with the storage table data per bus. + """ + # Get table names from config + sources = config.datasets()["home_batteries"]["sources"] + targets = config.datasets()["home_batteries"]["targets"] + + # Get cbat_pbat_ratio for capacity calculation + cbat_pbat_ratio = get_cbat_pbat_ratio() + + storage_schema = sources["storage"]["schema"] + storage_table = sources["storage"]["table"] + hb_schema = targets["home_batteries"]["schema"] + hb_table = targets["home_batteries"]["table"] + + return f""" + WITH storage_data AS ( + SELECT + bus_id, + el_capacity as storage_p_nom, + el_capacity * {cbat_pbat_ratio} as storage_capacity + FROM {storage_schema}.{storage_table} + WHERE carrier = 'home_battery' + AND scenario = '{self.scenario}' + ), + building_data AS ( + SELECT + bus_id, + SUM(p_nom) as building_p_nom, + SUM(capacity) as building_capacity + FROM {hb_schema}.{hb_table} + WHERE scenario = '{self.scenario}' + GROUP BY bus_id + ) + SELECT + COALESCE(s.bus_id, b.bus_id) as bus_id, + ROUND(s.storage_p_nom::numeric, 6) as storage_p_nom, + ROUND(s.storage_capacity::numeric, 6) as storage_capacity, + ROUND(b.building_p_nom::numeric, 6) as building_p_nom, + ROUND(b.building_capacity::numeric, 6) as building_capacity + FROM storage_data s + FULL OUTER JOIN building_data b ON s.bus_id = b.bus_id + ORDER BY bus_id + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate the comparison between storage and building data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with storage and building data per bus + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=( + f"No home battery data found for scenario {self.scenario}" + ), + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Check for buses that exist in only one source + missing_in_storage = df[df["storage_p_nom"].isna()] + missing_in_buildings = df[df["building_p_nom"].isna()] + + if not missing_in_storage.empty or not missing_in_buildings.empty: + violations = [] + if not missing_in_storage.empty: + bus_list = missing_in_storage['bus_id'].tolist()[:5] + violations.append( + f"{len(missing_in_storage)} bus(es) in buildings " + f"but not in storage: {bus_list}" + ) + if not missing_in_buildings.empty: + bus_list = missing_in_buildings['bus_id'].tolist()[:5] + violations.append( + f"{len(missing_in_buildings)} bus(es) in storage " + f"but not in buildings: {bus_list}" + ) + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=len(missing_in_storage) + len(missing_in_buildings), + expected=0, + message=f"Bus mismatch: {'; '.join(violations)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Check if p_nom values match + p_nom_mismatch = df[df["storage_p_nom"] != df["building_p_nom"]] + + # Check if capacity values match + cap_mismatch = df[df["storage_capacity"] != df["building_capacity"]] + + # Combine mismatches + mismatches = pd.concat( + [p_nom_mismatch, cap_mismatch] + ).drop_duplicates(subset=["bus_id"]) + + if not mismatches.empty: + # Calculate maximum differences + p_nom_diff = df["storage_p_nom"] - df["building_p_nom"] + cap_diff = df["storage_capacity"] - df["building_capacity"] + max_p_nom_diff = p_nom_diff.abs().max() + max_capacity_diff = cap_diff.abs().max() + + # Get all violations + cols = [ + "bus_id", "storage_p_nom", "building_p_nom", + "storage_capacity", "building_capacity" + ] + all_violations = mismatches[cols].to_dict(orient="records") + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max(max_p_nom_diff, max_capacity_diff)), + expected=0.0, + message=( + f"Home battery aggregation mismatch for " + f"{len(mismatches)} bus(es): " + f"max p_nom diff={max_p_nom_diff:.6f}, " + f"max capacity diff={max_capacity_diff:.6f}. " + f"violations: {all_violations}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # All checks passed + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=( + f"Home battery capacities correctly aggregated for all " + f"{len(df)} buses in scenario {self.scenario}" + ), + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + diff --git a/src/egon/data/validation/rules/custom/sanity/residential_electricity.py b/src/egon/data/validation/rules/custom/sanity/residential_electricity.py new file mode 100644 index 000000000..b53ac4bcc --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/residential_electricity.py @@ -0,0 +1,191 @@ +"""Residential electricity demand sanity check validation rules.""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + + +class ResidentialElectricityAnnualSum(DataFrameRule): + """Validate aggregated annual residential electricity demand matches DemandRegio at NUTS-3. + + Aggregates the annual demand of all census cells at NUTS3 to compare + with initial scaling parameters from DemandRegio. + + Args: + table: Primary table being validated (demand.egon_demandregio_zensus_electricity) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... ResidentialElectricityAnnualSum( + ... table="demand.egon_demandregio_zensus_electricity", + ... rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" # Override inferred kind + + def get_query(self, ctx): + return """ + SELECT dr.nuts3, dr.scenario, dr.demand_regio_sum, profiles.profile_sum + FROM ( + SELECT scenario, SUM(demand) AS profile_sum, vg250_nuts3 + FROM demand.egon_demandregio_zensus_electricity AS egon, + boundaries.egon_map_zensus_vg250 AS boundaries + WHERE egon.zensus_population_id = boundaries.zensus_population_id + AND sector = 'residential' + GROUP BY vg250_nuts3, scenario + ) AS profiles + JOIN ( + SELECT nuts3, scenario, sum(demand) AS demand_regio_sum + FROM demand.egon_demandregio_hh + GROUP BY year, scenario, nuts3 + ) AS dr + ON profiles.vg250_nuts3 = dr.nuts3 AND profiles.scenario = dr.scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + np.testing.assert_allclose( + actual=df["profile_sum"], + desired=df["demand_regio_sum"], + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"Aggregated annual residential electricity demand matches with DemandRegio at NUTS-3 (max deviation: {max_diff:.4%}, tolerance: {rtol:.4%})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max() + violations = df[~np.isclose(df["profile_sum"], df["demand_regio_sum"], rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Demand mismatch: max deviation {max_diff:.4%} exceeds tolerance {rtol:.4%}. {len(violations)} NUTS-3 regions have mismatches.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class ResidentialElectricityHhRefinement(DataFrameRule): + """Validate aggregated household types after refinement match original census values. + + Checks sum of aggregated household types after refinement method + was applied and compares it to the original census values. + + Args: + table: Primary table being validated (society.egon_destatis_zensus_household_per_ha_refined) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 1e-5 = 0.001%) + + Example: + >>> validation = { + ... "data_quality": [ + ... ResidentialElectricityHhRefinement( + ... table="society.egon_destatis_zensus_household_per_ha_refined", + ... rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT", + ... rtol=1e-5 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 1e-5, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT refined.nuts3, refined.characteristics_code, + refined.sum_refined::int, census.sum_census::int + FROM( + SELECT nuts3, characteristics_code, SUM(hh_10types) as sum_refined + FROM society.egon_destatis_zensus_household_per_ha_refined + GROUP BY nuts3, characteristics_code) + AS refined + JOIN( + SELECT t.nuts3, t.characteristics_code, sum(orig) as sum_census + FROM( + SELECT nuts3, cell_id, characteristics_code, + sum(DISTINCT(hh_5types))as orig + FROM society.egon_destatis_zensus_household_per_ha_refined + GROUP BY cell_id, characteristics_code, nuts3) AS t + GROUP BY t.nuts3, t.characteristics_code ) AS census + ON refined.nuts3 = census.nuts3 + AND refined.characteristics_code = census.characteristics_code + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 1e-5) + + try: + np.testing.assert_allclose( + actual=df["sum_refined"], + desired=df["sum_census"], + rtol=rtol, + verbose=False, + ) + + max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"All aggregated household types match at NUTS-3 (max deviation: {max_diff:.6%}, tolerance: {rtol:.6%})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max() + violations = df[~np.isclose(df["sum_refined"], df["sum_census"], rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Household refinement mismatch: max deviation {max_diff:.6%} exceeds tolerance {rtol:.6%}. {len(violations)} NUTS-3/characteristic combinations have mismatches.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/utils.py b/src/egon/data/validation/rules/custom/sanity/utils.py new file mode 100644 index 000000000..239fa7eea --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/utils.py @@ -0,0 +1,26 @@ +"""Utility functions for sanity check validation rules.""" + +from egon.data import config, db + + +def get_cbat_pbat_ratio(): + """ + Mean ratio between the storage capacity and the power of the pv rooftop + system + + Returns + ------- + int + Mean ratio between the storage capacity and the power of the pv + rooftop system + """ + sources = config.datasets()["home_batteries"]["sources"] + + sql = f""" + SELECT max_hours + FROM {sources["etrago_storage"]["schema"]} + .{sources["etrago_storage"]["table"]} + WHERE carrier = 'home_battery' + """ + + return int(db.select_dataframe(sql).iat[0, 0]) diff --git a/src/egon/data/validation/specs.py b/src/egon/data/validation/specs.py new file mode 100644 index 000000000..f2d2138fb --- /dev/null +++ b/src/egon/data/validation/specs.py @@ -0,0 +1,202 @@ +"""Validation specifications and expansion logic.""" + +from __future__ import annotations + +import copy +import logging +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional, Sequence, Union + +from egon_validation.rules.base import Rule +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero, + WholeTableNotNullAndNotNaNValidation, +) + +from .resolver import resolve_value + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class TableValidation: + """ + A compact, table-first spec that expands into Rule objects at runtime. + + Properties: + - table_name + - row_count + - geometry_columns + - data_type_columns + - not_null_columns + - value_set_columns + + Behavior: + - Adds WholeTableNotNullAndNotNaNValidation automatically. + - Generates rule_id strings matching your prior manual convention. + """ + + table_name: str + row_count: Optional[Any] = None + geometry_columns: Optional[Sequence[str]] = None + data_type_columns: Optional[Mapping[str, Any]] = None + not_null_columns: Optional[Sequence[str]] = None + value_set_columns: Optional[Mapping[str, Any]] = None + + def to_rules(self) -> List[Rule]: + rules: List[Rule] = [] + table_suffix = self.table_name.split(".")[-1] + + if self.row_count is not None: + rules.append( + RowCountValidation( + table=self.table_name, + rule_id=f"ROW_COUNT.{table_suffix}", + expected_count=self.row_count, + ) + ) + + if self.data_type_columns is not None: + rules.append( + DataTypeValidation( + table=self.table_name, + rule_id=f"DATA_TYPES.{table_suffix}", + column_types=dict(self.data_type_columns), + ) + ) + + if self.not_null_columns: + rules.append( + NotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"NOT_NAN.{table_suffix}", + columns=list(self.not_null_columns), + ) + ) + + if self.geometry_columns: + for geom_col in self.geometry_columns: + rules.append( + SRIDUniqueNonZero( + table=self.table_name, + rule_id=f"SRIDUniqueNonZero.{table_suffix}.{geom_col}", + column=geom_col, + ) + ) + + if self.value_set_columns: + for col_name, expected_values in self.value_set_columns.items(): + rules.append( + ValueSetValidation( + table=self.table_name, + rule_id=f"VALUE_SET_{str(col_name).upper()}" + f".{table_suffix}", + column=str(col_name), + expected_values=expected_values, + ) + ) + + # Always add the whole-table rule automatically + rules.append( + WholeTableNotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"TABLE_NOT_NAN.{table_suffix}", + ) + ) + + return rules + + +ValidationSpec = Union[Rule, TableValidation] + + +def clone_rule(rule: Rule) -> Rule: + """ + Creates a per-run copy of a rule to avoid mutating DAG-parse-time objects. + + We avoid deepcopy as first choice (can break on complex objects). + Strategy: + 1) Shallow copy the object + 2) Deep copy ONLY rule.params (the part we mutate) + 3) Fallback to deepcopy(rule) if shallow copy fails + """ + try: + # shallow copy: new object, same inner references + cloned = copy.copy(rule) + except Exception: + # Last resort: full deepcopy + return copy.deepcopy(rule) + + # Make params safe to mutate + params = getattr(cloned, "params", None) + if hasattr(cloned, "params") and isinstance(params, dict): + cloned.params = copy.deepcopy(cloned.params) + + return cloned + + +def expand_specs(specs: Sequence[ValidationSpec]) -> List[Rule]: + """ + Turn a mixed list of Rule/TableValidation into a plain list of Rules. + + TableValidation produces fresh rule instances. + Rule instances are cloned to avoid cross-run mutation. + """ + rules: List[Rule] = [] + + for spec in specs: + if isinstance(spec, TableValidation): + rules.extend(spec.to_rules()) + else: + rules.append(clone_rule(spec)) + + return rules + + +def resolve_rule_params(rule: Rule, boundary: str) -> None: + """ + Mutates rule.params on THIS rule instance only. + We ensure these rule instances are runtime clones/fresh instances. + """ + params = getattr(rule, "params", None) + if not isinstance(params, dict): + return + + for name, val in list(params.items()): + resolved = resolve_value(val, boundary) + if resolved is not val: + rule_id = getattr(rule, "rule_id", "") + logger.info( + "Rule %s: Resolved %s for boundary='%s'", + rule_id, name, boundary + ) + params[name] = resolved + + +def prepare_rules( + specs: Sequence[ValidationSpec], + boundary: str, + dataset_name: str, + task_name: str, +) -> List[Rule]: + """ + Build rules for this run: + - expand specs + - inject dataset/task if missing + - resolve boundary-dependent params + """ + rules = expand_specs(specs) + + for rule in rules: + if getattr(rule, "task", None) is None: + rule.task = task_name + if getattr(rule, "dataset", None) is None: + rule.dataset = dataset_name + + resolve_rule_params(rule, boundary) + + return rules