From e14f22f77e94ff3836edaa1759c661f63b9c6aa1 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 20 Nov 2025 11:53:54 +0100 Subject: [PATCH 01/54] start validation declaration in pipeline --- src/egon/data/datasets/__init__.py | 52 +++++++++++++++- src/egon/data/datasets/vg250/__init__.py | 12 ++++ src/egon/data/validation_utils.py | 76 ++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/egon/data/validation_utils.py diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index d65339d01..f88fbd7a7 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections import abc -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial, reduce, update_wrapper from typing import Callable, Iterable, Set, Tuple, Union import re @@ -12,9 +12,17 @@ from airflow.operators.python import PythonOperator from sqlalchemy import Column, ForeignKey, Integer, String, Table, orm, tuple_ from sqlalchemy.ext.declarative import declarative_base +from typing import Dict, List +from egon.data.validation_utils import create_validation_tasks from egon.data import config, db, logger +try: + from egon_validation.rules.base import Rule + except ImportError: + Rule = None # Type hint only + + Base = declarative_base() SCHEMA = "metadata" @@ -197,6 +205,8 @@ class Dataset: #: The tasks of this :class:`Dataset`. A :class:`TaskGraph` will #: automatically be converted to :class:`Tasks_`. tasks: Tasks = () + validation: Dict[str, List] = field(default_factory=dict) + validation_on_failure: str = "continue" def check_version(self, after_execution=()): scenario_names = config.settings()["egon-data"]["--scenarios"] @@ -264,6 +274,20 @@ def __post_init__(self): self.dependencies = list(self.dependencies) if not isinstance(self.tasks, Tasks_): self.tasks = Tasks_(self.tasks) + # Process validation configuration + if self.validation: + validation_tasks = create_validation_tasks( + validation_dict=self.validation, + dataset_name=self.name, + on_failure=self.validation_on_failure + ) + + # Append validation tasks to existing tasks + if validation_tasks: + task_list = list(self.tasks.graph if hasattr(self.tasks, 'graph') else self.tasks) + task_list.extend(validation_tasks) + self.tasks = Tasks_(tuple(task_list)) + if len(self.tasks.last) > 1: # Explicitly create single final task, because we can't know # which of the multiple tasks finishes last. @@ -302,3 +326,29 @@ def __post_init__(self): for p in predecessors: for first in self.tasks.first: p.set_downstream(first) + + # Link validation tasks to run after data tasks + if self.validation and validation_tasks: + # Get last non-validation tasks + non_validation_task_ids = [ + task.task_id for task in self.tasks.values() + if not any(task.task_id.endswith(f".validate.{name}") for name in self.validation.keys()) + ] + + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids and task in self.tasks.last + ] + + if not last_data_tasks: + # Fallback to last non-validation task + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids + ][-1:] + + # Link each validation task downstream of last data tasks + for validation_task in validation_tasks: + for last_task in last_data_tasks: + last_task.set_downstream(validation_task) + diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 378f86895..90aec2037 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,6 +29,7 @@ meta_metadata, ) import egon.data.config +from egon_validation import RowCountValidation def download_files(): @@ -529,4 +530,15 @@ def __init__(self, dependencies): add_metadata, cleaning_and_preperation, ), + validation={ + "data_quality": [ + RowCountValidation( + table="boundaries.vg250_krs", + rule_id="TEST_ROW_COUNT", + expected_count=27 + ) + ] + }, + validation_on_failure="continue" + ) diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py new file mode 100644 index 000000000..e00f6fd8e --- /dev/null +++ b/src/egon/data/validation_utils.py @@ -0,0 +1,76 @@ +"""Airflow integration for egon-validation.""" + +from typing import Dict, List +from airflow.operators.python import PythonOperator +from egon_validation import run_validations, RunContext +from egon_validation.rules.base import Rule +from egon_validation.config import get_env, build_db_url +from egon_validation import db +import logging + +logger = logging.getLogger(__name__) + + +def create_validation_tasks( + validation_dict: Dict[str, List[Rule]], + dataset_name: str, + on_failure: str = "continue" +) -> List[PythonOperator]: + """Convert validation dict to Airflow tasks. + + Args: + validation_dict: {"task_name": [Rule1(), Rule2()]} + dataset_name: Name of dataset + on_failure: "continue" or "fail" + + Returns: + List of PythonOperator tasks + """ + if not validation_dict: + return [] + + tasks = [] + + for task_name, rules in validation_dict.items(): + def make_callable(rules, task_name): + def run_validation(**context): + from datetime import datetime + + execution_date = context.get("execution_date", datetime.now()) + run_id = f"airflow-{dataset_name}-{task_name}-{execution_date.strftime('%Y%m%dT%H%M%S')}" + + logger.info(f"Validation: {dataset_name}.{task_name}") + + db_url = get_env("EGON_DB_URL") or build_db_url() + engine = db.make_engine(db_url) + + try: + ctx = RunContext(run_id=run_id, source="airflow") + results = run_validations(engine, ctx, rules, task_name) + + total = len(results) + failed = sum(1 for r in results if not r.success) + + logger.info(f"Complete: {total - failed}/{total} passed") + + if failed > 0 and on_failure == "fail": + raise Exception(f"{failed}/{total} validations failed") + + return {"total": total, "passed": total - failed, "failed": failed} + finally: + engine.dispose() + + return run_validation + + func = make_callable(rules, task_name) + func.__name__ = f"validate_{task_name}" + + operator = PythonOperator( + task_id=f"{dataset_name}.validate.{task_name}", + python_callable=func, + provide_context=True, + ) + + tasks.append(operator) + + return tasks From 31d69d9008364c6c331ff6d9e790bfcc2012186a Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 20 Nov 2025 11:53:54 +0100 Subject: [PATCH 02/54] start validation declaration in pipeline --- src/egon/data/datasets/__init__.py | 52 +++++++++++++++- src/egon/data/datasets/vg250/__init__.py | 12 ++++ src/egon/data/validation_utils.py | 76 ++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/egon/data/validation_utils.py diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index d65339d01..28761e367 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections import abc -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial, reduce, update_wrapper from typing import Callable, Iterable, Set, Tuple, Union import re @@ -12,9 +12,17 @@ from airflow.operators.python import PythonOperator from sqlalchemy import Column, ForeignKey, Integer, String, Table, orm, tuple_ from sqlalchemy.ext.declarative import declarative_base +from typing import Dict, List +from egon.data.validation_utils import create_validation_tasks from egon.data import config, db, logger +try: + from egon_validation.rules.base import Rule +except ImportError: + Rule = None # Type hint only + + Base = declarative_base() SCHEMA = "metadata" @@ -197,6 +205,8 @@ class Dataset: #: The tasks of this :class:`Dataset`. A :class:`TaskGraph` will #: automatically be converted to :class:`Tasks_`. tasks: Tasks = () + validation: Dict[str, List] = field(default_factory=dict) + validation_on_failure: str = "continue" def check_version(self, after_execution=()): scenario_names = config.settings()["egon-data"]["--scenarios"] @@ -264,6 +274,20 @@ def __post_init__(self): self.dependencies = list(self.dependencies) if not isinstance(self.tasks, Tasks_): self.tasks = Tasks_(self.tasks) + # Process validation configuration + if self.validation: + validation_tasks = create_validation_tasks( + validation_dict=self.validation, + dataset_name=self.name, + on_failure=self.validation_on_failure + ) + + # Append validation tasks to existing tasks + if validation_tasks: + task_list = list(self.tasks.graph if hasattr(self.tasks, 'graph') else self.tasks) + task_list.extend(validation_tasks) + self.tasks = Tasks_(tuple(task_list)) + if len(self.tasks.last) > 1: # Explicitly create single final task, because we can't know # which of the multiple tasks finishes last. @@ -302,3 +326,29 @@ def __post_init__(self): for p in predecessors: for first in self.tasks.first: p.set_downstream(first) + + # Link validation tasks to run after data tasks + if self.validation and validation_tasks: + # Get last non-validation tasks + non_validation_task_ids = [ + task.task_id for task in self.tasks.values() + if not any(task.task_id.endswith(f".validate.{name}") for name in self.validation.keys()) + ] + + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids and task in self.tasks.last + ] + + if not last_data_tasks: + # Fallback to last non-validation task + last_data_tasks = [ + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids + ][-1:] + + # Link each validation task downstream of last data tasks + for validation_task in validation_tasks: + for last_task in last_data_tasks: + last_task.set_downstream(validation_task) + diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 378f86895..90aec2037 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,6 +29,7 @@ meta_metadata, ) import egon.data.config +from egon_validation import RowCountValidation def download_files(): @@ -529,4 +530,15 @@ def __init__(self, dependencies): add_metadata, cleaning_and_preperation, ), + validation={ + "data_quality": [ + RowCountValidation( + table="boundaries.vg250_krs", + rule_id="TEST_ROW_COUNT", + expected_count=27 + ) + ] + }, + validation_on_failure="continue" + ) diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py new file mode 100644 index 000000000..e00f6fd8e --- /dev/null +++ b/src/egon/data/validation_utils.py @@ -0,0 +1,76 @@ +"""Airflow integration for egon-validation.""" + +from typing import Dict, List +from airflow.operators.python import PythonOperator +from egon_validation import run_validations, RunContext +from egon_validation.rules.base import Rule +from egon_validation.config import get_env, build_db_url +from egon_validation import db +import logging + +logger = logging.getLogger(__name__) + + +def create_validation_tasks( + validation_dict: Dict[str, List[Rule]], + dataset_name: str, + on_failure: str = "continue" +) -> List[PythonOperator]: + """Convert validation dict to Airflow tasks. + + Args: + validation_dict: {"task_name": [Rule1(), Rule2()]} + dataset_name: Name of dataset + on_failure: "continue" or "fail" + + Returns: + List of PythonOperator tasks + """ + if not validation_dict: + return [] + + tasks = [] + + for task_name, rules in validation_dict.items(): + def make_callable(rules, task_name): + def run_validation(**context): + from datetime import datetime + + execution_date = context.get("execution_date", datetime.now()) + run_id = f"airflow-{dataset_name}-{task_name}-{execution_date.strftime('%Y%m%dT%H%M%S')}" + + logger.info(f"Validation: {dataset_name}.{task_name}") + + db_url = get_env("EGON_DB_URL") or build_db_url() + engine = db.make_engine(db_url) + + try: + ctx = RunContext(run_id=run_id, source="airflow") + results = run_validations(engine, ctx, rules, task_name) + + total = len(results) + failed = sum(1 for r in results if not r.success) + + logger.info(f"Complete: {total - failed}/{total} passed") + + if failed > 0 and on_failure == "fail": + raise Exception(f"{failed}/{total} validations failed") + + return {"total": total, "passed": total - failed, "failed": failed} + finally: + engine.dispose() + + return run_validation + + func = make_callable(rules, task_name) + func.__name__ = f"validate_{task_name}" + + operator = PythonOperator( + task_id=f"{dataset_name}.validate.{task_name}", + python_callable=func, + provide_context=True, + ) + + tasks.append(operator) + + return tasks From fcb29951ea44822924e773e6117d2b1831f8919d Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 09:16:23 +0100 Subject: [PATCH 03/54] debug spacing --- src/egon/data/datasets/vg250/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 90aec2037..f1f6610e1 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -516,7 +516,7 @@ class Vg250(Dataset): #: name: str = "VG250" #: - version: str = filename + "-0.0.4" + version: str = filename + "-0.0.4 dev" def __init__(self, dependencies): super().__init__( From 6460cac2836c9118b03672e08b3c62ffbc478835 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 09:20:04 +0100 Subject: [PATCH 04/54] debug spacing --- src/egon/data/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index f88fbd7a7..28761e367 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -19,7 +19,7 @@ try: from egon_validation.rules.base import Rule - except ImportError: +except ImportError: Rule = None # Type hint only From 9193ff3240957bd87835dcfdc2c3eec59d19e705 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 09:21:49 +0100 Subject: [PATCH 05/54] add validation report as dataset --- src/egon/data/airflow/dags/pipeline.py | 9 ++ src/egon/data/datasets/validation_report.py | 104 ++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 src/egon/data/datasets/validation_report.py diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py index e9b87ea94..3dd84b071 100755 --- a/src/egon/data/airflow/dags/pipeline.py +++ b/src/egon/data/airflow/dags/pipeline.py @@ -102,6 +102,8 @@ from egon.data.datasets.zensus_vg250 import ZensusVg250 from egon.data.metadata import Json_Metadata +from egon.data.datasets.validation_report import ValidationReport + # Set number of threads used by numpy and pandas set_numexpr_threads() @@ -730,6 +732,13 @@ ] ) + with TaskGroup(group_id="validation_report") as validation_report_group: + # Generate validation report from all validation tasks + # NOTE: Temporarily depends only on vg250 for testing purposes + validation_report = ValidationReport( + dependencies=[vg250] + ) + with TaskGroup(group_id="sanity_checks") as sanity_checks_group: # ########## Keep this dataset at the end # Sanity Checks diff --git a/src/egon/data/datasets/validation_report.py b/src/egon/data/datasets/validation_report.py new file mode 100644 index 000000000..c4cc1e823 --- /dev/null +++ b/src/egon/data/datasets/validation_report.py @@ -0,0 +1,104 @@ +""" +Dataset for generating validation reports during pipeline execution. + +This module provides the ValidationReport dataset which generates comprehensive +validation reports by aggregating all validation results from individual dataset +validation tasks executed during the pipeline run. +""" + +import os +import time + +from egon.data import logger +from egon.data.datasets import Dataset +from egon_validation import RunContext +from egon_validation.runner.aggregate import collect, build_coverage, write_outputs +from egon_validation.report.generate import generate + +# Default output directory for validation results +DEFAULT_OUT_DIR = "./validation_runs" + + +def generate_validation_report(**kwargs): + """ + Generate validation report aggregating all validation results. + + This function collects all validation results from individual dataset + validation tasks that were executed during the pipeline run and generates + a comprehensive HTML report including: + - All validation results from individual dataset tasks + - Coverage analysis showing which tables were validated + - Summary statistics and pass/fail counts + """ + # Use same run_id as other validation tasks in the pipeline + # This ensures all tasks read/write to the same directory + run_id = ( + os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or + kwargs.get('run_id') or + (kwargs.get('ti') and hasattr(kwargs['ti'], 'dag_run') and kwargs['ti'].dag_run.run_id) or + (kwargs.get('dag_run') and kwargs['dag_run'].run_id) or + f"pipeline_validation_report_{int(time.time())}" + ) + out_dir = DEFAULT_OUT_DIR + + try: + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) + logger.info("Starting pipeline validation report generation", extra={ + "run_id": run_id, + "output_dir": out_dir + }) + + # Collect all validation results from existing validation runs + collected = collect(ctx) + coverage = build_coverage(ctx, collected) + final_out_dir = write_outputs(ctx, collected, coverage) + generate(ctx) + + report_path = os.path.join(final_out_dir, 'report.html') + logger.info("Pipeline validation report generated successfully", extra={ + "report_path": report_path, + "run_id": run_id, + "total_results": len(collected.get("items", [])) + }) + + except FileNotFoundError as e: + logger.warning("No validation results found for pipeline validation report", extra={ + "run_id": run_id, + "error": str(e), + "suggestion": "This may be expected if no validation tasks were run during the pipeline" + }) + # Don't raise - this is acceptable if no validations were run + except Exception as e: + logger.error("Pipeline validation report generation failed", extra={ + "run_id": run_id, + "error": str(e), + "error_type": type(e).__name__ + }) + raise + + +# Define the task +tasks = (generate_validation_report,) + + +class ValidationReport(Dataset): + """ + Dataset for generating validation reports. + + This dataset generates a comprehensive HTML validation report by aggregating + all validation results from individual dataset validation tasks that were + executed during the pipeline run. It should be placed before sanity_checks + in the DAG to ensure validation results are collected before final checks. + """ + #: + name: str = "ValidationReport" + #: + version: str = "0.0.2 dev" + + def __init__(self, dependencies): + super().__init__( + name=self.name, + version=self.version, + dependencies=dependencies, + tasks=tasks, + ) From b0aeb0bdea9f6e849eae67f5429d5bb9f25479c1 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 09:26:43 +0100 Subject: [PATCH 06/54] add egon-validation as dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d36887230..48c6eda87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "cdsapi", "click<8.1", "disaggregator @ git+https://github.com/openego/disaggregator.git@features/update-cache-directory#egg=disaggregator", + "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@feature/inline-validation-declaration#egg=egon-validation", "entsoe-py>=0.6.2", "fiona==1.9.6", "Flask-Session<0.6.0", From d69f3e43f177bae21d135d3a664ea48d2623904c Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 09:32:45 +0100 Subject: [PATCH 07/54] change how to save validation results, use db from pipeline --- src/egon/data/validation_utils.py | 54 ++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index e00f6fd8e..e349de386 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -4,8 +4,6 @@ from airflow.operators.python import PythonOperator from egon_validation import run_validations, RunContext from egon_validation.rules.base import Rule -from egon_validation.config import get_env, build_db_url -from egon_validation import db import logging logger = logging.getLogger(__name__) @@ -34,31 +32,51 @@ def create_validation_tasks( for task_name, rules in validation_dict.items(): def make_callable(rules, task_name): def run_validation(**context): + import os + import time from datetime import datetime + from egon.data import db as egon_db - execution_date = context.get("execution_date", datetime.now()) - run_id = f"airflow-{dataset_name}-{task_name}-{execution_date.strftime('%Y%m%dT%H%M%S')}" + # Use same run_id as validation report for consistency + # This allows the validation report to collect results from all validation tasks + run_id = ( + os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or + context.get('run_id') or + (context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or + (context.get('dag_run') and context['dag_run'].run_id) or + f"airflow-{dataset_name}-{task_name}-{int(time.time())}" + ) - logger.info(f"Validation: {dataset_name}.{task_name}") + # Include execution timestamp in task name so retries write to separate directories + # The validation report will filter to keep only the most recent execution per task + execution_date = context.get('execution_date') or datetime.now() + timestamp = execution_date.strftime('%Y%m%dT%H%M%S') + full_task_name = f"{dataset_name}.{task_name}.{timestamp}" - db_url = get_env("EGON_DB_URL") or build_db_url() - engine = db.make_engine(db_url) + logger.info(f"Validation: {full_task_name} (run_id: {run_id})") - try: - ctx = RunContext(run_id=run_id, source="airflow") - results = run_validations(engine, ctx, rules, task_name) + # Use existing engine from egon.data.db + engine = egon_db.engine() - total = len(results) - failed = sum(1 for r in results if not r.success) + # Set task and dataset on all rules (required by Rule base class) + for rule in rules: + if not hasattr(rule, 'task') or rule.task is None: + rule.task = task_name + if not hasattr(rule, 'dataset') or rule.dataset is None: + rule.dataset = dataset_name - logger.info(f"Complete: {total - failed}/{total} passed") + ctx = RunContext(run_id=run_id, source="airflow") + results = run_validations(engine, ctx, rules, full_task_name) - if failed > 0 and on_failure == "fail": - raise Exception(f"{failed}/{total} validations failed") + total = len(results) + failed = sum(1 for r in results if not r.success) - return {"total": total, "passed": total - failed, "failed": failed} - finally: - engine.dispose() + logger.info(f"Complete: {total - failed}/{total} passed") + + if failed > 0 and on_failure == "fail": + raise Exception(f"{failed}/{total} validations failed") + + return {"total": total, "passed": total - failed, "failed": failed} return run_validation From 57a45c290b5e16660f3ecd32771439a4187fcdf4 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 26 Nov 2025 14:00:52 +0100 Subject: [PATCH 08/54] change out_dir, use .dev --- src/egon/data/datasets/validation_report.py | 14 ++++++++------ src/egon/data/datasets/vg250/__init__.py | 2 +- src/egon/data/validation_utils.py | 9 ++++++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/egon/data/datasets/validation_report.py b/src/egon/data/datasets/validation_report.py index c4cc1e823..9aa223777 100644 --- a/src/egon/data/datasets/validation_report.py +++ b/src/egon/data/datasets/validation_report.py @@ -15,10 +15,6 @@ from egon_validation.runner.aggregate import collect, build_coverage, write_outputs from egon_validation.report.generate import generate -# Default output directory for validation results -DEFAULT_OUT_DIR = "./validation_runs" - - def generate_validation_report(**kwargs): """ Generate validation report aggregating all validation results. @@ -39,7 +35,13 @@ def generate_validation_report(**kwargs): (kwargs.get('dag_run') and kwargs['dag_run'].run_id) or f"pipeline_validation_report_{int(time.time())}" ) - out_dir = DEFAULT_OUT_DIR + + # Determine output directory at runtime (not import time) + # Priority: EGON_VALIDATION_DIR env var > current working directory + out_dir = os.path.join( + os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), + "validation_runs" + ) try: ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) @@ -93,7 +95,7 @@ class ValidationReport(Dataset): #: name: str = "ValidationReport" #: - version: str = "0.0.2 dev" + version: str = "0.0.2.dev" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index f1f6610e1..07a886453 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -516,7 +516,7 @@ class Vg250(Dataset): #: name: str = "VG250" #: - version: str = filename + "-0.0.4 dev" + version: str = filename + "-0.0.4.dev" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index e349de386..6fd4ea6b8 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -47,6 +47,13 @@ def run_validation(**context): f"airflow-{dataset_name}-{task_name}-{int(time.time())}" ) + # Use absolute path to ensure consistent location regardless of working directory + # Priority: EGON_VALIDATION_DIR env var > current working directory + out_dir = os.path.join( + os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), + "validation_runs" + ) + # Include execution timestamp in task name so retries write to separate directories # The validation report will filter to keep only the most recent execution per task execution_date = context.get('execution_date') or datetime.now() @@ -65,7 +72,7 @@ def run_validation(**context): if not hasattr(rule, 'dataset') or rule.dataset is None: rule.dataset = dataset_name - ctx = RunContext(run_id=run_id, source="airflow") + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) results = run_validations(engine, ctx, rules, full_task_name) total = len(results) From 3c8b1c166d901f7de8b9f241a64976839b5bc01a Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 3 Dec 2025 10:07:07 +0100 Subject: [PATCH 09/54] debug table count --- src/egon/data/datasets/validation_report.py | 28 ++++++++++++++++----- src/egon/data/validation_utils.py | 1 - 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/egon/data/datasets/validation_report.py b/src/egon/data/datasets/validation_report.py index 9aa223777..5a70814ec 100644 --- a/src/egon/data/datasets/validation_report.py +++ b/src/egon/data/datasets/validation_report.py @@ -9,11 +9,14 @@ import os import time -from egon.data import logger +from egon.data import logger, db as egon_db from egon.data.datasets import Dataset from egon_validation import RunContext from egon_validation.runner.aggregate import collect, build_coverage, write_outputs from egon_validation.report.generate import generate +from egon_validation.runner.coverage_analysis import discover_total_tables +from egon_validation.config import ENV_DB_URL +import os as _os def generate_validation_report(**kwargs): """ @@ -50,6 +53,17 @@ def generate_validation_report(**kwargs): "output_dir": out_dir }) + # Make database connection available for table counting + # Set the database URL from egon.data configuration + try: + # Get the database URL from egon.data + db_url = str(egon_db.engine().url) + # Temporarily set the environment variable so discover_total_tables can use it + _os.environ[ENV_DB_URL] = db_url + logger.info("Database connection available for table counting") + except Exception as e: + logger.warning(f"Could not set database URL for table counting: {e}") + # Collect all validation results from existing validation runs collected = collect(ctx) coverage = build_coverage(ctx, collected) @@ -63,12 +77,14 @@ def generate_validation_report(**kwargs): "total_results": len(collected.get("items", [])) }) + except FileNotFoundError as e: - logger.warning("No validation results found for pipeline validation report", extra={ - "run_id": run_id, - "error": str(e), - "suggestion": "This may be expected if no validation tasks were run during the pipeline" - }) + logger.warning( + f"No validation results found for pipeline validation report | " + f"run_id={run_id} | out_dir={out_dir} | error={e} | " + f"suggestion=This may be expected if no validation tasks were run during the pipeline" + ) + # Don't raise - this is acceptable if no validations were run except Exception as e: logger.error("Pipeline validation report generation failed", extra={ diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index 6fd4ea6b8..048cfa242 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -8,7 +8,6 @@ logger = logging.getLogger(__name__) - def create_validation_tasks( validation_dict: Dict[str, List[Rule]], dataset_name: str, From 942d89dfebe0d9aba8fe5a0c0e7be2296d7bc293 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 9 Dec 2025 11:01:25 +0100 Subject: [PATCH 10/54] use egon-validation v1.1.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 48c6eda87..67f0e5da5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "cdsapi", "click<8.1", "disaggregator @ git+https://github.com/openego/disaggregator.git@features/update-cache-directory#egg=disaggregator", - "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@feature/inline-validation-declaration#egg=egon-validation", + "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@v1.1.0#egg=egon-validation", "entsoe-py>=0.6.2", "fiona==1.9.6", "Flask-Session<0.6.0", From b727c0b3362319e291e111a1455521923edfead7 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 11 Dec 2025 16:31:11 +0100 Subject: [PATCH 11/54] add validation rules to vg250 --- pyproject.toml | 2 +- src/egon/data/datasets/vg250/__init__.py | 34 ++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 67f0e5da5..2549710cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "cdsapi", "click<8.1", "disaggregator @ git+https://github.com/openego/disaggregator.git@features/update-cache-directory#egg=disaggregator", - "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@v1.1.0#egg=egon-validation", + "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@dev", "entsoe-py>=0.6.2", "fiona==1.9.6", "Flask-Session<0.6.0", diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 07a886453..1bd9e8c2e 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,7 +29,13 @@ meta_metadata, ) import egon.data.config -from egon_validation import RowCountValidation +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) def download_files(): @@ -536,9 +542,33 @@ def __init__(self, dependencies): table="boundaries.vg250_krs", rule_id="TEST_ROW_COUNT", expected_count=27 + ), + DataTypeValidation( + table="boundaries.vg250_krs", + rule_id="TEST_DATA_MULTIPLE_TYPES", + column_types={"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"} + ), + NotNullAndNotNaNValidation( + table="boundaries.vg250_krs", + rule_id="TEST_NOT_NAN", + columns=["gf","bsg"] + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.vg250_krs", + rule_id="TEST_WHOLE_TABLE_NOT_NAN" + ), + ValueSetValidation( + table="boundaries.vg250_krs", + rule_id="TEST_VALUE_SET", + column="nbd", + expected_values=["ja", "nein"] ) ] }, validation_on_failure="continue" - ) From 059170fbcd89d6c9d7d879a546ba26b9c0fc6e3e Mon Sep 17 00:00:00 2001 From: sarah Date: Mon, 15 Dec 2025 16:35:25 +0100 Subject: [PATCH 12/54] Add sanity check validation to HouseholdElectricityDemand dataset --- .../datasets/electricity_demand/__init__.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index f6ef464d5..27042ea2f 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -10,6 +10,10 @@ from egon.data import db from egon.data.datasets import Dataset from egon.data.datasets.electricity_demand.temporal import insert_cts_load +from egon.data.validation.rules.custom.sanity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, +) from egon.data.datasets.electricity_demand_timeseries.hh_buildings import ( HouseholdElectricityProfilesOfBuildings, get_iee_hh_demand_profiles_raw, @@ -53,6 +57,21 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(create_tables, get_annual_household_el_demand_cells), + validation={ + "data_quality": [ + ResidentialElectricityAnnualSum( + table="demand.egon_demandregio_zensus_electricity", + rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM", + rtol=0.005 + ), + ResidentialElectricityHhRefinement( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT", + rtol=1e-5 + ), + ] + }, + validation_on_failure="continue" ) From 655af1bd6cbfea847a13f507850702a209d40f29 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 31 Dec 2025 14:13:52 +0100 Subject: [PATCH 13/54] add validation boundaries.egon_map_zensus_buildings_residential and _filtered --- .../osm_buildings_streets/__init__.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index 5677cf224..ee76b55fa 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -7,6 +7,11 @@ from egon.data import db from egon.data.datasets import Dataset +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation +) def execute_sql_script(script): @@ -211,4 +216,36 @@ def __init__(self, dependencies): drop_temp_tables, add_metadata, ), + validation={ + "data_quality": [ + RowCountValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="TEST_ROW_COUNT", + expected_count=28070301 + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="TEST_DATA_MULTIPLE_TYPES", + column_types={"id": "integer", "cell_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_buildings_filtered", + rule_id="TEST_WHOLE_TABLE_NOT_NAN" + ), + RowCountValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="TEST_ROW_COUNT", + expected_count=27477467 + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="TEST_DATA_MULTIPLE_TYPES", + column_types={"id": "integer", "cell_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_buildings_residential", + rule_id="TEST_WHOLE_TABLE_NOT_NAN" + ) + ] + } ) From 01e0123b7f4a0422ca8f68739161ed7a32d13e8c Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 8 Jan 2026 12:37:14 +0100 Subject: [PATCH 14/54] Add automatic boundary/scenario-dependent validation parameter resolution --- src/egon/data/validation_utils.py | 107 +++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index 048cfa242..b9d68a708 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -1,6 +1,6 @@ """Airflow integration for egon-validation.""" -from typing import Dict, List +from typing import Any, Dict, List from airflow.operators.python import PythonOperator from egon_validation import run_validations, RunContext from egon_validation.rules.base import Rule @@ -8,6 +8,79 @@ logger = logging.getLogger(__name__) + +def _resolve_context_value(value: Any, boundary: str, scenarios: List[str]) -> Any: + """Resolve a value that may be context-dependent (boundary/scenario). + + Args: + value: The value to resolve. Can be: + - A dict with boundary keys: {"Schleswig-Holstein": 27, "Everything": 537} + - A dict with scenario keys: {"eGon2035": 100, "eGon100RE": 200} + - Any other value (returned as-is) + boundary: Current dataset boundary setting + scenarios: List of active scenarios + + Returns: + Resolved value based on current context + + Examples: + >>> _resolve_context_value({"Schleswig-Holstein": 27, "Everything": 537}, + ... "Schleswig-Holstein", ["eGon2035"]) + 27 + + >>> _resolve_context_value({"eGon2035": 100, "eGon100RE": 200}, + ... "Everything", ["eGon2035"]) + 100 + + >>> _resolve_context_value(42, "Everything", ["eGon2035"]) + 42 + """ + # If not a dict, return as-is + if not isinstance(value, dict): + return value + + # Try to resolve by boundary + if boundary in value: + logger.debug(f"Resolved boundary-dependent value: {boundary} -> {value[boundary]}") + return value[boundary] + + # Try to resolve by scenario + for scenario in scenarios: + if scenario in value: + logger.debug(f"Resolved scenario-dependent value: {scenario} -> {value[scenario]}") + return value[scenario] + + # If dict doesn't match boundary/scenario pattern, return as-is + # This handles cases like column_types dicts which are not context-dependent + return value + + +def _resolve_rule_params(rule: Rule, boundary: str, scenarios: List[str]) -> None: + """Recursively resolve context-dependent parameters in a rule. + + Modifies rule.params in-place, resolving any dict values that match + boundary or scenario patterns. + + Args: + rule: The validation rule to process + boundary: Current dataset boundary setting + scenarios: List of active scenarios + """ + if not hasattr(rule, 'params') or not isinstance(rule.params, dict): + return + + # Recursively resolve all parameter values + for param_name, param_value in rule.params.items(): + resolved_value = _resolve_context_value(param_value, boundary, scenarios) + + # If the value was resolved (changed), update it + if resolved_value is not param_value: + logger.info( + f"Rule {rule.rule_id}: Resolved {param_name} for " + f"boundary='{boundary}', scenarios={scenarios}" + ) + rule.params[param_name] = resolved_value + def create_validation_tasks( validation_dict: Dict[str, List[Rule]], dataset_name: str, @@ -15,6 +88,14 @@ def create_validation_tasks( ) -> List[PythonOperator]: """Convert validation dict to Airflow tasks. + Automatically resolves context-dependent parameters in validation rules. + Parameters can be specified as dicts with boundary or scenario keys: + + - Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537} + - Scenario-dependent: {"eGon2035": 100, "eGon100RE": 200} + + The appropriate value is selected based on the current configuration. + Args: validation_dict: {"task_name": [Rule1(), Rule2()]} dataset_name: Name of dataset @@ -22,6 +103,18 @@ def create_validation_tasks( Returns: List of PythonOperator tasks + + Example: + >>> validation_dict = { + ... "data_quality": [ + ... RowCountValidation( + ... table="boundaries.vg250_krs", + ... rule_id="TEST_ROW_COUNT", + ... expected_count={"Schleswig-Holstein": 27, "Everything": 537} + ... ) + ... ] + ... } + >>> tasks = create_validation_tasks(validation_dict, "VG250") """ if not validation_dict: return [] @@ -35,6 +128,7 @@ def run_validation(**context): import time from datetime import datetime from egon.data import db as egon_db + from egon.data.config import settings # Use same run_id as validation report for consistency # This allows the validation report to collect results from all validation tasks @@ -64,13 +158,24 @@ def run_validation(**context): # Use existing engine from egon.data.db engine = egon_db.engine() + # Get current configuration context + config = settings()["egon-data"] + boundary = config["--dataset-boundary"] + scenarios = config.get("--scenarios", []) + + logger.info(f"Resolving validation parameters for boundary='{boundary}', scenarios={scenarios}") + # Set task and dataset on all rules (required by Rule base class) + # Also resolve context-dependent parameters for rule in rules: if not hasattr(rule, 'task') or rule.task is None: rule.task = task_name if not hasattr(rule, 'dataset') or rule.dataset is None: rule.dataset = dataset_name + # Automatically resolve boundary/scenario-dependent parameters + _resolve_rule_params(rule, boundary, scenarios) + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) results = run_validations(engine, ctx, rules, full_task_name) From 42808cdc2a5e95e938f08c4d565435eb5b72bd12 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 6 Jan 2026 13:40:47 +0100 Subject: [PATCH 15/54] correct spelling demand --- .../data/datasets/electricity_demand_timeseries/hh_profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index 7d613be6c..dc7ac60a7 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -1583,7 +1583,7 @@ def houseprofiles_in_census_cells(): """ Allocate household electricity demand profiles for each census cell. - Creates table `emand.egon_household_electricity_profile_in_census_cell` that maps + Creates table `demand.egon_household_electricity_profile_in_census_cell` that maps household electricity demand profiles to census cells. Each row represents one cell and contains a list of profile IDs. This table is fundamental for creating subsequent data like demand profiles on MV grid level or for From c9f2ce46381e03274c750b783d7db214dce42468 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 6 Jan 2026 14:18:53 +0100 Subject: [PATCH 16/54] add formal validation to main demand datasets --- src/egon/data/datasets/__init__.py | 4 +- .../data/datasets/demandregio/__init__.py | 51 +++++ .../district_heating_areas/__init__.py | 33 +++ .../datasets/electricity_demand/__init__.py | 35 +++- .../hh_buildings.py | 78 +++++++ .../hh_profiles.py | 27 +++ .../motorized_individual_travel/__init__.py | 192 ++++++++++++++++++ .../osm_buildings_streets/__init__.py | 17 +- src/egon/data/datasets/vg250/__init__.py | 2 +- .../data/datasets/zensus_mv_grid_districts.py | 24 +++ 10 files changed, 451 insertions(+), 12 deletions(-) diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index 28761e367..e0a14046e 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -206,7 +206,7 @@ class Dataset: #: automatically be converted to :class:`Tasks_`. tasks: Tasks = () validation: Dict[str, List] = field(default_factory=dict) - validation_on_failure: str = "continue" + on_validation_failure: str = "continue" def check_version(self, after_execution=()): scenario_names = config.settings()["egon-data"]["--scenarios"] @@ -279,7 +279,7 @@ def __post_init__(self): validation_tasks = create_validation_tasks( validation_dict=self.validation, dataset_name=self.name, - on_failure=self.validation_on_failure + on_failure=self.on_validation_failure ) # Append validation tasks to existing tasks diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index 479492ceb..b4ea4856f 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -20,6 +20,12 @@ ) import egon.data.config import egon.data.datasets.scenario_parameters.parameters as scenario_parameters +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) try: from disaggregator import config, data, spatial, temporal @@ -87,6 +93,51 @@ def __init__(self, dependencies): insert_cts_ind_demands, }, ), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_demandregio_hh", + rule_id="ROW_COUNT.egon_demandregio_hh", + expected_count=7218 + ), + DataTypeValidation( + table="demand.egon_demandregio_hh", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_hh", + column_types={"nuts3": "character varying", "hh_size": "integer", "year": "integer", "demand": "double precision"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_hh", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_hh" + ), + ValueSetValidation( + table="demand.egon_demandregio_hh", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_demandregio_hh", + column="scenario", + expected_values=["eGon2035", "eGon100RE", "eGon2021"] + ), + RowCountValidation( + table=" demand.egon_demandregio_wz", + rule_id="ROW_COUNT.egon_demandregio_wz", + expected_count=87 + ), + DataTypeValidation( + table="demand.egon_demandregio_wz", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_wz", + column_types={"wz": "integer", "sector": "character varying", "definition": "character varying"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_wz", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_wz" + ), + ValueSetValidation( + table="demand.egon_demandregio_wz", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_demandregio_wz", + column="sector", + expected_values=["industry", "CTS"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/district_heating_areas/__init__.py b/src/egon/data/datasets/district_heating_areas/__init__.py index df347bdbb..eb3ced010 100644 --- a/src/egon/data/datasets/district_heating_areas/__init__.py +++ b/src/egon/data/datasets/district_heating_areas/__init__.py @@ -40,6 +40,13 @@ ) from egon.data.metadata import context, license_ccby, meta_metadata, sources +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + # import time @@ -82,6 +89,32 @@ def __init__(self, dependencies): version=self.version, # maybe rethink the naming dependencies=dependencies, tasks=(create_tables, demarcation), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_district_heating_areas", + rule_id="ROW_COUNT.egon_district_heating_areas", + expected_count=6335 + ), + DataTypeValidation( + table="demand.egon_district_heating_areas", + rule_id="DATA_MULTIPLE_TYPES.egon_district_heating_areas", + column_types={"id": "integer", "area_id": "integer", "scenario": "character varying", + "geom_polygon": "geometry", "residential_and_service_demand": "double precision"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_district_heating_areas", + rule_id="WHOLE_TABLE_NOT_NAN.egon_district_heating_areas" + ), + ValueSetValidation( + table="demand.egon_district_heating_areas", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_district_heating_areas", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index 27042ea2f..83ee7637c 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -14,6 +14,13 @@ ResidentialElectricityAnnualSum, ResidentialElectricityHhRefinement, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + from egon.data.datasets.electricity_demand_timeseries.hh_buildings import ( HouseholdElectricityProfilesOfBuildings, get_iee_hh_demand_profiles_raw, @@ -69,9 +76,35 @@ def __init__(self, dependencies): rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT", rtol=1e-5 ), + RowCountValidation( + table=" demand.egon_demandregio_zensus_electricity", + rule_id="ROW_COUNT.egon_demandregio_zensus_electricity", + expected_count=7355160 + ), + DataTypeValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_zensus_electricity", + column_types={"zensus_population_id": "integer", "scenario": "character varying", "sector": "character varying", "demand": "double precision"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="WHOLE_TABLE_NOT_NAN.egon_demandregio_zensus_electricity" + ), + ValueSetValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_demandregio_zensus_electricity", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_demandregio_zensus_electricity", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_demandregio_zensus_electricity", + column="sector", + expected_values=["residential", "service"] + ), ] }, - validation_on_failure="continue" + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py index 6de5a5b74..c82eefe3e 100755 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py @@ -23,6 +23,12 @@ random_point_in_square, ) import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) engine = db.engine() Base = declarative_base() @@ -1232,4 +1238,76 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=self.tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_building_electricity_peak_loads", + rule_id="ROW_COUNT.egon_building_electricity_peak_loads", + expected_count=44683620 + ), + DataTypeValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="DATA_MULTIPLE_TYPES.egon_building_electricity_peak_loads", + column_types={"building_id": "integer", "scenario": "character varying", "sector": "character varying", "peak_load_in_w": "real", "voltage_level": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="WHOLE_TABLE_NOT_NAN.egon_building_electricity_peak_loads" + ), + ValueSetValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_building_electricity_peak_loads", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_building_electricity_peak_loads", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_building_electricity_peak_loads", + column="sector", + expected_values=["cts", "residential"] + ), + RowCountValidation( + table=" demand.egon_building_heat_peak_loads", + rule_id="ROW_COUNT.egon_building_heat_peak_loads", + expected_count=42128819 + ), + DataTypeValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="DATA_MULTIPLE_TYPES.egon_building_heat_peak_loads", + column_types={"building_id": "integer", "scenario": "character varying", "sector": "character varying", "peak_load_in_w": "real"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="WHOLE_TABLE_NOT_NAN.egon_building_heat_peak_loads" + ), + ValueSetValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_building_heat_peak_loads", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_building_heat_peak_loads", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_building_heat_peak_loads", + column="sector", + expected_values=["residential+cts"] + ), + RowCountValidation( + table=" demand.egon_household_electricity_profile_of_buildings", + rule_id="ROW_COUNT.egon_household_electricity_profile_of_buildings", + expected_count=38605221 + ), + DataTypeValidation( + table="demand.egon_household_electricity_profile_of_buildings", + rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_of_buildings", + column_types={"id": "integer", "building_id": "integer", "cell_id": "integer", + "profile_id": "character varying"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_household_electricity_profile_of_buildings", + rule_id="WHOLE_TABLE_NOT_NAN.egon_household_electricity_profile_of_buildings" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index dc7ac60a7..42fc6ddc7 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -27,6 +27,13 @@ from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() engine = db.engine() @@ -300,6 +307,26 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_household_electricity_profile_in_census_cell", + rule_id="ROW_COUNT.egon_household_electricity_profile_in_census_cell", + expected_count=3177723 + ), + DataTypeValidation( + table="demand.egon_household_electricity_profile_in_census_cell", + rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_in_census_cell", + column_types={"cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", + "nuts3": "character varying", "nuts1": "character varying", "factor_2035": "double precision", + "factor_2050": "double precision"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_household_electricity_profile_in_census_cell", + rule_id="WHOLE_TABLE_NOT_NAN.egon_household_electricity_profile_in_census_cell" + ) + ] + } ) diff --git a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py index 072a3e342..cbdc0388f 100644 --- a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py +++ b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py @@ -56,6 +56,13 @@ read_simbev_metadata_file, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + # ========== Register np datatypes with SQLA ========== def adapt_numpy_float64(numpy_float64): @@ -490,4 +497,189 @@ def generate_model_data_tasks(scenario_name): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_ev_count_municipality", + rule_id="ROW_COUNT.egon_ev_count_municipality", + expected_count=44012 + ), + DataTypeValidation( + table="demand.egon_ev_count_municipality", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_municipality", + column_types={"scenario": "character varying", "scenario_variation": "character varying", + "ags": "integer", "bev_mini": "integer", "bev_medium": "integer", + "bev_luxury": "integer", "phev_mini": "integer", "phev_medium": "integer", + "phev_luxury": "integer", "rs7_id": "smallint"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_municipality", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_municipality" + ), + ValueSetValidation( + table="demand.egon_ev_count_municipality", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_municipality", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_municipality", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_municipality", + column="scenario_variation", + expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", "Reference 2050"] + ), + RowCountValidation( + table=" demand.egon_ev_count_mv_grid_district", + rule_id="ROW_COUNT.egon_ev_count_mv_grid_district", + expected_count=15348 + ), + DataTypeValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_mv_grid_district", + column_types={"scenario": "character varying", "scenario_variation": "character varying", + "bus_id": "integer", "bev_mini": "integer", "bev_medium": "integer", + "bev_luxury": "integer", "phev_mini": "integer", "phev_medium": "integer", + "phev_luxury": "integer", "rs7_id": "smallint"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_mv_grid_district" + ), + ValueSetValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_mv_grid_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_mv_grid_district", + column="scenario_variation", + expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", + "Reference 2050"] + ), + RowCountValidation( + table=" demand.egon_ev_count_registration_district", + rule_id="ROW_COUNT.egon_ev_count_registration_district", + expected_count=1600 + ), + DataTypeValidation( + table="demand.egon_ev_count_registration_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_registration_district", + column_types={"scenario": "character varying", "scenario_variation": "character varying", + "ags_reg_district": "integer", "reg_district": "character varying", + "bev_mini": "integer", "bev_medium": "integer", "bev_luxury": "integer", + "phev_mini": "integer", "phev_medium": "integer", "phev_luxury": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_count_registration_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_count_registration_district" + ), + ValueSetValidation( + table="demand.egon_ev_count_registration_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_count_registration_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_count_registration_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_registration_district", + column="scenario_variation", + expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", + "Reference 2050"] + ), + RowCountValidation( + table=" demand.egon_ev_mv_grid_district", + rule_id="ROW_COUNT.egon_ev_mv_grid_district", + expected_count=15348 + ), + DataTypeValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_mv_grid_district", + column_types={"scenario": "character varying", "scenario_variation": "character varying", + "bus_id": "integer", "reg_district": "character varying", + "bev_mini": "integer", "bev_medium": "integer", "bev_luxury": "integer", + "phev_mini": "integer", "phev_medium": "integer", "phev_luxury": "integer", + "rs7_id": "smallint"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_mv_grid_district" + ), + ValueSetValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_mv_grid_district", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_mv_grid_district", + rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_mv_grid_district", + column="scenario_variation", + expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", + "Reference 2050"] + ), + RowCountValidation( + table=" demand.egon_ev_pool", + rule_id="ROW_COUNT.egon_ev_pool", + expected_count=65376 + ), + DataTypeValidation( + table="demand.egon_ev_pool", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_pool", + column_types={"scenario": "character varying", "ev_id": "integer", "rs7_id": "smallint", + "type": "character varying", "simbev_ev_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_pool", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_pool" + ), + ValueSetValidation( + table="demand.egon_ev_pool", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_pool", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_pool", + rule_id="VALUE_SET_VALIDATION_TYPE.egon_ev_pool", + column="type", + expected_values=["bev_mini", "bev_medium", "bev_luxury", "phev_mini", "phev_medium", + "phev_luxury"] + ), + RowCountValidation( + table=" demand.egon_ev_trip", + rule_id="ROW_COUNT.egon_ev_trip", + expected_count=108342188 + ), + DataTypeValidation( + table="demand.egon_ev_trip", + rule_id="DATA_MULTIPLE_TYPES.egon_ev_trip", + column_types={"scenario": "character varying", "event_id": "integer", "egon_ev_pool_ev_id": "integer", + "simbev_event_id": "integer", "location": "character varying", "use_case": "character varying", + "charging_capacity_nominal": "real", "charging_capacity_grid": "real", + "charging_capacity_battery": "real", "soc_start": "real", "soc_end": "real", + "charging_demand": "real", "park_start": "integer", "park_end": "integer", + "drive_start": "integer", "drive_end": "integer", "consumption": "real"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_ev_trip", + rule_id="WHOLE_TABLE_NOT_NAN.egon_ev_trip" + ), + ValueSetValidation( + table="demand.egon_ev_trip", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_ev_trip", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="demand.egon_ev_trip", + rule_id="VALUE_SET_LOCATION.egon_ev_trip", + column="type", + expected_values=["0_work", "1_business", "2_school", "3_shopping", "4_private/ridesharing", + "5_leisure", "6_home", "7_charging_hub", "driving"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index ee76b55fa..862bc6d64 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -220,32 +220,33 @@ def __init__(self, dependencies): "data_quality": [ RowCountValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="TEST_ROW_COUNT", + rule_id="ROW_COUNT.egon_map_zensus_buildings_filtered", expected_count=28070301 ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="TEST_DATA_MULTIPLE_TYPES", + rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_buildings_filtered", column_types={"id": "integer", "cell_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="TEST_WHOLE_TABLE_NOT_NAN" + rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_buildings_filtered" ), RowCountValidation( table="boundaries.egon_map_zensus_buildings_residential", - rule_id="TEST_ROW_COUNT", + rule_id="ROW_COUNT.egon_map_zensus_buildings_residential", expected_count=27477467 ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_residential", - rule_id="TEST_DATA_MULTIPLE_TYPES", - column_types={"id": "integer", "cell_id": "integer"} + rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_buildings_residential", + column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_residential", - rule_id="TEST_WHOLE_TABLE_NOT_NAN" + rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_buildings_residential" ) ] - } + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 1bd9e8c2e..8efc46df7 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -570,5 +570,5 @@ def __init__(self, dependencies): ) ] }, - validation_on_failure="continue" + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/zensus_mv_grid_districts.py b/src/egon/data/datasets/zensus_mv_grid_districts.py index ad2b36673..7f606e530 100644 --- a/src/egon/data/datasets/zensus_mv_grid_districts.py +++ b/src/egon/data/datasets/zensus_mv_grid_districts.py @@ -11,6 +11,11 @@ from egon.data.datasets.mv_grid_districts import MvGridDistricts from egon.data.datasets.zensus_vg250 import DestatisZensusPopulationPerHa import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation +) class ZensusMvGridDistricts(Dataset): @@ -38,6 +43,25 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(mapping), + validation={ + "data_quality": [ + RowCountValidation( + table=" boundaries.egon_map_zensus_grid_districts", + rule_id="ROW_COUNT.egon_map_zensus_grid_districts", + expected_count=35718586 + ), + DataTypeValidation( + table="boundaries.egon_map_zensus_grid_districts", + rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_grid_districts", + column_types={"index": "bigint", "zensus_population_id": "bigint", "bus_id": "bigint"} + ), + WholeTableNotNullAndNotNaNValidation( + table="boundaries.egon_map_zensus_grid_districts", + rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_grid_districts" + ), + ] + }, + on_validation_failure="continue" ) From e95d92fb6412e21f258c79351b2237ff0fed886d Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 8 Jan 2026 15:58:34 +0100 Subject: [PATCH 17/54] add different boundaries --- src/egon/data/datasets/demandregio/__init__.py | 5 +++-- .../district_heating_areas/__init__.py | 2 +- .../datasets/electricity_demand/__init__.py | 2 +- .../hh_buildings.py | 6 +++--- .../hh_profiles.py | 18 ++++++++++++++---- .../motorized_individual_travel/__init__.py | 14 +++++++------- .../data/datasets/zensus_mv_grid_districts.py | 2 +- 7 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index b4ea4856f..c4c8a4ed0 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -98,12 +98,13 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_demandregio_hh", rule_id="ROW_COUNT.egon_demandregio_hh", - expected_count=7218 + expected_count={"Schleswig-Holstein": 180, "everything": 7218} ), DataTypeValidation( table="demand.egon_demandregio_hh", rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_hh", - column_types={"nuts3": "character varying", "hh_size": "integer", "year": "integer", "demand": "double precision"} + column_types={"nuts3": "character varying", "hh_size": "integer", "scenario": "character varying", + "year": "integer", "demand": "double precision"} ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_demandregio_hh", diff --git a/src/egon/data/datasets/district_heating_areas/__init__.py b/src/egon/data/datasets/district_heating_areas/__init__.py index eb3ced010..bf2a02a03 100644 --- a/src/egon/data/datasets/district_heating_areas/__init__.py +++ b/src/egon/data/datasets/district_heating_areas/__init__.py @@ -94,7 +94,7 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_district_heating_areas", rule_id="ROW_COUNT.egon_district_heating_areas", - expected_count=6335 + expected_count={"Schleswig-Holstein": 100, "Everything": 6335} ), DataTypeValidation( table="demand.egon_district_heating_areas", diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index 83ee7637c..ef975aa54 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -79,7 +79,7 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_demandregio_zensus_electricity", rule_id="ROW_COUNT.egon_demandregio_zensus_electricity", - expected_count=7355160 + expected_count={"Schleswig-Holstein": 154527, "Everything": 7355160} ), DataTypeValidation( table="demand.egon_demandregio_zensus_electricity", diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py index c82eefe3e..7406747b8 100755 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py @@ -1243,7 +1243,7 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_building_electricity_peak_loads", rule_id="ROW_COUNT.egon_building_electricity_peak_loads", - expected_count=44683620 + expected_count={"Schleswig-Holstein": 3054820, "Everything": 44683620} ), DataTypeValidation( table="demand.egon_building_electricity_peak_loads", @@ -1269,7 +1269,7 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_building_heat_peak_loads", rule_id="ROW_COUNT.egon_building_heat_peak_loads", - expected_count=42128819 + expected_count={"Schleswig-Holstein": 732905, "Everything": 42128819} ), DataTypeValidation( table="demand.egon_building_heat_peak_loads", @@ -1295,7 +1295,7 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_household_electricity_profile_of_buildings", rule_id="ROW_COUNT.egon_household_electricity_profile_of_buildings", - expected_count=38605221 + expected_count={"Schleswig-Holstein": 1371592, "Everything": 38605221} ), DataTypeValidation( table="demand.egon_household_electricity_profile_of_buildings", diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index 42fc6ddc7..df5555f90 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -312,14 +312,24 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_household_electricity_profile_in_census_cell", rule_id="ROW_COUNT.egon_household_electricity_profile_in_census_cell", - expected_count=3177723 + expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723} ), DataTypeValidation( table="demand.egon_household_electricity_profile_in_census_cell", rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_in_census_cell", - column_types={"cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", - "nuts3": "character varying", "nuts1": "character varying", "factor_2035": "double precision", - "factor_2050": "double precision"} + column_types={ + "Schleswig-Holstein":{ + "cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", + "nuts3": "character varying", "nuts1": "character varying", + "factor_2019": "double precision","factor_2023": "double precision", + "factor_2035": "double precision", "factor_2050": "double precision" + }, + "Everything":{ + "cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", + "nuts3": "character varying", "nuts1": "character varying", + "factor_2035": "double precision", "factor_2050": "double precision" + } + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_household_electricity_profile_in_census_cell", diff --git a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py index cbdc0388f..8d230af3f 100644 --- a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py +++ b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py @@ -502,7 +502,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_count_municipality", rule_id="ROW_COUNT.egon_ev_count_municipality", - expected_count=44012 + expected_count={"Schleswig-Holstein": 1108, "Everything": 44012} ), DataTypeValidation( table="demand.egon_ev_count_municipality", @@ -531,7 +531,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_count_mv_grid_district", rule_id="ROW_COUNT.egon_ev_count_mv_grid_district", - expected_count=15348 + expected_count={"Schleswig-Holstein": 199, "Everything": 15348} ), DataTypeValidation( table="demand.egon_ev_count_mv_grid_district", @@ -561,7 +561,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_count_registration_district", rule_id="ROW_COUNT.egon_ev_count_registration_district", - expected_count=1600 + expected_count={"Schleswig-Holstein": 400, "Everything": 1600} ), DataTypeValidation( table="demand.egon_ev_count_registration_district", @@ -591,7 +591,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_mv_grid_district", rule_id="ROW_COUNT.egon_ev_mv_grid_district", - expected_count=15348 + expected_count={"Schleswig-Holstein": 534899, "Everything": 125609556} ), DataTypeValidation( table="demand.egon_ev_mv_grid_district", @@ -622,7 +622,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_pool", rule_id="ROW_COUNT.egon_ev_pool", - expected_count=65376 + expected_count={"Schleswig-Holstein": 7000, "Everything": 65376} ), DataTypeValidation( table="demand.egon_ev_pool", @@ -650,7 +650,7 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_trip", rule_id="ROW_COUNT.egon_ev_trip", - expected_count=108342188 + expected_count={"Schleswig-Holstein":11642066, "Everything": 108342188} ), DataTypeValidation( table="demand.egon_ev_trip", @@ -678,7 +678,7 @@ def generate_model_data_tasks(scenario_name): column="type", expected_values=["0_work", "1_business", "2_school", "3_shopping", "4_private/ridesharing", "5_leisure", "6_home", "7_charging_hub", "driving"] - ), + ) ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/zensus_mv_grid_districts.py b/src/egon/data/datasets/zensus_mv_grid_districts.py index 7f606e530..fe64bce60 100644 --- a/src/egon/data/datasets/zensus_mv_grid_districts.py +++ b/src/egon/data/datasets/zensus_mv_grid_districts.py @@ -48,7 +48,7 @@ def __init__(self, dependencies): RowCountValidation( table=" boundaries.egon_map_zensus_grid_districts", rule_id="ROW_COUNT.egon_map_zensus_grid_districts", - expected_count=35718586 + expected_count={"Schleswig-Holstein": 7519, "Everything": 35718586} ), DataTypeValidation( table="boundaries.egon_map_zensus_grid_districts", From 83aface94f5f8343b3615fb219781f16020c6460 Mon Sep 17 00:00:00 2001 From: sarah Date: Fri, 9 Jan 2026 15:03:07 +0100 Subject: [PATCH 18/54] add 2 heat datasets --- .../hh_profiles.py | 17 ++++++++ .../data/datasets/heat_demand/__init__.py | 41 +++++++++++++++++++ .../heat_demand_timeseries/__init__.py | 36 ++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index df5555f90..bbc47cea0 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -334,6 +334,23 @@ def __init__(self, dependencies): WholeTableNotNullAndNotNaNValidation( table="demand.egon_household_electricity_profile_in_census_cell", rule_id="WHOLE_TABLE_NOT_NAN.egon_household_electricity_profile_in_census_cell" + ), + RowCountValidation( + table=" demand.demand.iee_household_load_profiles", + rule_id="ROW_COUNT.iee_household_load_profiles", + expected_count={"Schleswig-Holstein": 2511, "Everything": 1000000} + ), + DataTypeValidation( + table="demand.iee_household_load_profiles", + rule_id="DATA_MULTIPLE_TYPES.iee_household_load_profiles", + column_types={ + "id": "integer", "type": "character", + "load_in_wh": "real[]" + } + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.iee_household_load_profiles", + rule_id="WHOLE_TABLE_NOT_NAN.iee_household_load_profiles" ) ] } diff --git a/src/egon/data/datasets/heat_demand/__init__.py b/src/egon/data/datasets/heat_demand/__init__.py index c0f9ce682..7d23e5d3f 100644 --- a/src/egon/data/datasets/heat_demand/__init__.py +++ b/src/egon/data/datasets/heat_demand/__init__.py @@ -39,6 +39,13 @@ from egon.data.metadata import context, license_ccby, meta_metadata, sources import egon.data.config +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + class HeatDemandImport(Dataset): """ @@ -74,6 +81,40 @@ def __init__(self, dependencies): version=self.version, # maybe rethink the naming dependencies=dependencies, tasks=(scenario_data_import), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_peta_heat", + rule_id="ROW_COUNT.egon_peta_heat", + expected_count={"Schleswig-Holstein": 139250, "Everything": 6836426} + ), + DataTypeValidation( + table="demand.egon_peta_heat", + rule_id="DATA_MULTIPLE_TYPES.egon_peta_heat", + column_types={"id": "integer", "demand": "double precision", "sector": "character varying", + "scenario": "character varying", "zensus_pupulation_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_peta_heat", + rule_id="WHOLE_TABLE_NOT_NAN.egon_peta_heat" + ), + ValueSetValidation( + table="demand.egon_peta_heat", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_peta_heat", + column="scenario", + expected_values={ + "Schleswig-Holstein":["eGon2035"], + "Everything":["eGon2035", "eGon100RE"] + } + ), + ValueSetValidation( + table="demand.egon_peta_heat", + rule_id="VALUE_SET_VALIDATION_SECTOR.egon_peta_heat", + column="sector", + expected_values=["residential", "service"] + ), + ] + }, ) diff --git a/src/egon/data/datasets/heat_demand_timeseries/__init__.py b/src/egon/data/datasets/heat_demand_timeseries/__init__.py index 972166780..8d442637a 100644 --- a/src/egon/data/datasets/heat_demand_timeseries/__init__.py +++ b/src/egon/data/datasets/heat_demand_timeseries/__init__.py @@ -37,6 +37,13 @@ sources, ) +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -1263,4 +1270,33 @@ def __init__(self, dependencies): metadata, store_national_profiles, ), + validation={ + "data_quality": [ + RowCountValidation( + table=" demand.egon_heat_idp_pool", + rule_id="ROW_COUNT.egon_heat_idp_pool", + expected_count=459535 + ), + DataTypeValidation( + table="demand.egon_heat_idp_pool", + rule_id="DATA_MULTIPLE_TYPES.egon_heat_idp_pool", + column_types={"index": "bigint", "idp": "double precision[]"} + ), + WholeTableNotNullAndNotNaNValidation( + table="demand.egon_heat_idp_pool", + rule_id="WHOLE_TABLE_NOT_NAN.egon_heat_idp_pool" + ), + RowCountValidation( + table="demand.egon_heat_timeseries_selected_profiles", + rule_id="ROW_COUNT.egon_heat_timeseries_selected_profiles", + expected_count={"Schleswig-Holstein": 719960, "Everything": 20606259} + ), + DataTypeValidation( + table="demand.egon_heat_timeseries_selected_profiles", + rule_id="DATA_MULTIPLE_TYPES.egon_heat_timeseries_selected_profiles", + column_types={"zensus_population_id": "integer", "bulding_id": "integer", + "selected_idp_profiles": "integer[]"} + ) + ] + }, ) From ecb86dc1a9538d3aa749d1dc7d9c4f947d97a300 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 9 Dec 2025 16:50:13 +0100 Subject: [PATCH 19/54] start sanity check integration --- SANITY_CHECKS_MIGRATION.md | 365 ++++++++++++++++++ .../data/validation/rules/custom/__init__.py | 15 + .../rules/custom/sanity/__init__.py | 17 + .../rules/custom/sanity/cts_demand.py | 170 ++++++++ .../custom/sanity/residential_electricity.py | 191 +++++++++ 5 files changed, 758 insertions(+) create mode 100644 SANITY_CHECKS_MIGRATION.md create mode 100644 src/egon/data/validation/rules/custom/__init__.py create mode 100644 src/egon/data/validation/rules/custom/sanity/__init__.py create mode 100644 src/egon/data/validation/rules/custom/sanity/cts_demand.py create mode 100644 src/egon/data/validation/rules/custom/sanity/residential_electricity.py diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md new file mode 100644 index 000000000..4c2362189 --- /dev/null +++ b/SANITY_CHECKS_MIGRATION.md @@ -0,0 +1,365 @@ +# Sanity Checks Migration Guide + +This guide explains how to migrate sanity check functions from `sanity_checks.py` to inline validation rules that integrate with the egon-validation framework. + +## Overview + +**Before:** Sanity checks were standalone functions called manually +**After:** Sanity checks are validation rules declared inline in Dataset definitions + +## Benefits + +- ✅ Structured validation results with pass/fail tracking +- ✅ Automatic execution as part of dataset tasks +- ✅ Results collected in validation reports +- ✅ Better error reporting with observed vs expected values +- ✅ Parallel execution support +- ✅ Consistent with formal validation rules + +--- + +## Example Migration + +### Before: Old Sanity Check Function + +```python +# In sanity_checks.py +def cts_electricity_demand_share(rtol=0.005): + """Check CTS electricity demand share sums to 1.""" + df_demand_share = pd.read_sql(...) + + np.testing.assert_allclose( + actual=df_demand_share.groupby(["bus_id", "scenario"])["profile_share"].sum(), + desired=1, + rtol=rtol, + verbose=False, + ) + + logger.info("CTS electricity demand shares sum correctly") +``` + +### After: New Validation Rule + +```python +# In egon/data/validation/rules/custom/sanity/cts_demand.py +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + +class CtsElectricityDemandShare(DataFrameRule): + """Validate CTS electricity demand shares sum to 1 for each substation.""" + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_electricity_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand shares sum to 1 (max deviation: {max_diff:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Demand share mismatch: {len(violations)} violations", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) +``` + +--- + +## Using Inline Validations in Datasets + +### Dataset Definition with Inline Validation + +```python +from egon.data.datasets import Dataset +from egon.data.validation.rules.custom.sanity import ( + CtsElectricityDemandShare, + CtsHeatDemandShare, +) + +class CtsElectricityDemand(Dataset): + def __init__(self, dependencies): + super().__init__( + name="CtsElectricityDemand", + version="1.0.0", + dependencies=dependencies, + tasks=( + download_data, + process_demand, + distribute_to_buildings, + ), + validation={ + "data_quality": [ + CtsElectricityDemandShare( + table="demand.egon_cts_electricity_demand_building_share", + rule_id="SANITY_CTS_ELECTRICITY_DEMAND_SHARE", + rtol=0.005 + ), + CtsHeatDemandShare( + table="demand.egon_cts_heat_demand_building_share", + rule_id="SANITY_CTS_HEAT_DEMAND_SHARE", + rtol=0.005 + ), + ] + }, + validation_on_failure="continue" # or "fail" to stop pipeline + ) +``` + +### How It Works + +1. **Validation tasks are created automatically** from the `validation` dict +2. **Tasks are named:** `{dataset_name}.validate.{validation_key}` + - Example: `CtsElectricityDemand.validate.data_quality` +3. **Tasks run after the main dataset tasks** complete +4. **Results are written** to `validation_runs/{run_id}/tasks/{task_name}/{rule_id}/results.jsonl` +5. **Validation report collects** all results at the end of the pipeline + +--- + +## Migration Patterns + +### Pattern 1: Simple DataFrame Assertion + +**Sanity Check:** +```python +def check_something(rtol=0.01): + df = db.select_dataframe("SELECT * FROM table") + np.testing.assert_allclose(df["actual"], df["expected"], rtol=rtol) + logger.info("Check passed") +``` + +**Validation Rule:** +```python +class CheckSomething(DataFrameRule): + def __init__(self, table, rule_id, rtol=0.01, **kwargs): + super().__init__(rule_id, table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return "SELECT * FROM table" + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol") + try: + np.testing.assert_allclose(df["actual"], df["expected"], rtol=rtol) + return RuleResult(success=True, ...) + except AssertionError: + return RuleResult(success=False, ...) +``` + +### Pattern 2: Multi-Table Comparison + +**Sanity Check:** +```python +def compare_tables(): + df1 = db.select_dataframe("SELECT SUM(value) FROM table1 GROUP BY key") + df2 = db.select_dataframe("SELECT SUM(value) FROM table2 GROUP BY key") + merged = df1.merge(df2, on="key") + assert (merged["value_x"] == merged["value_y"]).all() +``` + +**Validation Rule:** +```python +class CompareTablesCheck(DataFrameRule): + def get_query(self, ctx): + return """ + SELECT + t1.key, + t1.total as table1_total, + t2.total as table2_total + FROM (SELECT key, SUM(value) as total FROM table1 GROUP BY key) t1 + JOIN (SELECT key, SUM(value) as total FROM table2 GROUP BY key) t2 + ON t1.key = t2.key + """ + + def evaluate_df(self, df, ctx): + matches = (df["table1_total"] == df["table2_total"]).all() + return RuleResult(success=matches, ...) +``` + +### Pattern 3: Complex Checks with Loops + +For complex sanity checks with loops (e.g., `etrago_timeseries_length()`), you have two options: + +**Option A: Create one rule per component** (Recommended) +```python +validation = { + "timeseries_length": [ + TimeseriesLengthCheck( + table="grid.egon_etrago_generator_timeseries", + rule_id="SANITY_GENERATOR_TIMESERIES_LENGTH", + component="generator" + ), + TimeseriesLengthCheck( + table="grid.egon_etrago_load_timeseries", + rule_id="SANITY_LOAD_TIMESERIES_LENGTH", + component="load" + ), + # ... more components + ] +} +``` + +**Option B: Handle all components in one rule** +```python +class TimeseriesLengthCheck(DataFrameRule): + def evaluate_df(self, df, ctx): + # Check all components in a loop + # Return aggregated result +``` + +--- + +## Completed Migrations + +The following sanity checks have been migrated to validation rules: + +### ✅ Residential Electricity +- `residential_electricity_annual_sum()` → `ResidentialElectricityAnnualSum` +- `residential_electricity_hh_refinement()` → `ResidentialElectricityHhRefinement` + +### ✅ CTS Demand +- `cts_electricity_demand_share()` → `CtsElectricityDemandShare` +- `cts_heat_demand_share()` → `CtsHeatDemandShare` + +--- + +## Remaining Sanity Checks to Migrate + +The following functions from `sanity_checks.py` still need to be migrated: + +1. `etrago_eGon2035_electricity()` - Complex multi-carrier capacity checks +2. `etrago_eGon2035_heat()` - Heat capacity distribution checks +3. `sanitycheck_pv_rooftop_buildings()` - PV rooftop capacity validation +4. `sanitycheck_emobility_mit()` - E-mobility trip and vehicle checks +5. `sanitycheck_home_batteries()` - Home battery capacity validation +6. `sanity_check_gas_buses()` - Gas bus capacity checks +7. `sanity_check_CH4_stores()` - CH4 storage validation +8. `sanity_check_H2_saltcavern_stores()` - H2 storage validation +9. `sanity_check_gas_one_port()` - Gas one-port component checks +10. `sanity_check_CH4_grid()` - CH4 grid capacity validation +11. `sanity_check_gas_links()` - Gas link validation +12. `etrago_eGon2035_gas_DE()` - German gas network checks +13. `etrago_eGon2035_gas_abroad()` - International gas network checks +14. `sanitycheck_dsm()` - Demand-side management validation +15. `etrago_timeseries_length()` - Timeseries array length checks +16. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks +17. `electrical_load_100RE()` - eGon100RE load validation +18. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation + +--- + +## Directory Structure + +``` +egon-data/src/egon/data/ +├── datasets/ +│ ├── sanity_checks.py # Old sanity checks (to be deprecated) +│ └── ... +└── validation/ + └── rules/ + └── custom/ + └── sanity/ + ├── __init__.py + ├── residential_electricity.py # ✅ Migrated + ├── cts_demand.py # ✅ Migrated + ├── timeseries.py # TODO + ├── capacity_comparison.py # TODO + ├── emobility.py # TODO + ├── gas_grid.py # TODO + └── ... # TODO +``` + +--- + +## Testing Your Migration + +1. **Add validation to a dataset:** +```python +validation={ + "data_quality": [ + YourNewRule( + table="schema.table", + rule_id="SANITY_YOUR_CHECK", + param1=value1 + ) + ] +} +``` + +2. **Run the dataset:** +```bash +airflow tasks test your_dag your_dataset_task execution_date +``` + +3. **Check validation results:** +```bash +ls validation_runs/{run_id}/tasks/{dataset}.validate.data_quality/{rule_id}/ +cat validation_runs/{run_id}/tasks/{dataset}.validate.data_quality/{rule_id}/results.jsonl +``` + +4. **View the validation report:** +```bash +open validation_runs/{run_id}/final/report.html +``` + +--- + +## Best Practices + +1. **One rule class per check** - Keep rules focused and reusable +2. **Use descriptive rule_ids** - Follow pattern `SANITY_{CATEGORY}_{CHECK_NAME}` +3. **Set appropriate tolerances** - Document why you chose specific `rtol` values +4. **Provide clear messages** - Include context in success/failure messages +5. **Return observed/expected values** - Helps with debugging failures +6. **Override `kind = "sanity"`** - Ensures rules are categorized correctly + +--- + +## Getting Help + +- See implemented examples in `egon/data/validation/rules/custom/sanity/` +- Check egon-validation documentation for `DataFrameRule` API +- Ask in the team channel for migration assistance diff --git a/src/egon/data/validation/rules/custom/__init__.py b/src/egon/data/validation/rules/custom/__init__.py new file mode 100644 index 000000000..4f07cd008 --- /dev/null +++ b/src/egon/data/validation/rules/custom/__init__.py @@ -0,0 +1,15 @@ +"""Custom validation rules for eGon data.""" + +from .sanity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, + CtsElectricityDemandShare, + CtsHeatDemandShare, +) + +__all__ = [ + "ResidentialElectricityAnnualSum", + "ResidentialElectricityHhRefinement", + "CtsElectricityDemandShare", + "CtsHeatDemandShare", +] diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py new file mode 100644 index 000000000..a34f539b0 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -0,0 +1,17 @@ +"""Sanity check validation rules for eGon data quality.""" + +from .residential_electricity import ( + ResidentialElectricityAnnualSum, + ResidentialElectricityHhRefinement, +) +from .cts_demand import ( + CtsElectricityDemandShare, + CtsHeatDemandShare, +) + +__all__ = [ + "ResidentialElectricityAnnualSum", + "ResidentialElectricityHhRefinement", + "CtsElectricityDemandShare", + "CtsHeatDemandShare", +] diff --git a/src/egon/data/validation/rules/custom/sanity/cts_demand.py b/src/egon/data/validation/rules/custom/sanity/cts_demand.py new file mode 100644 index 000000000..5dbf16526 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/cts_demand.py @@ -0,0 +1,170 @@ +"""CTS (Commercial, Trade, Services) demand sanity check validation rules.""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + + +class CtsElectricityDemandShare(DataFrameRule): + """Validate CTS electricity demand shares sum to 1 for each substation. + + Checks that the sum of aggregated CTS electricity demand share equals 1 + for every substation, as the substation profile is linearly disaggregated + to all buildings. + + Args: + table: Primary table being validated (demand.egon_cts_electricity_demand_building_share) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... CtsElectricityDemandShare( + ... table="demand.egon_cts_electricity_demand_building_share", + ... rule_id="SANITY_CTS_ELECTRICITY_DEMAND_SHARE", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_electricity_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + # Check that all shares sum to 1 (within tolerance) + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand shares sum to 1 for all {len(df)} bus/scenario combinations (max deviation: {max_diff:.6f}, tolerance: {rtol:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"CTS electricity demand share mismatch: max deviation {max_diff:.6f} exceeds tolerance {rtol:.6f}. {len(violations)} bus/scenario combinations have shares != 1.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class CtsHeatDemandShare(DataFrameRule): + """Validate CTS heat demand shares sum to 1 for each substation. + + Checks that the sum of aggregated CTS heat demand share equals 1 + for every substation, as the substation profile is linearly disaggregated + to all buildings. + + Args: + table: Primary table being validated (demand.egon_cts_heat_demand_building_share) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... CtsHeatDemandShare( + ... table="demand.egon_cts_heat_demand_building_share", + ... rule_id="SANITY_CTS_HEAT_DEMAND_SHARE", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT bus_id, scenario, SUM(profile_share) as total_share + FROM demand.egon_cts_heat_demand_building_share + GROUP BY bus_id, scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + # Check that all shares sum to 1 (within tolerance) + np.testing.assert_allclose( + actual=df["total_share"], + desired=1.0, + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = (df["total_share"] - 1.0).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"CTS heat demand shares sum to 1 for all {len(df)} bus/scenario combinations (max deviation: {max_diff:.6f}, tolerance: {rtol:.6f})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = (df["total_share"] - 1.0).abs().max() + violations = df[~np.isclose(df["total_share"], 1.0, rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"CTS heat demand share mismatch: max deviation {max_diff:.6f} exceeds tolerance {rtol:.6f}. {len(violations)} bus/scenario combinations have shares != 1.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/residential_electricity.py b/src/egon/data/validation/rules/custom/sanity/residential_electricity.py new file mode 100644 index 000000000..b53ac4bcc --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/residential_electricity.py @@ -0,0 +1,191 @@ +"""Residential electricity demand sanity check validation rules.""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +import numpy as np + + +class ResidentialElectricityAnnualSum(DataFrameRule): + """Validate aggregated annual residential electricity demand matches DemandRegio at NUTS-3. + + Aggregates the annual demand of all census cells at NUTS3 to compare + with initial scaling parameters from DemandRegio. + + Args: + table: Primary table being validated (demand.egon_demandregio_zensus_electricity) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 0.005 = 0.5%) + + Example: + >>> validation = { + ... "data_quality": [ + ... ResidentialElectricityAnnualSum( + ... table="demand.egon_demandregio_zensus_electricity", + ... rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM", + ... rtol=0.005 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" # Override inferred kind + + def get_query(self, ctx): + return """ + SELECT dr.nuts3, dr.scenario, dr.demand_regio_sum, profiles.profile_sum + FROM ( + SELECT scenario, SUM(demand) AS profile_sum, vg250_nuts3 + FROM demand.egon_demandregio_zensus_electricity AS egon, + boundaries.egon_map_zensus_vg250 AS boundaries + WHERE egon.zensus_population_id = boundaries.zensus_population_id + AND sector = 'residential' + GROUP BY vg250_nuts3, scenario + ) AS profiles + JOIN ( + SELECT nuts3, scenario, sum(demand) AS demand_regio_sum + FROM demand.egon_demandregio_hh + GROUP BY year, scenario, nuts3 + ) AS dr + ON profiles.vg250_nuts3 = dr.nuts3 AND profiles.scenario = dr.scenario + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 0.005) + + try: + np.testing.assert_allclose( + actual=df["profile_sum"], + desired=df["demand_regio_sum"], + rtol=rtol, + verbose=False, + ) + + # Calculate actual max deviation for reporting + max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"Aggregated annual residential electricity demand matches with DemandRegio at NUTS-3 (max deviation: {max_diff:.4%}, tolerance: {rtol:.4%})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max() + violations = df[~np.isclose(df["profile_sum"], df["demand_regio_sum"], rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Demand mismatch: max deviation {max_diff:.4%} exceeds tolerance {rtol:.4%}. {len(violations)} NUTS-3 regions have mismatches.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class ResidentialElectricityHhRefinement(DataFrameRule): + """Validate aggregated household types after refinement match original census values. + + Checks sum of aggregated household types after refinement method + was applied and compares it to the original census values. + + Args: + table: Primary table being validated (society.egon_destatis_zensus_household_per_ha_refined) + rule_id: Unique identifier for this validation rule + rtol: Relative tolerance for comparison (default: 1e-5 = 0.001%) + + Example: + >>> validation = { + ... "data_quality": [ + ... ResidentialElectricityHhRefinement( + ... table="society.egon_destatis_zensus_household_per_ha_refined", + ... rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT", + ... rtol=1e-5 + ... ) + ... ] + ... } + """ + + def __init__(self, table: str, rule_id: str, rtol: float = 1e-5, **kwargs): + super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs) + self.kind = "sanity" + + def get_query(self, ctx): + return """ + SELECT refined.nuts3, refined.characteristics_code, + refined.sum_refined::int, census.sum_census::int + FROM( + SELECT nuts3, characteristics_code, SUM(hh_10types) as sum_refined + FROM society.egon_destatis_zensus_household_per_ha_refined + GROUP BY nuts3, characteristics_code) + AS refined + JOIN( + SELECT t.nuts3, t.characteristics_code, sum(orig) as sum_census + FROM( + SELECT nuts3, cell_id, characteristics_code, + sum(DISTINCT(hh_5types))as orig + FROM society.egon_destatis_zensus_household_per_ha_refined + GROUP BY cell_id, characteristics_code, nuts3) AS t + GROUP BY t.nuts3, t.characteristics_code ) AS census + ON refined.nuts3 = census.nuts3 + AND refined.characteristics_code = census.characteristics_code + """ + + def evaluate_df(self, df, ctx): + rtol = self.params.get("rtol", 1e-5) + + try: + np.testing.assert_allclose( + actual=df["sum_refined"], + desired=df["sum_census"], + rtol=rtol, + verbose=False, + ) + + max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(max_diff), + expected=rtol, + message=f"All aggregated household types match at NUTS-3 (max deviation: {max_diff:.6%}, tolerance: {rtol:.6%})", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + except AssertionError: + max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max() + violations = df[~np.isclose(df["sum_refined"], df["sum_census"], rtol=rtol)] + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_diff), + expected=rtol, + message=f"Household refinement mismatch: max deviation {max_diff:.6%} exceeds tolerance {rtol:.6%}. {len(violations)} NUTS-3/characteristic combinations have mismatches.", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) From 9fc0e062aefca1ddf927d1faf9cdd373af34d9f3 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 10 Dec 2025 14:19:04 +0100 Subject: [PATCH 20/54] start storage sanity interation --- src/egon/data/datasets/storages/__init__.py | 16 ++ .../rules/custom/sanity/__init__.py | 4 + .../rules/custom/sanity/home_batteries.py | 192 ++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 src/egon/data/validation/rules/custom/sanity/home_batteries.py diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 6ecda8b2c..25e3de6ff 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -24,6 +24,7 @@ from egon.data.datasets.storages.home_batteries import ( allocate_home_batteries_to_buildings, ) +from egon.data.validation.rules.custom.sanity import HomeBatteriesAggregation from egon.data.datasets.storages.pumped_hydro import ( apply_voltage_level_thresholds, get_location, @@ -99,6 +100,21 @@ def __init__(self, dependencies): allocate_pv_home_batteries_to_grids, allocate_home_batteries_to_buildings, ), + validation={ + "sanity_home_batteries_aggregation": [ + HomeBatteriesAggregation( + table="supply.egon_home_batteries", + rule_id="SANITY_HOME_BATTERIES_AGGREGATION_EGON2035", + scenario="eGon2035" + ), + HomeBatteriesAggregation( + table="supply.egon_home_batteries", + rule_id="SANITY_HOME_BATTERIES_AGGREGATION_EGON100RE", + scenario="eGon100RE" + ), + ] + }, + validation_on_failure="continue" ) diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py index a34f539b0..226164026 100644 --- a/src/egon/data/validation/rules/custom/sanity/__init__.py +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -8,10 +8,14 @@ CtsElectricityDemandShare, CtsHeatDemandShare, ) +from .home_batteries import ( + HomeBatteriesAggregation, +) __all__ = [ "ResidentialElectricityAnnualSum", "ResidentialElectricityHhRefinement", "CtsElectricityDemandShare", "CtsHeatDemandShare", + "HomeBatteriesAggregation", ] diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py new file mode 100644 index 000000000..6674dcfa0 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -0,0 +1,192 @@ +""" +Sanity check validation rules for home batteries + +Validates that home battery capacities are correctly aggregated from building-level +to bus-level in the storages table. +""" + +import numpy as np +import pandas as pd +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + +from egon.data import config +from egon.data.datasets.storages.home_batteries import get_cbat_pbat_ratio + + +class HomeBatteriesAggregation(DataFrameRule): + """ + Validate home battery capacity aggregation from buildings to buses. + + This rule checks that the sum of home battery capacities allocated to + buildings matches the aggregated capacity per bus in the storage table. + + The check compares: + 1. p_nom (power rating in MW) per bus + 2. capacity (energy capacity in MWh) per bus + + Both values are rounded to 6 decimal places for comparison. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", **kwargs): + super().__init__(rule_id=rule_id, table=table, scenario=scenario, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to compare storage and building-level home battery data. + + Returns a joined query that compares aggregated building-level data + with the storage table data per bus. + """ + # Get table names from config + sources = config.datasets()["home_batteries"]["sources"] + targets = config.datasets()["home_batteries"]["targets"] + + # Get cbat_pbat_ratio for capacity calculation + cbat_pbat_ratio = get_cbat_pbat_ratio() + + return f""" + WITH storage_data AS ( + SELECT + bus_id, + el_capacity as storage_p_nom, + el_capacity * {cbat_pbat_ratio} as storage_capacity + FROM {sources["storage"]["schema"]}.{sources["storage"]["table"]} + WHERE carrier = 'home_battery' + AND scenario = '{self.scenario}' + ), + building_data AS ( + SELECT + bus_id, + SUM(p_nom) as building_p_nom, + SUM(capacity) as building_capacity + FROM {targets["home_batteries"]["schema"]}.{targets["home_batteries"]["table"]} + WHERE scenario = '{self.scenario}' + GROUP BY bus_id + ) + SELECT + COALESCE(s.bus_id, b.bus_id) as bus_id, + ROUND(s.storage_p_nom::numeric, 6) as storage_p_nom, + ROUND(s.storage_capacity::numeric, 6) as storage_capacity, + ROUND(b.building_p_nom::numeric, 6) as building_p_nom, + ROUND(b.building_capacity::numeric, 6) as building_capacity + FROM storage_data s + FULL OUTER JOIN building_data b ON s.bus_id = b.bus_id + ORDER BY bus_id + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate the comparison between storage and building data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with storage and building data per bus + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No home battery data found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Check for buses that exist in only one source + missing_in_storage = df[df["storage_p_nom"].isna()] + missing_in_buildings = df[df["building_p_nom"].isna()] + + if not missing_in_storage.empty or not missing_in_buildings.empty: + violations = [] + if not missing_in_storage.empty: + violations.append( + f"{len(missing_in_storage)} bus(es) in buildings but not in storage: " + f"{missing_in_storage['bus_id'].tolist()[:5]}" + ) + if not missing_in_buildings.empty: + violations.append( + f"{len(missing_in_buildings)} bus(es) in storage but not in buildings: " + f"{missing_in_buildings['bus_id'].tolist()[:5]}" + ) + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=len(missing_in_storage) + len(missing_in_buildings), + expected=0, + message=f"Bus mismatch between tables: {'; '.join(violations)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Check if p_nom values match + p_nom_mismatch = df[df["storage_p_nom"] != df["building_p_nom"]] + + # Check if capacity values match + capacity_mismatch = df[df["storage_capacity"] != df["building_capacity"]] + + # Combine mismatches + mismatches = pd.concat([p_nom_mismatch, capacity_mismatch]).drop_duplicates(subset=["bus_id"]) + + if not mismatches.empty: + # Calculate maximum differences + max_p_nom_diff = (df["storage_p_nom"] - df["building_p_nom"]).abs().max() + max_capacity_diff = (df["storage_capacity"] - df["building_capacity"]).abs().max() + + # Get sample violations + sample_violations = mismatches.head(5)[ + ["bus_id", "storage_p_nom", "building_p_nom", "storage_capacity", "building_capacity"] + ].to_dict(orient="records") + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max(max_p_nom_diff, max_capacity_diff)), + expected=0.0, + message=( + f"Home battery aggregation mismatch for {len(mismatches)} bus(es): " + f"max p_nom diff={max_p_nom_diff:.6f}, max capacity diff={max_capacity_diff:.6f}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__, + details={"sample_violations": sample_violations} + ) + + # All checks passed + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=f"Home battery capacities correctly aggregated for all {len(df)} buses in scenario {self.scenario}", + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) \ No newline at end of file From 76f592342f830281f34f429fbf1e7c22963f73f6 Mon Sep 17 00:00:00 2001 From: Sarah Sommer Date: Mon, 29 Dec 2025 11:43:17 +0100 Subject: [PATCH 21/54] migrate gas sanity rules --- SANITY_CHECKS_MIGRATION.md | 94 +- src/egon/data/airflow/dags/pipeline.py | 24 +- .../datasets/electricity_demand/__init__.py | 2 +- src/egon/data/datasets/final_validations.py | 382 ++++++++ src/egon/data/datasets/storages/__init__.py | 2 +- .../rules/custom/sanity/__init__.py | 18 + .../rules/custom/sanity/gas_grid.py | 819 ++++++++++++++++++ .../rules/custom/sanity/gas_stores.py | 323 +++++++ .../rules/custom/sanity/home_batteries.py | 12 +- 9 files changed, 1652 insertions(+), 24 deletions(-) create mode 100644 src/egon/data/datasets/final_validations.py create mode 100644 src/egon/data/validation/rules/custom/sanity/gas_grid.py create mode 100644 src/egon/data/validation/rules/custom/sanity/gas_stores.py diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md index 4c2362189..51257f770 100644 --- a/SANITY_CHECKS_MIGRATION.md +++ b/SANITY_CHECKS_MIGRATION.md @@ -109,7 +109,9 @@ class CtsElectricityDemandShare(DataFrameRule): ## Using Inline Validations in Datasets -### Dataset Definition with Inline Validation +### Option 1: Dataset-Specific Inline Validation + +For validations tied to a specific dataset (e.g., CTS demand validations), add them inline to that dataset: ```python from egon.data.datasets import Dataset @@ -147,6 +149,56 @@ class CtsElectricityDemand(Dataset): ) ``` +### Option 2: Cross-Cutting Validations in FinalValidations + +For validations that check data consistency **across multiple datasets** (e.g., gas store capacity checks), add them to the `FinalValidations` dataset: + +```python +# In: src/egon/data/datasets/final_validations.py + +from egon.data.validation.rules.custom.sanity import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, + # Import your new validation rule here +) + +class FinalValidations(Dataset): + def __init__(self, dependencies): + super().__init__( + # ... + validation={ + "gas_stores": [ + CH4StoresCapacity(...), + H2SaltcavernStoresCapacity(...), + # Add your new rule here + ], + # Add new category if needed + "your_category": [ + YourNewValidationRule(...), + ], + }, + ) +``` + +Then update `pipeline.py` to include your dataset in `FinalValidations` dependencies: + +```python +final_validations = FinalValidations( + dependencies=[ + insert_data_ch4_storages, + insert_H2_storage, + storage_etrago, + your_new_dataset, # Add dataset providing data for your validation + ] +) +``` + +**When to use FinalValidations:** +- ✅ Validation checks data from multiple datasets +- ✅ Validation should run at the end of the pipeline +- ✅ Validation is cross-cutting (gas network, timeseries consistency, etc.) +- ❌ Don't use for dataset-specific checks (use inline validation instead) + ### How It Works 1. **Validation tasks are created automatically** from the `validation` dict @@ -264,6 +316,19 @@ The following sanity checks have been migrated to validation rules: - `cts_electricity_demand_share()` → `CtsElectricityDemandShare` - `cts_heat_demand_share()` → `CtsHeatDemandShare` +### ✅ Home Batteries +- `sanitycheck_home_batteries()` → `HomeBatteriesAggregation` + +### ✅ Gas Stores +- `sanity_check_CH4_stores()` → `CH4StoresCapacity` +- `sanity_check_H2_saltcavern_stores()` → `H2SaltcavernStoresCapacity` + +### ✅ Gas Grid +- `sanity_check_gas_buses()` → `GasBusesIsolated` + `GasBusesCount` +- `sanity_check_gas_one_port()` → `GasOnePortConnections` +- `sanity_check_CH4_grid()` → `CH4GridCapacity` +- `sanity_check_gas_links()` → `GasLinksConnections` + --- ## Remaining Sanity Checks to Migrate @@ -272,22 +337,15 @@ The following functions from `sanity_checks.py` still need to be migrated: 1. `etrago_eGon2035_electricity()` - Complex multi-carrier capacity checks 2. `etrago_eGon2035_heat()` - Heat capacity distribution checks -3. `sanitycheck_pv_rooftop_buildings()` - PV rooftop capacity validation +3. `sanitycheck_pv_rooftop_buildings()` - PV rooftop capacity validation (complex with plots) 4. `sanitycheck_emobility_mit()` - E-mobility trip and vehicle checks -5. `sanitycheck_home_batteries()` - Home battery capacity validation -6. `sanity_check_gas_buses()` - Gas bus capacity checks -7. `sanity_check_CH4_stores()` - CH4 storage validation -8. `sanity_check_H2_saltcavern_stores()` - H2 storage validation -9. `sanity_check_gas_one_port()` - Gas one-port component checks -10. `sanity_check_CH4_grid()` - CH4 grid capacity validation -11. `sanity_check_gas_links()` - Gas link validation -12. `etrago_eGon2035_gas_DE()` - German gas network checks -13. `etrago_eGon2035_gas_abroad()` - International gas network checks -14. `sanitycheck_dsm()` - Demand-side management validation -15. `etrago_timeseries_length()` - Timeseries array length checks -16. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks -17. `electrical_load_100RE()` - eGon100RE load validation -18. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation +5. `etrago_eGon2035_gas_DE()` - German gas network checks +6. `etrago_eGon2035_gas_abroad()` - International gas network checks +7. `sanitycheck_dsm()` - Demand-side management validation +8. `etrago_timeseries_length()` - Timeseries array length checks +9. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks +10. `electrical_load_100RE()` - eGon100RE load validation +11. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation --- @@ -305,10 +363,12 @@ egon-data/src/egon/data/ ├── __init__.py ├── residential_electricity.py # ✅ Migrated ├── cts_demand.py # ✅ Migrated + ├── home_batteries.py # ✅ Migrated + ├── gas_stores.py # ✅ Migrated (CH4, H2 saltcavern stores) + ├── gas_grid.py # ✅ Migrated (bus isolation, bus counts, one-port, CH4 grid capacity, link connections) ├── timeseries.py # TODO ├── capacity_comparison.py # TODO ├── emobility.py # TODO - ├── gas_grid.py # TODO └── ... # TODO ``` diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py index 3dd84b071..0b2a55bb0 100755 --- a/src/egon/data/airflow/dags/pipeline.py +++ b/src/egon/data/airflow/dags/pipeline.py @@ -103,6 +103,7 @@ from egon.data.metadata import Json_Metadata from egon.data.datasets.validation_report import ValidationReport +from egon.data.datasets.final_validations import FinalValidations # Set number of threads used by numpy and pandas set_numexpr_threads() @@ -732,11 +733,30 @@ ] ) + with TaskGroup(group_id="final_validations") as final_validations_group: + # Cross-cutting validations that check data consistency across datasets + # These run after all data generation but before the validation report + final_validations = FinalValidations( + dependencies=[ + insert_data_ch4_storages, # CH4Storages - for CH4 store validation + insert_H2_storage, # HydrogenStoreEtrago - for H2 saltcavern validation + storage_etrago, # StorageEtrago - general storage validation + hts_etrago_table, + fill_etrago_generators, + household_electricity_demand_annual, + cts_demand_buildings, + emobility_mit, + low_flex_scenario, + ] + ) + with TaskGroup(group_id="validation_report") as validation_report_group: # Generate validation report from all validation tasks - # NOTE: Temporarily depends only on vg250 for testing purposes + # Runs after all validations (including final_validations) are complete validation_report = ValidationReport( - dependencies=[vg250] + dependencies=[ + final_validations, # Wait for final validations + ] ) with TaskGroup(group_id="sanity_checks") as sanity_checks_group: diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index ef975aa54..5487bb5c4 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -56,7 +56,7 @@ class HouseholdElectricityDemand(Dataset): #: name: str = "HouseholdElectricityDemand" #: - version: str = "0.0.5" + version: str = "0.0.5.dev" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py new file mode 100644 index 000000000..fcc761d32 --- /dev/null +++ b/src/egon/data/datasets/final_validations.py @@ -0,0 +1,382 @@ +""" +Dataset for cross-cutting validations that run at the end of the pipeline. + +This module provides the FinalValidations dataset which contains validation rules +that check data consistency across multiple datasets. These validations should run +after all data generation is complete, but before the final validation report. +""" + +from egon.data.datasets import Dataset +from egon.data.validation.rules.custom.sanity import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, + GasBusesIsolated, + GasBusesCount, + GasOnePortConnections, + CH4GridCapacity, + GasLinksConnections, +) + + +def notasks(): + """ + Placeholder task function. + + This dataset has no data generation tasks - it only runs validation rules + defined in the validation dict. The validation framework automatically creates + validation tasks from the rules. + + Returns + ------- + None + """ + return None + + +class FinalValidations(Dataset): + """ + Cross-cutting validations that run at the end of the pipeline. + + This dataset contains validation rules that check data consistency across + multiple datasets and should run after all data generation is complete. + + The validations are organized by category and run automatically as part of + the dataset's validation tasks. Results are collected by ValidationReport. + + *Dependencies* + Should depend on all datasets whose data is validated by the rules + defined here. At minimum: + * CH4Storages - for CH4 store capacity validation + * HydrogenStoreEtrago - for H2 saltcavern store validation + * Add more as you add validation rules + + *Validation Results* + Results are written to validation_runs/{run_id}/tasks/FinalValidations.validate.*/ + and collected by the ValidationReport dataset + + *Adding New Validations* + To add new cross-cutting validations: + 1. Create the validation rule class in validation/rules/custom/sanity/ + 2. Import it at the top of this file + 3. Add instances to the appropriate category in the validation dict below + 4. Update dependencies to include datasets that provide the data being validated + + Example + ------- + To add a new gas grid validation: + + ```python + from egon.data.validation.rules.custom.sanity import CH4GridCapacity + + # In the validation dict: + "gas_stores": [ + # ... existing rules ... + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY", + scenario="eGon2035" + ), + ] + ``` + """ + + #: + name: str = "FinalValidations" + #: + version: str = "0.0.1" + + def __init__(self, dependencies): + super().__init__( + name=self.name, + version=self.version, + dependencies=dependencies, + tasks=(notasks,), # No data tasks - only validation tasks + validation={ + # Gas store capacity validations + # These check that CH4 and H2 store capacities match expected values + "gas_stores": [ + # CH4 stores - eGon2035 + CH4StoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_CH4_STORES_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.02 + ), + # CH4 stores - eGon100RE + CH4StoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_CH4_STORES_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.02 + ), + # H2 saltcavern stores - eGon2035 + H2SaltcavernStoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_H2_SALTCAVERN_STORES_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.02 + ), + # H2 saltcavern stores - eGon100RE + H2SaltcavernStoresCapacity( + table="grid.egon_etrago_store", + rule_id="SANITY_H2_SALTCAVERN_STORES_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.02 + ), + ], + + # Gas grid bus validations + # These check that gas buses are properly connected and counts match expectations + "gas_grid": [ + # Check for isolated CH4 buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4" + ), + # Check for isolated H2_grid buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON2035", + scenario="eGon2035", + carrier="H2_grid" + ), + # Check for isolated H2_saltcavern buses - eGon2035 + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON2035", + scenario="eGon2035", + carrier="H2_saltcavern" + ), + # Check for isolated CH4 buses - eGon100RE + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON100RE", + scenario="eGon100RE", + carrier="CH4" + ), + # Check for isolated H2_grid buses - eGon100RE + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON100RE", + scenario="eGon100RE", + carrier="H2_grid" + ), + # Check for isolated H2_saltcavern buses - eGon100RE + GasBusesIsolated( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON100RE", + scenario="eGon100RE", + carrier="H2_saltcavern" + ), + # Check CH4 bus count - eGon2035 + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4", + rtol=0.10 + ), + # Check H2_grid bus count - eGon2035 + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON2035", + scenario="eGon2035", + carrier="H2_grid", + rtol=0.10 + ), + # Check CH4 bus count - eGon100RE + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON100RE", + scenario="eGon100RE", + carrier="CH4", + rtol=0.10 + ), + # Check H2_grid bus count - eGon100RE + GasBusesCount( + table="grid.egon_etrago_bus", + rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON100RE", + scenario="eGon100RE", + carrier="H2_grid", + rtol=0.10 + ), + # Check CH4 grid capacity - eGon2035 + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY_EGON2035", + scenario="eGon2035", + rtol=0.10 + ), + # Check CH4 grid capacity - eGon100RE + CH4GridCapacity( + table="grid.egon_etrago_link", + rule_id="SANITY_CH4_GRID_CAPACITY_EGON100RE", + scenario="eGon100RE", + rtol=0.10 + ), + ], + + # Gas one-port component connection validations + # These check that loads, generators, and stores are connected to valid buses + "gas_one_port": [ + # LOADS - eGon2035 + # CH4_for_industry loads in Germany must connect to CH4 buses + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_CH4_FOR_INDUSTRY_DE_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="CH4_for_industry", + bus_conditions=[("CH4", "= 'DE'")] + ), + # CH4 loads abroad must connect to CH4 buses outside Germany + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_CH4_ABROAD_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="CH4", + bus_conditions=[("CH4", "!= 'DE'")] + ), + # H2_for_industry loads must connect to H2_grid in DE or AC abroad + GasOnePortConnections( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_ONE_PORT_LOAD_H2_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + component_type="load", + component_carrier="H2_for_industry", + bus_conditions=[("H2_grid", "= 'DE'"), ("AC", "!= 'DE'")] + ), + + # GENERATORS - eGon2035 + # CH4 generators must connect to CH4 buses + GasOnePortConnections( + table="grid.egon_etrago_generator", + rule_id="SANITY_GAS_ONE_PORT_GENERATOR_CH4_EGON2035", + scenario="eGon2035", + component_type="generator", + component_carrier="CH4", + bus_conditions=[("CH4", "IS NOT NULL")] # Any CH4 bus + ), + + # STORES - eGon2035 + # CH4 stores must connect to CH4 buses + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_CH4_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="CH4", + bus_conditions=[("CH4", "IS NOT NULL")] + ), + # H2_underground stores must connect to H2_saltcavern buses + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_H2_UNDERGROUND_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="H2_underground", + bus_conditions=[("H2_saltcavern", "IS NOT NULL")] + ), + # H2_overground stores must connect to H2_saltcavern or H2_grid in DE + GasOnePortConnections( + table="grid.egon_etrago_store", + rule_id="SANITY_GAS_ONE_PORT_STORE_H2_OVERGROUND_EGON2035", + scenario="eGon2035", + component_type="store", + component_carrier="H2_overground", + bus_conditions=[("H2_saltcavern", "= 'DE'"), ("H2_grid", "= 'DE'")] + ), + ], + + # Gas link connection validations + # These check that gas links have both bus0 and bus1 connected to existing buses + "gas_links": [ + # CH4 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4" + ), + # H2_feedin links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_FEEDIN_EGON2035", + scenario="eGon2035", + carrier="H2_feedin" + ), + # H2_to_CH4 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_TO_CH4_EGON2035", + scenario="eGon2035", + carrier="H2_to_CH4" + ), + # CH4_to_H2 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CH4_TO_H2_EGON2035", + scenario="eGon2035", + carrier="CH4_to_H2" + ), + # H2_to_power links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_H2_TO_POWER_EGON2035", + scenario="eGon2035", + carrier="H2_to_power" + ), + # power_to_H2 links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_POWER_TO_H2_EGON2035", + scenario="eGon2035", + carrier="power_to_H2" + ), + # OCGT links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_OCGT_EGON2035", + scenario="eGon2035", + carrier="OCGT" + ), + # central_gas_boiler links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_BOILER_EGON2035", + scenario="eGon2035", + carrier="central_gas_boiler" + ), + # central_gas_CHP links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_CHP_EGON2035", + scenario="eGon2035", + carrier="central_gas_CHP" + ), + # central_gas_CHP_heat links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_CENTRAL_GAS_CHP_HEAT_EGON2035", + scenario="eGon2035", + carrier="central_gas_CHP_heat" + ), + # industrial_gas_CHP links - eGon2035 + GasLinksConnections( + table="grid.egon_etrago_link", + rule_id="SANITY_GAS_LINKS_INDUSTRIAL_GAS_CHP_EGON2035", + scenario="eGon2035", + carrier="industrial_gas_CHP" + ), + ], + + # Add more validation categories here as you migrate more sanity checks + # Examples: + # "timeseries": [ ... ], + # "capacity_comparison": [ ... ], + }, + validation_on_failure="continue" # Continue pipeline even if validations fail + ) diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 25e3de6ff..e6476f2a7 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -86,7 +86,7 @@ class Storages(Dataset): #: name: str = "Storages" #: - version: str = "0.0.8" + version: str = "0.0.8.dev" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py index 226164026..be5fa80f1 100644 --- a/src/egon/data/validation/rules/custom/sanity/__init__.py +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -11,6 +11,17 @@ from .home_batteries import ( HomeBatteriesAggregation, ) +from .gas_stores import ( + CH4StoresCapacity, + H2SaltcavernStoresCapacity, +) +from .gas_grid import ( + GasBusesIsolated, + GasBusesCount, + GasOnePortConnections, + CH4GridCapacity, + GasLinksConnections, +) __all__ = [ "ResidentialElectricityAnnualSum", @@ -18,4 +29,11 @@ "CtsElectricityDemandShare", "CtsHeatDemandShare", "HomeBatteriesAggregation", + "CH4StoresCapacity", + "H2SaltcavernStoresCapacity", + "GasBusesIsolated", + "GasBusesCount", + "GasOnePortConnections", + "CH4GridCapacity", + "GasLinksConnections", ] diff --git a/src/egon/data/validation/rules/custom/sanity/gas_grid.py b/src/egon/data/validation/rules/custom/sanity/gas_grid.py new file mode 100644 index 000000000..54239c23d --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_grid.py @@ -0,0 +1,819 @@ +""" +Sanity check validation rules for gas grid components. + +Validates gas bus connectivity, counts, and grid consistency. +""" + +from pathlib import Path +import pandas as pd +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from typing import List, Tuple +from egon.data.datasets.scenario_parameters import get_sector_parameters + + +class GasBusesIsolated(DataFrameRule): + """ + Validate that gas buses are not isolated. + + Checks that all gas buses (CH4, H2_grid, H2_saltcavern) in Germany + are connected to at least one link. Isolated buses indicate potential + issues with grid connectivity. + + The check examines buses that don't appear in either bus0 or bus1 + of the corresponding link carrier. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_bus) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Bus carrier type ("CH4", "H2_grid", or "H2_saltcavern") + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + # Map bus carrier to corresponding link carrier + self.carrier_mapping = { + "eGon2035": { + "CH4": "CH4", + "H2_grid": "H2_feedin", + "H2_saltcavern": "power_to_H2", + }, + "eGon100RE": { + "CH4": "CH4", + "H2_grid": "H2_retrofit", + "H2_saltcavern": "H2_extension", + } + } + + def get_query(self, ctx): + """ + Query to find isolated gas buses. + + Returns a query that finds buses of the specified carrier that + are not connected to any links (don't appear in bus0 or bus1 + of links with the corresponding carrier). + """ + if self.scenario not in self.carrier_mapping: + # Return empty query for unsupported scenarios + return "SELECT NULL as bus_id, NULL as carrier, NULL as country LIMIT 0" + + link_carrier = self.carrier_mapping[self.scenario].get(self.carrier) + if not link_carrier: + return "SELECT NULL as bus_id, NULL as carrier, NULL as country LIMIT 0" + + return f""" + SELECT bus_id, carrier, country + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND country = 'DE' + AND bus_id NOT IN ( + SELECT bus0 + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{link_carrier}' + ) + AND bus_id NOT IN ( + SELECT bus1 + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{link_carrier}' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate isolated buses. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with isolated buses (bus_id, carrier, country) + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + # Filter out NULL rows from unsupported scenarios + df = df.dropna() + + isolated_count = len(df) + + if isolated_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"No isolated {self.carrier} buses found for {self.scenario} " + f"(all buses connected to grid)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Get sample of isolated buses + sample_buses = df.head(10)['bus_id'].tolist() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=isolated_count, + expected=0, + message=( + f"Found {isolated_count} isolated {self.carrier} buses for {self.scenario} " + f"(sample: {sample_buses})" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__, + details={"isolated_buses": df.to_dict(orient="records")} + ) + + +class GasBusesCount(DataFrameRule): + """ + Validate gas grid bus count against SciGRID_gas data. + + Compares the number of gas grid buses (CH4 or H2_grid) in the database + against the original SciGRID_gas node count for Germany. Allows for + small deviations due to grid simplification or modifications. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_bus) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Bus carrier type ("CH4" or "H2_grid") + rtol : float + Relative tolerance for bus count deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to count gas grid buses in Germany. + + Returns a query that counts buses of the specified carrier + in Germany for the specified scenario. + """ + return f""" + SELECT COUNT(*) as bus_count + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = '{self.carrier}' + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate bus count against SciGRID_gas reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with bus_count column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["bus_count"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} buses found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_count = int(df["bus_count"].values[0]) + + # Get expected count from SciGRID_gas data + try: + target_file = Path(".") / "datasets" / "gas_data" / "data" / "IGGIELGN_Nodes.csv" + grid_buses_df = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["country_code"], + ) + grid_buses_df = grid_buses_df[ + grid_buses_df["country_code"].str.match("DE") + ] + expected_count = len(grid_buses_df.index) + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error reading SciGRID_gas reference data: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_count - expected_count) / expected_count + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=float(observed_count), + expected=float(expected_count), + message=( + f"{self.carrier} bus count valid for {self.scenario}: " + f"{observed_count} buses (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(observed_count), + expected=float(expected_count), + message=( + f"{self.carrier} bus count deviation too large for {self.scenario}: " + f"{observed_count} vs {expected_count} expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasOnePortConnections(DataFrameRule): + """ + Validate that gas one-port components are connected to existing buses. + + Checks that all gas one-port components (loads, generators, stores) are + connected to buses that exist in the database with the correct carrier type. + + This validation ensures data integrity across the etrago tables and prevents + orphaned components that would cause errors in network optimization. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + component_type: str = "load", component_carrier: str = "CH4_for_industry", + bus_conditions: List[Tuple[str, str]] = None, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load, grid.egon_etrago_generator, + or grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + component_type : str + Type of component ("load", "generator", or "store") + component_carrier : str + Carrier of the component to check + bus_conditions : List[Tuple[str, str]] + List of (bus_carrier, country_condition) tuples that define valid buses + Examples: + - [("CH4", "= 'DE'")] - CH4 buses in Germany + - [("CH4", "!= 'DE'")] - CH4 buses outside Germany + - [("H2_grid", "= 'DE'"), ("AC", "!= 'DE'")] - H2_grid in DE OR AC abroad + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + component_type=component_type, + component_carrier=component_carrier, + bus_conditions=bus_conditions or [], **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.component_type = component_type + self.component_carrier = component_carrier + self.bus_conditions = bus_conditions or [] + + # Map component type to ID column name + self.id_column_map = { + "load": "load_id", + "generator": "generator_id", + "store": "store_id" + } + + def get_query(self, ctx): + """ + Query to find one-port components not connected to valid buses. + + Returns a query that finds components of the specified type and carrier + that are NOT connected to any of the valid bus types specified in + bus_conditions. + """ + if not self.bus_conditions: + # No bus conditions specified - skip validation + return "SELECT NULL as component_id, NULL as bus, NULL as carrier LIMIT 0" + + id_column = self.id_column_map.get(self.component_type, "id") + + # Build bus subqueries for each condition + bus_subqueries = [] + for bus_carrier, country_cond in self.bus_conditions: + subquery = f""" + (SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND carrier = '{bus_carrier}' + AND country {country_cond}) + """ + bus_subqueries.append(subquery) + + # Build NOT IN clauses for all bus conditions + not_in_clauses = [f"bus NOT IN {subq}" for subq in bus_subqueries] + combined_condition = " AND ".join(not_in_clauses) + + return f""" + SELECT {id_column} as component_id, bus, carrier, scn_name + FROM {self.table} + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.component_carrier}' + AND {combined_condition} + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate one-port component connections. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with disconnected components (component_id, bus, carrier) + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + # Filter out NULL rows + df = df.dropna() + + disconnected_count = len(df) + + if disconnected_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"All {self.component_carrier} {self.component_type}s connected " + f"to valid buses for {self.scenario}" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Get sample of disconnected components + sample_components = df.head(10)['component_id'].tolist() + sample_buses = df.head(10)['bus'].tolist() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=disconnected_count, + expected=0, + message=( + f"Found {disconnected_count} disconnected {self.component_carrier} " + f"{self.component_type}s for {self.scenario} " + f"(sample IDs: {sample_components}, buses: {sample_buses})" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__, + details={ + "disconnected_components": df.to_dict(orient="records"), + "bus_conditions": self.bus_conditions + } + ) + + +class CH4GridCapacity(DataFrameRule): + """ + Validate CH4 grid capacity against SciGRID_gas reference data. + + Compares the total capacity (p_nom) of CH4 pipelines in Germany from the + database against the original SciGRID_gas pipeline data. For eGon100RE, + the expected capacity is adjusted to account for the share of CH4 pipelines + retrofitted to H2 pipelines (based on PyPSA-eur-sec parameters). + + This validation ensures that the CH4 grid capacity in the database matches + the imported SciGRID_gas data, accounting for any scenario-specific modifications. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_link) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total CH4 pipeline capacity in Germany. + + Returns a query that sums the p_nom of all CH4 links where both + bus0 and bus1 are in Germany. + """ + return f""" + SELECT SUM(p_nom::numeric) as total_p_nom + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = 'CH4' + AND bus0 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + AND bus1 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + """ + + def _get_reference_capacity(self): + """ + Calculate reference capacity from SciGRID_gas pipeline data. + + Returns + ------- + float + Expected total pipeline capacity for the scenario + """ + try: + # Read pipeline segments from SciGRID_gas + target_file = ( + Path(".") + / "datasets" + / "gas_data" + / "data" + / "IGGIELGN_PipeSegments.csv" + ) + + pipelines = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["id", "node_id", "country_code", "param"], + ) + + # Parse bus0, bus1 and countries + pipelines["bus0"] = pipelines["node_id"].apply(lambda x: x.split(",")[0]) + pipelines["bus1"] = pipelines["node_id"].apply(lambda x: x.split(",")[1]) + pipelines["country_0"] = pipelines["country_code"].apply(lambda x: x.split(",")[0]) + pipelines["country_1"] = pipelines["country_code"].apply(lambda x: x.split(",")[1]) + + # Filter for pipelines within Germany + germany_pipelines = pipelines[ + (pipelines["country_0"] == "DE") & (pipelines["country_1"] == "DE") + ] + + # Read pipeline classification for capacity mapping + classification_file = ( + Path(".") + / "data_bundle_egon_data" + / "pipeline_classification_gas" + / "pipeline_classification.csv" + ) + + classification = pd.read_csv( + classification_file, + delimiter=",", + usecols=["classification", "max_transport_capacity_Gwh/d"], + ) + + # Map pipeline param to capacity + param_to_capacity = dict( + zip(classification["classification"], + classification["max_transport_capacity_Gwh/d"]) + ) + + germany_pipelines["p_nom"] = germany_pipelines["param"].map(param_to_capacity) + + # Sum total capacity + total_p_nom = germany_pipelines["p_nom"].sum() + + # Adjust for eGon100RE (H2 retrofit share) + if self.scenario == "eGon100RE": + scn_params = get_sector_parameters("gas", "eGon100RE") + h2_retrofit_share = scn_params["retrofitted_CH4pipeline-to-H2pipeline_share"] + total_p_nom = total_p_nom * (1 - h2_retrofit_share) + + return float(total_p_nom) + + except Exception as e: + raise ValueError(f"Error reading SciGRID_gas reference data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate CH4 grid capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with total_p_nom column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["total_p_nom"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No CH4 links found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["total_p_nom"].values[0]) + + # Get expected capacity from SciGRID_gas data + try: + expected_capacity = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 grid capacity valid for {self.scenario}: " + f"{observed_capacity:.2f} GWh/d (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 grid capacity deviation too large for {self.scenario}: " + f"{observed_capacity:.2f} vs {expected_capacity:.2f} GWh/d expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasLinksConnections(DataFrameRule): + """ + Validate that gas links are connected to existing buses. + + Checks that all gas links (two-port components) have both bus0 and bus1 + connected to buses that exist in the database. This validation ensures + data integrity and prevents orphaned links that would cause errors in + network optimization. + + This check covers all gas-related link carriers including CH4 pipelines, + H2 conversion links, and power-to-gas links. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_link) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Link carrier type to check (e.g., "CH4", "H2_feedin", "power_to_H2") + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to find links with missing buses. + + Returns a query that finds links where either bus0 or bus1 + does not exist in the bus table for the same scenario. + """ + return f""" + SELECT link_id, bus0, bus1, carrier, scn_name + FROM grid.egon_etrago_link + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND ( + bus0 NOT IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + ) + OR bus1 NOT IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + ) + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate link connections. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with links that have missing buses + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + disconnected_count = len(df) + + if disconnected_count == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0, + expected=0, + message=( + f"All {self.carrier} links connected to valid buses for {self.scenario}" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + # Get sample of disconnected links + sample_links = df.head(10)['link_id'].tolist() + sample_bus0 = df.head(10)['bus0'].tolist() + sample_bus1 = df.head(10)['bus1'].tolist() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=disconnected_count, + expected=0, + message=( + f"Found {disconnected_count} disconnected {self.carrier} links " + f"for {self.scenario} (sample link IDs: {sample_links}, " + f"bus0: {sample_bus0}, bus1: {sample_bus1})" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__, + details={ + "disconnected_links": df.to_dict(orient="records") + } + ) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_stores.py b/src/egon/data/validation/rules/custom/sanity/gas_stores.py new file mode 100644 index 000000000..a0e978862 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_stores.py @@ -0,0 +1,323 @@ +""" +Sanity check validation rules for gas storage components. + +Validates CH4 and H2 storage capacities against expected values from +grid capacities and external data sources. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + +from egon.data import config +from egon.data.datasets.hydrogen_etrago.storage import ( + calculate_and_map_saltcavern_storage_potential +) + + +class CH4StoresCapacity(DataFrameRule): + """ + Validate CH4 store capacity in Germany. + + Compares the sum of CH4 store capacities in the database against the + expected capacity calculated from: + - CH4 grid capacity allocation + - Total CH4 store capacity in Germany (source: GIE) + + The check allows for small deviations between observed and expected values. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.02, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.02 = 2%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total CH4 store capacity in Germany. + + Returns a query that sums all CH4 store capacities for German buses + in the specified scenario. + """ + return f""" + SELECT SUM(e_nom::numeric) as e_nom_germany + FROM grid.egon_etrago_store + WHERE scn_name = '{self.scenario}' + AND carrier = 'CH4' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'CH4' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate CH4 store capacity against expected values. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with e_nom_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["e_nom_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No CH4 store data found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["e_nom_germany"].values[0]) + + # Calculate expected capacity based on scenario + if self.scenario == "eGon2035": + grid_cap = 130000 # MWh + elif self.scenario == "eGon100RE": + # Get retrofitted share from config + from egon.data.datasets.scenario_parameters import get_sector_parameters + retrofitted_share = get_sector_parameters("gas", "eGon100RE")[ + "retrofitted_CH4pipeline-to-H2pipeline_share" + ] + grid_cap = 13000 * (1 - retrofitted_share) # MWh + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Unknown scenario: {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # GIE capacity: https://www.gie.eu/transparency/databases/storage-database/ + stores_cap_germany = 266424202 # MWh + + expected_capacity = stores_cap_germany + grid_cap + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.02) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 stores capacity valid for {self.scenario}: " + f"deviation {deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"CH4 stores capacity deviation too large for {self.scenario}: " + f"{deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class H2SaltcavernStoresCapacity(DataFrameRule): + """ + Validate H2 saltcavern store potential capacity in Germany. + + Compares the sum of H2 saltcavern potential storage capacities (e_nom_max) + in the database against the expected capacity calculated from: + - Area fractions around substations in federal states + - Estimated total hydrogen storage potential per federal state (InSpEE-DS) + + The check allows for small deviations between observed and expected values. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + rtol: float = 0.02, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_store) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for capacity deviation (default: 0.02 = 2%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + + def get_query(self, ctx): + """ + Query to get total H2 saltcavern potential storage capacity in Germany. + + Returns a query that sums all H2_underground store e_nom_max capacities + for German H2_saltcavern buses in the specified scenario. + """ + return f""" + SELECT SUM(e_nom_max::numeric) as e_nom_max_germany + FROM grid.egon_etrago_store + WHERE scn_name = '{self.scenario}' + AND carrier = 'H2_underground' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = 'H2_saltcavern' + ) + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate H2 saltcavern storage capacity against expected values. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with e_nom_max_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["e_nom_max_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No H2 saltcavern store data found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["e_nom_max_germany"].values[0]) + + # Calculate expected capacity from saltcavern potential + try: + storage_potentials = calculate_and_map_saltcavern_storage_potential() + storage_potentials["storage_potential"] = ( + storage_potentials["area_fraction"] * storage_potentials["potential"] + ) + expected_capacity = sum(storage_potentials["storage_potential"].to_list()) + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error calculating expected H2 saltcavern capacity: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.02) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"H2 saltcavern stores capacity valid for {self.scenario}: " + f"deviation {deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"H2 saltcavern stores capacity deviation too large for {self.scenario}: " + f"{deviation_pct:.2f}% (tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index 6674dcfa0..8e42379f3 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -9,8 +9,7 @@ import pandas as pd from egon_validation.rules.base import DataFrameRule, RuleResult, Severity -from egon.data import config -from egon.data.datasets.storages.home_batteries import get_cbat_pbat_ratio +from egon.data import config, db class HomeBatteriesAggregation(DataFrameRule): @@ -44,7 +43,14 @@ def get_query(self, ctx): targets = config.datasets()["home_batteries"]["targets"] # Get cbat_pbat_ratio for capacity calculation - cbat_pbat_ratio = get_cbat_pbat_ratio() + # Query the ratio directly from the database instead of importing from dataset module + cbat_pbat_ratio_query = f""" + SELECT max_hours + FROM {sources["etrago_storage"]["schema"]}.{sources["etrago_storage"]["table"]} + WHERE carrier = 'home_battery' + LIMIT 1 + """ + cbat_pbat_ratio = int(db.select_dataframe(cbat_pbat_ratio_query).iat[0, 0]) return f""" WITH storage_data AS ( From b489657264df629c5b3c5c90409159680626333c Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 30 Dec 2025 10:34:46 +0100 Subject: [PATCH 22/54] debug RuleResult: write debug information to message --- .../rules/custom/sanity/gas_grid.py | 25 +++++++------------ .../rules/custom/sanity/home_batteries.py | 10 ++++---- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_grid.py b/src/egon/data/validation/rules/custom/sanity/gas_grid.py index 54239c23d..c83fba331 100644 --- a/src/egon/data/validation/rules/custom/sanity/gas_grid.py +++ b/src/egon/data/validation/rules/custom/sanity/gas_grid.py @@ -146,13 +146,12 @@ def evaluate_df(self, df, ctx): expected=0, message=( f"Found {isolated_count} isolated {self.carrier} buses for {self.scenario} " - f"(sample: {sample_buses})" + f"isolated_buses: {df.to_dict(orient="records")}" ), severity=Severity.ERROR, schema=self.schema, table_name=self.table_name, - rule_class=self.__class__.__name__, - details={"isolated_buses": df.to_dict(orient="records")} + rule_class=self.__class__.__name__ ) @@ -453,17 +452,14 @@ def evaluate_df(self, df, ctx): expected=0, message=( f"Found {disconnected_count} disconnected {self.component_carrier} " - f"{self.component_type}s for {self.scenario} " - f"(sample IDs: {sample_components}, buses: {sample_buses})" + f"{self.component_type}s for {self.scenario}. " + f"disconnected_components: {df.to_dict(orient='records')}, " + f"bus_conditions: {self.bus_conditions}" ), severity=Severity.ERROR, schema=self.schema, table_name=self.table_name, - rule_class=self.__class__.__name__, - details={ - "disconnected_components": df.to_dict(orient="records"), - "bus_conditions": self.bus_conditions - } + rule_class=self.__class__.__name__ ) @@ -806,14 +802,11 @@ def evaluate_df(self, df, ctx): expected=0, message=( f"Found {disconnected_count} disconnected {self.carrier} links " - f"for {self.scenario} (sample link IDs: {sample_links}, " - f"bus0: {sample_bus0}, bus1: {sample_bus1})" + f"for {self.scenario}. " + f"disconnected_links: {df.to_dict(orient='records')}" ), severity=Severity.ERROR, schema=self.schema, table_name=self.table_name, - rule_class=self.__class__.__name__, - details={ - "disconnected_links": df.to_dict(orient="records") - } + rule_class=self.__class__.__name__ ) diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index 8e42379f3..fd5fb7ecb 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -158,8 +158,8 @@ def evaluate_df(self, df, ctx): max_p_nom_diff = (df["storage_p_nom"] - df["building_p_nom"]).abs().max() max_capacity_diff = (df["storage_capacity"] - df["building_capacity"]).abs().max() - # Get sample violations - sample_violations = mismatches.head(5)[ + # Get all violations + all_violations = mismatches[ ["bus_id", "storage_p_nom", "building_p_nom", "storage_capacity", "building_capacity"] ].to_dict(orient="records") @@ -173,13 +173,13 @@ def evaluate_df(self, df, ctx): expected=0.0, message=( f"Home battery aggregation mismatch for {len(mismatches)} bus(es): " - f"max p_nom diff={max_p_nom_diff:.6f}, max capacity diff={max_capacity_diff:.6f}" + f"max p_nom diff={max_p_nom_diff:.6f}, max capacity diff={max_capacity_diff:.6f}. " + f"violations: {all_violations}" ), severity=Severity.ERROR, schema=self.schema, table_name=self.table_name, - rule_class=self.__class__.__name__, - details={"sample_violations": sample_violations} + rule_class=self.__class__.__name__ ) # All checks passed From 78e06ff611cf804f46263f72f0aad0d2c5549011 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 30 Dec 2025 10:36:58 +0100 Subject: [PATCH 23/54] add sanity rules: gas_loads_generators.py --- SANITY_CHECKS_MIGRATION.md | 17 +- src/egon/data/datasets/final_validations.py | 31 ++ .../rules/custom/sanity/__init__.py | 6 + .../custom/sanity/gas_loads_generators.py | 412 ++++++++++++++++++ 4 files changed, 459 insertions(+), 7 deletions(-) create mode 100644 src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md index 51257f770..944568e9e 100644 --- a/SANITY_CHECKS_MIGRATION.md +++ b/SANITY_CHECKS_MIGRATION.md @@ -329,6 +329,9 @@ The following sanity checks have been migrated to validation rules: - `sanity_check_CH4_grid()` → `CH4GridCapacity` - `sanity_check_gas_links()` → `GasLinksConnections` +### ✅ Gas Loads and Generators +- `etrago_eGon2035_gas_DE()` → `GasLoadsCapacity` + `GasGeneratorsCapacity` (wrapper function - components already migrated) + --- ## Remaining Sanity Checks to Migrate @@ -339,13 +342,12 @@ The following functions from `sanity_checks.py` still need to be migrated: 2. `etrago_eGon2035_heat()` - Heat capacity distribution checks 3. `sanitycheck_pv_rooftop_buildings()` - PV rooftop capacity validation (complex with plots) 4. `sanitycheck_emobility_mit()` - E-mobility trip and vehicle checks -5. `etrago_eGon2035_gas_DE()` - German gas network checks -6. `etrago_eGon2035_gas_abroad()` - International gas network checks -7. `sanitycheck_dsm()` - Demand-side management validation -8. `etrago_timeseries_length()` - Timeseries array length checks -9. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks -10. `electrical_load_100RE()` - eGon100RE load validation -11. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation +5. `etrago_eGon2035_gas_abroad()` - International gas network checks +6. `sanitycheck_dsm()` - Demand-side management validation +7. `etrago_timeseries_length()` - Timeseries array length checks +8. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks +9. `electrical_load_100RE()` - eGon100RE load validation +10. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation --- @@ -366,6 +368,7 @@ egon-data/src/egon/data/ ├── home_batteries.py # ✅ Migrated ├── gas_stores.py # ✅ Migrated (CH4, H2 saltcavern stores) ├── gas_grid.py # ✅ Migrated (bus isolation, bus counts, one-port, CH4 grid capacity, link connections) + ├── gas_loads_generators.py # ✅ Migrated (loads and generators capacity) ├── timeseries.py # TODO ├── capacity_comparison.py # TODO ├── emobility.py # TODO diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index fcc761d32..73054685b 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -15,6 +15,8 @@ GasOnePortConnections, CH4GridCapacity, GasLinksConnections, + GasLoadsCapacity, + GasGeneratorsCapacity, ) @@ -373,6 +375,35 @@ def __init__(self, dependencies): ), ], + # Gas loads and generators capacity validations + # These check that gas demand and generation capacity match reference data + "gas_loads_generators": [ + # CH4_for_industry loads - eGon2035 + GasLoadsCapacity( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_LOADS_CH4_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + carrier="CH4_for_industry", + rtol=0.10 + ), + # H2_for_industry loads - eGon2035 + GasLoadsCapacity( + table="grid.egon_etrago_load", + rule_id="SANITY_GAS_LOADS_H2_FOR_INDUSTRY_EGON2035", + scenario="eGon2035", + carrier="H2_for_industry", + rtol=0.10 + ), + # CH4 generators - eGon2035 + GasGeneratorsCapacity( + table="grid.egon_etrago_generator", + rule_id="SANITY_GAS_GENERATORS_CH4_EGON2035", + scenario="eGon2035", + carrier="CH4", + rtol=0.10 + ), + ], + # Add more validation categories here as you migrate more sanity checks # Examples: # "timeseries": [ ... ], diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py index be5fa80f1..fd068fab5 100644 --- a/src/egon/data/validation/rules/custom/sanity/__init__.py +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -22,6 +22,10 @@ CH4GridCapacity, GasLinksConnections, ) +from .gas_loads_generators import ( + GasLoadsCapacity, + GasGeneratorsCapacity, +) __all__ = [ "ResidentialElectricityAnnualSum", @@ -36,4 +40,6 @@ "GasOnePortConnections", "CH4GridCapacity", "GasLinksConnections", + "GasLoadsCapacity", + "GasGeneratorsCapacity", ] diff --git a/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py b/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py new file mode 100644 index 000000000..a01076f57 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/gas_loads_generators.py @@ -0,0 +1,412 @@ +""" +Sanity check validation rules for gas loads and generators. + +Validates gas demand and generation capacity against reference data. +""" + +from pathlib import Path +import pandas as pd +import ast +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + + +class GasLoadsCapacity(DataFrameRule): + """ + Validate gas loads capacity against reference data. + + Compares the total annual load (in TWh) for gas loads in Germany + from the database against reference data from opendata.ffe. + This validates that industrial gas demand (CH4 and H2) matches + expected values from external sources. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4_for_industry", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Load carrier type ("CH4_for_industry" or "H2_for_industry") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to get total annual load for gas loads in Germany. + + Returns a query that sums the annual load from timeseries data + for the specified carrier in Germany, converting to TWh. + """ + return f""" + SELECT (SUM( + (SELECT SUM(p) + FROM UNNEST(b.p_set) p))/1000000)::numeric as load_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b + ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c + ON (a.bus=c.bus_id) + WHERE b.scn_name = '{self.scenario}' + AND a.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND c.country = 'DE' + AND a.carrier = '{self.carrier}' + """ + + def _get_reference_capacity(self): + """ + Calculate reference load capacity from opendata.ffe data. + + Returns + ------- + float + Expected total annual load in TWh + """ + try: + path = Path(".") / "datasets" / "gas_data" / "demand" + + # Read region correlation file + corr_file = path / "region_corr.json" + df_corr = pd.read_json(corr_file) + df_corr = df_corr.loc[:, ["id_region", "name_short"]] + df_corr.set_index("id_region", inplace=True) + + # Read demand data for carrier + input_gas_demand = pd.read_json( + path / (self.carrier + f"_{self.scenario}.json") + ) + input_gas_demand = input_gas_demand.loc[:, ["id_region", "value"]] + input_gas_demand.set_index("id_region", inplace=True) + + # Join with correlation and filter for Germany + input_gas_demand = pd.concat( + [input_gas_demand, df_corr], axis=1, join="inner" + ) + input_gas_demand["NUTS0"] = (input_gas_demand["name_short"].str)[0:2] + input_gas_demand = input_gas_demand[ + input_gas_demand["NUTS0"].str.match("DE") + ] + + # Sum and convert to TWh + total_demand = sum(input_gas_demand.value.to_list()) / 1000000 + + return float(total_demand) + + except Exception as e: + raise ValueError(f"Error reading reference load data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate gas loads capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with load_twh column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["load_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} loads found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_load = float(df["load_twh"].values[0]) + + # Get expected capacity from reference data + try: + expected_load = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_load - expected_load) / expected_load + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_load, + expected=expected_load, + message=( + f"{self.carrier} load valid for {self.scenario}: " + f"{observed_load:.2f} TWh (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_load, + expected=expected_load, + message=( + f"{self.carrier} load deviation too large for {self.scenario}: " + f"{observed_load:.2f} vs {expected_load:.2f} TWh expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + +class GasGeneratorsCapacity(DataFrameRule): + """ + Validate gas generators capacity against reference data. + + Compares the total nominal power (p_nom) of CH4 generators in Germany + from the database against reference data from SciGRID_gas productions + and the Biogaspartner Einspeiseatlas. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", + carrier: str = "CH4", rtol: float = 0.10, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_generator) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Generator carrier type (default: "CH4") + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + carrier=carrier, rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + + def get_query(self, ctx): + """ + Query to get total generator capacity in Germany. + + Returns a query that sums the p_nom of all gas generators + in Germany for the specified carrier. + """ + return f""" + SELECT SUM(p_nom::numeric) as p_nom_germany + FROM grid.egon_etrago_generator + WHERE scn_name = '{self.scenario}' + AND carrier = '{self.carrier}' + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + AND carrier = '{self.carrier}' + ) + """ + + def _get_reference_capacity(self): + """ + Calculate reference generation capacity from SciGRID_gas + biogas data. + + Returns + ------- + float + Expected total generation capacity in MW + """ + try: + # Read SciGRID_gas natural gas productions + target_file = ( + Path(".") + / "datasets" + / "gas_data" + / "data" + / "IGGIELGN_Productions.csv" + ) + + ng_generators = pd.read_csv( + target_file, + delimiter=";", + decimal=".", + usecols=["country_code", "param"], + ) + + ng_generators = ng_generators[ + ng_generators["country_code"].str.match("DE") + ] + + # Sum natural gas production capacity + p_ng = 0 + for index, row in ng_generators.iterrows(): + param = ast.literal_eval(row["param"]) + p_ng = p_ng + param["max_supply_M_m3_per_d"] + + conversion_factor = 437.5 # MCM/day to MWh/h + p_ng = p_ng * conversion_factor + + # Read biogas production data + basename = "Biogaspartner_Einspeiseatlas_Deutschland_2021.xlsx" + target_file = ( + Path(".") / "data_bundle_egon_data" / "gas_data" / basename + ) + + conversion_factor_b = 0.01083 # m^3/h to MWh/h + p_biogas = ( + pd.read_excel( + target_file, + usecols=["Einspeisung Biomethan [(N*m^3)/h)]"], + )["Einspeisung Biomethan [(N*m^3)/h)]"].sum() + * conversion_factor_b + ) + + total_generation = p_ng + p_biogas + + return float(total_generation) + + except Exception as e: + raise ValueError(f"Error reading reference generation data: {str(e)}") + + def evaluate_df(self, df, ctx): + """ + Evaluate gas generators capacity against reference data. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with p_nom_germany column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["p_nom_germany"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No {self.carrier} generators found for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + observed_capacity = float(df["p_nom_germany"].values[0]) + + # Get expected capacity from reference data + try: + expected_capacity = self._get_reference_capacity() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=str(e), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Calculate relative deviation + rtol = self.params.get("rtol", 0.10) + deviation = abs(observed_capacity - expected_capacity) / expected_capacity + + success = deviation <= rtol + deviation_pct = deviation * 100 + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"{self.carrier} generator capacity valid for {self.scenario}: " + f"{observed_capacity:.2f} MW (deviation: {deviation_pct:.2f}%, " + f"tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=observed_capacity, + expected=expected_capacity, + message=( + f"{self.carrier} generator capacity deviation too large for {self.scenario}: " + f"{observed_capacity:.2f} vs {expected_capacity:.2f} MW expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) From fc03a3b9d0d473ed508b559f7563e2357020a2da Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 30 Dec 2025 11:48:19 +0100 Subject: [PATCH 24/54] add sanity rules: heat_demand and electricity_capacity --- SANITY_CHECKS_MIGRATION.md | 198 +++++++- src/egon/data/datasets/final_validations.py | 476 +++++++++++++++++- .../rules/custom/sanity/__init__.py | 8 + .../custom/sanity/electricity_capacity.py | 253 ++++++++++ .../rules/custom/sanity/heat_demand.py | 163 ++++++ 5 files changed, 1068 insertions(+), 30 deletions(-) create mode 100644 src/egon/data/validation/rules/custom/sanity/electricity_capacity.py create mode 100644 src/egon/data/validation/rules/custom/sanity/heat_demand.py diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md index 944568e9e..48f7d166b 100644 --- a/SANITY_CHECKS_MIGRATION.md +++ b/SANITY_CHECKS_MIGRATION.md @@ -332,22 +332,94 @@ The following sanity checks have been migrated to validation rules: ### ✅ Gas Loads and Generators - `etrago_eGon2035_gas_DE()` → `GasLoadsCapacity` + `GasGeneratorsCapacity` (wrapper function - components already migrated) ---- - -## Remaining Sanity Checks to Migrate +### ✅ Electricity Capacity +- `etrago_eGon2035_electricity()` → `ElectricityCapacityComparison` (9 generator carriers + 1 storage carrier) + - Validates: wind_onshore, wind_offshore, solar, solar_rooftop, biomass, run_of_river, reservoir, oil, others, pumped_hydro + +### ✅ Heat Supply Capacity +- `etrago_eGon2035_heat()` → `ElectricityCapacityComparison` (5 heat supply carriers - reused for heat!) + - Links: central_heat_pump, rural_heat_pump, central_resistive_heater + - Generators: solar_thermal_collector, geo_thermal + - **Note:** Heat demand check from this function still needs migration (timeseries-based validation) + +### ✅ Timeseries Length +- `etrago_timeseries_length()` → `ArrayCardinalityValidation` (reused from egon-validation formal rules!) + - Validates 8 array columns across 5 component types (generator, load, link, store, storage) + - Checks: p_max_pu, p_min_pu, p_set, q_set, e_min_pu, e_max_pu, inflow + - Leverages existing formal validation rule from egon-validation library + +### ✅ eGon100RE Capacity Validations +- `generators_links_storages_stores_100RE()` → `ElectricityCapacityComparison` (reused for eGon100RE!) + - **Generators (13):** wind_onshore, wind_offshore, solar, solar_rooftop, run_of_river, oil, lignite, coal, solar_thermal_collector, geo_thermal, rural_solar_thermal, urban_central_gas_CHP, urban_central_solid_biomass_CHP + - **Links (9):** central_gas_boiler, central_heat_pump, central_resistive_heater, OCGT, rural_biomass_boiler, rural_gas_boiler, rural_heat_pump, rural_oil_boiler, rural_resistive_heater + - **Storage (1):** pumped_hydro + - **Note:** Stores validation deferred (original function only prints, no validation logic) + +### ✅ Electrical Load Demand +- `electrical_load_100RE()` → `ElectricalLoadAggregationValidation` (reused from egon-validation!) + - Validates annual electrical load sum (TWh) for all scenarios (eGon2035, eGon100RE, etc.) + - Also checks max/min load (GW) - more comprehensive than original + - Leverages existing custom validation rule from egon-validation library + - **Note:** Original function validated by sector (residential, commercial, industrial) but existing rule validates total only + +### ✅ Heat Demand +- Heat demand validation (from `etrago_eGon2035_heat()`) → `HeatDemandValidation` (new class!) + - Validates annual heat demand (rural_heat + central_heat) against peta_heat reference + - Compares timeseries sum vs expected demand + - eGon2035 scenario -The following functions from `sanity_checks.py` still need to be migrated: +--- -1. `etrago_eGon2035_electricity()` - Complex multi-carrier capacity checks -2. `etrago_eGon2035_heat()` - Heat capacity distribution checks -3. `sanitycheck_pv_rooftop_buildings()` - PV rooftop capacity validation (complex with plots) -4. `sanitycheck_emobility_mit()` - E-mobility trip and vehicle checks -5. `etrago_eGon2035_gas_abroad()` - International gas network checks -6. `sanitycheck_dsm()` - Demand-side management validation -7. `etrago_timeseries_length()` - Timeseries array length checks -8. `generators_links_storages_stores_100RE()` - eGon100RE capacity checks -9. `electrical_load_100RE()` - eGon100RE load validation -10. `heat_gas_load_egon100RE()` - eGon100RE heat/gas load validation +## Migration Status Summary + +### ✅ All Core Validations Migrated + +All core sanity checks have been successfully migrated to the new validation framework, including: +- Residential electricity (annual sum, household refinement) +- CTS demand (electricity and heat shares) +- Home batteries aggregation +- Gas infrastructure (stores, buses, grid, links, loads, generators) +- Electricity capacity (eGon2035 and eGon100RE generators, storage) +- Heat capacity (heat pumps, resistive heaters, solar thermal, geothermal) +- Timeseries length validation +- Electrical load aggregation +- Heat demand validation + +### Deferred Validations (Require Dataset-Inline Implementation) + +The following sanity checks require dataset-inline validation due to their complexity and cannot be easily migrated to standalone validation rules: + +**Reason for Deferral: Complex with External Dependencies** +1. **`sanitycheck_pv_rooftop_buildings()`** + - Creates matplotlib/seaborn visualizations + - Loads external building data via `load_building_data()` + - Has dataset-boundary-specific logic (Schleswig-Holstein special cases) + - Reads from Excel files for certain scenarios + - **Migration approach**: Implement as dataset-inline validation in the PV rooftop dataset + +2. **`sanitycheck_emobility_mit()`** + - Multiple sub-checks (EV allocation, trip data, model components) + - Uses ORM queries with session scopes + - Depends on SimBEV metadata files + - Has testmode conditional logic + - **Migration approach**: Implement as dataset-inline validation in the e-mobility dataset + +3. **`heat_gas_load_egon100RE()`** + - Only prints comparison table (no assertions/validations) + - Reads from pypsa_eur network data + - No actual validation logic to migrate + - **Migration approach**: Keep as reporting function or convert to validation with assertions + +**Reason for Deferral: Uses External Calculation Functions** +4. **`etrago_eGon2035_gas_abroad()`** + - Uses external calculation functions from gas_neighbours module + - Requires dataset-specific context + - **Migration approach**: Implement as dataset-inline validation in the gas grid dataset + +5. **`sanitycheck_dsm()`** + - Complex aggregation logic with multiple steps + - Dataset-specific calculations + - **Migration approach**: Implement as dataset-inline validation in the DSM dataset --- @@ -356,27 +428,58 @@ The following functions from `sanity_checks.py` still need to be migrated: ``` egon-data/src/egon/data/ ├── datasets/ -│ ├── sanity_checks.py # Old sanity checks (to be deprecated) +│ ├── sanity_checks.py # ⚠️ Old sanity checks (kept for deferred validations) +│ ├── final_validations.py # ✅ Cross-cutting validations │ └── ... └── validation/ └── rules/ └── custom/ └── sanity/ - ├── __init__.py - ├── residential_electricity.py # ✅ Migrated - ├── cts_demand.py # ✅ Migrated - ├── home_batteries.py # ✅ Migrated - ├── gas_stores.py # ✅ Migrated (CH4, H2 saltcavern stores) - ├── gas_grid.py # ✅ Migrated (bus isolation, bus counts, one-port, CH4 grid capacity, link connections) - ├── gas_loads_generators.py # ✅ Migrated (loads and generators capacity) - ├── timeseries.py # TODO - ├── capacity_comparison.py # TODO - ├── emobility.py # TODO - └── ... # TODO + ├── __init__.py # ✅ Exports all sanity validation classes + ├── residential_electricity.py # ✅ Migrated (2 rules) + ├── cts_demand.py # ✅ Migrated (2 rules) + ├── home_batteries.py # ✅ Migrated (1 rule) + ├── gas_stores.py # ✅ Migrated (2 rules: CH4, H2 saltcavern) + ├── gas_grid.py # ✅ Migrated (5 rules: buses, one-port, CH4 grid, links) + ├── gas_loads_generators.py # ✅ Migrated (2 rules: loads, generators) + ├── electricity_capacity.py # ✅ Migrated (reusable class for capacity comparison) + └── heat_demand.py # ✅ Migrated (1 rule) + +egon-validation/egon_validation/rules/ +├── formal/ +│ └── array_cardinality_check.py # ✅ Reused for timeseries length validation +└── custom/ + └── numeric_aggregation_check.py # ✅ Reused for electrical load aggregation ``` --- +## Migration Statistics + +**Total sanity checks in original `sanity_checks.py`**: 21 functions + +**Successfully migrated**: 16 functions (76%) +- Converted to **48 individual validation rules** across multiple categories +- Organized into **8 custom validation modules** +- Reused **2 existing validation classes** from egon-validation + +**Deferred (require dataset-inline implementation)**: 5 functions (24%) +- 3 complex validations with external dependencies +- 2 validations requiring external calculation functions + +**Validation rules by category**: +- Electricity capacity: 10 rules (eGon2035) +- Heat capacity: 5 rules (eGon2035) +- eGon100RE capacity: 23 rules (13 generators, 9 links, 1 storage) +- Gas infrastructure: 11 rules +- Demand validation: 4 rules +- Timeseries: 8 rules +- Home batteries: 1 rule +- Electrical load: 1 rule (multi-scenario) +- Heat demand: 1 rule + +--- + ## Testing Your Migration 1. **Add validation to a dataset:** @@ -426,3 +529,46 @@ open validation_runs/{run_id}/final/report.html - See implemented examples in `egon/data/validation/rules/custom/sanity/` - Check egon-validation documentation for `DataFrameRule` API - Ask in the team channel for migration assistance + +--- + +## Summary and Next Steps + +### ✅ Completed Work + +The sanity checks migration is **76% complete** with all core validations successfully migrated to the new framework: + +1. **8 custom validation modules** created in `egon/data/validation/rules/custom/sanity/` +2. **48 individual validation rules** implemented across all major categories +3. **Reused 2 existing validation classes** from egon-validation library (code reuse > new code) +4. **Fixed 4 RuleResult 'details' parameter errors** by moving violation data to message field +5. **Integrated validations** into `FinalValidations` dataset for cross-cutting checks + +### 🔄 Remaining Work + +5 sanity check functions (24%) are deferred for dataset-inline implementation: + +**High Priority** (complex with external dependencies): +1. `sanitycheck_pv_rooftop_buildings()` - Implement in PV rooftop dataset +2. `sanitycheck_emobility_mit()` - Implement in e-mobility dataset +3. `heat_gas_load_egon100RE()` - Add assertions or keep as reporting function + +**Medium Priority** (use external calculation functions): +4. `etrago_eGon2035_gas_abroad()` - Implement in gas grid dataset +5. `sanitycheck_dsm()` - Implement in DSM dataset + +### 🎯 Recommended Approach for Deferred Validations + +For each deferred validation: +1. Add inline `validation={}` dict to the relevant Dataset class +2. Create custom validation rules that can access dataset-specific functions +3. Use the same pattern as migrated validations (SqlRule or DataFrameRule) +4. Ensure validations run after dataset tasks complete + +### 📊 Impact + +- **Better error reporting**: Structured validation results with observed/expected values +- **Consistent framework**: All validations follow the same pattern +- **Parallel execution**: Validations can run concurrently +- **Automated reports**: HTML reports generated from all validation results +- **Code reuse**: Leveraged existing validation classes where possible diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 73054685b..078891fa3 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -17,7 +17,11 @@ GasLinksConnections, GasLoadsCapacity, GasGeneratorsCapacity, + ElectricityCapacityComparison, + HeatDemandValidation, ) +from egon_validation.rules.formal.array_cardinality_check import ArrayCardinalityValidation +from egon_validation.rules.custom.numeric_aggregation_check import ElectricalLoadAggregationValidation def notasks(): @@ -404,10 +408,474 @@ def __init__(self, dependencies): ), ], - # Add more validation categories here as you migrate more sanity checks - # Examples: - # "timeseries": [ ... ], - # "capacity_comparison": [ ... ], + # Electricity capacity validations + # These check that distributed generator and storage capacities match input capacities + "electricity_capacity": [ + # GENERATORS - eGon2035 + # Wind onshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_ONSHORE_EGON2035", + scenario="eGon2035", + carrier="wind_onshore", + component_type="generator", + rtol=0.10 + ), + # Wind offshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_OFFSHORE_EGON2035", + scenario="eGon2035", + carrier="wind_offshore", + component_type="generator", + rtol=0.10 + ), + # Solar + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_EGON2035", + scenario="eGon2035", + carrier="solar", + component_type="generator", + rtol=0.10 + ), + # Solar rooftop + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_ROOFTOP_EGON2035", + scenario="eGon2035", + carrier="solar_rooftop", + component_type="generator", + rtol=0.10 + ), + # Biomass (maps to multiple output carriers: biomass, industrial_biomass_CHP, central_biomass_CHP) + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_BIOMASS_EGON2035", + scenario="eGon2035", + carrier="biomass", + component_type="generator", + output_carriers=["biomass", "industrial_biomass_CHP", "central_biomass_CHP"], + rtol=0.10 + ), + # Run of river + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RUN_OF_RIVER_EGON2035", + scenario="eGon2035", + carrier="run_of_river", + component_type="generator", + rtol=0.10 + ), + # Reservoir + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RESERVOIR_EGON2035", + scenario="eGon2035", + carrier="reservoir", + component_type="generator", + rtol=0.10 + ), + # Oil + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OIL_EGON2035", + scenario="eGon2035", + carrier="oil", + component_type="generator", + rtol=0.10 + ), + # Others + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OTHERS_EGON2035", + scenario="eGon2035", + carrier="others", + component_type="generator", + rtol=0.10 + ), + + # STORAGE - eGon2035 + # Pumped hydro + ElectricityCapacityComparison( + table="grid.egon_etrago_storage", + rule_id="SANITY_ELECTRICITY_STORAGE_PUMPED_HYDRO_EGON2035", + scenario="eGon2035", + carrier="pumped_hydro", + component_type="storage", + rtol=0.10 + ), + + # GENERATORS - eGon100RE + # Wind onshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_ONSHORE_EGON100RE", + scenario="eGon100RE", + carrier="wind_onshore", + component_type="generator", + rtol=0.10 + ), + # Wind offshore + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_WIND_OFFSHORE_EGON100RE", + scenario="eGon100RE", + carrier="wind_offshore", + component_type="generator", + rtol=0.10 + ), + # Solar + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_EGON100RE", + scenario="eGon100RE", + carrier="solar", + component_type="generator", + rtol=0.10 + ), + # Solar rooftop + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_ROOFTOP_EGON100RE", + scenario="eGon100RE", + carrier="solar_rooftop", + component_type="generator", + rtol=0.10 + ), + # Run of river + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RUN_OF_RIVER_EGON100RE", + scenario="eGon100RE", + carrier="run_of_river", + component_type="generator", + rtol=0.10 + ), + # Oil + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_OIL_EGON100RE", + scenario="eGon100RE", + carrier="oil", + component_type="generator", + rtol=0.10 + ), + # Lignite + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_LIGNITE_EGON100RE", + scenario="eGon100RE", + carrier="lignite", + component_type="generator", + rtol=0.10 + ), + # Coal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_COAL_EGON100RE", + scenario="eGon100RE", + carrier="coal", + component_type="generator", + rtol=0.10 + ), + # Solar thermal collector + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_SOLAR_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_solar_thermal_collector", + component_type="generator", + output_carriers=["solar_thermal_collector"], + rtol=0.10 + ), + # Geothermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_GEO_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_geo_thermal", + component_type="generator", + output_carriers=["geo_thermal"], + rtol=0.10 + ), + # Rural solar thermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_RURAL_SOLAR_THERMAL_EGON100RE", + scenario="eGon100RE", + carrier="rural_solar_thermal", + component_type="generator", + rtol=0.10 + ), + # Urban central gas CHP + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_URBAN_GAS_CHP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_gas_CHP", + component_type="generator", + rtol=0.10 + ), + # Urban central solid biomass CHP + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_ELECTRICITY_GENERATOR_BIOMASS_CHP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_solid_biomass_CHP", + component_type="generator", + rtol=0.10 + ), + + # LINKS - eGon100RE + # Central gas boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_GAS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_gas_boiler", + component_type="link", + output_carriers=["central_gas_boiler"], + rtol=0.10 + ), + # Central heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_HEAT_PUMP_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_heat_pump", + component_type="link", + output_carriers=["central_heat_pump"], + rtol=0.10 + ), + # Central resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_CENTRAL_RESISTIVE_HEATER_EGON100RE", + scenario="eGon100RE", + carrier="urban_central_resistive_heater", + component_type="link", + output_carriers=["central_resistive_heater"], + rtol=0.10 + ), + # OCGT (gas) + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_OCGT_EGON100RE", + scenario="eGon100RE", + carrier="gas", + component_type="link", + output_carriers=["OCGT"], + rtol=0.10 + ), + # Rural biomass boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_BIOMASS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_biomass_boiler", + component_type="link", + rtol=0.10 + ), + # Rural gas boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_GAS_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_gas_boiler", + component_type="link", + rtol=0.10 + ), + # Rural heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_HEAT_PUMP_EGON100RE", + scenario="eGon100RE", + carrier="rural_heat_pump", + component_type="link", + rtol=0.10 + ), + # Rural oil boiler + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_OIL_BOILER_EGON100RE", + scenario="eGon100RE", + carrier="rural_oil_boiler", + component_type="link", + rtol=0.10 + ), + # Rural resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_ELECTRICITY_LINK_RURAL_RESISTIVE_HEATER_EGON100RE", + scenario="eGon100RE", + carrier="rural_resistive_heater", + component_type="link", + rtol=0.10 + ), + + # STORAGE - eGon100RE + # Pumped hydro + ElectricityCapacityComparison( + table="grid.egon_etrago_storage", + rule_id="SANITY_ELECTRICITY_STORAGE_PUMPED_HYDRO_EGON100RE", + scenario="eGon100RE", + carrier="pumped_hydro", + component_type="storage", + rtol=0.10 + ), + ], + + # Heat capacity validations + # These check that distributed heat supply capacities match input capacities + "heat_capacity": [ + # LINKS - eGon2035 + # Central heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_CENTRAL_HEAT_PUMP_EGON2035", + scenario="eGon2035", + carrier="urban_central_heat_pump", + component_type="link", + output_carriers=["central_heat_pump"], + rtol=0.10 + ), + # Rural heat pump + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_RURAL_HEAT_PUMP_EGON2035", + scenario="eGon2035", + carrier="residential_rural_heat_pump", + component_type="link", + output_carriers=["rural_heat_pump"], + rtol=0.10 + ), + # Central resistive heater + ElectricityCapacityComparison( + table="grid.egon_etrago_link", + rule_id="SANITY_HEAT_LINK_CENTRAL_RESISTIVE_HEATER_EGON2035", + scenario="eGon2035", + carrier="urban_central_resistive_heater", + component_type="link", + output_carriers=["central_resistive_heater"], + rtol=0.10 + ), + + # GENERATORS - eGon2035 + # Solar thermal collector + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_HEAT_GENERATOR_SOLAR_THERMAL_EGON2035", + scenario="eGon2035", + carrier="urban_central_solar_thermal_collector", + component_type="generator", + output_carriers=["solar_thermal_collector"], + rtol=0.10 + ), + # Geothermal + ElectricityCapacityComparison( + table="grid.egon_etrago_generator", + rule_id="SANITY_HEAT_GENERATOR_GEO_THERMAL_EGON2035", + scenario="eGon2035", + carrier="urban_central_geo_thermal", + component_type="generator", + output_carriers=["geo_thermal"], + rtol=0.10 + ), + ], + + # Timeseries length validations + # These check that all timeseries arrays have the expected length (8760 hours) + "timeseries_length": [ + # Generator timeseries - p_max_pu + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + # Generator timeseries - p_min_pu + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + # Load timeseries - p_set + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LOAD_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_load_timeseries", + array_column="p_set", + expected_length=8760 + ), + # Load timeseries - q_set + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LOAD_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_load_timeseries", + array_column="q_set", + expected_length=8760 + ), + # Link timeseries - p_set (note: may have NULLs) + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_set", + expected_length=8760 + ), + # Store timeseries - e_min_pu + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_E_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="e_min_pu", + expected_length=8760 + ), + # Store timeseries - e_max_pu + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_E_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="e_max_pu", + expected_length=8760 + ), + # Storage timeseries - inflow + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_INFLOW", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="inflow", + expected_length=8760 + ), + ], + + # Electrical load demand validations + # Validates annual electrical load sums against expected values + "electrical_load": [ + # Total AC load aggregation for all scenarios (eGon2035, eGon100RE, etc.) + ElectricalLoadAggregationValidation( + rule_id="SANITY_ELECTRICAL_LOAD_AGGREGATION", + task="FinalValidations.electrical_load", + table="grid.egon_etrago_load", + tolerance=0.05 # 5% tolerance + ), + ], + + # Heat demand validations + # Validates annual heat demand against peta_heat reference values + "heat_demand": [ + # Heat demand - eGon2035 + HeatDemandValidation( + table="grid.egon_etrago_load", + rule_id="SANITY_HEAT_DEMAND_EGON2035", + scenario="eGon2035", + rtol=0.02 # 2% tolerance + ), + ], }, validation_on_failure="continue" # Continue pipeline even if validations fail ) diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py index fd068fab5..2ff844f7c 100644 --- a/src/egon/data/validation/rules/custom/sanity/__init__.py +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -26,6 +26,12 @@ GasLoadsCapacity, GasGeneratorsCapacity, ) +from .electricity_capacity import ( + ElectricityCapacityComparison, +) +from .heat_demand import ( + HeatDemandValidation, +) __all__ = [ "ResidentialElectricityAnnualSum", @@ -42,4 +48,6 @@ "GasLinksConnections", "GasLoadsCapacity", "GasGeneratorsCapacity", + "ElectricityCapacityComparison", + "HeatDemandValidation", ] diff --git a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py new file mode 100644 index 000000000..65b2fd878 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py @@ -0,0 +1,253 @@ +""" +Sanity check validation rules for electricity capacity comparison. + +Validates that distributed capacities in etrago tables match input capacities +from scenario_capacities table. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from typing import Optional, List + + +class ElectricityCapacityComparison(DataFrameRule): + """ + Compare distributed capacity with input capacity for electricity components. + + Compares the total capacity in etrago tables (grid.egon_etrago_generator, + grid.egon_etrago_storage) against the input capacity from the scenario + capacities table (supply.egon_scenario_capacities). + + This validation ensures that capacity distribution is correct and no + capacity is lost or incorrectly added during the distribution process. + """ + + def __init__( + self, + table: str, + rule_id: str, + scenario: str = "eGon2035", + carrier: str = "wind_onshore", + component_type: str = "generator", + output_carriers: Optional[List[str]] = None, + rtol: float = 0.10, + **kwargs + ): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_generator or grid.egon_etrago_storage) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + carrier : str + Carrier type for the input table (supply.egon_scenario_capacities) + component_type : str + Type of component ("generator", "storage", or "link") + output_carriers : List[str], optional + List of carrier names in output table. If None, uses carrier parameter. + Useful for biomass which maps to multiple output carriers. + rtol : float + Relative tolerance for capacity deviation (default: 0.10 = 10%) + """ + super().__init__( + rule_id=rule_id, + table=table, + scenario=scenario, + carrier=carrier, + component_type=component_type, + output_carriers=output_carriers, + rtol=rtol, + **kwargs + ) + self.kind = "sanity" + self.scenario = scenario + self.carrier = carrier + self.component_type = component_type + self.output_carriers = output_carriers or [carrier] + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to compare input and output capacities. + + Returns a query that: + 1. Sums output capacity from etrago table for German buses + 2. Sums input capacity from scenario_capacities table + 3. Returns both values for comparison + """ + # Build carrier filter for output table + if len(self.output_carriers) == 1: + carrier_filter = f"carrier = '{self.output_carriers[0]}'" + else: + carriers_str = "', '".join(self.output_carriers) + carrier_filter = f"carrier IN ('{carriers_str}')" + + return f""" + WITH output_capacity AS ( + SELECT + COALESCE(SUM(p_nom::numeric), 0) as output_capacity_mw + FROM {self.table} + WHERE scn_name = '{self.scenario}' + AND {carrier_filter} + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + ) + ), + input_capacity AS ( + SELECT + COALESCE(SUM(capacity::numeric), 0) as input_capacity_mw + FROM supply.egon_scenario_capacities + WHERE carrier = '{self.carrier}' + AND scenario_name = '{self.scenario}' + ) + SELECT + o.output_capacity_mw, + i.input_capacity_mw + FROM output_capacity o + CROSS JOIN input_capacity i + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate capacity comparison. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with output_capacity_mw and input_capacity_mw columns + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No data found for {self.carrier} capacity comparison", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + output_capacity = float(df["output_capacity_mw"].values[0]) + input_capacity = float(df["input_capacity_mw"].values[0]) + + # Case 1: Both zero - OK, no capacity needed + if output_capacity == 0 and input_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=( + f"No {self.carrier} {self.component_type} capacity needed " + f"for {self.scenario} (both input and output are zero)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 2: Input > 0 but output = 0 - ERROR + if input_capacity > 0 and output_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=0.0, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity was not distributed at all! " + f"Input: {input_capacity:.2f} MW, Output: 0 MW for {self.scenario}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 3: Output > 0 but input = 0 - ERROR + if output_capacity > 0 and input_capacity == 0: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_capacity, + expected=0.0, + message=( + f"{self.carrier} {self.component_type} capacity was distributed " + f"even though no input was provided! " + f"Output: {output_capacity:.2f} MW, Input: 0 MW for {self.scenario}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Case 4: Both > 0 - Check deviation + deviation = abs(output_capacity - input_capacity) / input_capacity + deviation_pct = deviation * 100 + error_pct = ((output_capacity - input_capacity) / input_capacity) * 100 + + success = deviation <= self.rtol + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=output_capacity, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity valid for {self.scenario}: " + f"Output: {output_capacity:.2f} MW, Input: {input_capacity:.2f} MW, " + f"Deviation: {error_pct:+.2f}% (tolerance: ±{self.rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_capacity, + expected=input_capacity, + message=( + f"{self.carrier} {self.component_type} capacity deviation too large for {self.scenario}: " + f"Output: {output_capacity:.2f} MW, Input: {input_capacity:.2f} MW, " + f"Deviation: {error_pct:+.2f}% (tolerance: ±{self.rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) diff --git a/src/egon/data/validation/rules/custom/sanity/heat_demand.py b/src/egon/data/validation/rules/custom/sanity/heat_demand.py new file mode 100644 index 000000000..1f0da0935 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/heat_demand.py @@ -0,0 +1,163 @@ +""" +Sanity check validation rules for heat demand. + +Validates that heat demand timeseries match expected values from peta_heat. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity + + +class HeatDemandValidation(DataFrameRule): + """ + Validate annual heat demand against peta_heat reference values. + + Compares the sum of rural_heat and central_heat load timeseries + against the demand from egon_peta_heat table to ensure demand is + correctly distributed. + """ + + def __init__( + self, + table: str, + rule_id: str, + scenario: str = "eGon2035", + rtol: float = 0.02, + **kwargs + ): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name ("eGon2035" or "eGon100RE") + rtol : float + Relative tolerance for deviation (default: 0.02 = 2%) + """ + super().__init__( + rule_id=rule_id, + table=table, + scenario=scenario, + rtol=rtol, + **kwargs + ) + self.kind = "sanity" + self.scenario = scenario + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to compare heat demand output vs input. + + Returns a query that: + 1. Sums rural_heat + central_heat timeseries from etrago_load + 2. Sums demand from egon_peta_heat + 3. Returns both values for comparison + """ + return f""" + WITH output_demand AS ( + SELECT + SUM((SELECT SUM(p) FROM UNNEST(b.p_set) p)) / 1000000 as demand_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c ON (a.bus = c.bus_id) + WHERE b.scn_name = '{self.scenario}' + AND a.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND c.country = 'DE' + AND a.carrier IN ('rural_heat', 'central_heat') + ), + input_demand AS ( + SELECT + SUM(demand / 1000000) as demand_twh + FROM demand.egon_peta_heat + WHERE scenario = '{self.scenario}' + ) + SELECT + o.demand_twh as output_demand_twh, + i.demand_twh as input_demand_twh + FROM output_demand o + CROSS JOIN input_demand i + """ + + def evaluate_df(self, df, ctx): + """ + Evaluate heat demand comparison. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with output_demand_twh and input_demand_twh columns + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["output_demand_twh"].isna().all() or df["input_demand_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No heat demand data found for {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + output_twh = float(df["output_demand_twh"].values[0]) + input_twh = float(df["input_demand_twh"].values[0]) + + # Calculate deviation + deviation = abs(output_twh - input_twh) / input_twh + deviation_pct = deviation * 100 + diff_twh = output_twh - input_twh + + success = deviation <= self.rtol + + if success: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=output_twh, + expected=input_twh, + message=( + f"Heat demand valid for {self.scenario}: " + f"{output_twh:.2f} TWh vs {input_twh:.2f} TWh expected " + f"(deviation: {deviation_pct:.2f}%, tolerance: {self.rtol*100:.2f}%)" + ), + severity=Severity.INFO, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + else: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=output_twh, + expected=input_twh, + message=( + f"Heat demand deviation too large for {self.scenario}: " + f"{output_twh:.2f} TWh vs {input_twh:.2f} TWh expected " + f"(diff: {diff_twh:+.2f} TWh, deviation: {deviation_pct:.2f}%, " + f"tolerance: {self.rtol*100:.2f}%)" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) From d99703db8f05b0e93d8d4d4166029419c38f5688 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 30 Dec 2025 15:41:28 +0100 Subject: [PATCH 25/54] debug sanity rules --- src/egon/data/datasets/final_validations.py | 14 ++++---- .../custom/sanity/electricity_capacity.py | 33 +++++++++++++++---- .../rules/custom/sanity/gas_grid.py | 32 +++++++++--------- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 078891fa3..0d1ba9a8c 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -89,7 +89,7 @@ class FinalValidations(Dataset): #: name: str = "FinalValidations" #: - version: str = "0.0.1" + version: str = "0.0.1.dev" def __init__(self, dependencies): super().__init__( @@ -257,34 +257,34 @@ def __init__(self, dependencies): ), # GENERATORS - eGon2035 - # CH4 generators must connect to CH4 buses + # CH4 generators must connect to CH4 buses (any country) GasOnePortConnections( table="grid.egon_etrago_generator", rule_id="SANITY_GAS_ONE_PORT_GENERATOR_CH4_EGON2035", scenario="eGon2035", component_type="generator", component_carrier="CH4", - bus_conditions=[("CH4", "IS NOT NULL")] # Any CH4 bus + bus_conditions=[("CH4", "")] # Any CH4 bus, no country filter ), # STORES - eGon2035 - # CH4 stores must connect to CH4 buses + # CH4 stores must connect to CH4 buses (any country) GasOnePortConnections( table="grid.egon_etrago_store", rule_id="SANITY_GAS_ONE_PORT_STORE_CH4_EGON2035", scenario="eGon2035", component_type="store", component_carrier="CH4", - bus_conditions=[("CH4", "IS NOT NULL")] + bus_conditions=[("CH4", "")] # Any CH4 bus, no country filter ), - # H2_underground stores must connect to H2_saltcavern buses + # H2_underground stores must connect to H2_saltcavern buses (any country) GasOnePortConnections( table="grid.egon_etrago_store", rule_id="SANITY_GAS_ONE_PORT_STORE_H2_UNDERGROUND_EGON2035", scenario="eGon2035", component_type="store", component_carrier="H2_underground", - bus_conditions=[("H2_saltcavern", "IS NOT NULL")] + bus_conditions=[("H2_saltcavern", "")] # Any H2_saltcavern bus, no country filter ), # H2_overground stores must connect to H2_saltcavern or H2_grid in DE GasOnePortConnections( diff --git a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py index 65b2fd878..bd3fe3397 100644 --- a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py +++ b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py @@ -84,6 +84,32 @@ def get_query(self, ctx): carriers_str = "', '".join(self.output_carriers) carrier_filter = f"carrier IN ('{carriers_str}')" + # Build bus filter based on component type + # Links have bus0 and bus1, generators/storage have bus + if self.component_type == "link": + bus_filter = f""" + AND (bus0 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + ) OR bus1 IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + )) + """ + else: + bus_filter = f""" + AND bus IN ( + SELECT bus_id + FROM grid.egon_etrago_bus + WHERE scn_name = '{self.scenario}' + AND country = 'DE' + ) + """ + return f""" WITH output_capacity AS ( SELECT @@ -91,12 +117,7 @@ def get_query(self, ctx): FROM {self.table} WHERE scn_name = '{self.scenario}' AND {carrier_filter} - AND bus IN ( - SELECT bus_id - FROM grid.egon_etrago_bus - WHERE scn_name = '{self.scenario}' - AND country = 'DE' - ) + {bus_filter} ), input_capacity AS ( SELECT diff --git a/src/egon/data/validation/rules/custom/sanity/gas_grid.py b/src/egon/data/validation/rules/custom/sanity/gas_grid.py index c83fba331..55b1ee16d 100644 --- a/src/egon/data/validation/rules/custom/sanity/gas_grid.py +++ b/src/egon/data/validation/rules/custom/sanity/gas_grid.py @@ -133,8 +133,8 @@ def evaluate_df(self, df, ctx): rule_class=self.__class__.__name__ ) else: - # Get sample of isolated buses - sample_buses = df.head(10)['bus_id'].tolist() + # Show sample of isolated buses (first 5) + sample_buses = df.head(5).to_dict(orient='records') return RuleResult( rule_id=self.rule_id, @@ -145,8 +145,8 @@ def evaluate_df(self, df, ctx): observed=isolated_count, expected=0, message=( - f"Found {isolated_count} isolated {self.carrier} buses for {self.scenario} " - f"isolated_buses: {df.to_dict(orient="records")}" + f"Found {isolated_count} isolated {self.carrier} buses for {self.scenario}. " + f"Sample (first 5): {sample_buses}" ), severity=Severity.ERROR, schema=self.schema, @@ -377,12 +377,18 @@ def get_query(self, ctx): # Build bus subqueries for each condition bus_subqueries = [] for bus_carrier, country_cond in self.bus_conditions: + # Build country filter - if empty string, omit country condition entirely + if country_cond == "": + country_filter = "" + else: + country_filter = f"AND country {country_cond}" + subquery = f""" (SELECT bus_id FROM grid.egon_etrago_bus WHERE scn_name = '{self.scenario}' AND carrier = '{bus_carrier}' - AND country {country_cond}) + {country_filter}) """ bus_subqueries.append(subquery) @@ -438,9 +444,8 @@ def evaluate_df(self, df, ctx): rule_class=self.__class__.__name__ ) else: - # Get sample of disconnected components - sample_components = df.head(10)['component_id'].tolist() - sample_buses = df.head(10)['bus'].tolist() + # Show sample of disconnected components (first 5) + sample_components = df.head(5).to_dict(orient='records') return RuleResult( rule_id=self.rule_id, @@ -453,8 +458,7 @@ def evaluate_df(self, df, ctx): message=( f"Found {disconnected_count} disconnected {self.component_carrier} " f"{self.component_type}s for {self.scenario}. " - f"disconnected_components: {df.to_dict(orient='records')}, " - f"bus_conditions: {self.bus_conditions}" + f"Sample (first 5): {sample_components}" ), severity=Severity.ERROR, schema=self.schema, @@ -787,10 +791,8 @@ def evaluate_df(self, df, ctx): rule_class=self.__class__.__name__ ) else: - # Get sample of disconnected links - sample_links = df.head(10)['link_id'].tolist() - sample_bus0 = df.head(10)['bus0'].tolist() - sample_bus1 = df.head(10)['bus1'].tolist() + # Show sample of disconnected links (first 5) + sample_links = df.head(5).to_dict(orient='records') return RuleResult( rule_id=self.rule_id, @@ -803,7 +805,7 @@ def evaluate_df(self, df, ctx): message=( f"Found {disconnected_count} disconnected {self.carrier} links " f"for {self.scenario}. " - f"disconnected_links: {df.to_dict(orient='records')}" + f"Sample (first 5): {sample_links}" ), severity=Severity.ERROR, schema=self.schema, From d31ef46199ec239297f5c5ae2c0afd35ebe496cf Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 30 Dec 2025 18:27:47 +0100 Subject: [PATCH 26/54] debug sanity rules --- SANITY_CHECKS_MIGRATION.md | 45 +++- src/egon/data/datasets/final_validations.py | 231 ++++++++++++++---- .../rules/custom/sanity/__init__.py | 4 + 3 files changed, 217 insertions(+), 63 deletions(-) diff --git a/SANITY_CHECKS_MIGRATION.md b/SANITY_CHECKS_MIGRATION.md index 48f7d166b..4972bd453 100644 --- a/SANITY_CHECKS_MIGRATION.md +++ b/SANITY_CHECKS_MIGRATION.md @@ -344,9 +344,14 @@ The following sanity checks have been migrated to validation rules: ### ✅ Timeseries Length - `etrago_timeseries_length()` → `ArrayCardinalityValidation` (reused from egon-validation formal rules!) - - Validates 8 array columns across 5 component types (generator, load, link, store, storage) - - Checks: p_max_pu, p_min_pu, p_set, q_set, e_min_pu, e_max_pu, inflow + - Validates ALL 24 array columns across 5 component types (generator, load, link, store, storage) + - **Generator timeseries (5):** p_set, q_set, p_min_pu, p_max_pu, marginal_cost + - **Load timeseries (2):** p_set, q_set + - **Link timeseries (5):** p_set, p_min_pu, p_max_pu, efficiency, marginal_cost + - **Storage timeseries (7):** p_set, q_set, p_min_pu, p_max_pu, state_of_charge_set, inflow, marginal_cost + - **Store timeseries (5):** p_set, q_set, e_min_pu, e_max_pu, marginal_cost - Leverages existing formal validation rule from egon-validation library + - **Updated:** Now matches original dynamic column discovery behavior (sanity_checks.py:2465-2494) ### ✅ eGon100RE Capacity Validations - `generators_links_storages_stores_100RE()` → `ElectricityCapacityComparison` (reused for eGon100RE!) @@ -356,11 +361,18 @@ The following sanity checks have been migrated to validation rules: - **Note:** Stores validation deferred (original function only prints, no validation logic) ### ✅ Electrical Load Demand -- `electrical_load_100RE()` → `ElectricalLoadAggregationValidation` (reused from egon-validation!) - - Validates annual electrical load sum (TWh) for all scenarios (eGon2035, eGon100RE, etc.) - - Also checks max/min load (GW) - more comprehensive than original - - Leverages existing custom validation rule from egon-validation library - - **Note:** Original function validated by sector (residential, commercial, industrial) but existing rule validates total only +- `electrical_load_100RE()` → `ElectricalLoadAggregationValidation` + `ElectricalLoadSectorBreakdown` + - **Total load validation:** `ElectricalLoadAggregationValidation` validates annual load sum (TWh) for all scenarios + - Also checks max/min load (GW) - more comprehensive than original + - Leverages existing custom validation rule from egon-validation library + - **Sector breakdown validation:** `ElectricalLoadSectorBreakdown` validates eGon100RE by sector (new class!) + - Residential: 90.4 TWh expected (from household_curves table) + - Commercial: 146.7 TWh expected (from cts_curves table) + - Industrial: 382.9 TWh expected (from osm_curves + sites_curves tables) + - Total: 620.0 TWh expected (from etrago AC loads) + - Validates each sector independently with 1% tolerance + - Queries source tables directly matching original implementation + - **Updated:** Now provides full sector granularity as in original (sanity_checks.py:2676-2784) ### ✅ Heat Demand - Heat demand validation (from `etrago_eGon2035_heat()`) → `HeatDemandValidation` (new class!) @@ -443,6 +455,7 @@ egon-data/src/egon/data/ ├── gas_grid.py # ✅ Migrated (5 rules: buses, one-port, CH4 grid, links) ├── gas_loads_generators.py # ✅ Migrated (2 rules: loads, generators) ├── electricity_capacity.py # ✅ Migrated (reusable class for capacity comparison) + ├── electrical_load_sectors.py # ✅ Migrated (1 rule: sector breakdown) └── heat_demand.py # ✅ Migrated (1 rule) egon-validation/egon_validation/rules/ @@ -459,8 +472,8 @@ egon-validation/egon_validation/rules/ **Total sanity checks in original `sanity_checks.py`**: 21 functions **Successfully migrated**: 16 functions (76%) -- Converted to **48 individual validation rules** across multiple categories -- Organized into **8 custom validation modules** +- Converted to **65 individual validation rules** across multiple categories +- Organized into **9 custom validation modules** - Reused **2 existing validation classes** from egon-validation **Deferred (require dataset-inline implementation)**: 5 functions (24%) @@ -473,11 +486,15 @@ egon-validation/egon_validation/rules/ - eGon100RE capacity: 23 rules (13 generators, 9 links, 1 storage) - Gas infrastructure: 11 rules - Demand validation: 4 rules -- Timeseries: 8 rules +- Timeseries: 24 rules (all array columns across 5 component types) - Home batteries: 1 rule -- Electrical load: 1 rule (multi-scenario) +- Electrical load: 2 rules (total aggregation + sector breakdown) - Heat demand: 1 rule +**Recent Updates (2025-12-30)**: +- ✅ **Timeseries validation coverage expanded**: 8 → 24 array columns (now matches original dynamic discovery) +- ✅ **Electrical load sector breakdown implemented**: Added granular validation by sector (residential, commercial, industrial) + --- ## Testing Your Migration @@ -538,11 +555,13 @@ open validation_runs/{run_id}/final/report.html The sanity checks migration is **76% complete** with all core validations successfully migrated to the new framework: -1. **8 custom validation modules** created in `egon/data/validation/rules/custom/sanity/` -2. **48 individual validation rules** implemented across all major categories +1. **9 custom validation modules** created in `egon/data/validation/rules/custom/sanity/` +2. **65 individual validation rules** implemented across all major categories 3. **Reused 2 existing validation classes** from egon-validation library (code reuse > new code) 4. **Fixed 4 RuleResult 'details' parameter errors** by moving violation data to message field 5. **Integrated validations** into `FinalValidations` dataset for cross-cutting checks +6. **Full timeseries coverage** - All 24 array columns validated (matches original dynamic discovery) +7. **Sector breakdown validation** - Electrical load validated by sector (residential, commercial, industrial) ### 🔄 Remaining Work diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 0d1ba9a8c..0047f2124 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -19,6 +19,7 @@ GasGeneratorsCapacity, ElectricityCapacityComparison, HeatDemandValidation, + ElectricalLoadSectorBreakdown, ) from egon_validation.rules.formal.array_cardinality_check import ArrayCardinalityValidation from egon_validation.rules.custom.numeric_aggregation_check import ElectricalLoadAggregationValidation @@ -155,27 +156,30 @@ def __init__(self, dependencies): scenario="eGon2035", carrier="H2_saltcavern" ), - # Check for isolated CH4 buses - eGon100RE - GasBusesIsolated( - table="grid.egon_etrago_bus", - rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON100RE", - scenario="eGon100RE", - carrier="CH4" - ), - # Check for isolated H2_grid buses - eGon100RE - GasBusesIsolated( - table="grid.egon_etrago_bus", - rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON100RE", - scenario="eGon100RE", - carrier="H2_grid" - ), - # Check for isolated H2_saltcavern buses - eGon100RE - GasBusesIsolated( - table="grid.egon_etrago_bus", - rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON100RE", - scenario="eGon100RE", - carrier="H2_saltcavern" - ), + # NOTE: eGon100RE gas bus isolated checks are commented out + # because they are also commented out in the original sanity_checks.py + # (lines 1435-1439). Uncomment when eGon100RE gas bus data is ready. + # # Check for isolated CH4 buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_CH4_EGON100RE", + # scenario="eGon100RE", + # carrier="CH4" + # ), + # # Check for isolated H2_grid buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_H2_GRID_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_grid" + # ), + # # Check for isolated H2_saltcavern buses - eGon100RE + # GasBusesIsolated( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_ISOLATED_H2_SALTCAVERN_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_saltcavern" + # ), # Check CH4 bus count - eGon2035 GasBusesCount( table="grid.egon_etrago_bus", @@ -192,22 +196,24 @@ def __init__(self, dependencies): carrier="H2_grid", rtol=0.10 ), - # Check CH4 bus count - eGon100RE - GasBusesCount( - table="grid.egon_etrago_bus", - rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON100RE", - scenario="eGon100RE", - carrier="CH4", - rtol=0.10 - ), - # Check H2_grid bus count - eGon100RE - GasBusesCount( - table="grid.egon_etrago_bus", - rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON100RE", - scenario="eGon100RE", - carrier="H2_grid", - rtol=0.10 - ), + # NOTE: eGon100RE gas bus count checks are commented out + # because sanity_check_gas_buses() is only called for eGon2035 (line 1943) + # # Check CH4 bus count - eGon100RE + # GasBusesCount( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_COUNT_CH4_EGON100RE", + # scenario="eGon100RE", + # carrier="CH4", + # rtol=0.10 + # ), + # # Check H2_grid bus count - eGon100RE + # GasBusesCount( + # table="grid.egon_etrago_bus", + # rule_id="SANITY_GAS_BUSES_COUNT_H2_GRID_EGON100RE", + # scenario="eGon100RE", + # carrier="H2_grid", + # rtol=0.10 + # ), # Check CH4 grid capacity - eGon2035 CH4GridCapacity( table="grid.egon_etrago_link", @@ -786,16 +792,24 @@ def __init__(self, dependencies): # Timeseries length validations # These check that all timeseries arrays have the expected length (8760 hours) + # NOTE: All array columns are validated to match original sanity_checks.py + # which dynamically discovers all array columns (lines 2465-2494) "timeseries_length": [ - # Generator timeseries - p_max_pu + # Generator timeseries - all array columns ArrayCardinalityValidation( - rule_id="SANITY_TIMESERIES_GENERATOR_P_MAX_PU", + rule_id="SANITY_TIMESERIES_GENERATOR_P_SET", task="FinalValidations.timeseries_length", table="grid.egon_etrago_generator_timeseries", - array_column="p_max_pu", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="q_set", expected_length=8760 ), - # Generator timeseries - p_min_pu ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_GENERATOR_P_MIN_PU", task="FinalValidations.timeseries_length", @@ -803,7 +817,22 @@ def __init__(self, dependencies): array_column="p_min_pu", expected_length=8760 ), - # Load timeseries - p_set + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_GENERATOR_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_generator_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Load timeseries - all array columns ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_LOAD_P_SET", task="FinalValidations.timeseries_length", @@ -811,7 +840,6 @@ def __init__(self, dependencies): array_column="p_set", expected_length=8760 ), - # Load timeseries - q_set ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_LOAD_Q_SET", task="FinalValidations.timeseries_length", @@ -819,7 +847,8 @@ def __init__(self, dependencies): array_column="q_set", expected_length=8760 ), - # Link timeseries - p_set (note: may have NULLs) + + # Link timeseries - all array columns ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_LINK_P_SET", task="FinalValidations.timeseries_length", @@ -827,7 +856,101 @@ def __init__(self, dependencies): array_column="p_set", expected_length=8760 ), - # Store timeseries - e_min_pu + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_EFFICIENCY", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="efficiency", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_LINK_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_link_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Storage timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="q_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_MIN_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_min_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_P_MAX_PU", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="p_max_pu", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_STATE_OF_CHARGE_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="state_of_charge_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_INFLOW", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="inflow", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORAGE_MARGINAL_COST", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_storage_timeseries", + array_column="marginal_cost", + expected_length=8760 + ), + + # Store timeseries - all array columns + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_P_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="p_set", + expected_length=8760 + ), + ArrayCardinalityValidation( + rule_id="SANITY_TIMESERIES_STORE_Q_SET", + task="FinalValidations.timeseries_length", + table="grid.egon_etrago_store_timeseries", + array_column="q_set", + expected_length=8760 + ), ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_STORE_E_MIN_PU", task="FinalValidations.timeseries_length", @@ -835,7 +958,6 @@ def __init__(self, dependencies): array_column="e_min_pu", expected_length=8760 ), - # Store timeseries - e_max_pu ArrayCardinalityValidation( rule_id="SANITY_TIMESERIES_STORE_E_MAX_PU", task="FinalValidations.timeseries_length", @@ -843,12 +965,11 @@ def __init__(self, dependencies): array_column="e_max_pu", expected_length=8760 ), - # Storage timeseries - inflow ArrayCardinalityValidation( - rule_id="SANITY_TIMESERIES_STORAGE_INFLOW", + rule_id="SANITY_TIMESERIES_STORE_MARGINAL_COST", task="FinalValidations.timeseries_length", - table="grid.egon_etrago_storage_timeseries", - array_column="inflow", + table="grid.egon_etrago_store_timeseries", + array_column="marginal_cost", expected_length=8760 ), ], @@ -863,6 +984,16 @@ def __init__(self, dependencies): table="grid.egon_etrago_load", tolerance=0.05 # 5% tolerance ), + # Sector breakdown validation for eGon100RE + # Validates residential (90.4 TWh), commercial (146.7 TWh), + # industrial (382.9 TWh), and total (620.0 TWh) loads + ElectricalLoadSectorBreakdown( + rule_id="SANITY_ELECTRICAL_LOAD_SECTOR_BREAKDOWN_EGON100RE", + task="FinalValidations.electrical_load", + table="grid.egon_etrago_load", + scenario="eGon100RE", + rtol=0.01 # 1% tolerance as in original + ), ], # Heat demand validations diff --git a/src/egon/data/validation/rules/custom/sanity/__init__.py b/src/egon/data/validation/rules/custom/sanity/__init__.py index 2ff844f7c..27cf5f960 100644 --- a/src/egon/data/validation/rules/custom/sanity/__init__.py +++ b/src/egon/data/validation/rules/custom/sanity/__init__.py @@ -32,6 +32,9 @@ from .heat_demand import ( HeatDemandValidation, ) +from .electrical_load_sectors import ( + ElectricalLoadSectorBreakdown, +) __all__ = [ "ResidentialElectricityAnnualSum", @@ -50,4 +53,5 @@ "GasGeneratorsCapacity", "ElectricityCapacityComparison", "HeatDemandValidation", + "ElectricalLoadSectorBreakdown", ] From 04f5fda448cc62ca593a1e1e9e8b89968408cb65 Mon Sep 17 00:00:00 2001 From: Sarah Sommer Date: Mon, 5 Jan 2026 13:51:19 +0100 Subject: [PATCH 27/54] add electrical loads sanity_check --- .../custom/sanity/electrical_load_sectors.py | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py diff --git a/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py b/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py new file mode 100644 index 000000000..007c7d273 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/electrical_load_sectors.py @@ -0,0 +1,275 @@ +""" +Sanity check validation rules for electrical load sector breakdown. + +Validates that electrical loads are correctly disaggregated into sectors +(residential, commercial, industrial) and that each sector matches expected values. +""" + +from egon_validation.rules.base import DataFrameRule, RuleResult, Severity +from egon.data import config, db +import pandas as pd + + +class ElectricalLoadSectorBreakdown(DataFrameRule): + """ + Validate electrical load breakdown by sector (residential, commercial, industrial). + + This rule checks that the electrical load for each sector matches expected values: + - Residential: 90.4 TWh (from household_curves) + - Commercial: 146.7 TWh (from cts_curves) + - Industrial: 382.9 TWh (from osm_curves + sites_curves) + - Total: 620.0 TWh (from etrago AC loads) + + Matches the original electrical_load_100RE() function from sanity_checks.py. + """ + + def __init__(self, table: str, rule_id: str, scenario: str = "eGon100RE", + rtol: float = 0.01, **kwargs): + """ + Parameters + ---------- + table : str + Target table (grid.egon_etrago_load) + rule_id : str + Unique identifier for this validation rule + scenario : str + Scenario name (default: "eGon100RE") + rtol : float + Relative tolerance for load deviation (default: 0.01 = 1%) + """ + super().__init__(rule_id=rule_id, table=table, scenario=scenario, + rtol=rtol, **kwargs) + self.kind = "sanity" + self.scenario = scenario + self.rtol = rtol + + def get_query(self, ctx): + """ + Query to get total AC electrical load for Germany. + + Returns total load in TWh from etrago tables. + """ + return f""" + SELECT SUM((SELECT SUM(p) FROM UNNEST(b.p_set) p))/1000000::numeric as load_twh + FROM grid.egon_etrago_load a + JOIN grid.egon_etrago_load_timeseries b + ON (a.load_id = b.load_id) + JOIN grid.egon_etrago_bus c + ON (a.bus = c.bus_id) + WHERE a.scn_name = '{self.scenario}' + AND b.scn_name = '{self.scenario}' + AND c.scn_name = '{self.scenario}' + AND a.carrier = 'AC' + AND c.country = 'DE' + """ + + def _get_sector_loads(self): + """ + Get electrical loads by sector from source tables. + + Returns + ------- + dict + Dictionary with sector loads in TWh: + - residential: TWh from household_curves + - commercial: TWh from cts_curves + - industrial: TWh from osm_curves + sites_curves + """ + sources = config.datasets()["etrago_electricity"]["sources"] + + # Commercial load from CTS curves + cts_curves = db.select_dataframe( + f"""SELECT bus_id AS bus, p_set FROM + {sources['cts_curves']['schema']}. + {sources['cts_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + commercial_twh = ( + cts_curves.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Industrial load from OSM landuse areas + ind_curves_osm = db.select_dataframe( + f"""SELECT bus, p_set FROM + {sources['osm_curves']['schema']}. + {sources['osm_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + industrial_osm_twh = ( + ind_curves_osm.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Industrial load from industrial sites + ind_curves_sites = db.select_dataframe( + f"""SELECT bus, p_set FROM + {sources['sites_curves']['schema']}. + {sources['sites_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + industrial_sites_twh = ( + ind_curves_sites.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + # Total industrial + industrial_twh = industrial_osm_twh + industrial_sites_twh + + # Residential load from household curves + hh_curves = db.select_dataframe( + f"""SELECT bus_id AS bus, p_set FROM + {sources['household_curves']['schema']}. + {sources['household_curves']['table']} + WHERE scn_name = '{self.scenario}'""", + warning=False + ) + residential_twh = ( + hh_curves.apply(lambda x: sum(x["p_set"]), axis=1).sum() / 1000000 + ) + + return { + "residential": residential_twh, + "commercial": commercial_twh, + "industrial": industrial_twh + } + + def evaluate_df(self, df, ctx): + """ + Evaluate electrical load sector breakdown. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with total load_twh column + ctx : dict + Context information + + Returns + ------- + RuleResult + Validation result with success/failure status + """ + if df.empty or df["load_twh"].isna().all(): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"No electrical load data found for scenario {self.scenario}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Get total AC load + total_load_twh = float(df["load_twh"].values[0]) + + # Get sector loads + try: + sector_loads = self._get_sector_loads() + except Exception as e: + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"Error reading sector load data: {str(e)}", + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # Expected values (from original sanity_checks.py lines 2689-2694) + # References: + # https://github.com/openego/powerd-data/blob/56b8215928a8dc4fe953d266c563ce0ed98e93f9/src/egon/data/datasets/demandregio/__init__.py#L480 + # https://github.com/openego/powerd-data/blob/56b8215928a8dc4fe953d266c563ce0ed98e93f9/src/egon/data/datasets/demandregio/__init__.py#L775 + expected_values = { + "residential": 90.4, + "commercial": 146.7, + "industrial": 382.9, + "total": 620.0 + } + + # Build load summary dataframe + load_summary = pd.DataFrame({ + "sector": ["residential", "commercial", "industrial", "total"], + "expected": [ + expected_values["residential"], + expected_values["commercial"], + expected_values["industrial"], + expected_values["total"] + ], + "observed": [ + sector_loads["residential"], + sector_loads["commercial"], + sector_loads["industrial"], + total_load_twh + ] + }) + + load_summary["diff"] = load_summary["observed"] - load_summary["expected"] + load_summary["diff_pct"] = ( + load_summary["diff"] / load_summary["observed"] * 100 + ) + + # Check if all deviations are within tolerance (< 1% as in original) + violations = load_summary[load_summary["diff_pct"].abs() >= (self.rtol * 100)] + + if not violations.empty: + # Format violation details + violation_details = [] + for _, row in violations.iterrows(): + violation_details.append( + f"{row['sector']}: {row['observed']:.2f} TWh " + f"(expected {row['expected']:.2f} TWh, " + f"deviation {row['diff_pct']:+.2f}%)" + ) + + max_deviation = load_summary["diff_pct"].abs().max() + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + observed=float(max_deviation), + expected=self.rtol * 100, + message=( + f"Electrical load sector breakdown deviations exceed tolerance for {self.scenario}: " + f"{'; '.join(violation_details)}" + ), + severity=Severity.ERROR, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + + # All sectors within tolerance + sector_summary = "; ".join([ + f"{row['sector']}: {row['observed']:.2f} TWh " + f"(expected {row['expected']:.2f} TWh, " + f"deviation {row['diff_pct']:+.2f}%)" + for _, row in load_summary.iterrows() + ]) + + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=True, + observed=0.0, + expected=0.0, + message=( + f"Electrical load sector breakdown valid for {self.scenario}: {sector_summary}" + ), + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) From 6bd58c3378841adc65dd8c0fcae8ce3038275368 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 31 Dec 2025 13:55:13 +0100 Subject: [PATCH 28/54] refactor on_validation_failure --- src/egon/data/datasets/final_validations.py | 2 +- src/egon/data/datasets/storages/__init__.py | 2 +- .../rules/custom/sanity/gas_grid.py | 24 +++++--------- .../rules/custom/sanity/home_batteries.py | 33 ++++++++++++++----- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 0047f2124..a7e899b0f 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -1008,5 +1008,5 @@ def __init__(self, dependencies): ), ], }, - validation_on_failure="continue" # Continue pipeline even if validations fail + on_validation_failure="continue" # Continue pipeline even if validations fail ) diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index e6476f2a7..c43d9ccf7 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -114,7 +114,7 @@ def __init__(self, dependencies): ), ] }, - validation_on_failure="continue" + on_validation_failure="continue" ) diff --git a/src/egon/data/validation/rules/custom/sanity/gas_grid.py b/src/egon/data/validation/rules/custom/sanity/gas_grid.py index 55b1ee16d..974e0a3f6 100644 --- a/src/egon/data/validation/rules/custom/sanity/gas_grid.py +++ b/src/egon/data/validation/rules/custom/sanity/gas_grid.py @@ -314,7 +314,7 @@ class GasOnePortConnections(DataFrameRule): Checks that all gas one-port components (loads, generators, stores) are connected to buses that exist in the database with the correct carrier type. - + This validation ensures data integrity across the etrago tables and prevents orphaned components that would cause errors in network optimization. """ @@ -326,7 +326,7 @@ def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", Parameters ---------- table : str - Target table (grid.egon_etrago_load, grid.egon_etrago_generator, + Target table (grid.egon_etrago_load, grid.egon_etrago_generator, or grid.egon_etrago_store) rule_id : str Unique identifier for this validation rule @@ -340,11 +340,11 @@ def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", List of (bus_carrier, country_condition) tuples that define valid buses Examples: - [("CH4", "= 'DE'")] - CH4 buses in Germany - - [("CH4", "!= 'DE'")] - CH4 buses outside Germany + - [("CH4", "!= 'DE'")] - CH4 buses outside Germany - [("H2_grid", "= 'DE'"), ("AC", "!= 'DE'")] - H2_grid in DE OR AC abroad """ super().__init__(rule_id=rule_id, table=table, scenario=scenario, - component_type=component_type, + component_type=component_type, component_carrier=component_carrier, bus_conditions=bus_conditions or [], **kwargs) self.kind = "sanity" @@ -352,7 +352,7 @@ def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", self.component_type = component_type self.component_carrier = component_carrier self.bus_conditions = bus_conditions or [] - + # Map component type to ID column name self.id_column_map = { "load": "load_id", @@ -373,29 +373,23 @@ def get_query(self, ctx): return "SELECT NULL as component_id, NULL as bus, NULL as carrier LIMIT 0" id_column = self.id_column_map.get(self.component_type, "id") - + # Build bus subqueries for each condition bus_subqueries = [] for bus_carrier, country_cond in self.bus_conditions: - # Build country filter - if empty string, omit country condition entirely - if country_cond == "": - country_filter = "" - else: - country_filter = f"AND country {country_cond}" - subquery = f""" (SELECT bus_id FROM grid.egon_etrago_bus WHERE scn_name = '{self.scenario}' AND carrier = '{bus_carrier}' - {country_filter}) + AND country {country_cond}) """ bus_subqueries.append(subquery) - + # Build NOT IN clauses for all bus conditions not_in_clauses = [f"bus NOT IN {subq}" for subq in bus_subqueries] combined_condition = " AND ".join(not_in_clauses) - + return f""" SELECT {id_column} as component_id, bus, carrier, scn_name FROM {self.table} diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index fd5fb7ecb..9da1b4ff5 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -10,6 +10,7 @@ from egon_validation.rules.base import DataFrameRule, RuleResult, Severity from egon.data import config, db +from egon.data.datasets.storages.home_batteries import get_cbat_pbat_ratio class HomeBatteriesAggregation(DataFrameRule): @@ -31,6 +32,27 @@ def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", **kwarg self.kind = "sanity" self.scenario = scenario + def evaluate(self, engine, ctx) -> RuleResult: + """Override evaluate to catch errors from get_cbat_pbat_ratio().""" + try: + return super().evaluate(engine, ctx) + except IndexError as e: + # get_cbat_pbat_ratio() failed because no home_battery data exists + if "index 0 is out of bounds" in str(e): + return RuleResult( + rule_id=self.rule_id, + task=self.task, + table=self.table, + kind=self.kind, + success=False, + message=f"⚠️ NO DATA FOUND: No home_battery carrier found in etrago_storage table for scenario {self.scenario}", + severity=Severity.WARNING, + schema=self.schema, + table_name=self.table_name, + rule_class=self.__class__.__name__ + ) + raise + def get_query(self, ctx): """ Query to compare storage and building-level home battery data. @@ -42,15 +64,8 @@ def get_query(self, ctx): sources = config.datasets()["home_batteries"]["sources"] targets = config.datasets()["home_batteries"]["targets"] - # Get cbat_pbat_ratio for capacity calculation - # Query the ratio directly from the database instead of importing from dataset module - cbat_pbat_ratio_query = f""" - SELECT max_hours - FROM {sources["etrago_storage"]["schema"]}.{sources["etrago_storage"]["table"]} - WHERE carrier = 'home_battery' - LIMIT 1 - """ - cbat_pbat_ratio = int(db.select_dataframe(cbat_pbat_ratio_query).iat[0, 0]) + # Get cbat_pbat_ratio for capacity calculation (same as original sanity check) + cbat_pbat_ratio = get_cbat_pbat_ratio() return f""" WITH storage_data AS ( From 1de9edbabf4c811fe350426d2a4059a5d33ed22f Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 8 Jan 2026 13:05:33 +0100 Subject: [PATCH 29/54] add different boundaries --- .../data/datasets/osm_buildings_streets/__init__.py | 10 ++++++---- src/egon/data/datasets/vg250/__init__.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index 862bc6d64..0b5aa3439 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -220,13 +220,14 @@ def __init__(self, dependencies): "data_quality": [ RowCountValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="ROW_COUNT.egon_map_zensus_buildings_filtered", - expected_count=28070301 + rule_id="TEST_ROW_COUNT.egon_map_zensus_buildings_filtered", + expected_count={"Schleswig-Holstein":1010387, + "Everything":28070301} ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_filtered", rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_buildings_filtered", - column_types={"id": "integer", "cell_id": "integer"} + column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_filtered", @@ -235,7 +236,8 @@ def __init__(self, dependencies): RowCountValidation( table="boundaries.egon_map_zensus_buildings_residential", rule_id="ROW_COUNT.egon_map_zensus_buildings_residential", - expected_count=27477467 + expected_count={"Schleswig-Holstein":989967, + "Everything":27477467} ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_residential", diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 8efc46df7..54b2ac2f4 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -541,17 +541,24 @@ def __init__(self, dependencies): RowCountValidation( table="boundaries.vg250_krs", rule_id="TEST_ROW_COUNT", - expected_count=27 + expected_count={"Schleswig-Holstein":27, "Everything":431} ), DataTypeValidation( table="boundaries.vg250_krs", rule_id="TEST_DATA_MULTIPLE_TYPES", - column_types={"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", + column_types={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"timestamp without time zone", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"}, + "Everything":{"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint", "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text", "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"} + } ), NotNullAndNotNaNValidation( table="boundaries.vg250_krs", From 6faecaa37284403d9fbed7acd4ca0369c4f441c9 Mon Sep 17 00:00:00 2001 From: sarah Date: Mon, 12 Jan 2026 14:55:22 +0100 Subject: [PATCH 30/54] add grid datasets --- src/egon/data/datasets/final_validations.py | 237 +++++++++++++++++++- 1 file changed, 235 insertions(+), 2 deletions(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index a7e899b0f..c9448f441 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -21,8 +21,15 @@ HeatDemandValidation, ElectricalLoadSectorBreakdown, ) -from egon_validation.rules.formal.array_cardinality_check import ArrayCardinalityValidation -from egon_validation.rules.custom.numeric_aggregation_check import ElectricalLoadAggregationValidation +from egon_validation import ( + ArrayCardinalityValidation, + ElectricalLoadAggregationValidation, + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) def notasks(): @@ -1007,6 +1014,232 @@ def __init__(self, dependencies): rtol=0.02 # 2% tolerance ), ], + "data-quality": [ + #grid validation + RowCountValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_ROW_COUNT.egon_etrago_bus", + expected_count={"Schleswig-Holstein": 2729, "Everything": 85710} + ), + DataTypeValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_bus", + column_types={ + "scen_name": "character varying", "bus_id": "bigint", "v_nom": "double precision", + "type": "text", "carrier": "text", "v_mag_pu_set": "double precision", + "v_mag_pu_min": "double precision", "v_mag_pu_max": "double precision", + "x": "double precision", "y": "double precision", "geometry": "geometry", "country": "text" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_NOT_NAN.egon_etrago_bus", + columns=[ + "scn_name", "bus_id", "v_nom", "carrier", "v_mag_pu_min", "v_mag_pu_max", "x", "y", "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_bus" + ), + ValueSetValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_bus", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_bus", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_bus", + column="carrier", + expected_values=[ + "rural_heat", "urban_central_water_tanks", "low_voltage", "CH4", "H2_saltcavern", + "services_rural_heat", "services_rural_water_tanks", "central_heat_store", "AC", "Li_ion", + "H2_grid", "dsm", "urban_central_heat", "residential_rural_heat", "central_heat", + "rural_heat_store", "residential_rural_water_tanks" + ] + ), + RowCountValidation( + table="grid.egon_etrago_generator", + rule_id="TEST_ROW_COUNT.egon_etrago_generator", + expected_count={"Schleswig-Holstein": 2863, "Everything": 40577} + ), + DataTypeValidation( + table="grid.egon_etrago_generator", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_generator", + column_types={ + "scen_name": "character varying", "generator_id": "bigint", "control": "text", + "type": "text", "carrier": "text", "p_nom": "double precision", "p_nom_extendable": "boolean", + "p_nom_min": "double precision", "p_nom_max": "double precision", "p_min_pu": "double precision", + "p_max_pu": "double precision", "p_set": "double precision", "q_set": "double precision", + "sign": "double precision", "marginal_cost": "double precision", "build_year": "bigint", + "lifetime": "double precision", "capital_cost": "double precision", "efficiency": "double precision", + "commitable": "boolean", "start_up_cost": "double precision", "shut_down_cost": "double precision", + "min_up_time": "bigint", "min_down_time": "bigint", "up_time_before": "bigint", "down_time_before": "bigint", + "ramp_limit_up": "double precision", "ramp_limit_down": "double precision", + "ramp_limit_start_up": "double precision", "ramp_limit_shut_down": "double precision", + "e_nom_max": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_generator", + rule_id="TEST_NOT_NAN.egon_etrago_generator", + columns=[ + "scn_name", "generator_id", "bus", "control", "type", "carrier", "p_nom", "p_nom_extendable", + "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "sign", "marginal_cost", "build_year", + "lifetime", "capital_cost", "efficiency", "committable", "start_up_cost", "shut_down_cost", + "min_up_time", "min_down_time", "up_time_before", "down_time_before", "ramp_limit_start_up", + "ramp_limit_shut_down", "e_nom_max" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_generator", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_generator" + ), + ValueSetValidation( + table="grid.egon_etrago_generator", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_generator", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_egon_etrago_generator", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_generator", + column="carrier", + expected_values=[ + "CH4", "others", "central_biomass_CHP", "wind_onshore", "lignite", "geo_thermal", "solar", + "reservoir", "services_rural_solar_thermal_collector", "residential_rural_solar_thermal_collector", + "industrial_biomass_CHP", "biomass", "urban_central_solar_thermal_collector", "run_of_river", + "oil", "central_biomass_CHP_heat", "nuclear", "coal", "solar_thermal_collector", "solar_rooftop", + "wind_offshore" + ] + ), + RowCountValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_generator_timeseries", + expected_count={"Schleswig-Holstein": 1929, "Everything": 28651} + ), + DataTypeValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_generator_timeseries", + column_types={ + "scn_name": "character varying", "generator_id": "integer", "temp_id": "integer", + "p_set": "double precision[]", "q_set": "double precision[]", "p_min_pu": "double_precision []", + "p_max_pu": "double precision []", "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_generator_timeseries", + columns=[ + "scn_name", "generator_id", "temp_id", "p_max_pu" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_generator_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_generator_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_generator_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_line", + rule_id="TEST_ROW_COUNT.egon_etrago_line", + expected_count={"Schleswig-Holstein": 1197, "Everything": 69901} + ), + DataTypeValidation( + table="grid.egon_etrago_line", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_line", + column_types={ + "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", + "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", "b": "numeric", + "s_nom": "numeric", "s_nom_extendable": "boolean", "s_nom_min": "double precision", + "s_nom_max": "double precision", "s_max_pu": "double precision", "build_year": "bigint", + "lifetime": "double precision", "capital_cost": "double precision", "length": "double precision", + "cables": "integer", "terrain_factor": "double precision", "num_parallel": "double precision", + "v_ang_min": "double precision", "v_ang_max": "double precision", "v_nom": "double precision", + "geom": "geometry", "topo": "geometry" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_line", + rule_id="TEST_NOT_NAN.egon_etrago_line", + columns=[ + "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", + "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", + "capital_cost", "length", "cables", "terrain_factor", "num_parallel", "v_ang_min", + "v_ang_max", "v_nom", "geom", "topo", + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_line", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_line" + ), + ValueSetValidation( + table="grid.egon_etrago_line", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_line", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_egon_etrago_line", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line", + column="carrier", + expected_values=["AC"] + ), + RowCountValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_line_timeseries", + expected_count={"Schleswig-Holstein": 1197, "Everything": 69714} + ), + DataTypeValidation( + table="grid.egon_etrago_line", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_line_timeseries", + column_types={ + "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", + "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", + "b": "numeric", + "s_nom": "numeric", "s_nom_extendable": "boolean", "s_nom_min": "double precision", + "s_nom_max": "double precision", "s_max_pu": "double precision", "build_year": "bigint", + "lifetime": "double precision", "capital_cost": "double precision", + "length": "double precision", + "cables": "integer", "terrain_factor": "double precision", + "num_parallel": "double precision", + "v_ang_min": "double precision", "v_ang_max": "double precision", + "v_nom": "double precision", + "geom": "geometry", "topo": "geometry" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_line_timeseries", + columns=[ + "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", + "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", + "capital_cost", "length", "cables", "terrain_factor", "num_parallel", "v_ang_min", + "v_ang_max", "v_nom", "geom", "topo", + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_generator_line_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_line_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_line_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_line_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_egon_etrago_line_timeseries", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line_timeseries", + column="carrier", + expected_values=["AC"] + ), + ] }, on_validation_failure="continue" # Continue pipeline even if validations fail ) From 0116467ef3d53c02b2bbb5c29f02fabdb93e2c17 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 13 Jan 2026 12:00:17 +0100 Subject: [PATCH 31/54] finalize grid datasets --- src/egon/data/datasets/final_validations.py | 504 +++++++++++++++++++- 1 file changed, 497 insertions(+), 7 deletions(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index c9448f441..0589994b9 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -1103,7 +1103,7 @@ def __init__(self, dependencies): expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( - table="grid.egon_egon_etrago_generator", + table="grid.egon_etrago_generator", rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_generator", column="carrier", expected_values=[ @@ -1124,8 +1124,8 @@ def __init__(self, dependencies): rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_generator_timeseries", column_types={ "scn_name": "character varying", "generator_id": "integer", "temp_id": "integer", - "p_set": "double precision[]", "q_set": "double precision[]", "p_min_pu": "double_precision []", - "p_max_pu": "double precision []", "marginal_cost": "double precision[]" + "p_set": "double precision[]", "q_set": "double precision[]", "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", "marginal_cost": "double precision[]" }, ), NotNullAndNotNaNValidation( @@ -1185,7 +1185,7 @@ def __init__(self, dependencies): expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( - table="grid.egon_egon_etrago_line", + table="grid.egon_etrago_line", rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line", column="carrier", expected_values=["AC"] @@ -1196,7 +1196,7 @@ def __init__(self, dependencies): expected_count={"Schleswig-Holstein": 1197, "Everything": 69714} ), DataTypeValidation( - table="grid.egon_etrago_line", + table="grid.egon_etrago_line_timeseries", rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_line_timeseries", column_types={ "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", @@ -1224,7 +1224,7 @@ def __init__(self, dependencies): ] ), WholeTableNotNullAndNotNaNValidation( - table="grid.egon_etrago_generator_line_timeseries", + table="grid.egon_etrago_line_timeseries", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_line_timeseries" ), ValueSetValidation( @@ -1234,11 +1234,501 @@ def __init__(self, dependencies): expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( - table="grid.egon_egon_etrago_line_timeseries", + table="grid.egon_etrago_line_timeseries", rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line_timeseries", column="carrier", expected_values=["AC"] ), + RowCountValidation( + table="grid.egon_etrago_link", + rule_id="TEST_ROW_COUNT.egon_etrago_link", + expected_count={"Schleswig-Holstein": 15496, "Everything": 83980} + ), + DataTypeValidation( + table="grid.egon_etrago_link", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_link", + column_types={ + "scn_name": "character varying", "link_id": "bigint", "bus0": "bigint", "bus1": "bigint", + "type": "text", "carrier": "text", "efficiency": "double precision", "build_year": "bigint", + "lifetime": "double precision", "p_nom": "numeric", "p_nom_extendable": "boolean", + "p_nom_min": "double precision", "p_nom_max": "double precision", "p_min_pu": "double precision", + "p_max_pu": "double precision", "p_set": "double precision", "capital_cost": "double precision", + "marginal_cost": "double precision", "length": "double precision", + "terrain_factor": "double precision", "geom": "geometry", "topo": "geometry", + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_link", + rule_id="TEST_NOT_NAN.egon_etrago_link", + columns=[ + "scn_name", "link_id", "bus0", "bus1", "carrier", "efficiency", "build_year", "p_nom", + "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", + "capital_cost", "marginal_cost", "length", "terrain_factor", "geom", "topo" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_link", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_link" + ), + ValueSetValidation( + table="grid.egon_etrago_link", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_link", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_link", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_link", + column="carrier", + expected_values=[ + "industrial_gas_CHP", "residential_rural_water_tanks_discharger", "BEV_charger", "CH4", + "power_to_H2", "urban_central_gas_CHP", "rural_heat_store_discharger", "H2_gridextension", + "urban_central_gas_CHP_CC", "dsm", "services_rural_water_tanks_charger", "H2_to_CH4", + "rural_heat_store_charger", "DC", "central_gas_boiler", "H2_feedin", "H2_retrofit", "OCGT", + "central_gas_CHP_heat", "residential_rural_water_tanks_charger", "central_heat_pump", + "services_rural_ground_heat_pump", "rural_heat_pump", "CH4_to_H2", "central_resistive_heater", + "urban_central_air_heat_pump", "urban_central_water_tanks_discharger", + "urban_central_water_tanks_charger", "services_rural_water_tanks_discharger", + "electricity_distribution_grid", "central_heat_store_discharger", "H2_to_power", + "central_heat_store_charger", "central_gas_CHP", "residential_rural_ground_heat_pump"] + ), + RowCountValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_link_timeseries", + expected_count={"Schleswig-Holstein": 947, "Everything": 25729} + ), + DataTypeValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_link_timeseries", + column_types={ + "scn_name": "character varying", + "link_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "efficiency": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_link_timeseries", + columns=[ + "scn_name", "link_id", "temp_id", "p_set", "p_min_pu", "p_max_pu", "efficiency", + "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_link_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_link_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_link_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_load", + rule_id="TEST_ROW_COUNT.egon_etrago_load", + expected_count={"Schleswig-Holstein": 3202, "Everything": 44019} + ), + DataTypeValidation( + table="grid.egon_etrago_load", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_load", + column_types={ + "scn_name": "character varying", + "load_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_load", + rule_id="TEST_NOT_NAN.egon_etrago_load", + columns=[ + "scn_name", "load_id", "bus", "type", "carrier", "p_set", "q_set", "sign" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_load", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_load" + ), + ValueSetValidation( + table="grid.egon_etrago_load", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_load", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_load", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_load", + column="carrier", + expected_values=[ + "CH4", "H2_for_industry", "services_rural_heat", "H2_system_boundary", "AC", + "urban_central_heat", "residential_rural_heat", "low-temperature_heat_for_industry", + "CH4_for_industry", "central_heat", "CH4_system_boundary", "land_transport_EV", + "H2_hgv_load", "rural_gas_boiler", "rural_heat" + ] + ), + RowCountValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_load_timeseries", + expected_count={"Schleswig-Holstein": 3176, "Everything": 44013} + ), + DataTypeValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_load_timeseries", + column_types={ + "scn_name": "character varying", + "load_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_load_timeseries", + columns=[ + "scn_name", "load_id", "temp_id", "p_set", "q_set" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_load_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_load_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_load_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_ROW_COUNT.egon_etrago_storage", + expected_count={"Schleswig-Holstein": 418, "Everything": 13044} + ), + DataTypeValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_storage", + column_types={ + "scn_name": "character varying", + "storage_id": "bigint", + "bus": "bigint", + "control": "text", + "type": "text", + "carrier": "text", + "p_nom": "double precision", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "state_of_charge_initial": "double precision", + "cyclic_state_of_charge": "boolean", + "state_of_charge_set": "double precision", + "max_hours": "double precision", + "efficiency_store": "double precision", + "efficiency_dispatch": "double precision", + "standing_loss": "double precision", + "inflow": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_NOT_NAN.egon_etrago_storage", + columns=[ + "scn_name", "storage_id", "bus", "control", "type", "carrier", "p_nom", + "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", + "q_set", "sign", "marginal_cost", "capital_cost", "build_year", "lifetime", + "state_of_charge_initial", "cyclic_state_of_charge", "state_of_charge_set", + "max_hours", "efficiency_store", "efficiency_dispatch", "standing_loss", "inflow" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_storage" + ), + ValueSetValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_storage", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + ValueSetValidation( + table="grid.egon_etrago_storage", + rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_storage", + column="carrier", + expected_values=["battery", "home_battery", "pumped_hydro", "reservoir"] + ), + RowCountValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_storage_timeseries", + expected_count={"Schleswig-Holstein": 0, "Everything": 9} + ), + DataTypeValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_storage_timeseries", + column_types={ + "scn_name": "character varying", + "storage_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "state_of_charge_set": "double precision[]", + "inflow": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_storage_timeseries", + columns=[ + "scn_name", "storage_id", "temp_id", "inflow", "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_storage_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_storage_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_storage_timeseries", + column="scn_name", + expected_values=["eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_store", + rule_id="TEST_ROW_COUNT.egon_etrago_store", + expected_count={"Schleswig-Holstein": 2913, "Everything": 26520} + ), + DataTypeValidation( + table="grid.egon_etrago_store", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_store", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "e_nom": "double precision", + "e_nom_extendable": "boolean", + "e_nom_min": "double precision", + "e_nom_max": "double precision", + "e_min_pu": "double precision", + "e_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "e_initial": "double precision", + "e_cyclic": "boolean", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "standing_loss": "double precision", + "build_year": "bigint", + "lifetime": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_store", + rule_id="TEST_NOT_NAN.egon_etrago_store", + columns=[ + "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", + "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", + "e_cyclic", "sign", "marginal_cost", "capital_cost", "standing_loss", "build_year", + "lifetime" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_store", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_store" + ), + ValueSetValidation( + table="grid.egon_etrago_store", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_store", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TEST_ROW_COUNT.egon_etrago_store_timeseries", + expected_count={"Schleswig-Holstein": 392, "Everything": 15281} + ), + DataTypeValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_store_timeseries", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "e_min_pu": "double precision[]", + "e_max_pu": "double precision[]", + "marginal_cost": "double precision[]" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TEST_NOT_NAN.egon_etrago_store_timeseries", + columns=[ + "scn_name", "store_id", "temp_id", "p_set", "q_set", "e_min_pu", "e_max_pu", + "marginal_cost" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_store_timeseries" + ), + ValueSetValidation( + table="grid.egon_etrago_store_timeseries", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_store_timeseries", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="TEST_ROW_COUNT.egon_etrago_temp_resolution", + expected_count=1 + ), + DataTypeValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_temp_resolution", + column_types={ + "temp_id": "bigint", + "timesteps": "bigint", + "resolution": "text", + "start_time": "timestamp without time zone" + }, + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_temp_resolution", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_temp_resolution" + ), + RowCountValidation( + table="grid.egon_etrago_transformer", + rule_id="TEST_ROW_COUNT.egon_etrago_transformer", + expected_count={"Schleswig-Holstein": 31, "Everything": 1545} + ), + DataTypeValidation( + table="grid.egon_etrago_transformer", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_transformer", + column_types={ + "scn_name": "character varying", + "store_id": "bigint", + "bus": "bigint", + "type": "text", + "carrier": "text", + "e_nom": "double precision", + "e_nom_extendable": "boolean", + "e_nom_min": "double precision", + "e_nom_max": "double precision", + "e_min_pu": "double precision", + "e_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "e_initial": "double precision", + "e_cyclic": "boolean", + "sign": "double precision", + "marginal_cost": "double precision", + "capital_cost": "double precision", + "standing_loss": "double precision", + "build_year": "bigint", + "lifetime": "double precision" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_etrago_transformer", + rule_id="TEST_NOT_NAN.egon_etrago_transformer", + columns=[ + "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", + "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", + "e_cyclic", "sign", "marginal_cost", "capital_cost", "standing_loss", "build_year", + "lifetime" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_etrago_transformer", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_transformer" + ), + ValueSetValidation( + table="grid.egon_etrago_transformer", + rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_transformer", + column="scn_name", + expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + ), + RowCountValidation( + table="grid.egon_hvmv_substation", + rule_id="TEST_ROW_COUNT.hvmv_substation", + expected_count={"Schleswig-Holstein": 200, "Everything": 3854} + ), + DataTypeValidation( + table="grid.egon_hvmv_substation", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_hvmv_substation", + column_types={ + "bus_id": "integer", + "lon": "double precision", + "lat": "double precision", + "point": "geometry", + "polygon": "geometry", + "voltage": "text", + "power_type": "text", + "substation": "text", + "osm_id": "text", + "osm_www": "text", + "frequency": "text", + "subst_name": "text", + "ref": "text", + "operator": "text", + "dbahn": "text", + "status": "integer" + }, + ), + NotNullAndNotNaNValidation( + table="grid.egon_hvmv_substation", + rule_id="TEST_NOT_NAN.egon_hvmv_substation", + columns=[ + "bus_id", "lon", "lat", "point", "polygon", "voltage", "power_type", "substation", + "osm_id", "osm_www", "frequency", "subst_name", "ref", "operator", "dbahn", "status" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_hvmv_substation", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_hvmv_substation" + ), + RowCountValidation( + table="grid.egon_mv_grid_district", + rule_id="TEST_ROW_COUNT.egon_mv_grid_district", + expected_count={"Schleswig-Holstein": 200, "Everything": 3854} + ), + DataTypeValidation( + table="grid.egon_mv_grid_district", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_mv_grid_district", + column_types={ + "bus_id": "integer", + "geom": "geometry", + "area": "double precision" + }, + ), + WholeTableNotNullAndNotNaNValidation( + table="grid.egon_mv_grid_district", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_mv_grid_district" + ), ] }, on_validation_failure="continue" # Continue pipeline even if validations fail From 09ffbdf1d119434654fee64a792ca40a33faa6a3 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 13 Jan 2026 14:25:00 +0100 Subject: [PATCH 32/54] add openstreetmap datasets --- .../osm_buildings_streets/__init__.py | 172 +++++++++++++++++- 1 file changed, 171 insertions(+), 1 deletion(-) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index 0b5aa3439..034a9526f 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -247,7 +247,177 @@ def __init__(self, dependencies): WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_residential", rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_buildings_residential" - ) + ), + RowCountValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="ROW_COUNT.osm_amenities_not_in_buildings", + expected_count={"Schleswig-Holstein": 3142, + "Everything": 79928} + ), + DataTypeValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="DATA_MULTIPLE_TYPES.osm_amenities_not_in_buildings", + column_types={ + "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", + "tags": "hstore", "egon_amenity_id": "integer" } + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_not_in_buildings" + ), + RowCountValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="ROW_COUNT.osm_amenities_shops_filtered", + expected_count={"Schleswig-Holstein": 27438, "Everything": 700315} + ), + DataTypeValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="DATA_MULTIPLE_TYPES.osm_amenities_shops_filtered", + column_types={ + "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", + "tags": "hstore", "egon_amenity_id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_shops_filtered" + ), + RowCountValidation( + table="openstreetmap.osm_buildings", + rule_id="ROW_COUNT.osm_buildings", + expected_count={"Schleswig-Holstein": 1298230, "Everything": 34328483} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="ROW_COUNT.osm_buildings_filtered", + expected_count={"Schleswig-Holstein": 1169881, "Everything": 31619905} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings_filtered", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_filtered", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_filtered" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="ROW_COUNT.osm_buildings_residential", + expected_count={"Schleswig-Holstein": 1130929, "Everything": 30713011} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings_residential", + column_types={ + "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", + "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", + "tags": "hstore", "id": "integer"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_residential", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_residential" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="ROW_COUNT.osm_buildings_synthetic", + expected_count={"Schleswig-Holstein": 9498, "Everything": 706911} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings_synthetic", + column_types={ + "id": "character varying", "cell_id": "character varying", "geom_building": "geometry", + "geom_point": "geometry", "n_amenities_inside": "integer", "building": "character varying", + "area": "real"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_synthetic", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_synthetic" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="ROW_COUNT.osm_buildings_with_amenities", + expected_count={"Schleswig-Holstein": 24314, "Everything": 621385} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings_with_amenities", + column_types={ + "osm_id_amenity": "bigint", + "osm_id_building": "bigint", + "id": "integer", + "building": "text", + "area": "double precision", + "geom_building": "geometry", + "geom_amenity": "geometry", + "geom_point": "geometry", + "name": "text", + "tags_building": "hstore", + "tags_amenity": "hstore", + "n_amenities_inside": "bigint", + "apartment_count": "numeric"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_with_amenities" + ), + RowCountValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="ROW_COUNT.osm_buildings_without_amenities", + expected_count={"Schleswig-Holstein": 1152146, "Everything": 31151277} + ), + DataTypeValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="DATA_MULTIPLE_TYPES.osm_buildings_without_amenities", + column_types={ + "osm_id": "bigint", + "id": "integer", + "building": "text", + "area": "double precision", + "geom_building": "geometry", + "geom_point": "geometry", + "name": "text", + "tags": "hstore", + "apartment_count": "numeric"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_without_amenities" + ), + RowCountValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="ROW_COUNT.osm_ways_with_segments", + expected_count={"Schleswig-Holstein": 263427, "Everything": 6716196} + ), + DataTypeValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="DATA_MULTIPLE_TYPES.osm_ways_with_segments", + column_types={ + "osm_id": "bigint", + "nodes": "bigint[]", + "highway": "text", + "geom": "geometry", + "length_segments": "double precision[]"} + ), + WholeTableNotNullAndNotNaNValidation( + table="openstreetmap.osm_ways_with_segments", + rule_id="WHOLE_TABLE_NOT_NAN.osm_ways_with_segments" + ), ] }, on_validation_failure="continue" From 5deafe1939bd5883e4aaa92cef656bb094798f3e Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 13 Jan 2026 14:25:48 +0100 Subject: [PATCH 33/54] add scenario dataset --- .../datasets/scenario_parameters/__init__.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/egon/data/datasets/scenario_parameters/__init__.py b/src/egon/data/datasets/scenario_parameters/__init__.py index ceef011ff..6956dda9e 100755 --- a/src/egon/data/datasets/scenario_parameters/__init__.py +++ b/src/egon/data/datasets/scenario_parameters/__init__.py @@ -17,6 +17,12 @@ import egon.data.config import egon.data.datasets.scenario_parameters.parameters as parameters +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + WholeTableNotNullAndNotNaNValidation +) + Base = declarative_base() @@ -314,4 +320,26 @@ def __init__(self, dependencies): download_pypsa_technology_data, insert_scenarios, ), + validation={ + "data-quality": [ + RowCountValidation( + table="scenario.egon_scenario_parameters", + rule_id="ROW_COUNT.egon_scenario_parameters", + expected_count={"Schleswig-Holstein": 5, "Everything": 3} + ), + DataTypeValidation( + table="scenario.egon_scenario_parameters", + rule_id="DATA_MULTIPLE_TYPES.egon_scenario_parameters", + column_types={ + "name": "character varying", "global_parameters": "jsonb", "electricity_parameters": "jsonb", + "gas_parameters": "jsonb", "heat_parameters": "jsonb", "mobility_parameters": "jsonb", + "description": "character varying"} + ), + WholeTableNotNullAndNotNaNValidation( + table="scenario.egon_scenario_parameters", + rule_id="WHOLE_TABLE_NOT_NAN.egon_scenario_parameters" + ) + ] + }, + on_validation_failure = "continue" ) From 2e0411206df59921069428c20248934cd165cdc1 Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 13 Jan 2026 14:56:18 +0100 Subject: [PATCH 34/54] add society datasets --- src/egon/data/datasets/society_prognosis.py | 51 ++++++ src/egon/data/datasets/vg250/__init__.py | 34 +++- src/egon/data/datasets/zensus/__init__.py | 184 ++++++++++++++++++++ 3 files changed, 263 insertions(+), 6 deletions(-) diff --git a/src/egon/data/datasets/society_prognosis.py b/src/egon/data/datasets/society_prognosis.py index b0a42e96f..256adf4fa 100755 --- a/src/egon/data/datasets/society_prognosis.py +++ b/src/egon/data/datasets/society_prognosis.py @@ -11,6 +11,13 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation +) + # will be later imported from another file ### Base = declarative_base() @@ -22,6 +29,50 @@ def __init__(self, dependencies): version="0.0.1", dependencies=dependencies, tasks=(create_tables, {zensus_population, zensus_household}), + validation={ + "data-quality":[ + RowCountValidation( + table="society.egon_household_prognosis", + rule_id="TEST_ROW_COUNT.egon_household_prognosis", + expected_count={"Everything": 5319490} + ), + DataTypeValidation( + table="society.egon_household_prognosis", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_household_prognosis", + column_types={"zensus_population_id": "integer", "year": "integer", "households": "double precision"} + ), + NotNullAndNotNaNValidation( + table="society.egon_household_prognosis", + rule_id="TEST_NOT_NAN.egon_household_prognosis", + columns=["zensus_population_id", "year", "households"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_household_prognosis", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_household_prognosis" + ), + RowCountValidation( + table="society.egon_population_prognosis", + rule_id="TEST_ROW_COUNT.egon_population_prognosis", + expected_count={"Everything": 6355446} + ), + DataTypeValidation( + table="society.egon_population_prognosis", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_population_prognosis", + column_types={"zensus_population_id": "integer", "year": "integer", + "population": "double precision"} + ), + NotNullAndNotNaNValidation( + table="society.egon_population_prognosis", + rule_id="TEST_NOT_NAN.egon_population_prognosis", + columns=["zensus_population_id", "year", "population"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_population_prognosis", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_population_prognosis" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 54b2ac2f4..5d54e3bd9 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -540,12 +540,12 @@ def __init__(self, dependencies): "data_quality": [ RowCountValidation( table="boundaries.vg250_krs", - rule_id="TEST_ROW_COUNT", + rule_id="TEST_ROW_COUNT.vg250_krs", expected_count={"Schleswig-Holstein":27, "Everything":431} ), DataTypeValidation( table="boundaries.vg250_krs", - rule_id="TEST_DATA_MULTIPLE_TYPES", + rule_id="TEST_DATA_MULTIPLE_TYPES.vg250_krs", column_types={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", @@ -562,19 +562,41 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="boundaries.vg250_krs", - rule_id="TEST_NOT_NAN", + rule_id="TEST_NOT_NAN.vg250_krs", columns=["gf","bsg"] ), WholeTableNotNullAndNotNaNValidation( table="boundaries.vg250_krs", - rule_id="TEST_WHOLE_TABLE_NOT_NAN" + rule_id="TEST_WHOLE_TABLE_NOT_NAN.vg250_krs" ), ValueSetValidation( table="boundaries.vg250_krs", - rule_id="TEST_VALUE_SET", + rule_id="TEST_VALUE_SET_NBD.vg250_krs", column="nbd", expected_values=["ja", "nein"] - ) + ), + RowCountValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="TEST_ROW_COUNT.destatis_zensus_population_per_ha_inside_germany", + expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723} + ), + DataTypeValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="TEST_DATA_MULTIPLE_TYPES.destatis_zensus_population_per_ha_inside_germany", + column_types={ + "id": "integer", "grid_id": "character varying (254)", "population": "smallint", + "geom_point": "geometry","geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="TEST_NOT_NAN.destatis_zensus_population_per_ha_inside_germany", + columns=["id", "grid_id", "population", "geom_point", "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/zensus/__init__.py b/src/egon/data/datasets/zensus/__init__.py index 3d498a12b..499ef0bbb 100755 --- a/src/egon/data/datasets/zensus/__init__.py +++ b/src/egon/data/datasets/zensus/__init__.py @@ -17,6 +17,13 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation +) + class ZensusPopulation(Dataset): def __init__(self, dependencies): @@ -28,6 +35,33 @@ def __init__(self, dependencies): create_zensus_pop_table, population_to_postgres, ), + validation={ + "data-quality":[ + RowCountValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="TEST_ROW_COUNT.egon_destatis_zensus_apartment_building_population_per_ha", + expected_count={"Schleswig-Holstein": 145634, "Everything": 3206490} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_apartment_building_population_per_ha", + column_types={ + "grid_id": "character varying", "zensus_population_id": "integer", "building_count": "smallint", + "apartment_count": "smallint", "geom": "geometry", "geom_point": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="TEST_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha", + columns=["grid_id", "zensus_population_id", "building_count", "apartment_count", "geom", "geom_point"] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha" + ), + ] + }, + on_validation_failure="continue" ) @@ -41,6 +75,156 @@ def __init__(self, dependencies): create_zensus_misc_tables, zensus_misc_to_postgres, ), + validation={ + "data-quality":[ + + RowCountValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="TEST_ROW_COUNT.egon_destatis_zensus_apartment_per_ha", + expected_count={"Schleswig-Holstein": 1946300, "Everything": 51095280} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_apartment_per_ha", + column_types={ + "id": "integer", "grid_id": "character varying", "grid_id_new": "character varying", + "attribute": "character varying", "characteristics_code": "smallint", + "characteristics_text": "text", "quantity": "smallint", "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="TEST_NOT_NAN.egon_destatis_zensus_apartment_per_ha", + columns=[ + "id", "grid_id", "grid_id_new", "attribute", "characteristics_code", "characteristics_text", + "quantity", "quantity_q", "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_apartment_per_ha", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_apartment_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="TEST_ROW_COUNT.egon_destatis_zensus_building_per_ha", + expected_count={"Schleswig-Holstein": 978493, "Everything": 24297136} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_building_per_ha", + column_types={ + "id": "integer", + "grid_id": "character varying", + "grid_id_new": "character varying", + "attribute": "character varying", + "characteristics_code": "smallint", + "characteristics_text": "text", + "quantity": "smallint", + "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="TEST_NOT_NAN.egon_destatis_zensus_building_per_ha", + columns=[ + "id", + "grid_id", + "grid_id_new", + "attribute", + "characteristics_code", + "characteristics_text", + "quantity", + "quantity_q", + "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_building_per_ha", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_building_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="TEST_ROW_COUNT.egon_destatis_zensus_household_per_ha", + expected_count={"Schleswig-Holstein": 724970, "Everything": 18788917} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_household_per_ha", + column_types={ + "id": "integer", + "grid_id": "character varying", + "grid_id_new": "character varying", + "attribute": "character varying", + "characteristics_code": "smallint", + "characteristics_text": "text", + "quantity": "smallint", + "quantity_q": "smallint", + "zensus_population_id": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="TEST_NOT_NAN.egon_destatis_zensus_household_per_ha", + columns=[ + "id", + "grid_id", + "grid_id_new", + "attribute", + "characteristics_code", + "characteristics_text", + "quantity", + "quantity_q", + "zensus_population_id" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha" + ), + RowCountValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="TEST_ROW_COUNT.egon_destatis_zensus_household_per_ha_refined", + expected_count={"Schleswig-Holstein": 551678, "Everything": 13304814} + ), + DataTypeValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_household_per_ha_refined", + column_types={ + "id": "integer", + "cell_id": "integer", + "grid_id": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "characteristics_code": "integer", + "hh_5types": "integer", + "hh_type": "character", + "hh_10types": "integer" + } + ), + NotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="TEST_NOT_NAN.egon_destatis_zensus_household_per_ha_refined", + columns=[ + "id", + "cell_id", + "grid_id", + "nuts3", + "nuts1", + "characteristics_code", + "hh_5types", + "hh_type", + "hh_10types" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="society.egon_destatis_zensus_household_per_ha_refined", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha_refined" + ), + ] + }, + on_validation_failure="continue" ) From 23a770d18c5d0b5ba2343191d38f26d660516b2c Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 14 Jan 2026 14:37:00 +0100 Subject: [PATCH 35/54] add supply datasets --- src/egon/data/datasets/chp/__init__.py | 72 +++++++++ src/egon/data/datasets/era5.py | 32 ++++ src/egon/data/datasets/final_validations.py | 1 + .../data/datasets/heat_supply/__init__.py | 107 +++++++++++++ .../data/datasets/power_plants/__init__.py | 72 +++++++++ .../datasets/re_potential_areas/__init__.py | 81 ++++++++++ src/egon/data/datasets/renewable_feedin.py | 43 +++++ src/egon/data/datasets/scenario_capacities.py | 150 ++++++++++++++++++ src/egon/data/datasets/storages/__init__.py | 59 +++++++ 9 files changed, 617 insertions(+) diff --git a/src/egon/data/datasets/chp/__init__.py b/src/egon/data/datasets/chp/__init__.py index ac51ff881..0f2e4fe1f 100644 --- a/src/egon/data/datasets/chp/__init__.py +++ b/src/egon/data/datasets/chp/__init__.py @@ -47,6 +47,14 @@ sources, ) +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -853,4 +861,68 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality":[ + RowCountValidation( + table="supply.egon_chp_plants", + rule_id="TEST_ROW_COUNT.egon_chp_plants", + expected_count={"Schleswig-Holstein": 1720, "Everything": 40197} + ), + DataTypeValidation( + table="supply.egon_chp_plants", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_chp_plants", + column_types={ + "id": "integer", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "district_heating": "boolean", + "el_capacity": "double precision", + "th_capacity": "double precision", + "electrical_bus_id": "integer", + "district_heating_area_id": "integer", + "ch4_bus_id": "integer", + "voltage_level": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_chp_plants", + rule_id="TEST_NOT_NAN.egon_chp_plants", + columns=[ + "id", + "sources", + "source_id", + "carrier", + "district_heating", + "el_capacity", + "th_capacity", + "electrical_bus_id", + "district_heating_area_id", + "ch4_bus_id", + "voltage_level", + "scenario", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_chp_plants", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_chp_plants" + ), + ValueSetValidation( + table="supply.egon_chp_plants", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_chp_plants", + column="carrier", + expected_values=["oil", "others", "gas", "gas extended", "biomass"] + ), + ValueSetValidation( + table="supply.egon_chp_plants", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_chp_plants", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/era5.py b/src/egon/data/datasets/era5.py index baaf3ed0c..f62345ac3 100644 --- a/src/egon/data/datasets/era5.py +++ b/src/egon/data/datasets/era5.py @@ -16,6 +16,14 @@ from egon.data.datasets.scenario_parameters import get_sector_parameters import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + # will be later imported from another file ### Base = declarative_base() @@ -56,6 +64,30 @@ def __init__(self, dependencies): }, insert_weather_cells, ), # download_era5 should be included once issue #1250 is solved + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_era5_weather_cells", + rule_id="TEST_ROW_COUNT.egon_era5_weather_cells", + expected_count=29673 + ), + DataTypeValidation( + table="supply.egon_era5_weather_cells", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_era5_weather_cells", + column_types={"w_id": "integer", "geom": "geometry", "geom_point": "geometry"} + ), + NotNullAndNotNaNValidation( + table="supply.egon_era5_weather_cells", + rule_id="TEST_NOT_NAN.egon_era5_weather_cells", + columns=["w_id", "geom", "geom_point"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_era5_weather_cells", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_era5_weather_cells" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 0589994b9..a712457d9 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -1190,6 +1190,7 @@ def __init__(self, dependencies): column="carrier", expected_values=["AC"] ), + #Row Count doen't equal egon_etrago_line, RowCountValidation( table="grid.egon_etrago_line_timeseries", rule_id="TEST_ROW_COUNT.egon_etrago_line_timeseries", diff --git a/src/egon/data/datasets/heat_supply/__init__.py b/src/egon/data/datasets/heat_supply/__init__.py index 2c3a619b5..66cf10661 100644 --- a/src/egon/data/datasets/heat_supply/__init__.py +++ b/src/egon/data/datasets/heat_supply/__init__.py @@ -32,6 +32,14 @@ sources, ) +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + # Will later be imported from another file. Base = declarative_base() @@ -404,6 +412,105 @@ def __init__(self, dependencies): }, metadata, ), + validation={ + "data-quality":[ + RowCountValidation( + table="supply.egon_district_heating", + rule_id="TEST_ROW_COUNT.egon_district_heating", + expected_count={"Schleswig-Holstein": 402, "Everything": 9090} + ), + DataTypeValidation( + table="supply.egon_district_heating", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_district_heating", + column_types={ + "index": "integer", + "district_heating_id": "integer", + "carrier": "character varying", + "category": "character varying", + "capacity": "double precision", + "geometry": "geometry", + "scenario": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_district_heating", + rule_id="TEST_NOT_NAN.egon_district_heating", + columns=[ + "index", + "district_heating_id", + "carrier", + "category", + "capacity", + "geometry", + "scenario" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_district_heating", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_district_heating" + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_district_heating", + column="carrier", + expected_values=["geo_thermal", "CHP", "gas_boiler", "resistive_heater", "heat_pump", "solar_thermal_collector"] + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_district_heating", + column="scenario", + expected_values=["eGon2035"] + ), + RowCountValidation( + table="supply.egon_individual_heating", + rule_id="TEST_ROW_COUNT.egon_individual_heating", + expected_count={"Schleswig-Holstein": 400, "Everything": 7692} + ), + DataTypeValidation( + table="supply.egon_individual_heating", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_individual_heating", + column_types={ + "index": "integer", + "mv_grid_id": "integer", + "carrier": "character varying", + "category": "character varying", + "capacity": "double precision", + "geometry": "geometry", + "scenario": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_individual_heating", + rule_id="TEST_NOT_NAN.egon_individual_heating", + columns=[ + "index", + "mv_grid_id", + "carrier", + "category", + "capacity", + "geometry", + "scenario" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_individual_heating", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_individual_heating" + ), + ValueSetValidation( + table="supply.egon_individual_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_individual_heating", + column="carrier", + expected_values=["gas_boiler", "heat_pump"] + ), + ValueSetValidation( + table="supply.egon_individual_heating", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_individual_heating", + column="scenario", + expected_values=["eGon2035"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/power_plants/__init__.py b/src/egon/data/datasets/power_plants/__init__.py index 3ea65fba0..2f475ad7d 100755 --- a/src/egon/data/datasets/power_plants/__init__.py +++ b/src/egon/data/datasets/power_plants/__init__.py @@ -44,6 +44,14 @@ import egon.data.datasets.power_plants.wind_farms as wind_onshore import egon.data.datasets.power_plants.wind_offshore as wind_offshore +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -1624,4 +1632,68 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_power_plants", + rule_id="TEST_ROW_COUNT.egon_power_plants", + expected_count={"Schleswig-Holstein":34828, "Everything": 1103} + ), + DataTypeValidation( + table="supply.egon_power_plants", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_power_plants", + column_types={ + "id": "bigint", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "el_capacity": "double precision", + "bus_id": "integer", + "voltage_level": "integer", + "weather_cell_id": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_power_plants", + rule_id="TEST_NOT_NAN.egon_power_plants", + columns=["id", + "sources", + "source_id", + "carrier", + "el_capacity", + "bus_id", + "voltage_level", + "weather_cell_id", + "scenario", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_power_plants", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_power_plants" + ), + ValueSetValidation( + table="supply.egon_power_plants", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_power_plants", + column="carrier", + expected_values=["others", + "gas", + "biomass", + "run_of_river", + "wind_onshore", + "oil", + "wind_offshore", + "solar", + "reservoir"] + ), + ValueSetValidation( + table="supply.egon_power_plants", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_power_plants", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/re_potential_areas/__init__.py b/src/egon/data/datasets/re_potential_areas/__init__.py index bcb34af86..35968b180 100644 --- a/src/egon/data/datasets/re_potential_areas/__init__.py +++ b/src/egon/data/datasets/re_potential_areas/__init__.py @@ -13,6 +13,13 @@ from egon.data.datasets import Dataset import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation +) + Base = declarative_base() @@ -152,4 +159,78 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=self.tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="TEST_ROW_COUNT.egon_re_potential_area_pv_agricultur", + expected_count={"Schleswig-Holstein": 388, "Everything": 8259} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_pv_agricultur", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="TEST_NOT_NAN.egon_re_potential_area_pv_agricultur", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_agricultur" + ), + RowCountValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="TEST_ROW_COUNT.egon_re_potential_area_pv_road_railway", + expected_count={"Schleswig-Holstein": 479, "Everything": 5159} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_pv_road_railway", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="TEST_NOT_NAN.egon_re_potential_area_pv_road_railway", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_road_railway" + ), + RowCountValidation( + table="supply.egon_re_potential_area_wind", + rule_id="TEST_ROW_COUNT.egon_re_potential_area_wind", + expected_count={"Schleswig-Holstein": 6306, "Everything": 120268} + ), + DataTypeValidation( + table="supply.egon_re_potential_area_wind", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_wind", + column_types={ + "id": "integer", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_wind", + rule_id="TEST_NOT_NAN.egon_re_potential_area_wind", + columns=["id", + "geom"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_re_potential_area_wind", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_wind" + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/renewable_feedin.py b/src/egon/data/datasets/renewable_feedin.py index 549c7e073..e3fb58d03 100644 --- a/src/egon/data/datasets/renewable_feedin.py +++ b/src/egon/data/datasets/renewable_feedin.py @@ -24,6 +24,13 @@ from egon.data.metadata import context, license_ccby, meta_metadata, sources import egon.data.config +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) class RenewableFeedin(Dataset): """ @@ -64,6 +71,42 @@ def __init__(self, dependencies): wind_offshore, mapping_zensus_weather, }, + validation = { + "data-quality": [ + RowCountValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="TEST_ROW_COUNT.egon_renewable_feedin", + expected_count=6102 + ), + DataTypeValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_era5_renewable_feedin", + column_types={ + "w_id": "integer", + "weather_year": "integer", + "carrier": "character varying", + "feedin": "double precision[]" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="TEST_NOT_NAN.egon_era5_renewable_feedin", + columns=["w_id", "weather_year", "carrier", "feedin"] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_era5_renewable_feedin", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_era5_renewable_feedin" + ), + ValueSetValidation( + table="supply.egon_district_heating", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_district_heating", + column="carrier", + expected_values=["wind_onshore", "solar_thermal", "heat_pump_cop", "wind_offshore", "pv"] + ), + + ] + }, + on_validation_failure = "continue" ) diff --git a/src/egon/data/datasets/scenario_capacities.py b/src/egon/data/datasets/scenario_capacities.py index c810fc2ab..912b023ae 100755 --- a/src/egon/data/datasets/scenario_capacities.py +++ b/src/egon/data/datasets/scenario_capacities.py @@ -24,6 +24,14 @@ sources, ) +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -1051,4 +1059,146 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, + validation={ + "data-quality": [ + RowCountValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="TEST_ROW_COUNT.egon_nep_2021_conventional_powerplants", + expected_count={"Schleswig-Holstein": 40, "Everything": 737} + ), + DataTypeValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_nep_2021_conventional_powerplants", + column_types={ + "index": "bigint", + "bnetza_id": "text", + "name": "text", + "name_unit": "text", + "carrier_nep": "text", + "chp": "text", + "postcode": "text", + "city": "text", + "federal_state": "text", + "commissioned": "double precision", + "status": "text", + "capacity": "double precision", + "a2035_chp": "text", + "a2035_capacity": "double precision", + "b2035_chp": "text", + "b2035_capacity": "double precision", + "c2035_chp": "text", + "c2035_capacity": "double precision", + "b2040_chp": "text", + "b2040_capacity": "double precision", + "carrier": "text" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="TEST_NOT_NAN.egon_nep_2021_conventional_powerplants", + columns=[ + "index", + "bnetza_id", + "name", + "name_unit", + "carrier_nep", + "chp", + "postcode", + "city", + "federal_state", + "commissioned", + "status", + "capacity", + "a2035_chp", + "a2035_capacity", + "b2035_chp", + "b2035_capacity", + "c2035_chp", + "c2035_capacity", + "b2040_chp", + "b2040_capacity", + "carrier" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_nep_2021_conventional_powerplants", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_nep_2021_conventional_powerplants" + ), + RowCountValidation( + table="supply.egon_scenario_capacities", + rule_id="TEST_ROW_COUNT.egon_scenario_capacities", + expected_count={"Schleswig-Holstein": 17, "Everything": 236} + ), + DataTypeValidation( + table="supply.egon_scenario_capacities", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_scenario_capacities", + column_types={ + "index": "integer", + "component": "character varying", + "carrier": "character varying", + "capacity": "double precision", + "nuts": "character varying", + "scenario_name": "character varying" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_scenario_capacities", + rule_id="TEST_NOT_NAN.egon_scenario_capacities", + columns=[ + "index", + "component", + "carrier", + "capacity", + "nuts", + "scenario_name" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_scenario_capacities", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_scenario_capacities" + ), + ValueSetValidation( + table="supply.egon_scenario_capacities", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_scenario_capacities", + column="carrier", + expected_values=["pumped_hydro", + "gas_for_industry", + "gas_for_industry_CC", + "biogas_to_gas", + "Sabatier", + "urban_central_gas_CHP", + "solar", + "reservoir", + "biogas", + "residential_rural_heat_pump", + "urban_central_solar_thermal_collector", + "oil", + "urban_central_resistive_heater", + "wind_offshore", + "battery", + "others", + "gas", + "wind_onshore", + "small_chp", + "Li_ion", + "urban_central_heat_pump", + "urban_central_geo_thermal", + "SMR", + "biomass", + "hydro", + "run_of_river", + "rural_solar_thermal", + "solar_rooftop", + "BEV_charger"] + ), + ValueSetValidation( + table="supply.egon_scenario_capacities", + rule_id="VALUE_SET_VALIDATION_SCENARIO_NAME.egon_scenario_capacities", + column="scenario_name", + expected_values=["eGon2035", "eGon100RE"] + ), + + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index c43d9ccf7..39309938f 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -34,6 +34,14 @@ ) from egon.data.db import session_scope +from egon_validation import( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + WholeTableNotNullAndNotNaNValidation, + ValueSetValidation +) + Base = declarative_base() @@ -112,6 +120,57 @@ def __init__(self, dependencies): rule_id="SANITY_HOME_BATTERIES_AGGREGATION_EGON100RE", scenario="eGon100RE" ), + RowCountValidation( + table="supply.egon_storages", + rule_id="TEST_ROW_COUNT.egon_storages", + expected_count={"Schleswig-Holstein": 290, "Everything": 7748} + ), + DataTypeValidation( + table="supply.egon_storages", + rule_id="TEST_DATA_MULTIPLE_TYPES.egon_storages", + column_types={ + "id": "bigint", + "sources": "jsonb", + "source_id": "jsonb", + "carrier": "character varying", + "el_capacity": "double precision", + "bus_id": "integer", + "voltage_level": "integer", + "scenario": "character varying", + "geom": "geometry" + } + ), + NotNullAndNotNaNValidation( + table="supply.egon_storages", + rule_id="TEST_NOT_NAN.egon_storages", + columns=[ + "id", + "sources", + "source_id", + "carrier", + "el_capacity", + "bus_id", + "voltage_level", + "scenario", + "geom" + ] + ), + WholeTableNotNullAndNotNaNValidation( + table="supply.egon_storages", + rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_storages" + ), + ValueSetValidation( + table="supply.egon_storages", + rule_id="VALUE_SET_VALIDATION_SCENARIO.egon_storages", + column="scenario", + expected_values=["eGon2035", "eGon100RE"] + ), + ValueSetValidation( + table="supply.egon_storages", + rule_id="VALUE_SET_VALIDATION_CARRIER.egon_storages", + column="carrier", + expected_values=["home_battery", "pumped_hydro"] + ), ] }, on_validation_failure="continue" From 6d032005b312a7bc239bf89f5f6792d06aa083e6 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 15 Jan 2026 13:04:45 +0100 Subject: [PATCH 36/54] add SRID validation --- src/egon/data/datasets/chp/__init__.py | 8 +- .../district_heating_areas/__init__.py | 8 +- .../hh_profiles.py | 3 +- src/egon/data/datasets/era5.py | 13 ++- src/egon/data/datasets/final_validations.py | 53 +++++++++++- .../data/datasets/heat_demand/__init__.py | 1 + .../heat_demand_timeseries/__init__.py | 1 + .../data/datasets/heat_supply/__init__.py | 13 ++- .../osm_buildings_streets/__init__.py | 83 ++++++++++++++++++- .../data/datasets/power_plants/__init__.py | 8 +- .../datasets/re_potential_areas/__init__.py | 18 +++- src/egon/data/datasets/storages/__init__.py | 8 +- src/egon/data/datasets/vg250/__init__.py | 18 +++- src/egon/data/datasets/zensus/__init__.py | 13 ++- 14 files changed, 236 insertions(+), 12 deletions(-) diff --git a/src/egon/data/datasets/chp/__init__.py b/src/egon/data/datasets/chp/__init__.py index 0f2e4fe1f..8a7bb8007 100644 --- a/src/egon/data/datasets/chp/__init__.py +++ b/src/egon/data/datasets/chp/__init__.py @@ -52,7 +52,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) Base = declarative_base() @@ -922,6 +923,11 @@ def __init__(self, dependencies): column="scenario", expected_values=["eGon2035", "eGon100RE"] ), + SRIDUniqueNonZero( + table="supply.egon_chp_plants", + rule_id="SRIDUniqueNonZero.egon_chp_plants", + column="geom" + ) ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/district_heating_areas/__init__.py b/src/egon/data/datasets/district_heating_areas/__init__.py index bf2a02a03..6b487d487 100644 --- a/src/egon/data/datasets/district_heating_areas/__init__.py +++ b/src/egon/data/datasets/district_heating_areas/__init__.py @@ -44,7 +44,8 @@ RowCountValidation, DataTypeValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) # import time @@ -112,6 +113,11 @@ def __init__(self, dependencies): column="scenario", expected_values=["eGon2035", "eGon100RE"] ), + SRIDUniqueNonZero( + table="demand.egon_district_heating_areas", + rule_id="SRIDUniqueNonZero.egon_district_heating_areas", + column="geom_polygon" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index bbc47cea0..d52f8acf5 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -353,7 +353,8 @@ def __init__(self, dependencies): rule_id="WHOLE_TABLE_NOT_NAN.iee_household_load_profiles" ) ] - } + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/era5.py b/src/egon/data/datasets/era5.py index f62345ac3..5887cf712 100644 --- a/src/egon/data/datasets/era5.py +++ b/src/egon/data/datasets/era5.py @@ -21,7 +21,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) # will be later imported from another file ### @@ -85,6 +86,16 @@ def __init__(self, dependencies): table="supply.egon_era5_weather_cells", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_era5_weather_cells" ), + SRIDUniqueNonZero( + table="supply.egon_era5_weather_cells", + rule_id="SRIDUniqueNonZero.egon_era5_weather_cells", + column="geom" + ), + SRIDUniqueNonZero( + table="supply.egon_era5_weather_cells", + rule_id="SRIDUniqueNonZero.egon_era5_weather_cells", + column="geom_point" + ) ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index a712457d9..a7fc7d618 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -28,7 +28,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) @@ -1059,6 +1060,11 @@ def __init__(self, dependencies): "rural_heat_store", "residential_rural_water_tanks" ] ), + SRIDUniqueNonZero( + table="grid.egon_etrago_bus", + rule_id="SRIDUniqueNonZero.egon_etrago_bus", + column="geometry" + ), RowCountValidation( table="grid.egon_etrago_generator", rule_id="TEST_ROW_COUNT.egon_etrago_generator", @@ -1190,6 +1196,16 @@ def __init__(self, dependencies): column="carrier", expected_values=["AC"] ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line", + rule_id="SRIDUniqueNonZero.egon_etrago_line.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line", + rule_id="SRIDUniqueNonZero.egon_etrago_line.topo", + column="topo" + ), #Row Count doen't equal egon_etrago_line, RowCountValidation( table="grid.egon_etrago_line_timeseries", @@ -1240,6 +1256,16 @@ def __init__(self, dependencies): column="carrier", expected_values=["AC"] ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line_timeseries", + rule_id="SRIDUniqueNonZero.egon_etrago_line_timeseries.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_line_timeseries", + rule_id="SRIDUniqueNonZero.egon_etrago_line_timeseries.topo", + column="topo" + ), RowCountValidation( table="grid.egon_etrago_link", rule_id="TEST_ROW_COUNT.egon_etrago_link", @@ -1293,6 +1319,16 @@ def __init__(self, dependencies): "electricity_distribution_grid", "central_heat_store_discharger", "H2_to_power", "central_heat_store_charger", "central_gas_CHP", "residential_rural_ground_heat_pump"] ), + SRIDUniqueNonZero( + table="grid.egon_etrago_link", + rule_id="SRIDUniqueNonZero.egon_etrago_link.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="grid.egon_etrago_link", + rule_id="SRIDUniqueNonZero.egon_etrago_link.topo", + column="topo" + ), RowCountValidation( table="grid.egon_etrago_link_timeseries", rule_id="TEST_ROW_COUNT.egon_etrago_link_timeseries", @@ -1712,6 +1748,16 @@ def __init__(self, dependencies): table="grid.egon_hvmv_substation", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_hvmv_substation" ), + SRIDUniqueNonZero( + table="grid.egon_hvmv_substation", + rule_id="SRIDUniqueNonZero.egon_hvmv_substation.point", + column="point" + ), + SRIDUniqueNonZero( + table="grid.egon_hvmv_substation", + rule_id="SRIDUniqueNonZero.egon_hvmv_substation.polygon", + column="polygon" + ), RowCountValidation( table="grid.egon_mv_grid_district", rule_id="TEST_ROW_COUNT.egon_mv_grid_district", @@ -1730,6 +1776,11 @@ def __init__(self, dependencies): table="grid.egon_mv_grid_district", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_mv_grid_district" ), + SRIDUniqueNonZero( + table="grid.egon_mv_grid_district", + rule_id="SRIDUniqueNonZero.egon_mv_grid_district.geom", + column="geom" + ), ] }, on_validation_failure="continue" # Continue pipeline even if validations fail diff --git a/src/egon/data/datasets/heat_demand/__init__.py b/src/egon/data/datasets/heat_demand/__init__.py index 7d23e5d3f..fbfb01bee 100644 --- a/src/egon/data/datasets/heat_demand/__init__.py +++ b/src/egon/data/datasets/heat_demand/__init__.py @@ -115,6 +115,7 @@ def __init__(self, dependencies): ), ] }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/heat_demand_timeseries/__init__.py b/src/egon/data/datasets/heat_demand_timeseries/__init__.py index 8d442637a..62a87532b 100644 --- a/src/egon/data/datasets/heat_demand_timeseries/__init__.py +++ b/src/egon/data/datasets/heat_demand_timeseries/__init__.py @@ -1299,4 +1299,5 @@ def __init__(self, dependencies): ) ] }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/heat_supply/__init__.py b/src/egon/data/datasets/heat_supply/__init__.py index 66cf10661..1b2a794e0 100644 --- a/src/egon/data/datasets/heat_supply/__init__.py +++ b/src/egon/data/datasets/heat_supply/__init__.py @@ -37,7 +37,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) # Will later be imported from another file. @@ -449,6 +450,11 @@ def __init__(self, dependencies): table="supply.egon_district_heating", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_district_heating" ), + SRIDUniqueNonZero( + table="supply.egon_district_heating", + rule_id="SRIDUniqueNonZero.egon_district_heating.geometry", + column="geometry" + ), ValueSetValidation( table="supply.egon_district_heating", rule_id="VALUE_SET_VALIDATION_CARRIER.egon_district_heating", @@ -496,6 +502,11 @@ def __init__(self, dependencies): table="supply.egon_individual_heating", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_individual_heating" ), + SRIDUniqueNonZero( + table="supply.egon_individual_heating", + rule_id="SRIDUniqueNonZero.egon_individual_heating.geometry", + column="geometry" + ), ValueSetValidation( table="supply.egon_individual_heating", rule_id="VALUE_SET_VALIDATION_CARRIER.egon_individual_heating", diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index 034a9526f..b4411e192 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -10,7 +10,8 @@ from egon_validation import ( RowCountValidation, DataTypeValidation, - WholeTableNotNullAndNotNaNValidation + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero ) @@ -265,6 +266,11 @@ def __init__(self, dependencies): table="openstreetmap.osm_amenities_not_in_buildings", rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_not_in_buildings" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_amenities_not_in_buildings", + rule_id="SRIDUniqueNonZero.osm_amenities_not_in_buildings.geom_amenity", + column="geom_amenity" + ), RowCountValidation( table="openstreetmap.osm_amenities_shops_filtered", rule_id="ROW_COUNT.osm_amenities_shops_filtered", @@ -281,6 +287,11 @@ def __init__(self, dependencies): table="openstreetmap.osm_amenities_shops_filtered", rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_shops_filtered" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_amenities_shops_filtered", + rule_id="SRIDUniqueNonZero.osm_amenities_shops_filtered.geom_amenity", + column="geom_amenity" + ), RowCountValidation( table="openstreetmap.osm_buildings", rule_id="ROW_COUNT.osm_buildings", @@ -298,6 +309,16 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings", + rule_id="SRIDUniqueNonZero.osm_buildings.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings", + rule_id="SRIDUniqueNonZero.osm_buildings.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_buildings_filtered", rule_id="ROW_COUNT.osm_buildings_filtered", @@ -315,6 +336,16 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings_filtered", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_filtered" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_filtered", + rule_id="SRIDUniqueNonZero.osm_buildings_filtered.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_filtered", + rule_id="SRIDUniqueNonZero.osm_buildings_filtered.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_buildings_residential", rule_id="ROW_COUNT.osm_buildings_residential", @@ -332,6 +363,16 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings_residential", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_residential" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_residential", + rule_id="SRIDUniqueNonZero.osm_buildings_residential.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_residential", + rule_id="SRIDUniqueNonZero.osm_buildings_residental.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_buildings_synthetic", rule_id="ROW_COUNT.osm_buildings_synthetic", @@ -349,6 +390,16 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings_synthetic", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_synthetic" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_synthetic", + rule_id="SRIDUniqueNonZero.osm_buildings_synthetic.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_synthetic", + rule_id="SRIDUniqueNonZero.osm_buildings_synthetic.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_buildings_with_amenities", rule_id="ROW_COUNT.osm_buildings_with_amenities", @@ -376,6 +427,21 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings_with_amenities", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_with_amenities" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_amenity", + column="geom_amenity" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_with_amenities.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_buildings_without_amenities", rule_id="ROW_COUNT.osm_buildings_without_amenities", @@ -399,6 +465,16 @@ def __init__(self, dependencies): table="openstreetmap.osm_buildings_without_amenities", rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_without_amenities" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_without_amenities.geom_building", + column="geom_building" + ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_without_amenities", + rule_id="SRIDUniqueNonZero.osm_buildings_without_amenities.geom_point", + column="geom_point" + ), RowCountValidation( table="openstreetmap.osm_ways_with_segments", rule_id="ROW_COUNT.osm_ways_with_segments", @@ -418,6 +494,11 @@ def __init__(self, dependencies): table="openstreetmap.osm_ways_with_segments", rule_id="WHOLE_TABLE_NOT_NAN.osm_ways_with_segments" ), + SRIDUniqueNonZero( + table="openstreetmap.osm_buildings_with_segments", + rule_id="SRIDUniqueNonZero.osm_buildings_with_segments.geom", + column="geom" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/power_plants/__init__.py b/src/egon/data/datasets/power_plants/__init__.py index 2f475ad7d..19d184f4c 100755 --- a/src/egon/data/datasets/power_plants/__init__.py +++ b/src/egon/data/datasets/power_plants/__init__.py @@ -49,7 +49,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) Base = declarative_base() @@ -1693,6 +1694,11 @@ def __init__(self, dependencies): column="scenario", expected_values=["eGon2035", "eGon100RE"] ), + SRIDUniqueNonZero( + table="supply.egon_power_plants", + rule_id="SRIDUniqueNonZero.egon_power_plants.geom", + column="geom" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/re_potential_areas/__init__.py b/src/egon/data/datasets/re_potential_areas/__init__.py index 35968b180..5b02b2180 100644 --- a/src/egon/data/datasets/re_potential_areas/__init__.py +++ b/src/egon/data/datasets/re_potential_areas/__init__.py @@ -17,7 +17,8 @@ RowCountValidation, DataTypeValidation, NotNullAndNotNaNValidation, - WholeTableNotNullAndNotNaNValidation + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero ) Base = declarative_base() @@ -184,6 +185,11 @@ def __init__(self, dependencies): table="supply.egon_re_potential_area_pv_agricultur", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_agricultur" ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_pv_agricultur", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_pv_agricultur.geom", + column="geom" + ), RowCountValidation( table="supply.egon_re_potential_area_pv_road_railway", rule_id="TEST_ROW_COUNT.egon_re_potential_area_pv_road_railway", @@ -207,6 +213,11 @@ def __init__(self, dependencies): table="supply.egon_re_potential_area_pv_road_railway", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_road_railway" ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_pv_road_railway", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_pv_road_railway.geom", + column="geom" + ), RowCountValidation( table="supply.egon_re_potential_area_wind", rule_id="TEST_ROW_COUNT.egon_re_potential_area_wind", @@ -230,6 +241,11 @@ def __init__(self, dependencies): table="supply.egon_re_potential_area_wind", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_wind" ), + SRIDUniqueNonZero( + table="supply.egon_re_potential_area_wind", + rule_id="SRIDUniqueNonZero.egon_re_potential_area_wind.geom", + column="geom" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 39309938f..316624828 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -39,7 +39,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) Base = declarative_base() @@ -171,6 +172,11 @@ def __init__(self, dependencies): column="carrier", expected_values=["home_battery", "pumped_hydro"] ), + SRIDUniqueNonZero( + table="supply.egon_storages", + rule_id="SRIDUniqueNonZero.egon_storages.geom", + column="geom" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 5d54e3bd9..7f46d0716 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -34,7 +34,8 @@ DataTypeValidation, NotNullAndNotNaNValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + SRIDUniqueNonZero ) @@ -569,6 +570,11 @@ def __init__(self, dependencies): table="boundaries.vg250_krs", rule_id="TEST_WHOLE_TABLE_NOT_NAN.vg250_krs" ), + SRIDUniqueNonZero( + table="boundaries.vg250_krs", + rule_id="SRIDUniqueNonZero.vg250_krs.geometry", + column="geometry" + ), ValueSetValidation( table="boundaries.vg250_krs", rule_id="TEST_VALUE_SET_NBD.vg250_krs", @@ -597,6 +603,16 @@ def __init__(self, dependencies): table="society.destatis_zensus_population_per_ha_inside_germany", rule_id="TEST_WHOLE_TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany" ), + SRIDUniqueNonZero( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom_point", + column="geom_point" + ), + SRIDUniqueNonZero( + table="society.destatis_zensus_population_per_ha_inside_germany", + rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom", + column="geom" + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/zensus/__init__.py b/src/egon/data/datasets/zensus/__init__.py index 499ef0bbb..6344ee63a 100755 --- a/src/egon/data/datasets/zensus/__init__.py +++ b/src/egon/data/datasets/zensus/__init__.py @@ -21,7 +21,8 @@ RowCountValidation, DataTypeValidation, NotNullAndNotNaNValidation, - WholeTableNotNullAndNotNaNValidation + WholeTableNotNullAndNotNaNValidation, + SRIDUniqueNonZero ) @@ -59,6 +60,16 @@ def __init__(self, dependencies): table="society.egon_destatis_zensus_apartment_building_population_per_ha", rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha" ), + SRIDUniqueNonZero( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="SRIDUniqueNonZero.egon_destatis_zensus_apartment_building_population_per_ha.geom", + column="geom" + ), + SRIDUniqueNonZero( + table="society.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="SRIDUniqueNonZero.egon_destatis_zensus_apartment_building_population_per_ha.geom_point", + column="geom_point" + ), ] }, on_validation_failure="continue" From 26095b23bc2844d97ff2c6c387c96b96c70a4ce4 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 15 Jan 2026 13:38:23 +0100 Subject: [PATCH 37/54] add ArrayCardinalityValidation --- src/egon/data/datasets/DSM_cts_ind.py | 29 +++++++++++++++++++ .../data/datasets/demandregio/__init__.py | 8 ++++- .../heat_demand_timeseries/__init__.py | 15 ++++++++-- .../heat_supply/individual_heating.py | 22 ++++++++++++++ .../datasets/low_flex_scenario/__init__.py | 12 ++++++++ 5 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/egon/data/datasets/DSM_cts_ind.py b/src/egon/data/datasets/DSM_cts_ind.py index a3025968a..34e59821a 100644 --- a/src/egon/data/datasets/DSM_cts_ind.py +++ b/src/egon/data/datasets/DSM_cts_ind.py @@ -32,6 +32,10 @@ sources, ) +from egon_validation import( + ArrayCardinalityValidation +) + # CONSTANTS # TODO: move to datasets.yml CON = db.engine() @@ -142,6 +146,31 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=(dsm_cts_ind_processing,), + validation={ + "data-quality":[ + ArrayCardinalityValidation( + table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + array_column= "p_set", + expected_length= 8760, + ), + ArrayCardinalityValidation( + table="demand.egon_etrago_electricity_cts_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ArrayCardinalityValidation( + table="demand.egon_osm_ind_load_curves_individual_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ArrayCardinalityValidation( + table="demand.egon_sites_ind_load_curves_individual_dsm_timeseries", + array_column="p_set", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index c4c8a4ed0..f65becbf7 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -24,7 +24,8 @@ RowCountValidation, DataTypeValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + ArrayCardinalityValidation ) try: @@ -136,6 +137,11 @@ def __init__(self, dependencies): column="sector", expected_values=["industry", "CTS"] ), + ArrayCardinalityValidation( + table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + array_column="load_curve", + expected_length=8760, + ) ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/heat_demand_timeseries/__init__.py b/src/egon/data/datasets/heat_demand_timeseries/__init__.py index 62a87532b..bca9b8e9f 100644 --- a/src/egon/data/datasets/heat_demand_timeseries/__init__.py +++ b/src/egon/data/datasets/heat_demand_timeseries/__init__.py @@ -41,7 +41,8 @@ RowCountValidation, DataTypeValidation, WholeTableNotNullAndNotNaNValidation, - ValueSetValidation + ValueSetValidation, + ArrayCardinalityValidation ) Base = declarative_base() @@ -1296,7 +1297,17 @@ def __init__(self, dependencies): rule_id="DATA_MULTIPLE_TYPES.egon_heat_timeseries_selected_profiles", column_types={"zensus_population_id": "integer", "bulding_id": "integer", "selected_idp_profiles": "integer[]"} - ) + ), + ArrayCardinalityValidation( + table="demand.egon_heat_timeseries_selected_profiles", + array_column="selected_idp_profiles", + expected_length=365, + ), + ArrayCardinalityValidation( + table="demand.egon_timeseries_district_heating", + array_column="dist_aggregated_mw", + expected_length=8760, + ), ] }, on_validation_failure="continue" diff --git a/src/egon/data/datasets/heat_supply/individual_heating.py b/src/egon/data/datasets/heat_supply/individual_heating.py index 0b9b6f552..738a3def1 100644 --- a/src/egon/data/datasets/heat_supply/individual_heating.py +++ b/src/egon/data/datasets/heat_supply/individual_heating.py @@ -50,6 +50,8 @@ # get zensus cells with district heating from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts +from egon_validation import ArrayCardinalityValidation + engine = db.engine() Base = declarative_base() @@ -219,6 +221,16 @@ def dyn_parallel_tasks_pypsa_eur(): version=self.version, dependencies=dependencies, tasks=tasks_HeatPumpsPypsaEur, + validation={ + "data-quality": [ + ArrayCardinalityValidation( + table="demand.egon_etrago_timeseries_individual_heating", + array_column="dist_aggregated_mv", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) @@ -458,6 +470,16 @@ def dyn_parallel_tasks_2035(): version="0.0.3", dependencies=dependencies, tasks=tasks_HeatPumps2035, + validation={ + "data-quality":[ + ArrayCardinalityValidation( + table="demand.egon_etrago_timeseries_individual_heating", + array_column="dist_aggregated_mv", + expected_length=8760, + ), + ] + }, + on_validation_failure="continue" ) diff --git a/src/egon/data/datasets/low_flex_scenario/__init__.py b/src/egon/data/datasets/low_flex_scenario/__init__.py index 9e528ad58..7f13cabba 100644 --- a/src/egon/data/datasets/low_flex_scenario/__init__.py +++ b/src/egon/data/datasets/low_flex_scenario/__init__.py @@ -8,6 +8,8 @@ from egon.data.datasets import Dataset +from egon_validation import ArrayCardinalityValidation + Base = declarative_base() @@ -29,4 +31,14 @@ def __init__(self, dependencies): ), }, ), + validation={ + "data-quality":[ + ArrayCardinalityValidation( + table="grid.egon_etrago_bus_timeseries", + array_column="v_mag_pu_set", + expected_length=8760, + ), + ] + }, + on_validaiton_failure="continue" ) From f762aa9aa808967b5fd66413756bd379eee78f06 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 15 Jan 2026 13:49:38 +0100 Subject: [PATCH 38/54] add comment to grid.egon_etrago_line_timeseries RowCountValidation --- src/egon/data/datasets/final_validations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index a7fc7d618..811d75e95 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -1206,7 +1206,7 @@ def __init__(self, dependencies): rule_id="SRIDUniqueNonZero.egon_etrago_line.topo", column="topo" ), - #Row Count doen't equal egon_etrago_line, + #Row Count does't equal egon_etrago_line, because buses are located outside Germany RowCountValidation( table="grid.egon_etrago_line_timeseries", rule_id="TEST_ROW_COUNT.egon_etrago_line_timeseries", From f5d8c784c43612f5a23ba04f5a71a9497fe74c68 Mon Sep 17 00:00:00 2001 From: sarah Date: Fri, 16 Jan 2026 09:29:14 +0100 Subject: [PATCH 39/54] correct typo --- src/egon/data/datasets/osm_buildings_streets/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index b4411e192..f24fcad85 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -370,7 +370,7 @@ def __init__(self, dependencies): ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_residential", - rule_id="SRIDUniqueNonZero.osm_buildings_residental.geom_point", + rule_id="SRIDUniqueNonZero.osm_buildings_residential.geom_point", column="geom_point" ), RowCountValidation( @@ -495,8 +495,8 @@ def __init__(self, dependencies): rule_id="WHOLE_TABLE_NOT_NAN.osm_ways_with_segments" ), SRIDUniqueNonZero( - table="openstreetmap.osm_buildings_with_segments", - rule_id="SRIDUniqueNonZero.osm_buildings_with_segments.geom", + table="openstreetmap.osm_ways_with_segments", + rule_id="SRIDUniqueNonZero.osm_ways_with_segments.geom", column="geom" ), ] From 39987418344edcec6bcc8fc3bec6a56264d106ca Mon Sep 17 00:00:00 2001 From: sarah Date: Fri, 16 Jan 2026 09:54:53 +0100 Subject: [PATCH 40/54] add example as validation placeholder --- src/egon/data/datasets/substation/__init__.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/egon/data/datasets/substation/__init__.py b/src/egon/data/datasets/substation/__init__.py index 7e792eee7..00d9b8606 100644 --- a/src/egon/data/datasets/substation/__init__.py +++ b/src/egon/data/datasets/substation/__init__.py @@ -12,6 +12,16 @@ from egon.data.datasets import Dataset import egon.data.config +# Uncomment to add validation rules: +# from egon_validation import ( +# RowCountValidation, +# DataTypeValidation, +# NotNullAndNotNaNValidation, +# WholeTableNotNullAndNotNaNValidation, +# ValueSetValidation, +# SRIDUniqueNonZero, +# ) + Base = declarative_base() @@ -86,6 +96,18 @@ def __init__(self, dependencies): }, transfer_busses, ), + # Validation placeholder - add rules here. See vg250/__init__.py + # for examples of RowCountValidation, DataTypeValidation, etc. + validation={ + # "": [ + # RowCountValidation( + # table=".", + # rule_id="TEST_ROW_COUNT.", + # expected_count={"Schleswig-Holstein": X, "Everything": Y} + # ), + # ] + }, + on_validation_failure="continue", ) From 8cd5368290abfdb10fbcf05ca8b03203ef116c90 Mon Sep 17 00:00:00 2001 From: sarah Date: Fri, 16 Jan 2026 13:27:22 +0100 Subject: [PATCH 41/54] delete scenario parameter --- src/egon/data/validation_utils.py | 51 +++++++++++-------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index b9d68a708..e165c99d1 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -9,30 +9,24 @@ logger = logging.getLogger(__name__) -def _resolve_context_value(value: Any, boundary: str, scenarios: List[str]) -> Any: - """Resolve a value that may be context-dependent (boundary/scenario). +def _resolve_context_value(value: Any, boundary: str) -> Any: + """Resolve a value that may be boundary-dependent. Args: value: The value to resolve. Can be: - A dict with boundary keys: {"Schleswig-Holstein": 27, "Everything": 537} - - A dict with scenario keys: {"eGon2035": 100, "eGon100RE": 200} - Any other value (returned as-is) boundary: Current dataset boundary setting - scenarios: List of active scenarios Returns: - Resolved value based on current context + Resolved value based on current boundary Examples: >>> _resolve_context_value({"Schleswig-Holstein": 27, "Everything": 537}, - ... "Schleswig-Holstein", ["eGon2035"]) + ... "Schleswig-Holstein") 27 - >>> _resolve_context_value({"eGon2035": 100, "eGon100RE": 200}, - ... "Everything", ["eGon2035"]) - 100 - - >>> _resolve_context_value(42, "Everything", ["eGon2035"]) + >>> _resolve_context_value(42, "Everything") 42 """ # If not a dict, return as-is @@ -44,40 +38,33 @@ def _resolve_context_value(value: Any, boundary: str, scenarios: List[str]) -> A logger.debug(f"Resolved boundary-dependent value: {boundary} -> {value[boundary]}") return value[boundary] - # Try to resolve by scenario - for scenario in scenarios: - if scenario in value: - logger.debug(f"Resolved scenario-dependent value: {scenario} -> {value[scenario]}") - return value[scenario] - - # If dict doesn't match boundary/scenario pattern, return as-is + # If dict doesn't match boundary pattern, return as-is # This handles cases like column_types dicts which are not context-dependent return value -def _resolve_rule_params(rule: Rule, boundary: str, scenarios: List[str]) -> None: - """Recursively resolve context-dependent parameters in a rule. +def _resolve_rule_params(rule: Rule, boundary: str) -> None: + """Resolve boundary-dependent parameters in a rule. Modifies rule.params in-place, resolving any dict values that match - boundary or scenario patterns. + boundary patterns. Args: rule: The validation rule to process boundary: Current dataset boundary setting - scenarios: List of active scenarios """ if not hasattr(rule, 'params') or not isinstance(rule.params, dict): return - # Recursively resolve all parameter values + # Resolve all parameter values for param_name, param_value in rule.params.items(): - resolved_value = _resolve_context_value(param_value, boundary, scenarios) + resolved_value = _resolve_context_value(param_value, boundary) # If the value was resolved (changed), update it if resolved_value is not param_value: logger.info( f"Rule {rule.rule_id}: Resolved {param_name} for " - f"boundary='{boundary}', scenarios={scenarios}" + f"boundary='{boundary}'" ) rule.params[param_name] = resolved_value @@ -88,11 +75,10 @@ def create_validation_tasks( ) -> List[PythonOperator]: """Convert validation dict to Airflow tasks. - Automatically resolves context-dependent parameters in validation rules. - Parameters can be specified as dicts with boundary or scenario keys: + Automatically resolves boundary-dependent parameters in validation rules. + Parameters can be specified as dicts with boundary keys: - Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537} - - Scenario-dependent: {"eGon2035": 100, "eGon100RE": 200} The appropriate value is selected based on the current configuration. @@ -161,20 +147,19 @@ def run_validation(**context): # Get current configuration context config = settings()["egon-data"] boundary = config["--dataset-boundary"] - scenarios = config.get("--scenarios", []) - logger.info(f"Resolving validation parameters for boundary='{boundary}', scenarios={scenarios}") + logger.info(f"Resolving validation parameters for boundary='{boundary}'") # Set task and dataset on all rules (required by Rule base class) - # Also resolve context-dependent parameters + # Also resolve boundary-dependent parameters for rule in rules: if not hasattr(rule, 'task') or rule.task is None: rule.task = task_name if not hasattr(rule, 'dataset') or rule.dataset is None: rule.dataset = dataset_name - # Automatically resolve boundary/scenario-dependent parameters - _resolve_rule_params(rule, boundary, scenarios) + # Automatically resolve boundary-dependent parameters + _resolve_rule_params(rule, boundary) ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) results = run_validations(engine, ctx, rules, full_task_name) From e6f2dc4d20bf8d158e7a5a4f84d420d2f9fb77a2 Mon Sep 17 00:00:00 2001 From: sarah Date: Mon, 19 Jan 2026 11:52:05 +0100 Subject: [PATCH 42/54] delete .dev --- src/egon/data/datasets/electricity_demand/__init__.py | 2 +- src/egon/data/datasets/storages/__init__.py | 2 +- src/egon/data/datasets/vg250/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index 5487bb5c4..ef975aa54 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -56,7 +56,7 @@ class HouseholdElectricityDemand(Dataset): #: name: str = "HouseholdElectricityDemand" #: - version: str = "0.0.5.dev" + version: str = "0.0.5" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 316624828..42b502e17 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -95,7 +95,7 @@ class Storages(Dataset): #: name: str = "Storages" #: - version: str = "0.0.8.dev" + version: str = "0.0.8" def __init__(self, dependencies): super().__init__( diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 7f46d0716..7ac0106aa 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -523,7 +523,7 @@ class Vg250(Dataset): #: name: str = "VG250" #: - version: str = filename + "-0.0.4.dev" + version: str = filename + "-0.0.4" def __init__(self, dependencies): super().__init__( From 06c5235665ebadfdf58b9a9556672e79d88614a6 Mon Sep 17 00:00:00 2001 From: sarah Date: Mon, 19 Jan 2026 18:03:27 +0100 Subject: [PATCH 43/54] refactor rule_ids --- src/egon/data/datasets/chp/__init__.py | 8 +- src/egon/data/datasets/era5.py | 8 +- src/egon/data/datasets/final_validations.py | 174 +++++++++--------- .../data/datasets/heat_supply/__init__.py | 16 +- .../osm_buildings_streets/__init__.py | 46 ++--- .../data/datasets/power_plants/__init__.py | 8 +- .../datasets/re_potential_areas/__init__.py | 24 +-- src/egon/data/datasets/renewable_feedin.py | 8 +- src/egon/data/datasets/scenario_capacities.py | 16 +- src/egon/data/datasets/society_prognosis.py | 16 +- src/egon/data/datasets/storages/__init__.py | 8 +- src/egon/data/datasets/substation/__init__.py | 2 +- src/egon/data/datasets/vg250/__init__.py | 18 +- src/egon/data/datasets/zensus/__init__.py | 40 ++-- src/egon/data/validation_utils.py | 2 +- 15 files changed, 197 insertions(+), 197 deletions(-) diff --git a/src/egon/data/datasets/chp/__init__.py b/src/egon/data/datasets/chp/__init__.py index 8a7bb8007..e1362cd64 100644 --- a/src/egon/data/datasets/chp/__init__.py +++ b/src/egon/data/datasets/chp/__init__.py @@ -866,12 +866,12 @@ def __init__(self, dependencies): "data-quality":[ RowCountValidation( table="supply.egon_chp_plants", - rule_id="TEST_ROW_COUNT.egon_chp_plants", + rule_id="ROW_COUNT.egon_chp_plants", expected_count={"Schleswig-Holstein": 1720, "Everything": 40197} ), DataTypeValidation( table="supply.egon_chp_plants", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_chp_plants", + rule_id="DATA_TYPES.egon_chp_plants", column_types={ "id": "integer", "sources": "jsonb", @@ -890,7 +890,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_chp_plants", - rule_id="TEST_NOT_NAN.egon_chp_plants", + rule_id="NOT_NAN.egon_chp_plants", columns=[ "id", "sources", @@ -909,7 +909,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_chp_plants", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_chp_plants" + rule_id="TABLE_NOT_NAN.egon_chp_plants" ), ValueSetValidation( table="supply.egon_chp_plants", diff --git a/src/egon/data/datasets/era5.py b/src/egon/data/datasets/era5.py index 5887cf712..6d40a278e 100644 --- a/src/egon/data/datasets/era5.py +++ b/src/egon/data/datasets/era5.py @@ -69,22 +69,22 @@ def __init__(self, dependencies): "data-quality": [ RowCountValidation( table="supply.egon_era5_weather_cells", - rule_id="TEST_ROW_COUNT.egon_era5_weather_cells", + rule_id="ROW_COUNT.egon_era5_weather_cells", expected_count=29673 ), DataTypeValidation( table="supply.egon_era5_weather_cells", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_era5_weather_cells", + rule_id="DATA_TYPES.egon_era5_weather_cells", column_types={"w_id": "integer", "geom": "geometry", "geom_point": "geometry"} ), NotNullAndNotNaNValidation( table="supply.egon_era5_weather_cells", - rule_id="TEST_NOT_NAN.egon_era5_weather_cells", + rule_id="NOT_NAN.egon_era5_weather_cells", columns=["w_id", "geom", "geom_point"] ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_era5_weather_cells", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_era5_weather_cells" + rule_id="TABLE_NOT_NAN.egon_era5_weather_cells" ), SRIDUniqueNonZero( table="supply.egon_era5_weather_cells", diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 811d75e95..069f821ef 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -1019,12 +1019,12 @@ def __init__(self, dependencies): #grid validation RowCountValidation( table="grid.egon_etrago_bus", - rule_id="TEST_ROW_COUNT.egon_etrago_bus", + rule_id="ROW_COUNT.egon_etrago_bus", expected_count={"Schleswig-Holstein": 2729, "Everything": 85710} ), DataTypeValidation( table="grid.egon_etrago_bus", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_bus", + rule_id="DATA_TYPES.egon_etrago_bus", column_types={ "scen_name": "character varying", "bus_id": "bigint", "v_nom": "double precision", "type": "text", "carrier": "text", "v_mag_pu_set": "double precision", @@ -1034,24 +1034,24 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_bus", - rule_id="TEST_NOT_NAN.egon_etrago_bus", + rule_id="NOT_NAN.egon_etrago_bus", columns=[ "scn_name", "bus_id", "v_nom", "carrier", "v_mag_pu_min", "v_mag_pu_max", "x", "y", "geom" ] ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_bus", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_bus" + rule_id="TABLE_NOT_NAN.egon_etrago_bus" ), ValueSetValidation( table="grid.egon_etrago_bus", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_bus", + rule_id="VALUE_SET_SCENARIO.egon_etrago_bus", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_bus", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_bus", + rule_id="VALUE_SET_CARRIER.egon_etrago_bus", column="carrier", expected_values=[ "rural_heat", "urban_central_water_tanks", "low_voltage", "CH4", "H2_saltcavern", @@ -1067,12 +1067,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_etrago_generator", - rule_id="TEST_ROW_COUNT.egon_etrago_generator", + rule_id="ROW_COUNT.egon_etrago_generator", expected_count={"Schleswig-Holstein": 2863, "Everything": 40577} ), DataTypeValidation( table="grid.egon_etrago_generator", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_generator", + rule_id="DATA_TYPES.egon_etrago_generator", column_types={ "scen_name": "character varying", "generator_id": "bigint", "control": "text", "type": "text", "carrier": "text", "p_nom": "double precision", "p_nom_extendable": "boolean", @@ -1089,7 +1089,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_generator", - rule_id="TEST_NOT_NAN.egon_etrago_generator", + rule_id="NOT_NAN.egon_etrago_generator", columns=[ "scn_name", "generator_id", "bus", "control", "type", "carrier", "p_nom", "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "sign", "marginal_cost", "build_year", @@ -1100,17 +1100,17 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_generator", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_generator" + rule_id="TABLE_NOT_NAN.egon_etrago_generator" ), ValueSetValidation( table="grid.egon_etrago_generator", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_generator", + rule_id="VALUE_SET_SCENARIO.egon_etrago_generator", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_generator", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_generator", + rule_id="VALUE_SET_CARRIER.egon_etrago_generator", column="carrier", expected_values=[ "CH4", "others", "central_biomass_CHP", "wind_onshore", "lignite", "geo_thermal", "solar", @@ -1122,12 +1122,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_etrago_generator_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_generator_timeseries", + rule_id="ROW_COUNT.egon_etrago_generator_timeseries", expected_count={"Schleswig-Holstein": 1929, "Everything": 28651} ), DataTypeValidation( table="grid.egon_etrago_generator_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_generator_timeseries", + rule_id="DATA_TYPES.egon_etrago_generator_timeseries", column_types={ "scn_name": "character varying", "generator_id": "integer", "temp_id": "integer", "p_set": "double precision[]", "q_set": "double precision[]", "p_min_pu": "double precision[]", @@ -1136,29 +1136,29 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_generator_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_generator_timeseries", + rule_id="NOT_NAN.egon_etrago_generator_timeseries", columns=[ "scn_name", "generator_id", "temp_id", "p_max_pu" ] ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_generator_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_generator_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_generator_timeseries" ), ValueSetValidation( table="grid.egon_etrago_generator_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_generator_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_generator_timeseries", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_line", - rule_id="TEST_ROW_COUNT.egon_etrago_line", + rule_id="ROW_COUNT.egon_etrago_line", expected_count={"Schleswig-Holstein": 1197, "Everything": 69901} ), DataTypeValidation( table="grid.egon_etrago_line", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_line", + rule_id="DATA_TYPES.egon_etrago_line", column_types={ "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", "b": "numeric", @@ -1172,7 +1172,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_line", - rule_id="TEST_NOT_NAN.egon_etrago_line", + rule_id="NOT_NAN.egon_etrago_line", columns=[ "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", @@ -1182,17 +1182,17 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_line", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_line" + rule_id="TABLE_NOT_NAN.egon_etrago_line" ), ValueSetValidation( table="grid.egon_etrago_line", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_line", + rule_id="VALUE_SET_SCENARIO.egon_etrago_line", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_line", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line", + rule_id="VALUE_SET_CARRIER.egon_etrago_line", column="carrier", expected_values=["AC"] ), @@ -1209,12 +1209,12 @@ def __init__(self, dependencies): #Row Count does't equal egon_etrago_line, because buses are located outside Germany RowCountValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_line_timeseries", + rule_id="ROW_COUNT.egon_etrago_line_timeseries", expected_count={"Schleswig-Holstein": 1197, "Everything": 69714} ), DataTypeValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_line_timeseries", + rule_id="DATA_TYPES.egon_etrago_line_timeseries", column_types={ "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", @@ -1232,7 +1232,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_line_timeseries", + rule_id="NOT_NAN.egon_etrago_line_timeseries", columns=[ "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", @@ -1242,17 +1242,17 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_line_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_line_timeseries" ), ValueSetValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_line_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_line_timeseries", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_line_timeseries", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_line_timeseries", + rule_id="VALUE_SET_CARRIER.egon_etrago_line_timeseries", column="carrier", expected_values=["AC"] ), @@ -1268,12 +1268,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_etrago_link", - rule_id="TEST_ROW_COUNT.egon_etrago_link", + rule_id="ROW_COUNT.egon_etrago_link", expected_count={"Schleswig-Holstein": 15496, "Everything": 83980} ), DataTypeValidation( table="grid.egon_etrago_link", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_link", + rule_id="DATA_TYPES.egon_etrago_link", column_types={ "scn_name": "character varying", "link_id": "bigint", "bus0": "bigint", "bus1": "bigint", "type": "text", "carrier": "text", "efficiency": "double precision", "build_year": "bigint", @@ -1286,7 +1286,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_link", - rule_id="TEST_NOT_NAN.egon_etrago_link", + rule_id="NOT_NAN.egon_etrago_link", columns=[ "scn_name", "link_id", "bus0", "bus1", "carrier", "efficiency", "build_year", "p_nom", "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", @@ -1295,17 +1295,17 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_link", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_link" + rule_id="TABLE_NOT_NAN.egon_etrago_link" ), ValueSetValidation( table="grid.egon_etrago_link", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_link", + rule_id="VALUE_SET_SCENARIO.egon_etrago_link", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_link", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_link", + rule_id="VALUE_SET_CARRIER.egon_etrago_link", column="carrier", expected_values=[ "industrial_gas_CHP", "residential_rural_water_tanks_discharger", "BEV_charger", "CH4", @@ -1331,12 +1331,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_etrago_link_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_link_timeseries", + rule_id="ROW_COUNT.egon_etrago_link_timeseries", expected_count={"Schleswig-Holstein": 947, "Everything": 25729} ), DataTypeValidation( table="grid.egon_etrago_link_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_link_timeseries", + rule_id="DATA_TYPES.egon_etrago_link_timeseries", column_types={ "scn_name": "character varying", "link_id": "bigint", @@ -1350,7 +1350,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_link_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_link_timeseries", + rule_id="NOT_NAN.egon_etrago_link_timeseries", columns=[ "scn_name", "link_id", "temp_id", "p_set", "p_min_pu", "p_max_pu", "efficiency", "marginal_cost" @@ -1358,22 +1358,22 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_link_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_link_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_link_timeseries" ), ValueSetValidation( table="grid.egon_etrago_link_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_link_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_link_timeseries", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_load", - rule_id="TEST_ROW_COUNT.egon_etrago_load", + rule_id="ROW_COUNT.egon_etrago_load", expected_count={"Schleswig-Holstein": 3202, "Everything": 44019} ), DataTypeValidation( table="grid.egon_etrago_load", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_load", + rule_id="DATA_TYPES.egon_etrago_load", column_types={ "scn_name": "character varying", "load_id": "bigint", @@ -1387,24 +1387,24 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_load", - rule_id="TEST_NOT_NAN.egon_etrago_load", + rule_id="NOT_NAN.egon_etrago_load", columns=[ "scn_name", "load_id", "bus", "type", "carrier", "p_set", "q_set", "sign" ] ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_load", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_load" + rule_id="TABLE_NOT_NAN.egon_etrago_load" ), ValueSetValidation( table="grid.egon_etrago_load", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_load", + rule_id="VALUE_SET_SCENARIO.egon_etrago_load", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_load", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_load", + rule_id="VALUE_SET_CARRIER.egon_etrago_load", column="carrier", expected_values=[ "CH4", "H2_for_industry", "services_rural_heat", "H2_system_boundary", "AC", @@ -1415,12 +1415,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_etrago_load_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_load_timeseries", + rule_id="ROW_COUNT.egon_etrago_load_timeseries", expected_count={"Schleswig-Holstein": 3176, "Everything": 44013} ), DataTypeValidation( table="grid.egon_etrago_load_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_load_timeseries", + rule_id="DATA_TYPES.egon_etrago_load_timeseries", column_types={ "scn_name": "character varying", "load_id": "bigint", @@ -1431,29 +1431,29 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_load_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_load_timeseries", + rule_id="NOT_NAN.egon_etrago_load_timeseries", columns=[ "scn_name", "load_id", "temp_id", "p_set", "q_set" ] ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_load_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_load_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_load_timeseries" ), ValueSetValidation( table="grid.egon_etrago_load_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_load_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_load_timeseries", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_storage", - rule_id="TEST_ROW_COUNT.egon_etrago_storage", + rule_id="ROW_COUNT.egon_etrago_storage", expected_count={"Schleswig-Holstein": 418, "Everything": 13044} ), DataTypeValidation( table="grid.egon_etrago_storage", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_storage", + rule_id="DATA_TYPES.egon_etrago_storage", column_types={ "scn_name": "character varying", "storage_id": "bigint", @@ -1486,7 +1486,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_storage", - rule_id="TEST_NOT_NAN.egon_etrago_storage", + rule_id="NOT_NAN.egon_etrago_storage", columns=[ "scn_name", "storage_id", "bus", "control", "type", "carrier", "p_nom", "p_nom_extendable", "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "p_set", @@ -1497,28 +1497,28 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_storage", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_storage" + rule_id="TABLE_NOT_NAN.egon_etrago_storage" ), ValueSetValidation( table="grid.egon_etrago_storage", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_storage", + rule_id="VALUE_SET_SCENARIO.egon_etrago_storage", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), ValueSetValidation( table="grid.egon_etrago_storage", - rule_id="TEST_VALUE_SET_CARRIER.egon_etrago_storage", + rule_id="VALUE_SET_CARRIER.egon_etrago_storage", column="carrier", expected_values=["battery", "home_battery", "pumped_hydro", "reservoir"] ), RowCountValidation( table="grid.egon_etrago_storage_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_storage_timeseries", + rule_id="ROW_COUNT.egon_etrago_storage_timeseries", expected_count={"Schleswig-Holstein": 0, "Everything": 9} ), DataTypeValidation( table="grid.egon_etrago_storage_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_storage_timeseries", + rule_id="DATA_MULTIPLE_TYPES.egon_etrago_storage_timeseries", column_types={ "scn_name": "character varying", "storage_id": "bigint", @@ -1534,29 +1534,29 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_storage_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_storage_timeseries", + rule_id="NOT_NAN.egon_etrago_storage_timeseries", columns=[ "scn_name", "storage_id", "temp_id", "inflow", "marginal_cost" ] ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_storage_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_storage_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_storage_timeseries" ), ValueSetValidation( table="grid.egon_etrago_storage_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_storage_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_storage_timeseries", column="scn_name", expected_values=["eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_store", - rule_id="TEST_ROW_COUNT.egon_etrago_store", + rule_id="ROW_COUNT.egon_etrago_store", expected_count={"Schleswig-Holstein": 2913, "Everything": 26520} ), DataTypeValidation( table="grid.egon_etrago_store", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_store", + rule_id="DATA_TYPES.egon_etrago_store", column_types={ "scn_name": "character varying", "store_id": "bigint", @@ -1583,7 +1583,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_store", - rule_id="TEST_NOT_NAN.egon_etrago_store", + rule_id="NOT_NAN.egon_etrago_store", columns=[ "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", @@ -1593,22 +1593,22 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_store", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_store" + rule_id="TABLE_NOT_NAN.egon_etrago_store" ), ValueSetValidation( table="grid.egon_etrago_store", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_store", + rule_id="VALUE_SET_SCENARIO.egon_etrago_store", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_store_timeseries", - rule_id="TEST_ROW_COUNT.egon_etrago_store_timeseries", + rule_id="ROW_COUNT.egon_etrago_store_timeseries", expected_count={"Schleswig-Holstein": 392, "Everything": 15281} ), DataTypeValidation( table="grid.egon_etrago_store_timeseries", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_store_timeseries", + rule_id="DATA_TYPES.egon_etrago_store_timeseries", column_types={ "scn_name": "character varying", "store_id": "bigint", @@ -1622,7 +1622,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_store_timeseries", - rule_id="TEST_NOT_NAN.egon_etrago_store_timeseries", + rule_id="NOT_NAN.egon_etrago_store_timeseries", columns=[ "scn_name", "store_id", "temp_id", "p_set", "q_set", "e_min_pu", "e_max_pu", "marginal_cost" @@ -1630,22 +1630,22 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_store_timeseries", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_store_timeseries" + rule_id="TABLE_NOT_NAN.egon_etrago_store_timeseries" ), ValueSetValidation( table="grid.egon_etrago_store_timeseries", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_store_timeseries", + rule_id="VALUE_SET_SCENARIO.egon_etrago_store_timeseries", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_etrago_temp_resolution", - rule_id="TEST_ROW_COUNT.egon_etrago_temp_resolution", + rule_id="ROW_COUNT.egon_etrago_temp_resolution", expected_count=1 ), DataTypeValidation( table="grid.egon_etrago_temp_resolution", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_temp_resolution", + rule_id="DATA_TYPES.egon_etrago_temp_resolution", column_types={ "temp_id": "bigint", "timesteps": "bigint", @@ -1655,16 +1655,16 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_temp_resolution", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_temp_resolution" + rule_id="TABLE_NOT_NAN.egon_etrago_temp_resolution" ), RowCountValidation( table="grid.egon_etrago_transformer", - rule_id="TEST_ROW_COUNT.egon_etrago_transformer", + rule_id="ROW_COUNT.egon_etrago_transformer", expected_count={"Schleswig-Holstein": 31, "Everything": 1545} ), DataTypeValidation( table="grid.egon_etrago_transformer", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_etrago_transformer", + rule_id="DATA_TYPES.egon_etrago_transformer", column_types={ "scn_name": "character varying", "store_id": "bigint", @@ -1691,7 +1691,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_etrago_transformer", - rule_id="TEST_NOT_NAN.egon_etrago_transformer", + rule_id="NOT_NAN.egon_etrago_transformer", columns=[ "scn_name", "store_id", "bus", "type", "carrier", "e_nom", "e_nom_extendable", "e_nom_min", "e_nom_max", "e_min_pu", "e_max_pu", "p_set", "q_set", "e_initial", @@ -1701,22 +1701,22 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_etrago_transformer", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_etrago_transformer" + rule_id="TABLE_NOT_NAN.egon_etrago_transformer" ), ValueSetValidation( table="grid.egon_etrago_transformer", - rule_id="TEST_VALUE_SET_SCENARIO.egon_etrago_transformer", + rule_id="VALUE_SET_SCENARIO.egon_etrago_transformer", column="scn_name", expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] ), RowCountValidation( table="grid.egon_hvmv_substation", - rule_id="TEST_ROW_COUNT.hvmv_substation", + rule_id="ROW_COUNT.hvmv_substation", expected_count={"Schleswig-Holstein": 200, "Everything": 3854} ), DataTypeValidation( table="grid.egon_hvmv_substation", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_hvmv_substation", + rule_id="DATA_TYPES.egon_hvmv_substation", column_types={ "bus_id": "integer", "lon": "double precision", @@ -1738,7 +1738,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="grid.egon_hvmv_substation", - rule_id="TEST_NOT_NAN.egon_hvmv_substation", + rule_id="NOT_NAN.egon_hvmv_substation", columns=[ "bus_id", "lon", "lat", "point", "polygon", "voltage", "power_type", "substation", "osm_id", "osm_www", "frequency", "subst_name", "ref", "operator", "dbahn", "status" @@ -1746,7 +1746,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_hvmv_substation", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_hvmv_substation" + rule_id="TABLE_NOT_NAN.egon_hvmv_substation" ), SRIDUniqueNonZero( table="grid.egon_hvmv_substation", @@ -1760,12 +1760,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="grid.egon_mv_grid_district", - rule_id="TEST_ROW_COUNT.egon_mv_grid_district", + rule_id="ROW_COUNT.egon_mv_grid_district", expected_count={"Schleswig-Holstein": 200, "Everything": 3854} ), DataTypeValidation( table="grid.egon_mv_grid_district", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_mv_grid_district", + rule_id="DATA_TYPES.egon_mv_grid_district", column_types={ "bus_id": "integer", "geom": "geometry", @@ -1774,7 +1774,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="grid.egon_mv_grid_district", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_mv_grid_district" + rule_id="TABLE_NOT_NAN.egon_mv_grid_district" ), SRIDUniqueNonZero( table="grid.egon_mv_grid_district", diff --git a/src/egon/data/datasets/heat_supply/__init__.py b/src/egon/data/datasets/heat_supply/__init__.py index 1b2a794e0..8d3d8ba8b 100644 --- a/src/egon/data/datasets/heat_supply/__init__.py +++ b/src/egon/data/datasets/heat_supply/__init__.py @@ -417,12 +417,12 @@ def __init__(self, dependencies): "data-quality":[ RowCountValidation( table="supply.egon_district_heating", - rule_id="TEST_ROW_COUNT.egon_district_heating", + rule_id="ROW_COUNT.egon_district_heating", expected_count={"Schleswig-Holstein": 402, "Everything": 9090} ), DataTypeValidation( table="supply.egon_district_heating", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_district_heating", + rule_id="DATA_TYPES.egon_district_heating", column_types={ "index": "integer", "district_heating_id": "integer", @@ -435,7 +435,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_district_heating", - rule_id="TEST_NOT_NAN.egon_district_heating", + rule_id="NOT_NAN.egon_district_heating", columns=[ "index", "district_heating_id", @@ -448,7 +448,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_district_heating", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_district_heating" + rule_id="TABLE_NOT_NAN.egon_district_heating" ), SRIDUniqueNonZero( table="supply.egon_district_heating", @@ -469,12 +469,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="supply.egon_individual_heating", - rule_id="TEST_ROW_COUNT.egon_individual_heating", + rule_id="ROW_COUNT.egon_individual_heating", expected_count={"Schleswig-Holstein": 400, "Everything": 7692} ), DataTypeValidation( table="supply.egon_individual_heating", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_individual_heating", + rule_id="DATA_TYPES.egon_individual_heating", column_types={ "index": "integer", "mv_grid_id": "integer", @@ -487,7 +487,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_individual_heating", - rule_id="TEST_NOT_NAN.egon_individual_heating", + rule_id="NOT_NAN.egon_individual_heating", columns=[ "index", "mv_grid_id", @@ -500,7 +500,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_individual_heating", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_individual_heating" + rule_id="TABLE_NOT_NAN.egon_individual_heating" ), SRIDUniqueNonZero( table="supply.egon_individual_heating", diff --git a/src/egon/data/datasets/osm_buildings_streets/__init__.py b/src/egon/data/datasets/osm_buildings_streets/__init__.py index f24fcad85..a3ad8541c 100644 --- a/src/egon/data/datasets/osm_buildings_streets/__init__.py +++ b/src/egon/data/datasets/osm_buildings_streets/__init__.py @@ -221,18 +221,18 @@ def __init__(self, dependencies): "data_quality": [ RowCountValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="TEST_ROW_COUNT.egon_map_zensus_buildings_filtered", + rule_id="ROW_COUNT.egon_map_zensus_buildings_filtered", expected_count={"Schleswig-Holstein":1010387, "Everything":28070301} ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_buildings_filtered", + rule_id="DATA_TYPES.egon_map_zensus_buildings_filtered", column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_filtered", - rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_buildings_filtered" + rule_id="TABLE_NOT_NAN.egon_map_zensus_buildings_filtered" ), RowCountValidation( table="boundaries.egon_map_zensus_buildings_residential", @@ -242,12 +242,12 @@ def __init__(self, dependencies): ), DataTypeValidation( table="boundaries.egon_map_zensus_buildings_residential", - rule_id="DATA_MULTIPLE_TYPES.egon_map_zensus_buildings_residential", + rule_id="DATA_TYPES.egon_map_zensus_buildings_residential", column_types={"id": "integer", "grid_id": "character varying", "cell_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="boundaries.egon_map_zensus_buildings_residential", - rule_id="WHOLE_TABLE_NOT_NAN.egon_map_zensus_buildings_residential" + rule_id="TABLE_NOT_NAN.egon_map_zensus_buildings_residential" ), RowCountValidation( table="openstreetmap.osm_amenities_not_in_buildings", @@ -257,14 +257,14 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_amenities_not_in_buildings", - rule_id="DATA_MULTIPLE_TYPES.osm_amenities_not_in_buildings", + rule_id="DATA_TYPES.osm_amenities_not_in_buildings", column_types={ "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", "tags": "hstore", "egon_amenity_id": "integer" } ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_amenities_not_in_buildings", - rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_not_in_buildings" + rule_id="TABLE_NOT_NAN.osm_amenities_not_in_buildings" ), SRIDUniqueNonZero( table="openstreetmap.osm_amenities_not_in_buildings", @@ -278,14 +278,14 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_amenities_shops_filtered", - rule_id="DATA_MULTIPLE_TYPES.osm_amenities_shops_filtered", + rule_id="DATA_TYPES.osm_amenities_shops_filtered", column_types={ "osm_id": "bigint", "amenity": "text", "name": "text", "geom_amenity": "geometry", "tags": "hstore", "egon_amenity_id": "integer"} ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_amenities_shops_filtered", - rule_id="WHOLE_TABLE_NOT_NAN.osm_amenities_shops_filtered" + rule_id="TABLE_NOT_NAN.osm_amenities_shops_filtered" ), SRIDUniqueNonZero( table="openstreetmap.osm_amenities_shops_filtered", @@ -299,7 +299,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings", + rule_id="DATA_TYPES.osm_buildings", column_types={ "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", @@ -307,7 +307,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings" + rule_id="TABLE_NOT_NAN.osm_buildings" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings", @@ -326,7 +326,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings_filtered", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings_filtered", + rule_id="DATA_TYPES.osm_buildings_filtered", column_types={ "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", @@ -334,7 +334,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings_filtered", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_filtered" + rule_id="TABLE_NOT_NAN.osm_buildings_filtered" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_filtered", @@ -353,7 +353,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings_residential", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings_residential", + rule_id="DATA_TYPES.osm_buildings_residential", column_types={ "osm_id": "bigint", "amenity": "text", "building": "text", "name": "text", "geom_building": "geometry", "area": "double precision", "geom_point": "geometry", @@ -361,7 +361,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings_residential", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_residential" + rule_id="TABLE_NOT_NAN.osm_buildings_residential" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_residential", @@ -380,7 +380,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings_synthetic", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings_synthetic", + rule_id="DATA_TYPES.osm_buildings_synthetic", column_types={ "id": "character varying", "cell_id": "character varying", "geom_building": "geometry", "geom_point": "geometry", "n_amenities_inside": "integer", "building": "character varying", @@ -388,7 +388,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings_synthetic", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_synthetic" + rule_id="TABLE_NOT_NAN.osm_buildings_synthetic" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_synthetic", @@ -407,7 +407,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings_with_amenities", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings_with_amenities", + rule_id="DATA_TYPES.osm_buildings_with_amenities", column_types={ "osm_id_amenity": "bigint", "osm_id_building": "bigint", @@ -425,7 +425,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings_with_amenities", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_with_amenities" + rule_id="TABLE_NOT_NAN.osm_buildings_with_amenities" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_with_amenities", @@ -449,7 +449,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_buildings_without_amenities", - rule_id="DATA_MULTIPLE_TYPES.osm_buildings_without_amenities", + rule_id="DATA_TYPES.osm_buildings_without_amenities", column_types={ "osm_id": "bigint", "id": "integer", @@ -463,7 +463,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_buildings_without_amenities", - rule_id="WHOLE_TABLE_NOT_NAN.osm_buildings_without_amenities" + rule_id="TABLE_NOT_NAN.osm_buildings_without_amenities" ), SRIDUniqueNonZero( table="openstreetmap.osm_buildings_without_amenities", @@ -482,7 +482,7 @@ def __init__(self, dependencies): ), DataTypeValidation( table="openstreetmap.osm_ways_with_segments", - rule_id="DATA_MULTIPLE_TYPES.osm_ways_with_segments", + rule_id="DATA_TYPES.osm_ways_with_segments", column_types={ "osm_id": "bigint", "nodes": "bigint[]", @@ -492,7 +492,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="openstreetmap.osm_ways_with_segments", - rule_id="WHOLE_TABLE_NOT_NAN.osm_ways_with_segments" + rule_id="TABLE_NOT_NAN.osm_ways_with_segments" ), SRIDUniqueNonZero( table="openstreetmap.osm_ways_with_segments", diff --git a/src/egon/data/datasets/power_plants/__init__.py b/src/egon/data/datasets/power_plants/__init__.py index 19d184f4c..2fe95ede8 100755 --- a/src/egon/data/datasets/power_plants/__init__.py +++ b/src/egon/data/datasets/power_plants/__init__.py @@ -1637,12 +1637,12 @@ def __init__(self, dependencies): "data-quality": [ RowCountValidation( table="supply.egon_power_plants", - rule_id="TEST_ROW_COUNT.egon_power_plants", + rule_id="ROW_COUNT.egon_power_plants", expected_count={"Schleswig-Holstein":34828, "Everything": 1103} ), DataTypeValidation( table="supply.egon_power_plants", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_power_plants", + rule_id="DATA_TYPES.egon_power_plants", column_types={ "id": "bigint", "sources": "jsonb", @@ -1658,7 +1658,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_power_plants", - rule_id="TEST_NOT_NAN.egon_power_plants", + rule_id="NOT_NAN.egon_power_plants", columns=["id", "sources", "source_id", @@ -1672,7 +1672,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_power_plants", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_power_plants" + rule_id="TABLE_NOT_NAN.egon_power_plants" ), ValueSetValidation( table="supply.egon_power_plants", diff --git a/src/egon/data/datasets/re_potential_areas/__init__.py b/src/egon/data/datasets/re_potential_areas/__init__.py index 5b02b2180..5edb489bb 100644 --- a/src/egon/data/datasets/re_potential_areas/__init__.py +++ b/src/egon/data/datasets/re_potential_areas/__init__.py @@ -164,12 +164,12 @@ def __init__(self, dependencies): "data-quality": [ RowCountValidation( table="supply.egon_re_potential_area_pv_agricultur", - rule_id="TEST_ROW_COUNT.egon_re_potential_area_pv_agricultur", + rule_id="ROW_COUNT.egon_re_potential_area_pv_agricultur", expected_count={"Schleswig-Holstein": 388, "Everything": 8259} ), DataTypeValidation( table="supply.egon_re_potential_area_pv_agricultur", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_pv_agricultur", + rule_id="DATA_TYPES.egon_re_potential_area_pv_agricultur", column_types={ "id": "integer", "geom": "geometry" @@ -177,13 +177,13 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_re_potential_area_pv_agricultur", - rule_id="TEST_NOT_NAN.egon_re_potential_area_pv_agricultur", + rule_id="NOT_NAN.egon_re_potential_area_pv_agricultur", columns=["id", "geom"] ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_re_potential_area_pv_agricultur", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_agricultur" + rule_id="WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_agricultur" ), SRIDUniqueNonZero( table="supply.egon_re_potential_area_pv_agricultur", @@ -192,12 +192,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="supply.egon_re_potential_area_pv_road_railway", - rule_id="TEST_ROW_COUNT.egon_re_potential_area_pv_road_railway", + rule_id="ROW_COUNT.egon_re_potential_area_pv_road_railway", expected_count={"Schleswig-Holstein": 479, "Everything": 5159} ), DataTypeValidation( table="supply.egon_re_potential_area_pv_road_railway", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_pv_road_railway", + rule_id="DATA_TYPES.egon_re_potential_area_pv_road_railway", column_types={ "id": "integer", "geom": "geometry" @@ -205,13 +205,13 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_re_potential_area_pv_road_railway", - rule_id="TEST_NOT_NAN.egon_re_potential_area_pv_road_railway", + rule_id="NOT_NAN.egon_re_potential_area_pv_road_railway", columns=["id", "geom"] ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_re_potential_area_pv_road_railway", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_pv_road_railway" + rule_id="TABLE_NOT_NAN.egon_re_potential_area_pv_road_railway" ), SRIDUniqueNonZero( table="supply.egon_re_potential_area_pv_road_railway", @@ -220,12 +220,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="supply.egon_re_potential_area_wind", - rule_id="TEST_ROW_COUNT.egon_re_potential_area_wind", + rule_id="ROW_COUNT.egon_re_potential_area_wind", expected_count={"Schleswig-Holstein": 6306, "Everything": 120268} ), DataTypeValidation( table="supply.egon_re_potential_area_wind", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_re_potential_area_wind", + rule_id="DATA_TYPES.egon_re_potential_area_wind", column_types={ "id": "integer", "geom": "geometry" @@ -233,13 +233,13 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_re_potential_area_wind", - rule_id="TEST_NOT_NAN.egon_re_potential_area_wind", + rule_id="NOT_NAN.egon_re_potential_area_wind", columns=["id", "geom"] ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_re_potential_area_wind", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_re_potential_area_wind" + rule_id="TABLE_NOT_NAN.egon_re_potential_area_wind" ), SRIDUniqueNonZero( table="supply.egon_re_potential_area_wind", diff --git a/src/egon/data/datasets/renewable_feedin.py b/src/egon/data/datasets/renewable_feedin.py index e3fb58d03..78a6b7ff7 100644 --- a/src/egon/data/datasets/renewable_feedin.py +++ b/src/egon/data/datasets/renewable_feedin.py @@ -75,12 +75,12 @@ def __init__(self, dependencies): "data-quality": [ RowCountValidation( table="supply.egon_era5_renewable_feedin", - rule_id="TEST_ROW_COUNT.egon_renewable_feedin", + rule_id="ROW_COUNT.egon_renewable_feedin", expected_count=6102 ), DataTypeValidation( table="supply.egon_era5_renewable_feedin", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_era5_renewable_feedin", + rule_id="DATA_MULTIPLE_TYPES.egon_era5_renewable_feedin", column_types={ "w_id": "integer", "weather_year": "integer", @@ -90,12 +90,12 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_era5_renewable_feedin", - rule_id="TEST_NOT_NAN.egon_era5_renewable_feedin", + rule_id="NOT_NAN.egon_era5_renewable_feedin", columns=["w_id", "weather_year", "carrier", "feedin"] ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_era5_renewable_feedin", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_era5_renewable_feedin" + rule_id="TABLE_NOT_NAN.egon_era5_renewable_feedin" ), ValueSetValidation( table="supply.egon_district_heating", diff --git a/src/egon/data/datasets/scenario_capacities.py b/src/egon/data/datasets/scenario_capacities.py index 912b023ae..612c002d9 100755 --- a/src/egon/data/datasets/scenario_capacities.py +++ b/src/egon/data/datasets/scenario_capacities.py @@ -1063,12 +1063,12 @@ def __init__(self, dependencies): "data-quality": [ RowCountValidation( table="supply.egon_nep_2021_conventional_powerplants", - rule_id="TEST_ROW_COUNT.egon_nep_2021_conventional_powerplants", + rule_id="ROW_COUNT.egon_nep_2021_conventional_powerplants", expected_count={"Schleswig-Holstein": 40, "Everything": 737} ), DataTypeValidation( table="supply.egon_nep_2021_conventional_powerplants", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_nep_2021_conventional_powerplants", + rule_id="DATA_TYPES.egon_nep_2021_conventional_powerplants", column_types={ "index": "bigint", "bnetza_id": "text", @@ -1095,7 +1095,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_nep_2021_conventional_powerplants", - rule_id="TEST_NOT_NAN.egon_nep_2021_conventional_powerplants", + rule_id="NOT_NAN.egon_nep_2021_conventional_powerplants", columns=[ "index", "bnetza_id", @@ -1122,16 +1122,16 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_nep_2021_conventional_powerplants", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_nep_2021_conventional_powerplants" + rule_id="TABLE_NOT_NAN.egon_nep_2021_conventional_powerplants" ), RowCountValidation( table="supply.egon_scenario_capacities", - rule_id="TEST_ROW_COUNT.egon_scenario_capacities", + rule_id="ROW_COUNT.egon_scenario_capacities", expected_count={"Schleswig-Holstein": 17, "Everything": 236} ), DataTypeValidation( table="supply.egon_scenario_capacities", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_scenario_capacities", + rule_id="DATA_TYPES.egon_scenario_capacities", column_types={ "index": "integer", "component": "character varying", @@ -1143,7 +1143,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_scenario_capacities", - rule_id="TEST_NOT_NAN.egon_scenario_capacities", + rule_id="NOT_NAN.egon_scenario_capacities", columns=[ "index", "component", @@ -1155,7 +1155,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_scenario_capacities", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_scenario_capacities" + rule_id="TABLE_NOT_NAN.egon_scenario_capacities" ), ValueSetValidation( table="supply.egon_scenario_capacities", diff --git a/src/egon/data/datasets/society_prognosis.py b/src/egon/data/datasets/society_prognosis.py index 256adf4fa..d916aa1cf 100755 --- a/src/egon/data/datasets/society_prognosis.py +++ b/src/egon/data/datasets/society_prognosis.py @@ -33,42 +33,42 @@ def __init__(self, dependencies): "data-quality":[ RowCountValidation( table="society.egon_household_prognosis", - rule_id="TEST_ROW_COUNT.egon_household_prognosis", + rule_id="ROW_COUNT.egon_household_prognosis", expected_count={"Everything": 5319490} ), DataTypeValidation( table="society.egon_household_prognosis", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_household_prognosis", + rule_id="DATA_TYPES.egon_household_prognosis", column_types={"zensus_population_id": "integer", "year": "integer", "households": "double precision"} ), NotNullAndNotNaNValidation( table="society.egon_household_prognosis", - rule_id="TEST_NOT_NAN.egon_household_prognosis", + rule_id="NOT_NAN.egon_household_prognosis", columns=["zensus_population_id", "year", "households"] ), WholeTableNotNullAndNotNaNValidation( table="society.egon_household_prognosis", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_household_prognosis" + rule_id="TABLE_NOT_NAN.egon_household_prognosis" ), RowCountValidation( table="society.egon_population_prognosis", - rule_id="TEST_ROW_COUNT.egon_population_prognosis", + rule_id="ROW_COUNT.egon_population_prognosis", expected_count={"Everything": 6355446} ), DataTypeValidation( table="society.egon_population_prognosis", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_population_prognosis", + rule_id="DATA_TYPES.egon_population_prognosis", column_types={"zensus_population_id": "integer", "year": "integer", "population": "double precision"} ), NotNullAndNotNaNValidation( table="society.egon_population_prognosis", - rule_id="TEST_NOT_NAN.egon_population_prognosis", + rule_id="NOT_NAN.egon_population_prognosis", columns=["zensus_population_id", "year", "population"] ), WholeTableNotNullAndNotNaNValidation( table="society.egon_population_prognosis", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_population_prognosis" + rule_id="TABLE_NOT_NAN.egon_population_prognosis" ), ] }, diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 42b502e17..2b163ccb1 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -123,12 +123,12 @@ def __init__(self, dependencies): ), RowCountValidation( table="supply.egon_storages", - rule_id="TEST_ROW_COUNT.egon_storages", + rule_id="ROW_COUNT.egon_storages", expected_count={"Schleswig-Holstein": 290, "Everything": 7748} ), DataTypeValidation( table="supply.egon_storages", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_storages", + rule_id="DATA_TYPES.egon_storages", column_types={ "id": "bigint", "sources": "jsonb", @@ -143,7 +143,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="supply.egon_storages", - rule_id="TEST_NOT_NAN.egon_storages", + rule_id="NOT_NAN.egon_storages", columns=[ "id", "sources", @@ -158,7 +158,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="supply.egon_storages", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_storages" + rule_id="TABLE_NOT_NAN.egon_storages" ), ValueSetValidation( table="supply.egon_storages", diff --git a/src/egon/data/datasets/substation/__init__.py b/src/egon/data/datasets/substation/__init__.py index 00d9b8606..3144ff174 100644 --- a/src/egon/data/datasets/substation/__init__.py +++ b/src/egon/data/datasets/substation/__init__.py @@ -102,7 +102,7 @@ def __init__(self, dependencies): # "": [ # RowCountValidation( # table=".", - # rule_id="TEST_ROW_COUNT.", + # rule_id="ROW_COUNT.", # expected_count={"Schleswig-Holstein": X, "Everything": Y} # ), # ] diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 7ac0106aa..20612dad5 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -541,12 +541,12 @@ def __init__(self, dependencies): "data_quality": [ RowCountValidation( table="boundaries.vg250_krs", - rule_id="TEST_ROW_COUNT.vg250_krs", + rule_id="ROW_COUNT.vg250_krs", expected_count={"Schleswig-Holstein":27, "Everything":431} ), DataTypeValidation( table="boundaries.vg250_krs", - rule_id="TEST_DATA_MULTIPLE_TYPES.vg250_krs", + rule_id="DATA_TYPES.vg250_krs", column_types={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", @@ -563,12 +563,12 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="boundaries.vg250_krs", - rule_id="TEST_NOT_NAN.vg250_krs", + rule_id="NOT_NAN.vg250_krs", columns=["gf","bsg"] ), WholeTableNotNullAndNotNaNValidation( table="boundaries.vg250_krs", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.vg250_krs" + rule_id="TABLE_NOT_NAN.vg250_krs" ), SRIDUniqueNonZero( table="boundaries.vg250_krs", @@ -577,18 +577,18 @@ def __init__(self, dependencies): ), ValueSetValidation( table="boundaries.vg250_krs", - rule_id="TEST_VALUE_SET_NBD.vg250_krs", + rule_id="VALUE_SET_NBD.vg250_krs", column="nbd", expected_values=["ja", "nein"] ), RowCountValidation( table="society.destatis_zensus_population_per_ha_inside_germany", - rule_id="TEST_ROW_COUNT.destatis_zensus_population_per_ha_inside_germany", + rule_id="ROW_COUNT.destatis_zensus_population_per_ha_inside_germany", expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723} ), DataTypeValidation( table="society.destatis_zensus_population_per_ha_inside_germany", - rule_id="TEST_DATA_MULTIPLE_TYPES.destatis_zensus_population_per_ha_inside_germany", + rule_id="DATA_TYPES.destatis_zensus_population_per_ha_inside_germany", column_types={ "id": "integer", "grid_id": "character varying (254)", "population": "smallint", "geom_point": "geometry","geom": "geometry" @@ -596,12 +596,12 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.destatis_zensus_population_per_ha_inside_germany", - rule_id="TEST_NOT_NAN.destatis_zensus_population_per_ha_inside_germany", + rule_id="NOT_NAN.destatis_zensus_population_per_ha_inside_germany", columns=["id", "grid_id", "population", "geom_point", "geom"] ), WholeTableNotNullAndNotNaNValidation( table="society.destatis_zensus_population_per_ha_inside_germany", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany" + rule_id="TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany" ), SRIDUniqueNonZero( table="society.destatis_zensus_population_per_ha_inside_germany", diff --git a/src/egon/data/datasets/zensus/__init__.py b/src/egon/data/datasets/zensus/__init__.py index 6344ee63a..6012b1ddf 100755 --- a/src/egon/data/datasets/zensus/__init__.py +++ b/src/egon/data/datasets/zensus/__init__.py @@ -40,12 +40,12 @@ def __init__(self, dependencies): "data-quality":[ RowCountValidation( table="society.egon_destatis_zensus_apartment_building_population_per_ha", - rule_id="TEST_ROW_COUNT.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_apartment_building_population_per_ha", expected_count={"Schleswig-Holstein": 145634, "Everything": 3206490} ), DataTypeValidation( table="society.egon_destatis_zensus_apartment_building_population_per_ha", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_apartment_building_population_per_ha", column_types={ "grid_id": "character varying", "zensus_population_id": "integer", "building_count": "smallint", "apartment_count": "smallint", "geom": "geometry", "geom_point": "geometry" @@ -53,12 +53,12 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.egon_destatis_zensus_apartment_building_population_per_ha", - rule_id="TEST_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha", columns=["grid_id", "zensus_population_id", "building_count", "apartment_count", "geom", "geom_point"] ), WholeTableNotNullAndNotNaNValidation( table="society.egon_destatis_zensus_apartment_building_population_per_ha", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha" + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_apartment_building_population_per_ha" ), SRIDUniqueNonZero( table="society.egon_destatis_zensus_apartment_building_population_per_ha", @@ -91,12 +91,12 @@ def __init__(self, dependencies): RowCountValidation( table="society.egon_destatis_zensus_apartment_per_ha", - rule_id="TEST_ROW_COUNT.egon_destatis_zensus_apartment_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_apartment_per_ha", expected_count={"Schleswig-Holstein": 1946300, "Everything": 51095280} ), DataTypeValidation( table="society.egon_destatis_zensus_apartment_per_ha", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_apartment_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_apartment_per_ha", column_types={ "id": "integer", "grid_id": "character varying", "grid_id_new": "character varying", "attribute": "character varying", "characteristics_code": "smallint", @@ -106,7 +106,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.egon_destatis_zensus_apartment_per_ha", - rule_id="TEST_NOT_NAN.egon_destatis_zensus_apartment_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_apartment_per_ha", columns=[ "id", "grid_id", "grid_id_new", "attribute", "characteristics_code", "characteristics_text", "quantity", "quantity_q", "zensus_population_id" @@ -114,16 +114,16 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="society.egon_destatis_zensus_apartment_per_ha", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_apartment_per_ha" + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_apartment_per_ha" ), RowCountValidation( table="society.egon_destatis_zensus_building_per_ha", - rule_id="TEST_ROW_COUNT.egon_destatis_zensus_building_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_building_per_ha", expected_count={"Schleswig-Holstein": 978493, "Everything": 24297136} ), DataTypeValidation( table="society.egon_destatis_zensus_building_per_ha", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_building_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_building_per_ha", column_types={ "id": "integer", "grid_id": "character varying", @@ -138,7 +138,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.egon_destatis_zensus_building_per_ha", - rule_id="TEST_NOT_NAN.egon_destatis_zensus_building_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_building_per_ha", columns=[ "id", "grid_id", @@ -153,16 +153,16 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="society.egon_destatis_zensus_building_per_ha", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_building_per_ha" + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_building_per_ha" ), RowCountValidation( table="society.egon_destatis_zensus_household_per_ha", - rule_id="TEST_ROW_COUNT.egon_destatis_zensus_household_per_ha", + rule_id="ROW_COUNT.egon_destatis_zensus_household_per_ha", expected_count={"Schleswig-Holstein": 724970, "Everything": 18788917} ), DataTypeValidation( table="society.egon_destatis_zensus_household_per_ha", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_household_per_ha", + rule_id="DATA_TYPES.egon_destatis_zensus_household_per_ha", column_types={ "id": "integer", "grid_id": "character varying", @@ -177,7 +177,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.egon_destatis_zensus_household_per_ha", - rule_id="TEST_NOT_NAN.egon_destatis_zensus_household_per_ha", + rule_id="NOT_NAN.egon_destatis_zensus_household_per_ha", columns=[ "id", "grid_id", @@ -192,16 +192,16 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="society.egon_destatis_zensus_household_per_ha", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha" + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha" ), RowCountValidation( table="society.egon_destatis_zensus_household_per_ha_refined", - rule_id="TEST_ROW_COUNT.egon_destatis_zensus_household_per_ha_refined", + rule_id="ROW_COUNT.egon_destatis_zensus_household_per_ha_refined", expected_count={"Schleswig-Holstein": 551678, "Everything": 13304814} ), DataTypeValidation( table="society.egon_destatis_zensus_household_per_ha_refined", - rule_id="TEST_DATA_MULTIPLE_TYPES.egon_destatis_zensus_household_per_ha_refined", + rule_id="DATA_TYPES.egon_destatis_zensus_household_per_ha_refined", column_types={ "id": "integer", "cell_id": "integer", @@ -216,7 +216,7 @@ def __init__(self, dependencies): ), NotNullAndNotNaNValidation( table="society.egon_destatis_zensus_household_per_ha_refined", - rule_id="TEST_NOT_NAN.egon_destatis_zensus_household_per_ha_refined", + rule_id="NOT_NAN.egon_destatis_zensus_household_per_ha_refined", columns=[ "id", "cell_id", @@ -231,7 +231,7 @@ def __init__(self, dependencies): ), WholeTableNotNullAndNotNaNValidation( table="society.egon_destatis_zensus_household_per_ha_refined", - rule_id="TEST_WHOLE_TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha_refined" + rule_id="TABLE_NOT_NAN.egon_destatis_zensus_household_per_ha_refined" ), ] }, diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index e165c99d1..9c5130c59 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -95,7 +95,7 @@ def create_validation_tasks( ... "data_quality": [ ... RowCountValidation( ... table="boundaries.vg250_krs", - ... rule_id="TEST_ROW_COUNT", + ... rule_id="ROW_COUNT", ... expected_count={"Schleswig-Holstein": 27, "Everything": 537} ... ) ... ] From 65c1d707b69e634a104a09e5882fd6289d52a3ce Mon Sep 17 00:00:00 2001 From: sarah Date: Tue, 20 Jan 2026 15:14:57 +0100 Subject: [PATCH 44/54] remove .dev in final_validations.py --- src/egon/data/datasets/final_validations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 069f821ef..01dadff92 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -98,7 +98,7 @@ class FinalValidations(Dataset): #: name: str = "FinalValidations" #: - version: str = "0.0.1.dev" + version: str = "0.0.1" def __init__(self, dependencies): super().__init__( From 0860b8ab2189393c78979326cd6dfb7c55674255 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 21 Jan 2026 13:38:01 +0100 Subject: [PATCH 45/54] add table first validation --- src/egon/data/datasets/vg250/__init__.py | 21 +++ src/egon/data/validation_utils.py | 201 ++++++++++++++++++----- 2 files changed, 182 insertions(+), 40 deletions(-) diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 20612dad5..5d7bc8230 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,6 +29,7 @@ meta_metadata, ) import egon.data.config +from .validation_utils import TableValidation from egon_validation import ( RowCountValidation, DataTypeValidation, @@ -539,6 +540,26 @@ def __init__(self, dependencies): ), validation={ "data_quality": [ + TableValidation( + table_name="boundaries.vg250_krs", + row_count={"Schleswig-Holstein": 27, "Everything": 537}, + geometry_columns=["geometry"], + data_type_columns={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"timestamp without time zone", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"}, + "Everything":{"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", + "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint", + "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", + "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", + "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text", + "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"} + }, + not_null_columns=["gf", "bsg"], + value_set_columns={"nbd": ["ja", "nein"]}, + ), RowCountValidation( table="boundaries.vg250_krs", rule_id="ROW_COUNT.vg250_krs", diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py index 9c5130c59..fd1f9d882 100644 --- a/src/egon/data/validation_utils.py +++ b/src/egon/data/validation_utils.py @@ -1,14 +1,154 @@ -"""Airflow integration for egon-validation.""" +"""Airflow integration for egon-validation. + +This module supports two configuration styles: + +1) Backwards compatible "rule-first": + validation_dict = {"task": [Rule(...), Rule(...)]} + +2) New "table-first": + validation_dict = {"task": [TableValidation(...), TableValidation(...)]} + +Both styles can be mixed in the same list. +""" + +from __future__ import annotations + +import copy +import logging +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Sequence, Union -from typing import Any, Dict, List from airflow.operators.python import PythonOperator from egon_validation import run_validations, RunContext from egon_validation.rules.base import Rule import logging +from egon_validation import ( # noqa: F401 + DataTypeValidation, + NotNullAndNotNaNValidation, + RowCountValidation, + SRIDUniqueNonZero, + ValueSetValidation, + WholeTableNotNullAndNotNaNValidation, +) + logger = logging.getLogger(__name__) +@dataclass(frozen=True, slots=True) +class TableValidation: + """ + Table-first validation specification. + + Properties you asked for: + - table_name + - row_count + - geometry_columns + - data_type_columns + - not_null_columns + - value_set_columns + + Behavior: + - Generates rule_ids exactly like your manual convention: + ROW_COUNT. + DATA_TYPES. + NOT_NAN. + TABLE_NOT_NAN. <-- always added automatically + SRIDUniqueNonZero.. + VALUE_SET_. + - Boundary-dependent dict values are preserved and resolved later in _resolve_rule_params(). + """ + + table_name: str + + row_count: Optional[Any] = None + geometry_columns: Optional[Sequence[str]] = None + data_type_columns: Optional[Mapping[str, Any]] = None + not_null_columns: Optional[Sequence[str]] = None + value_set_columns: Optional[Mapping[str, Any]] = None + + def to_rules(self) -> List[Rule]: + rules: List[Rule] = [] + table_suffix = self.table_name.split(".")[-1] + + # 1) Row count + if self.row_count is not None: + rules.append( + RowCountValidation( + table=self.table_name, + rule_id=f"ROW_COUNT.{table_suffix}", + expected_count=self.row_count, + ) + ) + + # 2) Data types + if self.data_type_columns is not None: + rules.append( + DataTypeValidation( + table=self.table_name, + rule_id=f"DATA_TYPES.{table_suffix}", + column_types=dict(self.data_type_columns), + ) + ) + + # 3) Column-level not-null / not-NaN + if self.not_null_columns: + rules.append( + NotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"NOT_NAN.{table_suffix}", + columns=list(self.not_null_columns), + ) + ) + + # 4) Geometry checks (one rule per geometry column) + if self.geometry_columns: + for geom_col in self.geometry_columns: + rules.append( + SRIDUniqueNonZero( + table=self.table_name, + rule_id=f"SRIDUniqueNonZero.{table_suffix}.{geom_col}", + column=geom_col, + ) + ) + + # 5) Value sets (one rule per column) + if self.value_set_columns: + for col_name, expected_values in self.value_set_columns.items(): + rules.append( + ValueSetValidation( + table=self.table_name, + rule_id=f"VALUE_SET_{str(col_name).upper()}.{table_suffix}", + column=str(col_name), + expected_values=expected_values, + ) + ) + + # 6) Whole-table not-null / not-NaN (automatic, as requested) + rules.append( + WholeTableNotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"TABLE_NOT_NAN.{table_suffix}", + ) + ) + + return rules + + +ValidationSpec = Union[Rule, TableValidation] + + +def _expand_specs(specs: Sequence[ValidationSpec]) -> List[Rule]: + """Turn a mixed list of Rule/TableValidation into a flat list of Rule.""" + expanded: List[Rule] = [] + for spec in specs: + if isinstance(spec, TableValidation): + expanded.extend(spec.to_rules()) + else: + expanded.append(spec) + return expanded + + def _resolve_context_value(value: Any, boundary: str) -> Any: """Resolve a value that may be boundary-dependent. @@ -69,46 +209,22 @@ def _resolve_rule_params(rule: Rule, boundary: str) -> None: rule.params[param_name] = resolved_value def create_validation_tasks( - validation_dict: Dict[str, List[Rule]], + validation_dict: Dict[str, Sequence[ValidationSpec]], dataset_name: str, on_failure: str = "continue" ) -> List[PythonOperator]: """Convert validation dict to Airflow tasks. - Automatically resolves boundary-dependent parameters in validation rules. - Parameters can be specified as dicts with boundary keys: - - - Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537} - - The appropriate value is selected based on the current configuration. - - Args: - validation_dict: {"task_name": [Rule1(), Rule2()]} - dataset_name: Name of dataset - on_failure: "continue" or "fail" - - Returns: - List of PythonOperator tasks - - Example: - >>> validation_dict = { - ... "data_quality": [ - ... RowCountValidation( - ... table="boundaries.vg250_krs", - ... rule_id="ROW_COUNT", - ... expected_count={"Schleswig-Holstein": 27, "Everything": 537} - ... ) - ... ] - ... } - >>> tasks = create_validation_tasks(validation_dict, "VG250") + Values can be List[Rule], values can be List[TableValidation] or mixed. """ if not validation_dict: return [] - tasks = [] + tasks: List[PythonOperator] = [] - for task_name, rules in validation_dict.items(): - def make_callable(rules, task_name): + for task_name, specs in validation_dict.items(): + + def make_callable(specs: Sequence[ValidationSpec], task_name: str): def run_validation(**context): import os import time @@ -116,14 +232,17 @@ def run_validation(**context): from egon.data import db as egon_db from egon.data.config import settings - # Use same run_id as validation report for consistency - # This allows the validation report to collect results from all validation tasks + # Run id selection (unchanged logic) run_id = ( - os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or - context.get('run_id') or - (context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or - (context.get('dag_run') and context['dag_run'].run_id) or - f"airflow-{dataset_name}-{task_name}-{int(time.time())}" + os.environ.get("AIRFLOW_CTX_DAG_RUN_ID") + or context.get("run_id") + or ( + context.get("ti") + and hasattr(context["ti"], "dag_run") + and context["ti"].dag_run.run_id + ) + or (context.get("dag_run") and context["dag_run"].run_id) + or f"airflow-{dataset_name}-{task_name}-{int(time.time())}" ) # Use absolute path to ensure consistent location regardless of working directory @@ -150,6 +269,8 @@ def run_validation(**context): logger.info(f"Resolving validation parameters for boundary='{boundary}'") + rules: List[Rule] = copy.deepcopy(_expand_specs(specs)) + # Set task and dataset on all rules (required by Rule base class) # Also resolve boundary-dependent parameters for rule in rules: @@ -176,7 +297,7 @@ def run_validation(**context): return run_validation - func = make_callable(rules, task_name) + func = make_callable(specs, task_name) func.__name__ = f"validate_{task_name}" operator = PythonOperator( From 28d794f3baaeee40b91931fdf83dd728f4917337 Mon Sep 17 00:00:00 2001 From: sarah Date: Wed, 21 Jan 2026 14:54:08 +0100 Subject: [PATCH 46/54] move functionality from validation_utils to different files --- src/egon/data/datasets/vg250/__init__.py | 6 +- src/egon/data/validation/__init__.py | 46 ++++ src/egon/data/validation/airflow.py | 125 +++++++++ src/egon/data/validation/resolver.py | 53 ++++ src/egon/data/validation/specs.py | 194 ++++++++++++++ src/egon/data/validation_utils.py | 311 ----------------------- 6 files changed, 421 insertions(+), 314 deletions(-) create mode 100644 src/egon/data/validation/__init__.py create mode 100644 src/egon/data/validation/airflow.py create mode 100644 src/egon/data/validation/resolver.py create mode 100644 src/egon/data/validation/specs.py delete mode 100644 src/egon/data/validation_utils.py diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index 5d7bc8230..f9a8118ab 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -29,7 +29,7 @@ meta_metadata, ) import egon.data.config -from .validation_utils import TableValidation +from egon.data.validation import TableValidation, resolve_boundary_dependence from egon_validation import ( RowCountValidation, DataTypeValidation, @@ -542,7 +542,7 @@ def __init__(self, dependencies): "data_quality": [ TableValidation( table_name="boundaries.vg250_krs", - row_count={"Schleswig-Holstein": 27, "Everything": 537}, + row_count=resolve_boundary_dependence({"Schleswig-Holstein": 27, "Everything": 537}), geometry_columns=["geometry"], data_type_columns={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", @@ -563,7 +563,7 @@ def __init__(self, dependencies): RowCountValidation( table="boundaries.vg250_krs", rule_id="ROW_COUNT.vg250_krs", - expected_count={"Schleswig-Holstein":27, "Everything":431} + expected_count=resolve_boundary_dependence({"Schleswig-Holstein":27, "Everything":431}) ), DataTypeValidation( table="boundaries.vg250_krs", diff --git a/src/egon/data/validation/__init__.py b/src/egon/data/validation/__init__.py new file mode 100644 index 000000000..7e7145e0e --- /dev/null +++ b/src/egon/data/validation/__init__.py @@ -0,0 +1,46 @@ +""" +Validation framework for egon-data. + +Supports two configuration styles (can be mixed): + +1) "rule-first": + validation_dict = {"task_name": [Rule(...), Rule(...)]} + +2) "table-first": + validation_dict = {"task_name": [TableValidation(...), TableValidation(...)]} +""" + +from .resolver import ( + BoundaryDependent, + resolve_boundary_dependence, + resolve_value, +) +from .specs import ( + TableValidation, + ValidationSpec, + clone_rule, + expand_specs, + prepare_rules, + resolve_rule_params, +) +from .airflow import ( + create_validation_tasks, + run_validation_task, +) + +__all__ = [ + # resolver + "BoundaryDependent", + "resolve_boundary_dependence", + "resolve_value", + # specs + "TableValidation", + "ValidationSpec", + "clone_rule", + "expand_specs", + "prepare_rules", + "resolve_rule_params", + # airflow + "create_validation_tasks", + "run_validation_task", +] \ No newline at end of file diff --git a/src/egon/data/validation/airflow.py b/src/egon/data/validation/airflow.py new file mode 100644 index 000000000..ba420c4aa --- /dev/null +++ b/src/egon/data/validation/airflow.py @@ -0,0 +1,125 @@ +"""Airflow integration for validation tasks.""" + +from __future__ import annotations + +import logging +from functools import partial +from typing import Any, Dict, List, Sequence + +from airflow.operators.python import PythonOperator +from egon_validation import RunContext, run_validations + +from .specs import ValidationSpec, prepare_rules + +logger = logging.getLogger(__name__) + + +def run_validation_task( + *, + specs: Sequence[ValidationSpec], + task_name: str, + dataset_name: str, + on_failure: str, + **context: Any, +) -> Dict[str, int]: + """ + This is the function Airflow actually calls. + + It's top-level (not nested), so: + - easier to test + - easier stack traces + - fewer closure surprises + """ + import os + import time + from datetime import datetime + from egon.data import db as egon_db + from egon.data.config import settings + + # Consistent run_id across tasks so reports can correlate results + run_id = ( + os.environ.get("AIRFLOW_CTX_DAG_RUN_ID") + or context.get("run_id") + or ( + context.get("ti") + and hasattr(context["ti"], "dag_run") + and context["ti"].dag_run.run_id + ) + or (context.get("dag_run") and context["dag_run"].run_id) + or f"airflow-{dataset_name}-{task_name}-{int(time.time())}" + ) + + out_dir = os.path.join( + os.environ.get("EGON_VALIDATION_DIR", os.getcwd()), + "validation_runs", + ) + + execution_date = context.get("execution_date") or datetime.now() + timestamp = execution_date.strftime("%Y%m%dT%H%M%S") + full_task_name = f"{dataset_name}.{task_name}.{timestamp}" + + logger.info("Validation: %s (run_id: %s)", full_task_name, run_id) + + engine = egon_db.engine() + + config = settings()["egon-data"] + boundary = config["--dataset-boundary"] + logger.info("Resolving validation parameters for boundary='%s'", boundary) + + rules = prepare_rules( + specs=specs, + boundary=boundary, + dataset_name=dataset_name, + task_name=task_name, + ) + + ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) + results = run_validations(engine, ctx, rules, full_task_name) + + total = len(results) + failed = sum(1 for r in results if not r.success) + + logger.info("Complete: %s/%s passed", total - failed, total) + + if failed > 0 and on_failure == "fail": + raise Exception(f"{failed}/{total} validations failed") + + return {"total": total, "passed": total - failed, "failed": failed} + + +def create_validation_tasks( + validation_dict: Dict[str, Sequence[ValidationSpec]], + dataset_name: str, + on_failure: str = "continue", +) -> List[PythonOperator]: + """ + Creates one PythonOperator per task_name in validation_dict. + + - values can still be List[Rule] + - values can be List[TableValidation] + + Mixed lists also work. + """ + if not validation_dict: + return [] + + tasks: List[PythonOperator] = [] + + for task_name, specs in validation_dict.items(): + callable_for_airflow = partial( + run_validation_task, + specs=specs, + task_name=task_name, + dataset_name=dataset_name, + on_failure=on_failure, + ) + + tasks.append( + PythonOperator( + task_id=f"{dataset_name}.validate.{task_name}", + python_callable=callable_for_airflow, + provide_context=True, + ) + ) + + return tasks \ No newline at end of file diff --git a/src/egon/data/validation/resolver.py b/src/egon/data/validation/resolver.py new file mode 100644 index 000000000..690da6e3e --- /dev/null +++ b/src/egon/data/validation/resolver.py @@ -0,0 +1,53 @@ +"""Boundary resolution helpers for validation parameters.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Any, Dict + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class BoundaryDependent: + """ + Wrapper for values that vary by boundary (e.g. Schleswig-Holstein vs Everything). + + At validation runtime, the appropriate value is selected based on the + current boundary setting. + """ + values: Dict[str, Any] + + def resolve(self, boundary: str) -> Any: + """Return the value for the given boundary, or the whole dict if not found.""" + if boundary in self.values: + logger.debug("Resolved boundary-dependent value: %s -> %s", boundary, self.values[boundary]) + return self.values[boundary] + return self.values + + +def resolve_boundary_dependence(boundary_dict: Dict[str, Any]) -> BoundaryDependent: + """ + Wrap a boundary-dependent dict for deferred resolution. + + At validation runtime, the appropriate value is selected based on the + current boundary setting. + + Example: + expected_count=resolve_boundary_dependence({"Schleswig-Holstein": 27, "Everything": 431}) + """ + return BoundaryDependent(boundary_dict) + + +def resolve_value(value: Any, boundary: str) -> Any: + """ + Resolve boundary-dependent values. + + If value is a BoundaryDependent, resolve it using the current boundary. + Otherwise return value unchanged. + """ + if isinstance(value, BoundaryDependent): + return value.resolve(boundary) + + return value \ No newline at end of file diff --git a/src/egon/data/validation/specs.py b/src/egon/data/validation/specs.py new file mode 100644 index 000000000..765881c47 --- /dev/null +++ b/src/egon/data/validation/specs.py @@ -0,0 +1,194 @@ +"""Validation specifications and expansion logic.""" + +from __future__ import annotations + +import copy +import logging +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional, Sequence, Union + +from egon_validation.rules.base import Rule +from egon_validation import ( + RowCountValidation, + DataTypeValidation, + NotNullAndNotNaNValidation, + ValueSetValidation, + SRIDUniqueNonZero, + WholeTableNotNullAndNotNaNValidation, +) + +from .resolver import resolve_value + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class TableValidation: + """ + A compact, table-first spec that expands into Rule objects at runtime. + + Properties: + - table_name + - row_count + - geometry_columns + - data_type_columns + - not_null_columns + - value_set_columns + + Behavior: + - Adds WholeTableNotNullAndNotNaNValidation automatically. + - Generates rule_id strings matching your prior manual convention. + """ + + table_name: str + row_count: Optional[Any] = None + geometry_columns: Optional[Sequence[str]] = None + data_type_columns: Optional[Mapping[str, Any]] = None + not_null_columns: Optional[Sequence[str]] = None + value_set_columns: Optional[Mapping[str, Any]] = None + + def to_rules(self) -> List[Rule]: + rules: List[Rule] = [] + table_suffix = self.table_name.split(".")[-1] + + if self.row_count is not None: + rules.append( + RowCountValidation( + table=self.table_name, + rule_id=f"ROW_COUNT.{table_suffix}", + expected_count=self.row_count, + ) + ) + + if self.data_type_columns is not None: + rules.append( + DataTypeValidation( + table=self.table_name, + rule_id=f"DATA_TYPES.{table_suffix}", + column_types=dict(self.data_type_columns), + ) + ) + + if self.not_null_columns: + rules.append( + NotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"NOT_NAN.{table_suffix}", + columns=list(self.not_null_columns), + ) + ) + + if self.geometry_columns: + for geom_col in self.geometry_columns: + rules.append( + SRIDUniqueNonZero( + table=self.table_name, + rule_id=f"SRIDUniqueNonZero.{table_suffix}.{geom_col}", + column=geom_col, + ) + ) + + if self.value_set_columns: + for col_name, expected_values in self.value_set_columns.items(): + rules.append( + ValueSetValidation( + table=self.table_name, + rule_id=f"VALUE_SET_{str(col_name).upper()}.{table_suffix}", + column=str(col_name), + expected_values=expected_values, + ) + ) + + # Always add the whole-table rule automatically + rules.append( + WholeTableNotNullAndNotNaNValidation( + table=self.table_name, + rule_id=f"TABLE_NOT_NAN.{table_suffix}", + ) + ) + + return rules + + +ValidationSpec = Union[Rule, TableValidation] + + +def clone_rule(rule: Rule) -> Rule: + """ + Creates a per-run copy of a rule so we don't mutate DAG-parse-time objects. + + We avoid deepcopy as the first choice (deepcopy can break on complex objects). + Strategy: + 1) Shallow copy the object + 2) Deep copy ONLY rule.params (the part we mutate) + 3) Fallback to deepcopy(rule) if shallow copy fails + """ + try: + cloned = copy.copy(rule) # shallow copy: new object, same inner references + except Exception: + # Last resort: full deepcopy + return copy.deepcopy(rule) + + # Make params safe to mutate + if hasattr(cloned, "params") and isinstance(getattr(cloned, "params"), dict): + cloned.params = copy.deepcopy(cloned.params) + + return cloned + + +def expand_specs(specs: Sequence[ValidationSpec]) -> List[Rule]: + """ + Turn a mixed list of Rule/TableValidation into a plain list of Rule objects. + TableValidation produces fresh rule instances. + Rule instances are cloned to avoid cross-run mutation. + """ + rules: List[Rule] = [] + + for spec in specs: + if isinstance(spec, TableValidation): + rules.extend(spec.to_rules()) + else: + rules.append(clone_rule(spec)) + + return rules + + +def resolve_rule_params(rule: Rule, boundary: str) -> None: + """ + Mutates rule.params on THIS rule instance only. + We ensure these rule instances are runtime clones/fresh instances. + """ + params = getattr(rule, "params", None) + if not isinstance(params, dict): + return + + for name, val in list(params.items()): + resolved = resolve_value(val, boundary) + if resolved is not val: + logger.info("Rule %s: Resolved %s for boundary='%s'", getattr(rule, "rule_id", ""), name, boundary) + params[name] = resolved + + +def prepare_rules( + specs: Sequence[ValidationSpec], + boundary: str, + dataset_name: str, + task_name: str, +) -> List[Rule]: + """ + Build rules for this run: + - expand specs + - inject dataset/task if missing + - resolve boundary-dependent params + """ + rules = expand_specs(specs) + + for rule in rules: + if getattr(rule, "task", None) is None: + rule.task = task_name + if getattr(rule, "dataset", None) is None: + rule.dataset = dataset_name + + resolve_rule_params(rule, boundary) + + return rules \ No newline at end of file diff --git a/src/egon/data/validation_utils.py b/src/egon/data/validation_utils.py deleted file mode 100644 index fd1f9d882..000000000 --- a/src/egon/data/validation_utils.py +++ /dev/null @@ -1,311 +0,0 @@ -"""Airflow integration for egon-validation. - -This module supports two configuration styles: - -1) Backwards compatible "rule-first": - validation_dict = {"task": [Rule(...), Rule(...)]} - -2) New "table-first": - validation_dict = {"task": [TableValidation(...), TableValidation(...)]} - -Both styles can be mixed in the same list. -""" - -from __future__ import annotations - -import copy -import logging -from dataclasses import dataclass -from typing import Any, Dict, List, Mapping, Optional, Sequence, Union - -from airflow.operators.python import PythonOperator -from egon_validation import run_validations, RunContext -from egon_validation.rules.base import Rule -import logging - -from egon_validation import ( # noqa: F401 - DataTypeValidation, - NotNullAndNotNaNValidation, - RowCountValidation, - SRIDUniqueNonZero, - ValueSetValidation, - WholeTableNotNullAndNotNaNValidation, -) - -logger = logging.getLogger(__name__) - - -@dataclass(frozen=True, slots=True) -class TableValidation: - """ - Table-first validation specification. - - Properties you asked for: - - table_name - - row_count - - geometry_columns - - data_type_columns - - not_null_columns - - value_set_columns - - Behavior: - - Generates rule_ids exactly like your manual convention: - ROW_COUNT. - DATA_TYPES. - NOT_NAN. - TABLE_NOT_NAN. <-- always added automatically - SRIDUniqueNonZero.. - VALUE_SET_. - - Boundary-dependent dict values are preserved and resolved later in _resolve_rule_params(). - """ - - table_name: str - - row_count: Optional[Any] = None - geometry_columns: Optional[Sequence[str]] = None - data_type_columns: Optional[Mapping[str, Any]] = None - not_null_columns: Optional[Sequence[str]] = None - value_set_columns: Optional[Mapping[str, Any]] = None - - def to_rules(self) -> List[Rule]: - rules: List[Rule] = [] - table_suffix = self.table_name.split(".")[-1] - - # 1) Row count - if self.row_count is not None: - rules.append( - RowCountValidation( - table=self.table_name, - rule_id=f"ROW_COUNT.{table_suffix}", - expected_count=self.row_count, - ) - ) - - # 2) Data types - if self.data_type_columns is not None: - rules.append( - DataTypeValidation( - table=self.table_name, - rule_id=f"DATA_TYPES.{table_suffix}", - column_types=dict(self.data_type_columns), - ) - ) - - # 3) Column-level not-null / not-NaN - if self.not_null_columns: - rules.append( - NotNullAndNotNaNValidation( - table=self.table_name, - rule_id=f"NOT_NAN.{table_suffix}", - columns=list(self.not_null_columns), - ) - ) - - # 4) Geometry checks (one rule per geometry column) - if self.geometry_columns: - for geom_col in self.geometry_columns: - rules.append( - SRIDUniqueNonZero( - table=self.table_name, - rule_id=f"SRIDUniqueNonZero.{table_suffix}.{geom_col}", - column=geom_col, - ) - ) - - # 5) Value sets (one rule per column) - if self.value_set_columns: - for col_name, expected_values in self.value_set_columns.items(): - rules.append( - ValueSetValidation( - table=self.table_name, - rule_id=f"VALUE_SET_{str(col_name).upper()}.{table_suffix}", - column=str(col_name), - expected_values=expected_values, - ) - ) - - # 6) Whole-table not-null / not-NaN (automatic, as requested) - rules.append( - WholeTableNotNullAndNotNaNValidation( - table=self.table_name, - rule_id=f"TABLE_NOT_NAN.{table_suffix}", - ) - ) - - return rules - - -ValidationSpec = Union[Rule, TableValidation] - - -def _expand_specs(specs: Sequence[ValidationSpec]) -> List[Rule]: - """Turn a mixed list of Rule/TableValidation into a flat list of Rule.""" - expanded: List[Rule] = [] - for spec in specs: - if isinstance(spec, TableValidation): - expanded.extend(spec.to_rules()) - else: - expanded.append(spec) - return expanded - - -def _resolve_context_value(value: Any, boundary: str) -> Any: - """Resolve a value that may be boundary-dependent. - - Args: - value: The value to resolve. Can be: - - A dict with boundary keys: {"Schleswig-Holstein": 27, "Everything": 537} - - Any other value (returned as-is) - boundary: Current dataset boundary setting - - Returns: - Resolved value based on current boundary - - Examples: - >>> _resolve_context_value({"Schleswig-Holstein": 27, "Everything": 537}, - ... "Schleswig-Holstein") - 27 - - >>> _resolve_context_value(42, "Everything") - 42 - """ - # If not a dict, return as-is - if not isinstance(value, dict): - return value - - # Try to resolve by boundary - if boundary in value: - logger.debug(f"Resolved boundary-dependent value: {boundary} -> {value[boundary]}") - return value[boundary] - - # If dict doesn't match boundary pattern, return as-is - # This handles cases like column_types dicts which are not context-dependent - return value - - -def _resolve_rule_params(rule: Rule, boundary: str) -> None: - """Resolve boundary-dependent parameters in a rule. - - Modifies rule.params in-place, resolving any dict values that match - boundary patterns. - - Args: - rule: The validation rule to process - boundary: Current dataset boundary setting - """ - if not hasattr(rule, 'params') or not isinstance(rule.params, dict): - return - - # Resolve all parameter values - for param_name, param_value in rule.params.items(): - resolved_value = _resolve_context_value(param_value, boundary) - - # If the value was resolved (changed), update it - if resolved_value is not param_value: - logger.info( - f"Rule {rule.rule_id}: Resolved {param_name} for " - f"boundary='{boundary}'" - ) - rule.params[param_name] = resolved_value - -def create_validation_tasks( - validation_dict: Dict[str, Sequence[ValidationSpec]], - dataset_name: str, - on_failure: str = "continue" -) -> List[PythonOperator]: - """Convert validation dict to Airflow tasks. - - Values can be List[Rule], values can be List[TableValidation] or mixed. - """ - if not validation_dict: - return [] - - tasks: List[PythonOperator] = [] - - for task_name, specs in validation_dict.items(): - - def make_callable(specs: Sequence[ValidationSpec], task_name: str): - def run_validation(**context): - import os - import time - from datetime import datetime - from egon.data import db as egon_db - from egon.data.config import settings - - # Run id selection (unchanged logic) - run_id = ( - os.environ.get("AIRFLOW_CTX_DAG_RUN_ID") - or context.get("run_id") - or ( - context.get("ti") - and hasattr(context["ti"], "dag_run") - and context["ti"].dag_run.run_id - ) - or (context.get("dag_run") and context["dag_run"].run_id) - or f"airflow-{dataset_name}-{task_name}-{int(time.time())}" - ) - - # Use absolute path to ensure consistent location regardless of working directory - # Priority: EGON_VALIDATION_DIR env var > current working directory - out_dir = os.path.join( - os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), - "validation_runs" - ) - - # Include execution timestamp in task name so retries write to separate directories - # The validation report will filter to keep only the most recent execution per task - execution_date = context.get('execution_date') or datetime.now() - timestamp = execution_date.strftime('%Y%m%dT%H%M%S') - full_task_name = f"{dataset_name}.{task_name}.{timestamp}" - - logger.info(f"Validation: {full_task_name} (run_id: {run_id})") - - # Use existing engine from egon.data.db - engine = egon_db.engine() - - # Get current configuration context - config = settings()["egon-data"] - boundary = config["--dataset-boundary"] - - logger.info(f"Resolving validation parameters for boundary='{boundary}'") - - rules: List[Rule] = copy.deepcopy(_expand_specs(specs)) - - # Set task and dataset on all rules (required by Rule base class) - # Also resolve boundary-dependent parameters - for rule in rules: - if not hasattr(rule, 'task') or rule.task is None: - rule.task = task_name - if not hasattr(rule, 'dataset') or rule.dataset is None: - rule.dataset = dataset_name - - # Automatically resolve boundary-dependent parameters - _resolve_rule_params(rule, boundary) - - ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) - results = run_validations(engine, ctx, rules, full_task_name) - - total = len(results) - failed = sum(1 for r in results if not r.success) - - logger.info(f"Complete: {total - failed}/{total} passed") - - if failed > 0 and on_failure == "fail": - raise Exception(f"{failed}/{total} validations failed") - - return {"total": total, "passed": total - failed, "failed": failed} - - return run_validation - - func = make_callable(specs, task_name) - func.__name__ = f"validate_{task_name}" - - operator = PythonOperator( - task_id=f"{dataset_name}.validate.{task_name}", - python_callable=func, - provide_context=True, - ) - - tasks.append(operator) - - return tasks From d38871f5d0687a35a2cf368a554d54588a346c06 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 10:28:38 +0100 Subject: [PATCH 47/54] use eGon-validation v1.2.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2549710cd..6a98602b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "cdsapi", "click<8.1", "disaggregator @ git+https://github.com/openego/disaggregator.git@features/update-cache-directory#egg=disaggregator", - "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@dev", + "egon-validation @ git+https://github.com/sagemaso/eGon-validation.git@v1.2.1", "entsoe-py>=0.6.2", "fiona==1.9.6", "Flask-Session<0.6.0", From 3e72509e2c51eae4b4973a3fef9bdaf689118154 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 10:33:50 +0100 Subject: [PATCH 48/54] fix import error bug --- src/egon/data/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index e0a14046e..1fae16a9a 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -13,7 +13,7 @@ from sqlalchemy import Column, ForeignKey, Integer, String, Table, orm, tuple_ from sqlalchemy.ext.declarative import declarative_base from typing import Dict, List -from egon.data.validation_utils import create_validation_tasks +from egon.data.validation import create_validation_tasks from egon.data import config, db, logger From 6ae1b6bd50a3e6bd498e922aa34e982fad368029 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 10:44:00 +0100 Subject: [PATCH 49/54] fix bug circular import --- src/egon/data/datasets/storages/__init__.py | 2 +- src/egon/data/validation/rules/custom/sanity/home_batteries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 2b163ccb1..3d37304cc 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -24,7 +24,6 @@ from egon.data.datasets.storages.home_batteries import ( allocate_home_batteries_to_buildings, ) -from egon.data.validation.rules.custom.sanity import HomeBatteriesAggregation from egon.data.datasets.storages.pumped_hydro import ( apply_voltage_level_thresholds, get_location, @@ -42,6 +41,7 @@ ValueSetValidation, SRIDUniqueNonZero ) +from egon.data.validation.rules.custom.sanity import HomeBatteriesAggregation Base = declarative_base() diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index 9da1b4ff5..4ffe0b5a2 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -10,7 +10,7 @@ from egon_validation.rules.base import DataFrameRule, RuleResult, Severity from egon.data import config, db -from egon.data.datasets.storages.home_batteries import get_cbat_pbat_ratio +from egon.data.datasets.storages.utils import get_cbat_pbat_ratio class HomeBatteriesAggregation(DataFrameRule): From b6806f1400bb3795425e72d224518f629d13abbe Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 11:00:50 +0100 Subject: [PATCH 50/54] fix bug circular import --- src/egon/data/datasets/storages/__init__.py | 2 +- .../rules/custom/sanity/home_batteries.py | 2 +- .../validation/rules/custom/sanity/utils.py | 26 +++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 src/egon/data/validation/rules/custom/sanity/utils.py diff --git a/src/egon/data/datasets/storages/__init__.py b/src/egon/data/datasets/storages/__init__.py index 3d37304cc..e3b426779 100755 --- a/src/egon/data/datasets/storages/__init__.py +++ b/src/egon/data/datasets/storages/__init__.py @@ -41,7 +41,7 @@ ValueSetValidation, SRIDUniqueNonZero ) -from egon.data.validation.rules.custom.sanity import HomeBatteriesAggregation +from egon.data.validation.rules.custom.sanity.home_batteries import HomeBatteriesAggregation Base = declarative_base() diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index 4ffe0b5a2..c4e87790e 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -10,7 +10,7 @@ from egon_validation.rules.base import DataFrameRule, RuleResult, Severity from egon.data import config, db -from egon.data.datasets.storages.utils import get_cbat_pbat_ratio +from egon.data.validation.rules.custom.sanity.utils import get_cbat_pbat_ratio class HomeBatteriesAggregation(DataFrameRule): diff --git a/src/egon/data/validation/rules/custom/sanity/utils.py b/src/egon/data/validation/rules/custom/sanity/utils.py new file mode 100644 index 000000000..9b77dd619 --- /dev/null +++ b/src/egon/data/validation/rules/custom/sanity/utils.py @@ -0,0 +1,26 @@ +"""Utility functions for sanity check validation rules.""" + +from egon.data import config, db + + +def get_cbat_pbat_ratio(): + """ + Mean ratio between the storage capacity and the power of the pv rooftop + system + + Returns + ------- + int + Mean ratio between the storage capacity and the power of the pv + rooftop system + """ + sources = config.datasets()["home_batteries"]["sources"] + + sql = f""" + SELECT max_hours + FROM {sources["etrago_storage"]["schema"]} + .{sources["etrago_storage"]["table"]} + WHERE carrier = 'home_battery' + """ + + return int(db.select_dataframe(sql).iat[0, 0]) \ No newline at end of file From dac7788a0855b1850d06404b34fb5b4332a3a41c Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 13:21:58 +0100 Subject: [PATCH 51/54] fix bug missing rule_id in ArrayCardinalityValidation initialization --- src/egon/data/datasets/DSM_cts_ind.py | 4 ++++ src/egon/data/datasets/__init__.py | 6 +++++- src/egon/data/datasets/demandregio/__init__.py | 1 + src/egon/data/datasets/heat_demand_timeseries/__init__.py | 2 ++ src/egon/data/datasets/heat_supply/individual_heating.py | 2 ++ src/egon/data/datasets/low_flex_scenario/__init__.py | 1 + 6 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/egon/data/datasets/DSM_cts_ind.py b/src/egon/data/datasets/DSM_cts_ind.py index 34e59821a..3548fb400 100644 --- a/src/egon/data/datasets/DSM_cts_ind.py +++ b/src/egon/data/datasets/DSM_cts_ind.py @@ -150,21 +150,25 @@ def __init__(self, dependencies): "data-quality":[ ArrayCardinalityValidation( table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_demandregio_sites_ind_electricity_dsm_timeseries", array_column= "p_set", expected_length= 8760, ), ArrayCardinalityValidation( table="demand.egon_etrago_electricity_cts_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_etrago_electricity_cts_dsm_timeseries", array_column="p_set", expected_length=8760, ), ArrayCardinalityValidation( table="demand.egon_osm_ind_load_curves_individual_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_osm_ind_load_curves_individual_dsm_timeseries", array_column="p_set", expected_length=8760, ), ArrayCardinalityValidation( table="demand.egon_sites_ind_load_curves_individual_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_sites_ind_load_curves_individual_dsm_timeseries", array_column="p_set", expected_length=8760, ), diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index 1fae16a9a..6f754a11d 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -284,7 +284,11 @@ def __post_init__(self): # Append validation tasks to existing tasks if validation_tasks: - task_list = list(self.tasks.graph if hasattr(self.tasks, 'graph') else self.tasks) + graph = self.tasks.graph if hasattr(self.tasks, 'graph') else self.tasks + if isinstance(graph, (tuple, set, list)): + task_list = list(graph) + else: + task_list = [graph] task_list.extend(validation_tasks) self.tasks = Tasks_(tuple(task_list)) diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index f65becbf7..dcabea048 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -139,6 +139,7 @@ def __init__(self, dependencies): ), ArrayCardinalityValidation( table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", + rule_id="ARRAY_VALIDATION.egon_demandregio_sites_ind_electricity_dsm_timeseries", array_column="load_curve", expected_length=8760, ) diff --git a/src/egon/data/datasets/heat_demand_timeseries/__init__.py b/src/egon/data/datasets/heat_demand_timeseries/__init__.py index bca9b8e9f..d4712db34 100644 --- a/src/egon/data/datasets/heat_demand_timeseries/__init__.py +++ b/src/egon/data/datasets/heat_demand_timeseries/__init__.py @@ -1300,11 +1300,13 @@ def __init__(self, dependencies): ), ArrayCardinalityValidation( table="demand.egon_heat_timeseries_selected_profiles", + rule_id="ARRAY.egon_heat_timeseries_selected_profiles", array_column="selected_idp_profiles", expected_length=365, ), ArrayCardinalityValidation( table="demand.egon_timeseries_district_heating", + rule_id="ARRAY.egon_timeseries_district_heating", array_column="dist_aggregated_mw", expected_length=8760, ), diff --git a/src/egon/data/datasets/heat_supply/individual_heating.py b/src/egon/data/datasets/heat_supply/individual_heating.py index 738a3def1..ab13d715f 100644 --- a/src/egon/data/datasets/heat_supply/individual_heating.py +++ b/src/egon/data/datasets/heat_supply/individual_heating.py @@ -225,6 +225,7 @@ def dyn_parallel_tasks_pypsa_eur(): "data-quality": [ ArrayCardinalityValidation( table="demand.egon_etrago_timeseries_individual_heating", + rule_id="ARRAY_HEAT_PUMPS_PYPSA.egon_etrago_timeseries_individual_heating", array_column="dist_aggregated_mv", expected_length=8760, ), @@ -474,6 +475,7 @@ def dyn_parallel_tasks_2035(): "data-quality":[ ArrayCardinalityValidation( table="demand.egon_etrago_timeseries_individual_heating", + rule_id="ARRAY_HEAT_PUMPS.egon_etrago_timeseries_individual_heating", array_column="dist_aggregated_mv", expected_length=8760, ), diff --git a/src/egon/data/datasets/low_flex_scenario/__init__.py b/src/egon/data/datasets/low_flex_scenario/__init__.py index 7f13cabba..3b27d83b0 100644 --- a/src/egon/data/datasets/low_flex_scenario/__init__.py +++ b/src/egon/data/datasets/low_flex_scenario/__init__.py @@ -35,6 +35,7 @@ def __init__(self, dependencies): "data-quality":[ ArrayCardinalityValidation( table="grid.egon_etrago_bus_timeseries", + rule_id="ARRAY.egon_etrago_bus_timeseries", array_column="v_mag_pu_set", expected_length=8760, ), From 67077908a526d2223ef2be44f5800e2ea511d452 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 14:01:53 +0100 Subject: [PATCH 52/54] bug fix: remove spacing from dataset for task_id --- src/egon/data/validation/airflow.py | 32 ++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/egon/data/validation/airflow.py b/src/egon/data/validation/airflow.py index ba420c4aa..1188f01c8 100644 --- a/src/egon/data/validation/airflow.py +++ b/src/egon/data/validation/airflow.py @@ -4,7 +4,9 @@ import logging from functools import partial -from typing import Any, Dict, List, Sequence +import re +import hashlib +from typing import Any, Dict, List, Sequence, Set from airflow.operators.python import PythonOperator from egon_validation import RunContext, run_validations @@ -105,6 +107,10 @@ def create_validation_tasks( tasks: List[PythonOperator] = [] + used_task_ids: Set[str] = set() + + safe_dataset = sanitize_airflow_key(dataset_name) + for task_name, specs in validation_dict.items(): callable_for_airflow = partial( run_validation_task, @@ -116,10 +122,30 @@ def create_validation_tasks( tasks.append( PythonOperator( - task_id=f"{dataset_name}.validate.{task_name}", + task_id=f"{safe_dataset}.validate.{task_name}", python_callable=callable_for_airflow, provide_context=True, ) ) - return tasks \ No newline at end of file + return tasks + +def sanitize_airflow_key(value: str) -> str: + """ + Airflow task_id/key must match: [A-Za-z0-9_.-]+ + Replace everything else with underscores. + """ + # 1) strip outer whitespace + v = value.strip() + + # 2) replace any run of invalid characters (including spaces) with "_" + v = re.sub(r"[^A-Za-z0-9_.-]+", "_", v) + + # 3) collapse multiple underscores + v = re.sub(r"_+", "_", v) + + # 4) avoid leading/trailing separators that can look ugly / confusing + v = v.strip("._-") + + # 5) don't return empty + return v or "unnamed" From f663df91cc64fbc1d31faa71e88a40ea47ed9fcd Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 14:06:34 +0100 Subject: [PATCH 53/54] bug fix: correct typo --- src/egon/data/datasets/low_flex_scenario/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/low_flex_scenario/__init__.py b/src/egon/data/datasets/low_flex_scenario/__init__.py index 3b27d83b0..2b1d24dbe 100644 --- a/src/egon/data/datasets/low_flex_scenario/__init__.py +++ b/src/egon/data/datasets/low_flex_scenario/__init__.py @@ -41,5 +41,5 @@ def __init__(self, dependencies): ), ] }, - on_validaiton_failure="continue" + on_validation_failure="continue" ) From e56973cb01628c93e312f4e3b0bd7de7485aa982 Mon Sep 17 00:00:00 2001 From: sarah Date: Thu, 22 Jan 2026 16:04:51 +0100 Subject: [PATCH 54/54] correct linting errors --- src/egon/data/datasets/DSM_cts_ind.py | 8 +- src/egon/data/datasets/__init__.py | 23 +- src/egon/data/datasets/chp/__init__.py | 17 +- .../data/datasets/demandregio/__init__.py | 18 +- .../district_heating_areas/__init__.py | 14 +- .../datasets/electricity_demand/__init__.py | 12 +- .../hh_buildings.py | 37 +- .../hh_profiles.py | 38 +- .../motorized_individual_travel/__init__.py | 183 +++++++-- src/egon/data/datasets/era5.py | 8 +- src/egon/data/datasets/final_validations.py | 388 ++++++++++++++---- src/egon/data/datasets/validation_report.py | 50 ++- src/egon/data/datasets/vg250/__init__.py | 97 ++++- src/egon/data/datasets/zensus/__init__.py | 6 +- src/egon/data/validation/__init__.py | 4 +- src/egon/data/validation/airflow.py | 6 +- src/egon/data/validation/resolver.py | 23 +- .../custom/sanity/electricity_capacity.py | 1 - .../rules/custom/sanity/gas_stores.py | 1 - .../rules/custom/sanity/home_batteries.py | 83 ++-- .../validation/rules/custom/sanity/utils.py | 2 +- src/egon/data/validation/specs.py | 24 +- 22 files changed, 785 insertions(+), 258 deletions(-) diff --git a/src/egon/data/datasets/DSM_cts_ind.py b/src/egon/data/datasets/DSM_cts_ind.py index 3548fb400..9b2e86bdf 100644 --- a/src/egon/data/datasets/DSM_cts_ind.py +++ b/src/egon/data/datasets/DSM_cts_ind.py @@ -32,7 +32,7 @@ sources, ) -from egon_validation import( +from egon_validation import ( ArrayCardinalityValidation ) @@ -147,12 +147,12 @@ def __init__(self, dependencies): dependencies=dependencies, tasks=(dsm_cts_ind_processing,), validation={ - "data-quality":[ + "data-quality": [ ArrayCardinalityValidation( table="demand.egon_demandregio_sites_ind_electricity_dsm_timeseries", rule_id="ARRAY_VALIDATION.egon_demandregio_sites_ind_electricity_dsm_timeseries", - array_column= "p_set", - expected_length= 8760, + array_column="p_set", + expected_length=8760, ), ArrayCardinalityValidation( table="demand.egon_etrago_electricity_cts_dsm_timeseries", diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index 6f754a11d..d64573060 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -18,9 +18,9 @@ from egon.data import config, db, logger try: - from egon_validation.rules.base import Rule + from egon_validation.rules.base import Rule except ImportError: - Rule = None # Type hint only + Rule = None # Type hint only Base = declarative_base() @@ -284,7 +284,10 @@ def __post_init__(self): # Append validation tasks to existing tasks if validation_tasks: - graph = self.tasks.graph if hasattr(self.tasks, 'graph') else self.tasks + if hasattr(self.tasks, 'graph'): + graph = self.tasks.graph + else: + graph = self.tasks if isinstance(graph, (tuple, set, list)): task_list = list(graph) else: @@ -336,20 +339,24 @@ def __post_init__(self): # Get last non-validation tasks non_validation_task_ids = [ task.task_id for task in self.tasks.values() - if not any(task.task_id.endswith(f".validate.{name}") for name in self.validation.keys()) + if not any( + task.task_id.endswith(f".validate.{name}") + for name in self.validation.keys() + ) ] last_data_tasks = [ task for task in self.tasks.values() - if task.task_id in non_validation_task_ids and task in self.tasks.last + if task.task_id in non_validation_task_ids + and task in self.tasks.last ] if not last_data_tasks: # Fallback to last non-validation task last_data_tasks = [ - task for task in self.tasks.values() - if task.task_id in non_validation_task_ids - ][-1:] + task for task in self.tasks.values() + if task.task_id in non_validation_task_ids + ][-1:] # Link each validation task downstream of last data tasks for validation_task in validation_tasks: diff --git a/src/egon/data/datasets/chp/__init__.py b/src/egon/data/datasets/chp/__init__.py index e1362cd64..066a6d99a 100644 --- a/src/egon/data/datasets/chp/__init__.py +++ b/src/egon/data/datasets/chp/__init__.py @@ -47,7 +47,7 @@ sources, ) -from egon_validation import( +from egon_validation import ( RowCountValidation, DataTypeValidation, NotNullAndNotNaNValidation, @@ -863,11 +863,14 @@ def __init__(self, dependencies): dependencies=dependencies, tasks=tasks, validation={ - "data-quality":[ + "data-quality": [ RowCountValidation( table="supply.egon_chp_plants", rule_id="ROW_COUNT.egon_chp_plants", - expected_count={"Schleswig-Holstein": 1720, "Everything": 40197} + expected_count={ + "Schleswig-Holstein": 1720, + "Everything": 40197 + } ), DataTypeValidation( table="supply.egon_chp_plants", @@ -915,7 +918,13 @@ def __init__(self, dependencies): table="supply.egon_chp_plants", rule_id="VALUE_SET_VALIDATION_CARRIER.egon_chp_plants", column="carrier", - expected_values=["oil", "others", "gas", "gas extended", "biomass"] + expected_values=[ + "oil", + "others", + "gas", + "gas extended", + "biomass" + ] ), ValueSetValidation( table="supply.egon_chp_plants", diff --git a/src/egon/data/datasets/demandregio/__init__.py b/src/egon/data/datasets/demandregio/__init__.py index dcabea048..efffd571f 100644 --- a/src/egon/data/datasets/demandregio/__init__.py +++ b/src/egon/data/datasets/demandregio/__init__.py @@ -99,13 +99,20 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_demandregio_hh", rule_id="ROW_COUNT.egon_demandregio_hh", - expected_count={"Schleswig-Holstein": 180, "everything": 7218} + expected_count={ + "Schleswig-Holstein": 180, + "everything": 7218 + } ), DataTypeValidation( table="demand.egon_demandregio_hh", rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_hh", - column_types={"nuts3": "character varying", "hh_size": "integer", "scenario": "character varying", - "year": "integer", "demand": "double precision"} + column_types={"nuts3": "character varying", + "hh_size": "integer", + "scenario": "character varying", + "year": "integer", + "demand": "double precision" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_demandregio_hh", @@ -125,7 +132,10 @@ def __init__(self, dependencies): DataTypeValidation( table="demand.egon_demandregio_wz", rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_wz", - column_types={"wz": "integer", "sector": "character varying", "definition": "character varying"} + column_types={"wz": "integer", + "sector": "character varying", + "definition": "character varying" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_demandregio_wz", diff --git a/src/egon/data/datasets/district_heating_areas/__init__.py b/src/egon/data/datasets/district_heating_areas/__init__.py index 6b487d487..5f8ca856a 100644 --- a/src/egon/data/datasets/district_heating_areas/__init__.py +++ b/src/egon/data/datasets/district_heating_areas/__init__.py @@ -95,13 +95,21 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_district_heating_areas", rule_id="ROW_COUNT.egon_district_heating_areas", - expected_count={"Schleswig-Holstein": 100, "Everything": 6335} + expected_count={ + "Schleswig-Holstein": 100, + "Everything": 6335 + } ), DataTypeValidation( table="demand.egon_district_heating_areas", rule_id="DATA_MULTIPLE_TYPES.egon_district_heating_areas", - column_types={"id": "integer", "area_id": "integer", "scenario": "character varying", - "geom_polygon": "geometry", "residential_and_service_demand": "double precision"} + column_types={ + "id": "integer", + "area_id": "integer", + "scenario": "character varying", + "geom_polygon": "geometry", + "residential_and_service_demand": "double precision" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_district_heating_areas", diff --git a/src/egon/data/datasets/electricity_demand/__init__.py b/src/egon/data/datasets/electricity_demand/__init__.py index ef975aa54..f9a630f39 100644 --- a/src/egon/data/datasets/electricity_demand/__init__.py +++ b/src/egon/data/datasets/electricity_demand/__init__.py @@ -79,12 +79,20 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_demandregio_zensus_electricity", rule_id="ROW_COUNT.egon_demandregio_zensus_electricity", - expected_count={"Schleswig-Holstein": 154527, "Everything": 7355160} + expected_count={ + "Schleswig-Holstein": 154527, + "Everything": 7355160 + } ), DataTypeValidation( table="demand.egon_demandregio_zensus_electricity", rule_id="DATA_MULTIPLE_TYPES.egon_demandregio_zensus_electricity", - column_types={"zensus_population_id": "integer", "scenario": "character varying", "sector": "character varying", "demand": "double precision"} + column_types={ + "zensus_population_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "demand": "double precision" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_demandregio_zensus_electricity", diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py index 7406747b8..d8cc2621f 100755 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_buildings.py @@ -1243,12 +1243,21 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_building_electricity_peak_loads", rule_id="ROW_COUNT.egon_building_electricity_peak_loads", - expected_count={"Schleswig-Holstein": 3054820, "Everything": 44683620} + expected_count={ + "Schleswig-Holstein": 3054820, + "Everything": 44683620 + } ), DataTypeValidation( table="demand.egon_building_electricity_peak_loads", rule_id="DATA_MULTIPLE_TYPES.egon_building_electricity_peak_loads", - column_types={"building_id": "integer", "scenario": "character varying", "sector": "character varying", "peak_load_in_w": "real", "voltage_level": "integer"} + column_types={ + "building_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "peak_load_in_w": "real", + "voltage_level": "integer" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_building_electricity_peak_loads", @@ -1269,12 +1278,20 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_building_heat_peak_loads", rule_id="ROW_COUNT.egon_building_heat_peak_loads", - expected_count={"Schleswig-Holstein": 732905, "Everything": 42128819} + expected_count={ + "Schleswig-Holstein": 732905, + "Everything": 42128819 + } ), DataTypeValidation( table="demand.egon_building_heat_peak_loads", rule_id="DATA_MULTIPLE_TYPES.egon_building_heat_peak_loads", - column_types={"building_id": "integer", "scenario": "character varying", "sector": "character varying", "peak_load_in_w": "real"} + column_types={ + "building_id": "integer", + "scenario": "character varying", + "sector": "character varying", + "peak_load_in_w": "real" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_building_heat_peak_loads", @@ -1295,13 +1312,19 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_household_electricity_profile_of_buildings", rule_id="ROW_COUNT.egon_household_electricity_profile_of_buildings", - expected_count={"Schleswig-Holstein": 1371592, "Everything": 38605221} + expected_count={ + "Schleswig-Holstein": 1371592, + "Everything": 38605221 + } ), DataTypeValidation( table="demand.egon_household_electricity_profile_of_buildings", rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_of_buildings", - column_types={"id": "integer", "building_id": "integer", "cell_id": "integer", - "profile_id": "character varying"} + column_types={ + "id": "integer", + "building_id": "integer", + "cell_id": "integer", + "profile_id": "character varying"} ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_household_electricity_profile_of_buildings", diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py index d52f8acf5..7bb5ecb84 100644 --- a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py +++ b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py @@ -312,22 +312,34 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.egon_household_electricity_profile_in_census_cell", rule_id="ROW_COUNT.egon_household_electricity_profile_in_census_cell", - expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723} + expected_count={ + "Schleswig-Holstein": 143521, + "Everything": 3177723 + } ), DataTypeValidation( table="demand.egon_household_electricity_profile_in_census_cell", rule_id="DATA_MULTIPLE_TYPES.egon_household_electricity_profile_in_census_cell", column_types={ "Schleswig-Holstein":{ - "cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", - "nuts3": "character varying", "nuts1": "character varying", - "factor_2019": "double precision","factor_2023": "double precision", - "factor_2035": "double precision", "factor_2050": "double precision" + "cell_id": "integer", + "grid_id": "character varying", + "cell_profile_ids": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "factor_2019": "double precision", + "factor_2023": "double precision", + "factor_2035": "double precision", + "factor_2050": "double precision" }, "Everything":{ - "cell_id": "integer", "grid_id": "character varying", "cell_profile_ids": "character varying", - "nuts3": "character varying", "nuts1": "character varying", - "factor_2035": "double precision", "factor_2050": "double precision" + "cell_id": "integer", + "grid_id": "character varying", + "cell_profile_ids": "character varying", + "nuts3": "character varying", + "nuts1": "character varying", + "factor_2035": "double precision", + "factor_2050": "double precision" } } ), @@ -338,14 +350,18 @@ def __init__(self, dependencies): RowCountValidation( table=" demand.demand.iee_household_load_profiles", rule_id="ROW_COUNT.iee_household_load_profiles", - expected_count={"Schleswig-Holstein": 2511, "Everything": 1000000} + expected_count={ + "Schleswig-Holstein": 2511, + "Everything": 1000000 + } ), DataTypeValidation( table="demand.iee_household_load_profiles", rule_id="DATA_MULTIPLE_TYPES.iee_household_load_profiles", column_types={ - "id": "integer", "type": "character", - "load_in_wh": "real[]" + "id": "integer", + "type": "character", + "load_in_wh": "real[]" } ), WholeTableNotNullAndNotNaNValidation( diff --git a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py index 8d230af3f..d772617d4 100644 --- a/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py +++ b/src/egon/data/datasets/emobility/motorized_individual_travel/__init__.py @@ -502,15 +502,26 @@ def generate_model_data_tasks(scenario_name): RowCountValidation( table=" demand.egon_ev_count_municipality", rule_id="ROW_COUNT.egon_ev_count_municipality", - expected_count={"Schleswig-Holstein": 1108, "Everything": 44012} + expected_count={ + "Schleswig-Holstein": 1108, + "Everything": 44012 + } ), DataTypeValidation( table="demand.egon_ev_count_municipality", rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_municipality", - column_types={"scenario": "character varying", "scenario_variation": "character varying", - "ags": "integer", "bev_mini": "integer", "bev_medium": "integer", - "bev_luxury": "integer", "phev_mini": "integer", "phev_medium": "integer", - "phev_luxury": "integer", "rs7_id": "smallint"} + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "ags": "integer", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_count_municipality", @@ -526,20 +537,36 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_count_municipality", rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_municipality", column="scenario_variation", - expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", "Reference 2050"] + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] ), RowCountValidation( table=" demand.egon_ev_count_mv_grid_district", rule_id="ROW_COUNT.egon_ev_count_mv_grid_district", - expected_count={"Schleswig-Holstein": 199, "Everything": 15348} + expected_count={ + "Schleswig-Holstein": 199, + "Everything": 15348 + } ), DataTypeValidation( table="demand.egon_ev_count_mv_grid_district", rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_mv_grid_district", - column_types={"scenario": "character varying", "scenario_variation": "character varying", - "bus_id": "integer", "bev_mini": "integer", "bev_medium": "integer", - "bev_luxury": "integer", "phev_mini": "integer", "phev_medium": "integer", - "phev_luxury": "integer", "rs7_id": "smallint"} + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "bus_id": "integer", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_count_mv_grid_district", @@ -555,21 +582,36 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_count_mv_grid_district", rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_mv_grid_district", column="scenario_variation", - expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", - "Reference 2050"] + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] ), RowCountValidation( table=" demand.egon_ev_count_registration_district", rule_id="ROW_COUNT.egon_ev_count_registration_district", - expected_count={"Schleswig-Holstein": 400, "Everything": 1600} + expected_count={ + "Schleswig-Holstein": 400, + "Everything": 1600 + } ), DataTypeValidation( table="demand.egon_ev_count_registration_district", rule_id="DATA_MULTIPLE_TYPES.egon_ev_count_registration_district", - column_types={"scenario": "character varying", "scenario_variation": "character varying", - "ags_reg_district": "integer", "reg_district": "character varying", - "bev_mini": "integer", "bev_medium": "integer", "bev_luxury": "integer", - "phev_mini": "integer", "phev_medium": "integer", "phev_luxury": "integer"} + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "ags_reg_district": "integer", + "reg_district": "character varying", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_count_registration_district", @@ -585,22 +627,37 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_count_registration_district", rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_count_registration_district", column="scenario_variation", - expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", - "Reference 2050"] + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] ), RowCountValidation( table=" demand.egon_ev_mv_grid_district", rule_id="ROW_COUNT.egon_ev_mv_grid_district", - expected_count={"Schleswig-Holstein": 534899, "Everything": 125609556} + expected_count={ + "Schleswig-Holstein": 534899, + "Everything": 125609556 + } ), DataTypeValidation( table="demand.egon_ev_mv_grid_district", rule_id="DATA_MULTIPLE_TYPES.egon_ev_mv_grid_district", - column_types={"scenario": "character varying", "scenario_variation": "character varying", - "bus_id": "integer", "reg_district": "character varying", - "bev_mini": "integer", "bev_medium": "integer", "bev_luxury": "integer", - "phev_mini": "integer", "phev_medium": "integer", "phev_luxury": "integer", - "rs7_id": "smallint"} + column_types={ + "scenario": "character varying", + "scenario_variation": "character varying", + "bus_id": "integer", + "reg_district": "character varying", + "bev_mini": "integer", + "bev_medium": "integer", + "bev_luxury": "integer", + "phev_mini": "integer", + "phev_medium": "integer", + "phev_luxury": "integer", + "rs7_id": "smallint" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_mv_grid_district", @@ -616,19 +673,31 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_mv_grid_district", rule_id="VALUE_SET_VALIDATION_SCENARIO_VARIATION.egon_ev_mv_grid_district", column="scenario_variation", - expected_values=["Mobility Transition 2050", "NEP C 2035", "Electrification 2050", - "Reference 2050"] + expected_values=[ + "Mobility Transition 2050", + "NEP C 2035", + "Electrification 2050", + "Reference 2050" + ] ), RowCountValidation( table=" demand.egon_ev_pool", rule_id="ROW_COUNT.egon_ev_pool", - expected_count={"Schleswig-Holstein": 7000, "Everything": 65376} + expected_count={ + "Schleswig-Holstein": 7000, + "Everything": 65376 + } ), DataTypeValidation( table="demand.egon_ev_pool", rule_id="DATA_MULTIPLE_TYPES.egon_ev_pool", - column_types={"scenario": "character varying", "ev_id": "integer", "rs7_id": "smallint", - "type": "character varying", "simbev_ev_id": "integer"} + column_types={ + "scenario": "character varying", + "ev_id": "integer", + "rs7_id": "smallint", + "type": "character varying", + "simbev_ev_id": "integer" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_pool", @@ -644,23 +713,44 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_pool", rule_id="VALUE_SET_VALIDATION_TYPE.egon_ev_pool", column="type", - expected_values=["bev_mini", "bev_medium", "bev_luxury", "phev_mini", "phev_medium", - "phev_luxury"] + expected_values=[ + "bev_mini", + "bev_medium", + "bev_luxury", + "phev_mini", + "phev_medium", + "phev_luxury" + ] ), RowCountValidation( table=" demand.egon_ev_trip", rule_id="ROW_COUNT.egon_ev_trip", - expected_count={"Schleswig-Holstein":11642066, "Everything": 108342188} + expected_count={ + "Schleswig-Holstein":11642066, + "Everything": 108342188 + } ), DataTypeValidation( table="demand.egon_ev_trip", rule_id="DATA_MULTIPLE_TYPES.egon_ev_trip", - column_types={"scenario": "character varying", "event_id": "integer", "egon_ev_pool_ev_id": "integer", - "simbev_event_id": "integer", "location": "character varying", "use_case": "character varying", - "charging_capacity_nominal": "real", "charging_capacity_grid": "real", - "charging_capacity_battery": "real", "soc_start": "real", "soc_end": "real", - "charging_demand": "real", "park_start": "integer", "park_end": "integer", - "drive_start": "integer", "drive_end": "integer", "consumption": "real"} + column_types={ + "scenario": "character varying", + "event_id": "integer", + "egon_ev_pool_ev_id": "integer", + "simbev_event_id": "integer", + "location": "character varying", + "use_case": "character varying", + "charging_capacity_nominal": "real", + "charging_capacity_grid": "real", + "charging_capacity_battery": "real", + "soc_start": "real", "soc_end": "real", + "charging_demand": "real", + "park_start": "integer", + "park_end": "integer", + "drive_start": "integer", + "drive_end": "integer", + "consumption": "real" + } ), WholeTableNotNullAndNotNaNValidation( table="demand.egon_ev_trip", @@ -676,8 +766,17 @@ def generate_model_data_tasks(scenario_name): table="demand.egon_ev_trip", rule_id="VALUE_SET_LOCATION.egon_ev_trip", column="type", - expected_values=["0_work", "1_business", "2_school", "3_shopping", "4_private/ridesharing", - "5_leisure", "6_home", "7_charging_hub", "driving"] + expected_values=[ + "0_work", + "1_business", + "2_school", + "3_shopping", + "4_private/ridesharing", + "5_leisure", + "6_home", + "7_charging_hub", + "driving" + ] ) ] }, diff --git a/src/egon/data/datasets/era5.py b/src/egon/data/datasets/era5.py index 6d40a278e..1f9e74da9 100644 --- a/src/egon/data/datasets/era5.py +++ b/src/egon/data/datasets/era5.py @@ -16,7 +16,7 @@ from egon.data.datasets.scenario_parameters import get_sector_parameters import egon.data.config -from egon_validation import( +from egon_validation import ( RowCountValidation, DataTypeValidation, NotNullAndNotNaNValidation, @@ -75,7 +75,11 @@ def __init__(self, dependencies): DataTypeValidation( table="supply.egon_era5_weather_cells", rule_id="DATA_TYPES.egon_era5_weather_cells", - column_types={"w_id": "integer", "geom": "geometry", "geom_point": "geometry"} + column_types={ + "w_id": "integer", + "geom": "geometry", + "geom_point": "geometry" + } ), NotNullAndNotNaNValidation( table="supply.egon_era5_weather_cells", diff --git a/src/egon/data/datasets/final_validations.py b/src/egon/data/datasets/final_validations.py index 01dadff92..36fefac83 100644 --- a/src/egon/data/datasets/final_validations.py +++ b/src/egon/data/datasets/final_validations.py @@ -267,7 +267,10 @@ def __init__(self, dependencies): scenario="eGon2035", component_type="load", component_carrier="H2_for_industry", - bus_conditions=[("H2_grid", "= 'DE'"), ("AC", "!= 'DE'")] + bus_conditions=[ + ("H2_grid", "= 'DE'"), + ("AC", "!= 'DE'") + ] ), # GENERATORS - eGon2035 @@ -307,7 +310,10 @@ def __init__(self, dependencies): scenario="eGon2035", component_type="store", component_carrier="H2_overground", - bus_conditions=[("H2_saltcavern", "= 'DE'"), ("H2_grid", "= 'DE'")] + bus_conditions=[ + ("H2_saltcavern", "= 'DE'"), + ("H2_grid", "= 'DE'") + ] ), ], @@ -469,7 +475,11 @@ def __init__(self, dependencies): scenario="eGon2035", carrier="biomass", component_type="generator", - output_carriers=["biomass", "industrial_biomass_CHP", "central_biomass_CHP"], + output_carriers=[ + "biomass", + "industrial_biomass_CHP", + "central_biomass_CHP" + ], rtol=0.10 ), # Run of river @@ -1026,17 +1036,33 @@ def __init__(self, dependencies): table="grid.egon_etrago_bus", rule_id="DATA_TYPES.egon_etrago_bus", column_types={ - "scen_name": "character varying", "bus_id": "bigint", "v_nom": "double precision", - "type": "text", "carrier": "text", "v_mag_pu_set": "double precision", - "v_mag_pu_min": "double precision", "v_mag_pu_max": "double precision", - "x": "double precision", "y": "double precision", "geometry": "geometry", "country": "text" + "scen_name": "character varying", + "bus_id": "bigint", + "v_nom": "double precision", + "type": "text", + "carrier": "text", + "v_mag_pu_set": "double precision", + "v_mag_pu_min": "double precision", + "v_mag_pu_max": "double precision", + "x": "double precision", + "y": "double precision", + "geometry": "geometry", + "country": "text" }, ), NotNullAndNotNaNValidation( table="grid.egon_etrago_bus", rule_id="NOT_NAN.egon_etrago_bus", columns=[ - "scn_name", "bus_id", "v_nom", "carrier", "v_mag_pu_min", "v_mag_pu_max", "x", "y", "geom" + "scn_name", + "bus_id", + "v_nom", + "carrier", + "v_mag_pu_min", + "v_mag_pu_max", + "x", + "y", + "geom" ] ), WholeTableNotNullAndNotNaNValidation( @@ -1047,17 +1073,34 @@ def __init__(self, dependencies): table="grid.egon_etrago_bus", rule_id="VALUE_SET_SCENARIO.egon_etrago_bus", column="scn_name", - expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] ), ValueSetValidation( table="grid.egon_etrago_bus", rule_id="VALUE_SET_CARRIER.egon_etrago_bus", column="carrier", expected_values=[ - "rural_heat", "urban_central_water_tanks", "low_voltage", "CH4", "H2_saltcavern", - "services_rural_heat", "services_rural_water_tanks", "central_heat_store", "AC", "Li_ion", - "H2_grid", "dsm", "urban_central_heat", "residential_rural_heat", "central_heat", - "rural_heat_store", "residential_rural_water_tanks" + "rural_heat", + "urban_central_water_tanks", + "low_voltage", + "CH4", + "H2_saltcavern", + "services_rural_heat", + "services_rural_water_tanks", + "central_heat_store", + "AC", + "Li_ion", + "H2_grid", + "dsm", + "urban_central_heat", + "residential_rural_heat", + "central_heat", + "rural_heat_store", + "residential_rural_water_tanks" ] ), SRIDUniqueNonZero( @@ -1068,22 +1111,45 @@ def __init__(self, dependencies): RowCountValidation( table="grid.egon_etrago_generator", rule_id="ROW_COUNT.egon_etrago_generator", - expected_count={"Schleswig-Holstein": 2863, "Everything": 40577} + expected_count={ + "Schleswig-Holstein": 2863, + "Everything": 40577 + } ), DataTypeValidation( table="grid.egon_etrago_generator", rule_id="DATA_TYPES.egon_etrago_generator", column_types={ - "scen_name": "character varying", "generator_id": "bigint", "control": "text", - "type": "text", "carrier": "text", "p_nom": "double precision", "p_nom_extendable": "boolean", - "p_nom_min": "double precision", "p_nom_max": "double precision", "p_min_pu": "double precision", - "p_max_pu": "double precision", "p_set": "double precision", "q_set": "double precision", - "sign": "double precision", "marginal_cost": "double precision", "build_year": "bigint", - "lifetime": "double precision", "capital_cost": "double precision", "efficiency": "double precision", - "commitable": "boolean", "start_up_cost": "double precision", "shut_down_cost": "double precision", - "min_up_time": "bigint", "min_down_time": "bigint", "up_time_before": "bigint", "down_time_before": "bigint", - "ramp_limit_up": "double precision", "ramp_limit_down": "double precision", - "ramp_limit_start_up": "double precision", "ramp_limit_shut_down": "double precision", + "scen_name": "character varying", + "generator_id": "bigint", + "control": "text", + "type": "text", + "carrier": "text", + "p_nom": "double precision", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "q_set": "double precision", + "sign": "double precision", + "marginal_cost": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", + "efficiency": "double precision", + "commitable": "boolean", + "start_up_cost": "double precision", + "shut_down_cost": "double precision", + "min_up_time": "bigint", + "min_down_time": "bigint", + "up_time_before": "bigint", + "down_time_before": "bigint", + "ramp_limit_up": "double precision", + "ramp_limit_down": "double precision", + "ramp_limit_start_up": "double precision", + "ramp_limit_shut_down": "double precision", "e_nom_max": "double precision" }, ), @@ -1091,11 +1157,34 @@ def __init__(self, dependencies): table="grid.egon_etrago_generator", rule_id="NOT_NAN.egon_etrago_generator", columns=[ - "scn_name", "generator_id", "bus", "control", "type", "carrier", "p_nom", "p_nom_extendable", - "p_nom_min", "p_nom_max", "p_min_pu", "p_max_pu", "sign", "marginal_cost", "build_year", - "lifetime", "capital_cost", "efficiency", "committable", "start_up_cost", "shut_down_cost", - "min_up_time", "min_down_time", "up_time_before", "down_time_before", "ramp_limit_start_up", - "ramp_limit_shut_down", "e_nom_max" + "scn_name", + "generator_id", + "bus", + "control", + "type", + "carrier", + "p_nom", + "p_nom_extendable", + "p_nom_min", + "p_nom_max", + "p_min_pu", + "p_max_pu", + "sign", + "marginal_cost", + "build_year", + "lifetime", + "capital_cost", + "efficiency", + "committable", + "start_up_cost", + "shut_down_cost", + "min_up_time", + "min_down_time", + "up_time_before", + "down_time_before", + "ramp_limit_start_up", + "ramp_limit_shut_down", + "e_nom_max" ] ), WholeTableNotNullAndNotNaNValidation( @@ -1106,39 +1195,70 @@ def __init__(self, dependencies): table="grid.egon_etrago_generator", rule_id="VALUE_SET_SCENARIO.egon_etrago_generator", column="scn_name", - expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] ), ValueSetValidation( table="grid.egon_etrago_generator", rule_id="VALUE_SET_CARRIER.egon_etrago_generator", column="carrier", expected_values=[ - "CH4", "others", "central_biomass_CHP", "wind_onshore", "lignite", "geo_thermal", "solar", - "reservoir", "services_rural_solar_thermal_collector", "residential_rural_solar_thermal_collector", - "industrial_biomass_CHP", "biomass", "urban_central_solar_thermal_collector", "run_of_river", - "oil", "central_biomass_CHP_heat", "nuclear", "coal", "solar_thermal_collector", "solar_rooftop", + "CH4", + "others", + "central_biomass_CHP", + "wind_onshore", + "lignite", + "geo_thermal", + "solar", + "reservoir", + "services_rural_solar_thermal_collector", + "residential_rural_solar_thermal_collector", + "industrial_biomass_CHP", + "biomass", + "urban_central_solar_thermal_collector", + "run_of_river", + "oil", + "central_biomass_CHP_heat", + "nuclear", + "coal", + "solar_thermal_collector", + "solar_rooftop", "wind_offshore" ] ), RowCountValidation( table="grid.egon_etrago_generator_timeseries", rule_id="ROW_COUNT.egon_etrago_generator_timeseries", - expected_count={"Schleswig-Holstein": 1929, "Everything": 28651} + expected_count={ + "Schleswig-Holstein": 1929, + "Everything": 28651 + } ), DataTypeValidation( table="grid.egon_etrago_generator_timeseries", rule_id="DATA_TYPES.egon_etrago_generator_timeseries", column_types={ - "scn_name": "character varying", "generator_id": "integer", "temp_id": "integer", - "p_set": "double precision[]", "q_set": "double precision[]", "p_min_pu": "double precision[]", - "p_max_pu": "double precision[]", "marginal_cost": "double precision[]" + "scn_name": "character varying", + "generator_id": "integer", + "temp_id": "integer", + "p_set": "double precision[]", + "q_set": "double precision[]", + "p_min_pu": "double precision[]", + "p_max_pu": "double precision[]", + "marginal_cost": "double precision[]" }, ), NotNullAndNotNaNValidation( table="grid.egon_etrago_generator_timeseries", rule_id="NOT_NAN.egon_etrago_generator_timeseries", columns=[ - "scn_name", "generator_id", "temp_id", "p_max_pu" + "scn_name", + "generator_id", + "temp_id", + "p_max_pu" ] ), WholeTableNotNullAndNotNaNValidation( @@ -1149,35 +1269,83 @@ def __init__(self, dependencies): table="grid.egon_etrago_generator_timeseries", rule_id="VALUE_SET_SCENARIO.egon_etrago_generator_timeseries", column="scn_name", - expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] ), RowCountValidation( table="grid.egon_etrago_line", rule_id="ROW_COUNT.egon_etrago_line", - expected_count={"Schleswig-Holstein": 1197, "Everything": 69901} + expected_count={ + "Schleswig-Holstein": 1197, + "Everything": 69901 + } ), DataTypeValidation( table="grid.egon_etrago_line", rule_id="DATA_TYPES.egon_etrago_line", column_types={ - "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", - "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", "b": "numeric", - "s_nom": "numeric", "s_nom_extendable": "boolean", "s_nom_min": "double precision", - "s_nom_max": "double precision", "s_max_pu": "double precision", "build_year": "bigint", - "lifetime": "double precision", "capital_cost": "double precision", "length": "double precision", - "cables": "integer", "terrain_factor": "double precision", "num_parallel": "double precision", - "v_ang_min": "double precision", "v_ang_max": "double precision", "v_nom": "double precision", - "geom": "geometry", "topo": "geometry" + "scn_name": "character varying", + "line_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "x": "numeric", + "r": "numeric", + "g": "numeric", + "b": "numeric", + "s_nom": "numeric", + "s_nom_extendable": "boolean", + "s_nom_min": "double precision", + "s_nom_max": "double precision", + "s_max_pu": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", + "length": "double precision", + "cables": "integer", + "terrain_factor": "double precision", + "num_parallel": "double precision", + "v_ang_min": "double precision", + "v_ang_max": "double precision", + "v_nom": "double precision", + "geom": "geometry", + "topo": "geometry" }, ), NotNullAndNotNaNValidation( table="grid.egon_etrago_line", rule_id="NOT_NAN.egon_etrago_line", columns=[ - "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", - "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", - "capital_cost", "length", "cables", "terrain_factor", "num_parallel", "v_ang_min", - "v_ang_max", "v_nom", "geom", "topo", + "scn_name", + "line_id", + "bus0", + "bus1", + "carrier", + "x", + "r", + "g", + "b", + "s_nom", + "s_nom_extendable", + "s_nom_min", + "s_nom_max", + "s_max_pu", + "build_year", + "lifetime", + "capital_cost", + "length", + "cables", + "terrain_factor", + "num_parallel", + "v_ang_min", + "v_ang_max", + "v_nom", + "geom", + "topo" ] ), WholeTableNotNullAndNotNaNValidation( @@ -1188,7 +1356,11 @@ def __init__(self, dependencies): table="grid.egon_etrago_line", rule_id="VALUE_SET_SCENARIO.egon_etrago_line", column="scn_name", - expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] ), ValueSetValidation( table="grid.egon_etrago_line", @@ -1206,38 +1378,78 @@ def __init__(self, dependencies): rule_id="SRIDUniqueNonZero.egon_etrago_line.topo", column="topo" ), - #Row Count does't equal egon_etrago_line, because buses are located outside Germany + #Row Count does not equal egon_etrago_line, because buses are located outside Germany RowCountValidation( table="grid.egon_etrago_line_timeseries", rule_id="ROW_COUNT.egon_etrago_line_timeseries", - expected_count={"Schleswig-Holstein": 1197, "Everything": 69714} + expected_count={ + "Schleswig-Holstein": 1197, + "Everything": 69714 + } ), DataTypeValidation( table="grid.egon_etrago_line_timeseries", rule_id="DATA_TYPES.egon_etrago_line_timeseries", column_types={ - "scn_name": "character varying", "line_id": "bigint", "bus0": "bigint", "bus1": "bigint", - "type": "text", "carrier": "text", "x": "numeric", "r": "numeric", "g": "numeric", + "scn_name": "character varying", + "line_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "x": "numeric", + "r": "numeric", + "g": "numeric", "b": "numeric", - "s_nom": "numeric", "s_nom_extendable": "boolean", "s_nom_min": "double precision", - "s_nom_max": "double precision", "s_max_pu": "double precision", "build_year": "bigint", - "lifetime": "double precision", "capital_cost": "double precision", + "s_nom": "numeric", + "s_nom_extendable": "boolean", + "s_nom_min": "double precision", + "s_nom_max": "double precision", + "s_max_pu": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "capital_cost": "double precision", "length": "double precision", - "cables": "integer", "terrain_factor": "double precision", + "cables": "integer", + "terrain_factor": "double precision", "num_parallel": "double precision", - "v_ang_min": "double precision", "v_ang_max": "double precision", + "v_ang_min": "double precision", + "v_ang_max": "double precision", "v_nom": "double precision", - "geom": "geometry", "topo": "geometry" + "geom": "geometry", + "topo": "geometry" }, ), NotNullAndNotNaNValidation( table="grid.egon_etrago_line_timeseries", rule_id="NOT_NAN.egon_etrago_line_timeseries", columns=[ - "scn_name", "line_id", "bus0", "bus1", "carrier", "x", "r", "g", "b", "s_nom", - "s_nom_extendable", "s_nom_min", "s_nom_max", "s_max_pu", "build_year", "lifetime", - "capital_cost", "length", "cables", "terrain_factor", "num_parallel", "v_ang_min", - "v_ang_max", "v_nom", "geom", "topo", + "scn_name", + "line_id", + "bus0", + "bus1", + "carrier", + "x", + "r", + "g", + "b", + "s_nom", + "s_nom_extendable", + "s_nom_min", + "s_nom_max", + "s_max_pu", + "build_year", + "lifetime", + "capital_cost", + "length", + "cables", + "terrain_factor", + "num_parallel", + "v_ang_min", + "v_ang_max", + "v_nom", + "geom", + "topo", ] ), WholeTableNotNullAndNotNaNValidation( @@ -1248,7 +1460,11 @@ def __init__(self, dependencies): table="grid.egon_etrago_line_timeseries", rule_id="VALUE_SET_SCENARIO.egon_etrago_line_timeseries", column="scn_name", - expected_values=["eGon2035", "eGon2035_lowflex", "eGon100RE"] + expected_values=[ + "eGon2035", + "eGon2035_lowflex", + "eGon100RE" + ] ), ValueSetValidation( table="grid.egon_etrago_line_timeseries", @@ -1269,19 +1485,37 @@ def __init__(self, dependencies): RowCountValidation( table="grid.egon_etrago_link", rule_id="ROW_COUNT.egon_etrago_link", - expected_count={"Schleswig-Holstein": 15496, "Everything": 83980} + expected_count={ + "Schleswig-Holstein": 15496, + "Everything": 83980 + } ), DataTypeValidation( table="grid.egon_etrago_link", rule_id="DATA_TYPES.egon_etrago_link", column_types={ - "scn_name": "character varying", "link_id": "bigint", "bus0": "bigint", "bus1": "bigint", - "type": "text", "carrier": "text", "efficiency": "double precision", "build_year": "bigint", - "lifetime": "double precision", "p_nom": "numeric", "p_nom_extendable": "boolean", - "p_nom_min": "double precision", "p_nom_max": "double precision", "p_min_pu": "double precision", - "p_max_pu": "double precision", "p_set": "double precision", "capital_cost": "double precision", - "marginal_cost": "double precision", "length": "double precision", - "terrain_factor": "double precision", "geom": "geometry", "topo": "geometry", + "scn_name": "character varying", + "link_id": "bigint", + "bus0": "bigint", + "bus1": "bigint", + "type": "text", + "carrier": "text", + "efficiency": "double precision", + "build_year": "bigint", + "lifetime": "double precision", + "p_nom": "numeric", + "p_nom_extendable": "boolean", + "p_nom_min": "double precision", + "p_nom_max": "double precision", + "p_min_pu": "double precision", + "p_max_pu": "double precision", + "p_set": "double precision", + "capital_cost": "double precision", + "marginal_cost": "double precision", + "length": "double precision", + "terrain_factor": "double precision", + "geom": "geometry", + "topo": "geometry", }, ), NotNullAndNotNaNValidation( diff --git a/src/egon/data/datasets/validation_report.py b/src/egon/data/datasets/validation_report.py index 5a70814ec..5efe19b54 100644 --- a/src/egon/data/datasets/validation_report.py +++ b/src/egon/data/datasets/validation_report.py @@ -1,9 +1,9 @@ """ Dataset for generating validation reports during pipeline execution. -This module provides the ValidationReport dataset which generates comprehensive -validation reports by aggregating all validation results from individual dataset -validation tasks executed during the pipeline run. +This module provides the ValidationReport dataset which generates +comprehensive validation reports by aggregating all validation results +from individual dataset validation tasks executed during the pipeline run. """ import os @@ -12,12 +12,15 @@ from egon.data import logger, db as egon_db from egon.data.datasets import Dataset from egon_validation import RunContext -from egon_validation.runner.aggregate import collect, build_coverage, write_outputs +from egon_validation.runner.aggregate import ( + collect, build_coverage, write_outputs +) from egon_validation.report.generate import generate from egon_validation.runner.coverage_analysis import discover_total_tables from egon_validation.config import ENV_DB_URL import os as _os + def generate_validation_report(**kwargs): """ Generate validation report aggregating all validation results. @@ -31,11 +34,13 @@ def generate_validation_report(**kwargs): """ # Use same run_id as other validation tasks in the pipeline # This ensures all tasks read/write to the same directory + dag_run = kwargs.get('dag_run') + ti = kwargs.get('ti') run_id = ( os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or kwargs.get('run_id') or - (kwargs.get('ti') and hasattr(kwargs['ti'], 'dag_run') and kwargs['ti'].dag_run.run_id) or - (kwargs.get('dag_run') and kwargs['dag_run'].run_id) or + (ti and hasattr(ti, 'dag_run') and ti.dag_run.run_id) or + (dag_run and dag_run.run_id) or f"pipeline_validation_report_{int(time.time())}" ) @@ -58,11 +63,13 @@ def generate_validation_report(**kwargs): try: # Get the database URL from egon.data db_url = str(egon_db.engine().url) - # Temporarily set the environment variable so discover_total_tables can use it + # Set env var so discover_total_tables can use it _os.environ[ENV_DB_URL] = db_url logger.info("Database connection available for table counting") except Exception as e: - logger.warning(f"Could not set database URL for table counting: {e}") + logger.warning( + f"Could not set database URL for table counting: {e}" + ) # Collect all validation results from existing validation runs collected = collect(ctx) @@ -71,18 +78,20 @@ def generate_validation_report(**kwargs): generate(ctx) report_path = os.path.join(final_out_dir, 'report.html') - logger.info("Pipeline validation report generated successfully", extra={ - "report_path": report_path, - "run_id": run_id, - "total_results": len(collected.get("items", [])) - }) - + logger.info( + "Pipeline validation report generated successfully", + extra={ + "report_path": report_path, + "run_id": run_id, + "total_results": len(collected.get("items", [])) + } + ) except FileNotFoundError as e: logger.warning( f"No validation results found for pipeline validation report | " f"run_id={run_id} | out_dir={out_dir} | error={e} | " - f"suggestion=This may be expected if no validation tasks were run during the pipeline" + f"suggestion=This may be expected if no validation tasks ran" ) # Don't raise - this is acceptable if no validations were run @@ -103,10 +112,11 @@ class ValidationReport(Dataset): """ Dataset for generating validation reports. - This dataset generates a comprehensive HTML validation report by aggregating - all validation results from individual dataset validation tasks that were - executed during the pipeline run. It should be placed before sanity_checks - in the DAG to ensure validation results are collected before final checks. + This dataset generates a comprehensive HTML validation report by + aggregating all validation results from individual dataset validation + tasks that were executed during the pipeline run. It should be placed + before sanity_checks in the DAG to ensure validation results are + collected before final checks. """ #: name: str = "ValidationReport" @@ -119,4 +129,4 @@ def __init__(self, dependencies): version=self.version, dependencies=dependencies, tasks=tasks, - ) + ) \ No newline at end of file diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py index f9a8118ab..a58ce5b1e 100644 --- a/src/egon/data/datasets/vg250/__init__.py +++ b/src/egon/data/datasets/vg250/__init__.py @@ -568,24 +568,73 @@ def __init__(self, dependencies): DataTypeValidation( table="boundaries.vg250_krs", rule_id="DATA_TYPES.vg250_krs", - column_types={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text", - "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer", - "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", - "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", - "ars_0":"text", "ags_0":"text", "wsk":"timestamp without time zone", "debkg_id":"text", "rs":"text", - "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"}, - "Everything":{"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text", - "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint", - "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text", - "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text", - "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text", - "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"} - } + column_types={ + "Schleswig-Holstein": { + "id": "bigint", + "ade": "integer", + "gf": "integer", + "bsg": "integer", + "ars": "text", + "ags": "text", + "sdv_ars": "text", + "gen": "text", + "bez": "text", + "ibz": "integer", + "bem": "text", + "nbd": "text", + "sn_l": "text", + "sn_r": "text", + "sn_k": "text", + "sn_v1": "text", + "sn_v2": "text", + "sn_g": "text", + "fk_s3": "text", + "nuts": "text", + "ars_0": "text", + "ags_0": "text", + "wsk": "timestamp without time zone", + "debkg_id": "text", + "rs": "text", + "sdv_rs": "text", + "rs_0": "text", + "geometry": "geometry" + }, + "Everything": { + "id": "bigint", + "ade": "bigint", + "gf": "bigint", + "bsg": "bigint", + "ars": "text", + "ags": "text", + "sdv_ars": "text", + "gen": "text", + "bez": "text", + "ibz": "bigint", + "bem": "text", + "nbd": "text", + "sn_l": "text", + "sn_r": "text", + "sn_k": "text", + "sn_v1": "text", + "sn_v2": "text", + "sn_g": "text", + "fk_s3": "text", + "nuts": "text", + "ars_0": "text", + "ags_0": "text", + "wsk": "text", + "debkg_id": "text", + "rs": "text", + "sdv_rs": "text", + "rs_0": "text", + "geometry": "geometry" + } + } ), NotNullAndNotNaNValidation( table="boundaries.vg250_krs", rule_id="NOT_NAN.vg250_krs", - columns=["gf","bsg"] + columns=["gf", "bsg"] ), WholeTableNotNullAndNotNaNValidation( table="boundaries.vg250_krs", @@ -605,20 +654,32 @@ def __init__(self, dependencies): RowCountValidation( table="society.destatis_zensus_population_per_ha_inside_germany", rule_id="ROW_COUNT.destatis_zensus_population_per_ha_inside_germany", - expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723} + expected_count={ + "Schleswig-Holstein": 143521, + "Everything": 3177723 + } ), DataTypeValidation( table="society.destatis_zensus_population_per_ha_inside_germany", rule_id="DATA_TYPES.destatis_zensus_population_per_ha_inside_germany", column_types={ - "id": "integer", "grid_id": "character varying (254)", "population": "smallint", - "geom_point": "geometry","geom": "geometry" + "id": "integer", + "grid_id": "character varying (254)", + "population": "smallint", + "geom_point": "geometry", + "geom": "geometry" } ), NotNullAndNotNaNValidation( table="society.destatis_zensus_population_per_ha_inside_germany", rule_id="NOT_NAN.destatis_zensus_population_per_ha_inside_germany", - columns=["id", "grid_id", "population", "geom_point", "geom"] + columns=[ + "id", + "grid_id", + "population", + "geom_point", + "geom" + ] ), WholeTableNotNullAndNotNaNValidation( table="society.destatis_zensus_population_per_ha_inside_germany", diff --git a/src/egon/data/datasets/zensus/__init__.py b/src/egon/data/datasets/zensus/__init__.py index 6012b1ddf..97147d95c 100755 --- a/src/egon/data/datasets/zensus/__init__.py +++ b/src/egon/data/datasets/zensus/__init__.py @@ -17,7 +17,7 @@ from egon.data.datasets import Dataset import egon.data.config -from egon_validation import( +from egon_validation import ( RowCountValidation, DataTypeValidation, NotNullAndNotNaNValidation, @@ -37,7 +37,7 @@ def __init__(self, dependencies): population_to_postgres, ), validation={ - "data-quality":[ + "data-quality": [ RowCountValidation( table="society.egon_destatis_zensus_apartment_building_population_per_ha", rule_id="ROW_COUNT.egon_destatis_zensus_apartment_building_population_per_ha", @@ -87,7 +87,7 @@ def __init__(self, dependencies): zensus_misc_to_postgres, ), validation={ - "data-quality":[ + "data-quality": [ RowCountValidation( table="society.egon_destatis_zensus_apartment_per_ha", diff --git a/src/egon/data/validation/__init__.py b/src/egon/data/validation/__init__.py index 7e7145e0e..9c6c482ac 100644 --- a/src/egon/data/validation/__init__.py +++ b/src/egon/data/validation/__init__.py @@ -7,7 +7,7 @@ validation_dict = {"task_name": [Rule(...), Rule(...)]} 2) "table-first": - validation_dict = {"task_name": [TableValidation(...), TableValidation(...)]} + validation_dict = {"task_name": [TableValidation(...), ...]} """ from .resolver import ( @@ -43,4 +43,4 @@ # airflow "create_validation_tasks", "run_validation_task", -] \ No newline at end of file +] diff --git a/src/egon/data/validation/airflow.py b/src/egon/data/validation/airflow.py index 1188f01c8..cca86ad99 100644 --- a/src/egon/data/validation/airflow.py +++ b/src/egon/data/validation/airflow.py @@ -5,8 +5,7 @@ import logging from functools import partial import re -import hashlib -from typing import Any, Dict, List, Sequence, Set +from typing import Any, Dict, List, Sequence from airflow.operators.python import PythonOperator from egon_validation import RunContext, run_validations @@ -107,8 +106,6 @@ def create_validation_tasks( tasks: List[PythonOperator] = [] - used_task_ids: Set[str] = set() - safe_dataset = sanitize_airflow_key(dataset_name) for task_name, specs in validation_dict.items(): @@ -130,6 +127,7 @@ def create_validation_tasks( return tasks + def sanitize_airflow_key(value: str) -> str: """ Airflow task_id/key must match: [A-Za-z0-9_.-]+ diff --git a/src/egon/data/validation/resolver.py b/src/egon/data/validation/resolver.py index 690da6e3e..327b5ee61 100644 --- a/src/egon/data/validation/resolver.py +++ b/src/egon/data/validation/resolver.py @@ -12,22 +12,27 @@ @dataclass(frozen=True, slots=True) class BoundaryDependent: """ - Wrapper for values that vary by boundary (e.g. Schleswig-Holstein vs Everything). + Wrapper for values that vary by boundary. - At validation runtime, the appropriate value is selected based on the - current boundary setting. + E.g. Schleswig-Holstein vs Everything. At validation runtime, the + appropriate value is selected based on the current boundary setting. """ values: Dict[str, Any] def resolve(self, boundary: str) -> Any: - """Return the value for the given boundary, or the whole dict if not found.""" + """Return value for given boundary, or the whole dict if not found.""" if boundary in self.values: - logger.debug("Resolved boundary-dependent value: %s -> %s", boundary, self.values[boundary]) + logger.debug( + "Resolved boundary-dependent value: %s -> %s", + boundary, self.values[boundary] + ) return self.values[boundary] return self.values -def resolve_boundary_dependence(boundary_dict: Dict[str, Any]) -> BoundaryDependent: +def resolve_boundary_dependence( + boundary_dict: Dict[str, Any] +) -> BoundaryDependent: """ Wrap a boundary-dependent dict for deferred resolution. @@ -35,7 +40,9 @@ def resolve_boundary_dependence(boundary_dict: Dict[str, Any]) -> BoundaryDepend current boundary setting. Example: - expected_count=resolve_boundary_dependence({"Schleswig-Holstein": 27, "Everything": 431}) + expected_count=resolve_boundary_dependence( + {"Schleswig-Holstein": 27, "Everything": 431} + ) """ return BoundaryDependent(boundary_dict) @@ -50,4 +57,4 @@ def resolve_value(value: Any, boundary: str) -> Any: if isinstance(value, BoundaryDependent): return value.resolve(boundary) - return value \ No newline at end of file + return value diff --git a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py index bd3fe3397..1e1319231 100644 --- a/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py +++ b/src/egon/data/validation/rules/custom/sanity/electricity_capacity.py @@ -229,7 +229,6 @@ def evaluate_df(self, df, ctx): # Case 4: Both > 0 - Check deviation deviation = abs(output_capacity - input_capacity) / input_capacity - deviation_pct = deviation * 100 error_pct = ((output_capacity - input_capacity) / input_capacity) * 100 success = deviation <= self.rtol diff --git a/src/egon/data/validation/rules/custom/sanity/gas_stores.py b/src/egon/data/validation/rules/custom/sanity/gas_stores.py index a0e978862..c4eda057f 100644 --- a/src/egon/data/validation/rules/custom/sanity/gas_stores.py +++ b/src/egon/data/validation/rules/custom/sanity/gas_stores.py @@ -7,7 +7,6 @@ from egon_validation.rules.base import DataFrameRule, RuleResult, Severity -from egon.data import config from egon.data.datasets.hydrogen_etrago.storage import ( calculate_and_map_saltcavern_storage_potential ) diff --git a/src/egon/data/validation/rules/custom/sanity/home_batteries.py b/src/egon/data/validation/rules/custom/sanity/home_batteries.py index c4e87790e..828250230 100644 --- a/src/egon/data/validation/rules/custom/sanity/home_batteries.py +++ b/src/egon/data/validation/rules/custom/sanity/home_batteries.py @@ -1,15 +1,14 @@ """ Sanity check validation rules for home batteries -Validates that home battery capacities are correctly aggregated from building-level -to bus-level in the storages table. +Validates that home battery capacities are correctly aggregated +from building-level to bus-level in the storages table. """ -import numpy as np import pandas as pd from egon_validation.rules.base import DataFrameRule, RuleResult, Severity -from egon.data import config, db +from egon.data import config from egon.data.validation.rules.custom.sanity.utils import get_cbat_pbat_ratio @@ -27,8 +26,12 @@ class HomeBatteriesAggregation(DataFrameRule): Both values are rounded to 6 decimal places for comparison. """ - def __init__(self, table: str, rule_id: str, scenario: str = "eGon2035", **kwargs): - super().__init__(rule_id=rule_id, table=table, scenario=scenario, **kwargs) + def __init__( + self, table: str, rule_id: str, scenario: str = "eGon2035", **kwargs + ): + super().__init__( + rule_id=rule_id, table=table, scenario=scenario, **kwargs + ) self.kind = "sanity" self.scenario = scenario @@ -37,7 +40,7 @@ def evaluate(self, engine, ctx) -> RuleResult: try: return super().evaluate(engine, ctx) except IndexError as e: - # get_cbat_pbat_ratio() failed because no home_battery data exists + # get_cbat_pbat_ratio() failed - no home_battery data exists if "index 0 is out of bounds" in str(e): return RuleResult( rule_id=self.rule_id, @@ -45,7 +48,10 @@ def evaluate(self, engine, ctx) -> RuleResult: table=self.table, kind=self.kind, success=False, - message=f"⚠️ NO DATA FOUND: No home_battery carrier found in etrago_storage table for scenario {self.scenario}", + message=( + f"NO DATA FOUND: No home_battery carrier found in " + f"etrago_storage table for scenario {self.scenario}" + ), severity=Severity.WARNING, schema=self.schema, table_name=self.table_name, @@ -64,16 +70,21 @@ def get_query(self, ctx): sources = config.datasets()["home_batteries"]["sources"] targets = config.datasets()["home_batteries"]["targets"] - # Get cbat_pbat_ratio for capacity calculation (same as original sanity check) + # Get cbat_pbat_ratio for capacity calculation cbat_pbat_ratio = get_cbat_pbat_ratio() + storage_schema = sources["storage"]["schema"] + storage_table = sources["storage"]["table"] + hb_schema = targets["home_batteries"]["schema"] + hb_table = targets["home_batteries"]["table"] + return f""" WITH storage_data AS ( SELECT bus_id, el_capacity as storage_p_nom, el_capacity * {cbat_pbat_ratio} as storage_capacity - FROM {sources["storage"]["schema"]}.{sources["storage"]["table"]} + FROM {storage_schema}.{storage_table} WHERE carrier = 'home_battery' AND scenario = '{self.scenario}' ), @@ -82,7 +93,7 @@ def get_query(self, ctx): bus_id, SUM(p_nom) as building_p_nom, SUM(capacity) as building_capacity - FROM {targets["home_batteries"]["schema"]}.{targets["home_batteries"]["table"]} + FROM {hb_schema}.{hb_table} WHERE scenario = '{self.scenario}' GROUP BY bus_id ) @@ -120,7 +131,9 @@ def evaluate_df(self, df, ctx): table=self.table, kind=self.kind, success=False, - message=f"No home battery data found for scenario {self.scenario}", + message=( + f"No home battery data found for scenario {self.scenario}" + ), severity=Severity.WARNING, schema=self.schema, table_name=self.table_name, @@ -134,14 +147,16 @@ def evaluate_df(self, df, ctx): if not missing_in_storage.empty or not missing_in_buildings.empty: violations = [] if not missing_in_storage.empty: + bus_list = missing_in_storage['bus_id'].tolist()[:5] violations.append( - f"{len(missing_in_storage)} bus(es) in buildings but not in storage: " - f"{missing_in_storage['bus_id'].tolist()[:5]}" + f"{len(missing_in_storage)} bus(es) in buildings " + f"but not in storage: {bus_list}" ) if not missing_in_buildings.empty: + bus_list = missing_in_buildings['bus_id'].tolist()[:5] violations.append( - f"{len(missing_in_buildings)} bus(es) in storage but not in buildings: " - f"{missing_in_buildings['bus_id'].tolist()[:5]}" + f"{len(missing_in_buildings)} bus(es) in storage " + f"but not in buildings: {bus_list}" ) return RuleResult( @@ -152,7 +167,7 @@ def evaluate_df(self, df, ctx): success=False, observed=len(missing_in_storage) + len(missing_in_buildings), expected=0, - message=f"Bus mismatch between tables: {'; '.join(violations)}", + message=f"Bus mismatch: {'; '.join(violations)}", severity=Severity.ERROR, schema=self.schema, table_name=self.table_name, @@ -163,20 +178,26 @@ def evaluate_df(self, df, ctx): p_nom_mismatch = df[df["storage_p_nom"] != df["building_p_nom"]] # Check if capacity values match - capacity_mismatch = df[df["storage_capacity"] != df["building_capacity"]] + cap_mismatch = df[df["storage_capacity"] != df["building_capacity"]] # Combine mismatches - mismatches = pd.concat([p_nom_mismatch, capacity_mismatch]).drop_duplicates(subset=["bus_id"]) + mismatches = pd.concat( + [p_nom_mismatch, cap_mismatch] + ).drop_duplicates(subset=["bus_id"]) if not mismatches.empty: # Calculate maximum differences - max_p_nom_diff = (df["storage_p_nom"] - df["building_p_nom"]).abs().max() - max_capacity_diff = (df["storage_capacity"] - df["building_capacity"]).abs().max() + p_nom_diff = df["storage_p_nom"] - df["building_p_nom"] + cap_diff = df["storage_capacity"] - df["building_capacity"] + max_p_nom_diff = p_nom_diff.abs().max() + max_capacity_diff = cap_diff.abs().max() # Get all violations - all_violations = mismatches[ - ["bus_id", "storage_p_nom", "building_p_nom", "storage_capacity", "building_capacity"] - ].to_dict(orient="records") + cols = [ + "bus_id", "storage_p_nom", "building_p_nom", + "storage_capacity", "building_capacity" + ] + all_violations = mismatches[cols].to_dict(orient="records") return RuleResult( rule_id=self.rule_id, @@ -187,8 +208,10 @@ def evaluate_df(self, df, ctx): observed=float(max(max_p_nom_diff, max_capacity_diff)), expected=0.0, message=( - f"Home battery aggregation mismatch for {len(mismatches)} bus(es): " - f"max p_nom diff={max_p_nom_diff:.6f}, max capacity diff={max_capacity_diff:.6f}. " + f"Home battery aggregation mismatch for " + f"{len(mismatches)} bus(es): " + f"max p_nom diff={max_p_nom_diff:.6f}, " + f"max capacity diff={max_capacity_diff:.6f}. " f"violations: {all_violations}" ), severity=Severity.ERROR, @@ -206,8 +229,12 @@ def evaluate_df(self, df, ctx): success=True, observed=0.0, expected=0.0, - message=f"Home battery capacities correctly aggregated for all {len(df)} buses in scenario {self.scenario}", + message=( + f"Home battery capacities correctly aggregated for all " + f"{len(df)} buses in scenario {self.scenario}" + ), schema=self.schema, table_name=self.table_name, rule_class=self.__class__.__name__ - ) \ No newline at end of file + ) + diff --git a/src/egon/data/validation/rules/custom/sanity/utils.py b/src/egon/data/validation/rules/custom/sanity/utils.py index 9b77dd619..239fa7eea 100644 --- a/src/egon/data/validation/rules/custom/sanity/utils.py +++ b/src/egon/data/validation/rules/custom/sanity/utils.py @@ -23,4 +23,4 @@ def get_cbat_pbat_ratio(): WHERE carrier = 'home_battery' """ - return int(db.select_dataframe(sql).iat[0, 0]) \ No newline at end of file + return int(db.select_dataframe(sql).iat[0, 0]) diff --git a/src/egon/data/validation/specs.py b/src/egon/data/validation/specs.py index 765881c47..f2d2138fb 100644 --- a/src/egon/data/validation/specs.py +++ b/src/egon/data/validation/specs.py @@ -93,7 +93,8 @@ def to_rules(self) -> List[Rule]: rules.append( ValueSetValidation( table=self.table_name, - rule_id=f"VALUE_SET_{str(col_name).upper()}.{table_suffix}", + rule_id=f"VALUE_SET_{str(col_name).upper()}" + f".{table_suffix}", column=str(col_name), expected_values=expected_values, ) @@ -115,22 +116,24 @@ def to_rules(self) -> List[Rule]: def clone_rule(rule: Rule) -> Rule: """ - Creates a per-run copy of a rule so we don't mutate DAG-parse-time objects. + Creates a per-run copy of a rule to avoid mutating DAG-parse-time objects. - We avoid deepcopy as the first choice (deepcopy can break on complex objects). + We avoid deepcopy as first choice (can break on complex objects). Strategy: 1) Shallow copy the object 2) Deep copy ONLY rule.params (the part we mutate) 3) Fallback to deepcopy(rule) if shallow copy fails """ try: - cloned = copy.copy(rule) # shallow copy: new object, same inner references + # shallow copy: new object, same inner references + cloned = copy.copy(rule) except Exception: # Last resort: full deepcopy return copy.deepcopy(rule) # Make params safe to mutate - if hasattr(cloned, "params") and isinstance(getattr(cloned, "params"), dict): + params = getattr(cloned, "params", None) + if hasattr(cloned, "params") and isinstance(params, dict): cloned.params = copy.deepcopy(cloned.params) return cloned @@ -138,7 +141,8 @@ def clone_rule(rule: Rule) -> Rule: def expand_specs(specs: Sequence[ValidationSpec]) -> List[Rule]: """ - Turn a mixed list of Rule/TableValidation into a plain list of Rule objects. + Turn a mixed list of Rule/TableValidation into a plain list of Rules. + TableValidation produces fresh rule instances. Rule instances are cloned to avoid cross-run mutation. """ @@ -165,7 +169,11 @@ def resolve_rule_params(rule: Rule, boundary: str) -> None: for name, val in list(params.items()): resolved = resolve_value(val, boundary) if resolved is not val: - logger.info("Rule %s: Resolved %s for boundary='%s'", getattr(rule, "rule_id", ""), name, boundary) + rule_id = getattr(rule, "rule_id", "") + logger.info( + "Rule %s: Resolved %s for boundary='%s'", + rule_id, name, boundary + ) params[name] = resolved @@ -191,4 +199,4 @@ def prepare_rules( resolve_rule_params(rule, boundary) - return rules \ No newline at end of file + return rules