feat: tweaked json schemas for contract error details. Fixed failing unit test

stevenhsd · stevenhsd · commit cfd4b2064c20 · 2025-10-28T11:29:20.000Z
diff --git a/docs/README.md b/docs/README.md
@@ -18,7 +18,7 @@ DVE configuration can be instantiated from a json (dischema) file which might be
 {
     "contract": {
         "cache_originals": true,
-        "contract_error_codes": null,
+        "error_details": null,
         "types": {},
         "schemas": {},
         "datasets": {
diff --git a/docs/detailed_guidance/data_contract.md b/docs/detailed_guidance/data_contract.md
@@ -4,7 +4,7 @@ Lets look at the data contract configuration from [Introduction to DVE](../READM
 {
     "contract": {
         "cache_originals": true,
-        "contract_error_codes": null,
+        "error_details": null,
         "types": {},
         "schemas": {},
         "datasets": {
@@ -78,7 +78,7 @@ Here we have only filled out datasets. We've added a few more fields such as `Pe
 {
     "contract": {
         "cache_originals": true,
-        "contract_error_codes": null,
+        "error_details": null,
         "types": {
             "isodate": {
                 "description": "an isoformatted date type",
@@ -172,7 +172,7 @@ We can see here that the Activity has a number of fields. `startdate`, `enddate`
 {
     "contract": {
         "cache_originals": true,
-        "contract_error_codes": null,
+        "error_details": null,
         "types": {
             "isodate": {
                 "description": "an isoformatted date type",
diff --git a/docs/json_schemas/contract/components/contact_error_details.schema.json b/docs/json_schemas/contract/components/contact_error_details.schema.json
@@ -2,15 +2,9 @@
     "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "data-ingest:contract/components/contract_error_details.schema.json",
     "title": "base_entity",
-    "description": "An optional specification of custom error codes and messages for fields for the data contract phase of validation",
+    "description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
     "type": "object",
-    "properties": {
-        "field_name": {
-            "type": "object",
-            "description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
-            "additionalProperties": {
-                "$ref": "field_error_type.schema.json"
-            }
-        }
+    "additionalProperties": {
+        "$ref": "field_error_type.schema.json"
     }
 }
diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py
@@ -237,7 +237,7 @@ def _ddb_write_parquet(  # pylint: disable=unused-argument
             "select dta.* from (select unnest($data) as dta)", params={"data": list(entity)}
         )
 
-    entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore
+    entity.to_parquet(file_name=target_location, compression="snappy", **kwargs)  # type: ignore
     return target_location
 
 
diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/json.py b/src/dve/core_engine/backends/implementations/duckdb/readers/json.py
@@ -41,4 +41,4 @@ def read_to_relation(  # pylint: disable=unused-argument
             for fld in schema.__fields__.values()
         }
 
-        return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore
+        return read_json(resource, columns=ddb_schema, format=self._json_format)  # type: ignore
diff --git a/src/dve/core_engine/backends/implementations/spark/readers/csv.py b/src/dve/core_engine/backends/implementations/spark/readers/csv.py
@@ -1,4 +1,4 @@
-"""A reader implementation using the Databricks Spark XML reader."""
+"""A reader implementation using the Databricks Spark CSV reader."""
 
 
 from typing import Any, Dict, Iterator, Type
@@ -19,7 +19,7 @@
 
 @spark_write_parquet
 class SparkCSVReader(BaseFileReader):
-    """A Spark reader for JSON files."""
+    """A Spark reader for CSV files."""
 
     def __init__(
         self,
@@ -56,7 +56,7 @@ def read_to_dataframe(
         entity_name: EntityName,  # pylint: disable=unused-argument
         schema: Type[BaseModel],
     ) -> DataFrame:
-        """Read an JSON file directly to a Spark DataFrame."""
+        """Read a CSV file directly to a Spark DataFrame."""
         if get_content_length(resource) == 0:
             raise EmptyFileError(f"File at {resource} is empty.")
 
diff --git a/src/dve/core_engine/backends/implementations/spark/readers/json.py b/src/dve/core_engine/backends/implementations/spark/readers/json.py
@@ -1,4 +1,4 @@
-"""A reader implementation using the Databricks Spark XML reader."""
+"""A reader implementation using the Databricks Spark JSON reader."""
 
 
 from typing import Any, Dict, Iterator, Optional, Type
@@ -48,7 +48,7 @@ def read_to_dataframe(
         entity_name: EntityName,  # pylint: disable=unused-argument
         schema: Type[BaseModel],
     ) -> DataFrame:
-        """Read an JSON file directly to a Spark DataFrame."""
+        """Read a JSON file directly to a Spark DataFrame."""
         if get_content_length(resource) == 0:
             raise EmptyFileError(f"File at {resource} is empty.")
 
diff --git a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py
@@ -352,13 +352,14 @@ def _spark_write_parquet(  # pylint: disable=unused-argument
     """Method to write parquet files from type cast entities
     following data contract application
     """
+    _options: Dict[str, Any] = {**kwargs}
     if isinstance(entity, Generator):
         _writer = self.spark_session.createDataFrame(entity).write
     else:
-        _options = {"schema": entity.schema, **kwargs} # type: ignore
-        _writer = entity.write.options(**_options) # type: ignore
+        _options["schema"] = entity.schema  # type: ignore
+        _writer = entity.write
 
-    (_writer.format("parquet").mode("overwrite").save(target_location))
+    (_writer.options(**_options).format("parquet").mode("overwrite").save(target_location))
     return target_location
 
 
diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py
@@ -223,7 +223,7 @@ def from_pydantic_error(
             is_informational = False
             if error_code.endswith("warning"):
                 is_informational = True
-            error_detail: DataContractErrorDetail = error_details.get( # type: ignore
+            error_detail: DataContractErrorDetail = error_details.get(  # type: ignore
                 error_field, DEFAULT_ERROR_DETAIL
             ).get(category)
 
diff --git a/src/dve/core_engine/validation.py b/src/dve/core_engine/validation.py
@@ -126,8 +126,8 @@ def handle_warnings(self, record, caught_warnings) -> List[FeedbackMessage]:
                 else:
                     error_location = None
                 error_code = (
-                    self.error_details.get(error_location, DEFAULT_ERROR_DETAIL) # type: ignore
-                    .get("Wrong Format")
+                    self.error_details.get(error_location, DEFAULT_ERROR_DETAIL)  # type: ignore
+                    .get("Wrong format")
                     .error_code
                 )
 
diff --git a/src/dve/pipeline/duckdb_pipeline.py b/src/dve/pipeline/duckdb_pipeline.py
@@ -44,11 +44,8 @@ def __init__(
         )
 
     # pylint: disable=arguments-differ
-    def write_file_to_parquet( # type: ignore
-        self,
-        submission_file_uri: URI,
-        submission_info: SubmissionInfo,
-        output: URI
+    def write_file_to_parquet(  # type: ignore
+        self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
     ):
         return super().write_file_to_parquet(
             submission_file_uri, submission_info, output, DuckDBPyRelation
diff --git a/src/dve/pipeline/spark_pipeline.py b/src/dve/pipeline/spark_pipeline.py
@@ -46,11 +46,8 @@ def __init__(
         )
 
     # pylint: disable=arguments-differ
-    def write_file_to_parquet( # type: ignore
-        self,
-        submission_file_uri: URI,
-        submission_info: SubmissionInfo,
-        output: URI
+    def write_file_to_parquet(  # type: ignore
+        self, submission_file_uri: URI, submission_info: SubmissionInfo, output: URI
     ):
         return super().write_file_to_parquet(
             submission_file_uri, submission_info, output, DataFrame
diff --git a/src/dve/pipeline/utils.py b/src/dve/pipeline/utils.py
@@ -6,8 +6,8 @@
 from pydantic.main import ModelMetaclass
 from pyspark.sql import SparkSession
 
-import dve.core_engine.backends.implementations.duckdb # pylint: disable=unused-import
-import dve.core_engine.backends.implementations.spark # pylint: disable=unused-import
+import dve.core_engine.backends.implementations.duckdb  # pylint: disable=unused-import
+import dve.core_engine.backends.implementations.spark  # pylint: disable=unused-import
 import dve.parser.file_handling as fh
 from dve.core_engine.backends.readers import _READER_REGISTRY
 from dve.core_engine.configuration.v1 import SchemaName, V1EngineConfig, _ModelConfig

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ DVE configuration can be instantiated from a json (dischema) file which might be`
`18`	`18`	`{`
`19`	`19`	`"contract": {`
`20`	`20`	`"cache_originals": true,`
`21`		`- "contract_error_codes": null,`
	`21`	`+ "error_details": null,`
`22`	`22`	`"types": {},`
`23`	`23`	`"schemas": {},`
`24`	`24`	`"datasets": {`
Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ def _ddb_write_parquet( # pylint: disable=unused-argument`
`237`	`237`	`"select dta.* from (select unnest($data) as dta)", params={"data": list(entity)}`
`238`	`238`	`)`
`239`	`239`
`240`		`- entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore`
	`240`	`+ entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore`
`241`	`241`	`return target_location`
`242`	`242`
`243`	`243`
Original file line number	Diff line number	Diff line change
`@@ -41,4 +41,4 @@ def read_to_relation( # pylint: disable=unused-argument`
`41`	`41`	`for fld in schema.__fields__.values()`
`42`	`42`	`}`
`43`	`43`
`44`		`- return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore`
	`44`	`+ return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore`