Skip to content

Commit cfd4b20

Browse files
committed
feat: tweaked json schemas for contract error details. Fixed failing unit test
1 parent 3baf205 commit cfd4b20

File tree

13 files changed

+27
-38
lines changed

13 files changed

+27
-38
lines changed

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ DVE configuration can be instantiated from a json (dischema) file which might be
1818
{
1919
"contract": {
2020
"cache_originals": true,
21-
"contract_error_codes": null,
21+
"error_details": null,
2222
"types": {},
2323
"schemas": {},
2424
"datasets": {

docs/detailed_guidance/data_contract.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Lets look at the data contract configuration from [Introduction to DVE](../READM
44
{
55
"contract": {
66
"cache_originals": true,
7-
"contract_error_codes": null,
7+
"error_details": null,
88
"types": {},
99
"schemas": {},
1010
"datasets": {
@@ -78,7 +78,7 @@ Here we have only filled out datasets. We've added a few more fields such as `Pe
7878
{
7979
"contract": {
8080
"cache_originals": true,
81-
"contract_error_codes": null,
81+
"error_details": null,
8282
"types": {
8383
"isodate": {
8484
"description": "an isoformatted date type",
@@ -172,7 +172,7 @@ We can see here that the Activity has a number of fields. `startdate`, `enddate`
172172
{
173173
"contract": {
174174
"cache_originals": true,
175-
"contract_error_codes": null,
175+
"error_details": null,
176176
"types": {
177177
"isodate": {
178178
"description": "an isoformatted date type",

docs/json_schemas/contract/components/contact_error_details.schema.json

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,9 @@
22
"$schema": "https://json-schema.org/draft-07/schema",
33
"$id": "data-ingest:contract/components/contract_error_details.schema.json",
44
"title": "base_entity",
5-
"description": "An optional specification of custom error codes and messages for fields for the data contract phase of validation",
5+
"description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
66
"type": "object",
7-
"properties": {
8-
"field_name": {
9-
"type": "object",
10-
"description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
11-
"additionalProperties": {
12-
"$ref": "field_error_type.schema.json"
13-
}
14-
}
7+
"additionalProperties": {
8+
"$ref": "field_error_type.schema.json"
159
}
1610
}

src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def _ddb_write_parquet( # pylint: disable=unused-argument
237237
"select dta.* from (select unnest($data) as dta)", params={"data": list(entity)}
238238
)
239239

240-
entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore
240+
entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore
241241
return target_location
242242

243243

src/dve/core_engine/backends/implementations/duckdb/readers/json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ def read_to_relation( # pylint: disable=unused-argument
4141
for fld in schema.__fields__.values()
4242
}
4343

44-
return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore
44+
return read_json(resource, columns=ddb_schema, format=self._json_format) # type: ignore

src/dve/core_engine/backends/implementations/spark/readers/csv.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""A reader implementation using the Databricks Spark XML reader."""
1+
"""A reader implementation using the Databricks Spark CSV reader."""
22

33

44
from typing import Any, Dict, Iterator, Type
@@ -19,7 +19,7 @@
1919

2020
@spark_write_parquet
2121
class SparkCSVReader(BaseFileReader):
22-
"""A Spark reader for JSON files."""
22+
"""A Spark reader for CSV files."""
2323

2424
def __init__(
2525
self,
@@ -56,7 +56,7 @@ def read_to_dataframe(
5656
entity_name: EntityName, # pylint: disable=unused-argument
5757
schema: Type[BaseModel],
5858
) -> DataFrame:
59-
"""Read an JSON file directly to a Spark DataFrame."""
59+
"""Read a CSV file directly to a Spark DataFrame."""
6060
if get_content_length(resource) == 0:
6161
raise EmptyFileError(f"File at {resource} is empty.")
6262

src/dve/core_engine/backends/implementations/spark/readers/json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""A reader implementation using the Databricks Spark XML reader."""
1+
"""A reader implementation using the Databricks Spark JSON reader."""
22

33

44
from typing import Any, Dict, Iterator, Optional, Type
@@ -48,7 +48,7 @@ def read_to_dataframe(
4848
entity_name: EntityName, # pylint: disable=unused-argument
4949
schema: Type[BaseModel],
5050
) -> DataFrame:
51-
"""Read an JSON file directly to a Spark DataFrame."""
51+
"""Read a JSON file directly to a Spark DataFrame."""
5252
if get_content_length(resource) == 0:
5353
raise EmptyFileError(f"File at {resource} is empty.")
5454

src/dve/core_engine/backends/implementations/spark/spark_helpers.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,13 +352,14 @@ def _spark_write_parquet( # pylint: disable=unused-argument
352352
"""Method to write parquet files from type cast entities
353353
following data contract application
354354
"""
355+
_options: Dict[str, Any] = {**kwargs}
355356
if isinstance(entity, Generator):
356357
_writer = self.spark_session.createDataFrame(entity).write
357358
else:
358-
_options = {"schema": entity.schema, **kwargs} # type: ignore
359-
_writer = entity.write.options(**_options) # type: ignore
359+
_options["schema"] = entity.schema # type: ignore
360+
_writer = entity.write
360361

361-
(_writer.format("parquet").mode("overwrite").save(target_location))
362+
(_writer.options(**_options).format("parquet").mode("overwrite").save(target_location))
362363
return target_location
363364

364365

src/dve/core_engine/message.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def from_pydantic_error(
223223
is_informational = False
224224
if error_code.endswith("warning"):
225225
is_informational = True
226-
error_detail: DataContractErrorDetail = error_details.get( # type: ignore
226+
error_detail: DataContractErrorDetail = error_details.get( # type: ignore
227227
error_field, DEFAULT_ERROR_DETAIL
228228
).get(category)
229229

src/dve/core_engine/validation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ def handle_warnings(self, record, caught_warnings) -> List[FeedbackMessage]:
126126
else:
127127
error_location = None
128128
error_code = (
129-
self.error_details.get(error_location, DEFAULT_ERROR_DETAIL) # type: ignore
130-
.get("Wrong Format")
129+
self.error_details.get(error_location, DEFAULT_ERROR_DETAIL) # type: ignore
130+
.get("Wrong format")
131131
.error_code
132132
)
133133

0 commit comments

Comments
 (0)