Skip to content

Commit 469bc04

Browse files
merge: changes from feature/ndit-512_dc_codes_granular_null_vs_format
2 parents 1d62fe7 + cfd4b20 commit 469bc04

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+625
-313
lines changed

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ DVE configuration can be instantiated from a json (dischema) file which might be
1818
{
1919
"contract": {
2020
"cache_originals": true,
21-
"contract_error_codes": null,
21+
"error_details": null,
2222
"types": {},
2323
"schemas": {},
2424
"datasets": {

docs/detailed_guidance/data_contract.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Lets look at the data contract configuration from [Introduction to DVE](../READM
44
{
55
"contract": {
66
"cache_originals": true,
7-
"contract_error_codes": null,
7+
"error_details": null,
88
"types": {},
99
"schemas": {},
1010
"datasets": {
@@ -78,7 +78,7 @@ Here we have only filled out datasets. We've added a few more fields such as `Pe
7878
{
7979
"contract": {
8080
"cache_originals": true,
81-
"contract_error_codes": null,
81+
"error_details": null,
8282
"types": {
8383
"isodate": {
8484
"description": "an isoformatted date type",
@@ -172,7 +172,7 @@ We can see here that the Activity has a number of fields. `startdate`, `enddate`
172172
{
173173
"contract": {
174174
"cache_originals": true,
175-
"contract_error_codes": null,
175+
"error_details": null,
176176
"types": {
177177
"isodate": {
178178
"description": "an isoformatted date type",
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/contract_error_details.schema.json",
4+
"title": "base_entity",
5+
"description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
6+
"type": "object",
7+
"additionalProperties": {
8+
"$ref": "field_error_type.schema.json"
9+
}
10+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/field_error_detail.schema.json",
4+
"title": "field_error_detail",
5+
"description": "The custom details to be used for a field when a validation error is raised during the data contract phase",
6+
"type": "object",
7+
"properties": {
8+
"error_code": {
9+
"description": "The code to be used for the field and error type specified",
10+
"type": "string"
11+
},
12+
"error_message": {
13+
"description": "The message to be used for the field and error type specified. This can include templating (specified using jinja2 conventions). During templating, the full record will be available with an additional __error_value to easily obtain nested offending values.",
14+
"type": "string",
15+
"enum": [
16+
"record_rejection",
17+
"file_rejection",
18+
"warning"
19+
]
20+
}
21+
},
22+
"required": [
23+
"error_code",
24+
"error_message"
25+
],
26+
"additionalProperties": false
27+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/field_error_type.schema.json",
4+
"title": "field_error_detail",
5+
"description": "The error type for a field when a validation error is raised during the data contract phase",
6+
"type": "object",
7+
"properties": {
8+
"error_type": {
9+
"description": "The type of error the details are for",
10+
"type": "string",
11+
"enum": [
12+
"Blank",
13+
"Bad value",
14+
"Wrong format"
15+
],
16+
"additionalProperties": {
17+
"$ref": "field_error_detail.schema.json"
18+
}
19+
}
20+
}
21+
}

src/dve/core_engine/backends/implementations/duckdb/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Implementation of duckdb backend"""
12
from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader
23
from dve.core_engine.backends.readers import register_reader
34

src/dve/core_engine/backends/implementations/duckdb/auditing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
PYTHON_TYPE_TO_DUCKDB_TYPE,
1616
table_exists,
1717
)
18+
from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
1819
from dve.core_engine.models import (
1920
AuditRecord,
2021
ProcessingStatusRecord,
2122
SubmissionInfo,
2223
SubmissionStatisticsRecord,
2324
TransferRecord,
2425
)
25-
from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
2626
from dve.core_engine.type_hints import URI, ExecutorType
2727

2828

src/dve/core_engine/backends/implementations/duckdb/contract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
2626
from dve.core_engine.backends.metadata.contract import DataContractMetadata
2727
from dve.core_engine.backends.types import StageSuccessful
28-
from dve.core_engine.backends.utilities import stringify_model, get_polars_type_from_annotation
28+
from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
2929
from dve.core_engine.message import FeedbackMessage
3030
from dve.core_engine.type_hints import URI, Messages
3131
from dve.core_engine.validation import RowValidator

src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,9 @@
1111

1212
import duckdb.typing as ddbtyp
1313
import numpy as np
14-
import polars as pl # type: ignore
1514
from duckdb import DuckDBPyConnection, DuckDBPyRelation
1615
from duckdb.typing import DuckDBPyType
1716
from pandas import DataFrame
18-
1917
from pydantic import BaseModel
2018
from typing_extensions import Annotated, get_args, get_origin, get_type_hints
2119

@@ -91,6 +89,7 @@ def __call__(self):
9189
}
9290
"""A mapping of Python types to the equivalent DuckDB types."""
9391

92+
9493
def table_exists(connection: DuckDBPyConnection, table_name: str) -> bool:
9594
"""check if a table exists in a given DuckDBPyConnection"""
9695
return table_name in map(lambda x: x[0], connection.sql("SHOW TABLES").fetchall())
@@ -190,6 +189,7 @@ def get_duckdb_type_from_annotation(type_annotation: Any) -> DuckDBPyType:
190189
return duck_type
191190
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
192191

192+
193193
def coerce_inferred_numpy_array_to_list(pandas_df: DataFrame) -> DataFrame:
194194
"""Function to modify numpy inferred array when cnverting from duckdb relation to
195195
pandas dataframe - these cause issues with pydantic models
@@ -224,23 +224,20 @@ def _ddb_read_parquet(
224224

225225

226226
def _ddb_write_parquet( # pylint: disable=unused-argument
227-
self,
228-
entity: Union[Iterator[Dict[str, Any]],
229-
DuckDBPyRelation],
230-
target_location: URI,
231-
**kwargs
227+
self, entity: Union[Iterator[Dict[str, Any]], DuckDBPyRelation], target_location: URI, **kwargs
232228
) -> URI:
233229
"""Method to write parquet files from type cast entities
234230
following data contract application
235231
"""
236232
if isinstance(_get_implementation(target_location), LocalFilesystemImplementation):
237233
Path(target_location).parent.mkdir(parents=True, exist_ok=True)
238-
234+
239235
if isinstance(entity, Generator):
240-
entity = self._connection.query("select dta.* from (select unnest($data) as dta)",
241-
params={"data": list(entity)})
236+
entity = self._connection.query(
237+
"select dta.* from (select unnest($data) as dta)", params={"data": list(entity)}
238+
)
242239

243-
entity.to_parquet(file_name=target_location, compression="snappy", **kwargs)
240+
entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore
244241
return target_location
245242

246243

src/dve/core_engine/backends/implementations/duckdb/readers/csv.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import duckdb as ddb
77
import polars as pl
8-
from duckdb import DuckDBPyConnection, DuckDBPyRelation, read_csv, default_connection
8+
from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection, read_csv
99
from pydantic import BaseModel
1010

1111
from dve.core_engine.backends.base.reader import BaseFileReader, read_function
@@ -20,6 +20,7 @@
2020
from dve.core_engine.type_hints import URI, EntityName
2121
from dve.parser.file_handling import get_content_length
2222

23+
2324
@duckdb_write_parquet
2425
class DuckDBCSVReader(BaseFileReader):
2526
"""A reader for CSV files"""

0 commit comments

Comments
 (0)