Skip to content

Commit 3bfccde

Browse files
v1.1.0 - custom error codes for data contract, new readers and various bugfixes (#7)
* feat: initial work to add custom error messages to data contract * feat: added support for nested fields when configuring custom error details for data contract. Includes accessing nested error values in error messages. * feat: bug fixes with readers and base pipeline. Added spark csv reader. Added new dischema files for tests * feat: small fixes and movies dataset working up to end of data contract * feat: further refinement of movies test dataset. * feat: improved movies dataset test coverage. Added testing for spark and duckdb refdata loaders when table config specified. * feat: sorted linting. Added json schema docs * feat: tweaked json schemas for contract error details. Fixed failing unit test * style: fix mypy error with entity.write * feature: more duckdb csv readers (#6) * feat: add new duckdb csv readers * style: fix polars typing issue and white space in docstrings * bump: version 1.0.0 → 1.1.0 * docs: Amended changelog --------- Co-authored-by: georgeRobertson <50412379+georgeRobertson@users.noreply.github.com>
1 parent 970f0ef commit 3bfccde

File tree

68 files changed

+2191
-284
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+2191
-284
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
## v1.1.0 (2025-10-28)
2+
3+
### Feat
4+
5+
- Added ability to define custom error codes and templated messages for data contract feedback messages
6+
- Added new JSON readers
7+
- Added SparkCSVReader
8+
- Added PolarsToDuckDBCSVReader and DuckDBCSVRepeatingReader
9+
- Added quotechar option to DuckDBCSVReader
10+
11+
### Fix
12+
- Fixed issues with refdata loader table implementations
13+
- Fixed duckdb try_cast statements in data contract phase
14+
- Allowed use of entity type in file transformation
15+
116
## 1.0.0 (2025-10-09)
217

318
### Refactor

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ DVE configuration can be instantiated from a json (dischema) file which might be
1818
{
1919
"contract": {
2020
"cache_originals": true,
21-
"contract_error_codes": null,
21+
"error_details": null,
2222
"types": {},
2323
"schemas": {},
2424
"datasets": {

docs/detailed_guidance/data_contract.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Lets look at the data contract configuration from [Introduction to DVE](../READM
44
{
55
"contract": {
66
"cache_originals": true,
7-
"contract_error_codes": null,
7+
"error_details": null,
88
"types": {},
99
"schemas": {},
1010
"datasets": {
@@ -78,7 +78,7 @@ Here we have only filled out datasets. We've added a few more fields such as `Pe
7878
{
7979
"contract": {
8080
"cache_originals": true,
81-
"contract_error_codes": null,
81+
"error_details": null,
8282
"types": {
8383
"isodate": {
8484
"description": "an isoformatted date type",
@@ -172,7 +172,7 @@ We can see here that the Activity has a number of fields. `startdate`, `enddate`
172172
{
173173
"contract": {
174174
"cache_originals": true,
175-
"contract_error_codes": null,
175+
"error_details": null,
176176
"types": {
177177
"isodate": {
178178
"description": "an isoformatted date type",
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/contract_error_details.schema.json",
4+
"title": "base_entity",
5+
"description": "A mapping of field names to the custom error code and message required if these fields were to fail validation during the data contract phase. For nested fields, these should be specified using struct '.' notation (eg. fieldA.fieldB.fieldC)",
6+
"type": "object",
7+
"additionalProperties": {
8+
"$ref": "field_error_type.schema.json"
9+
}
10+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/field_error_detail.schema.json",
4+
"title": "field_error_detail",
5+
"description": "The custom details to be used for a field when a validation error is raised during the data contract phase",
6+
"type": "object",
7+
"properties": {
8+
"error_code": {
9+
"description": "The code to be used for the field and error type specified",
10+
"type": "string"
11+
},
12+
"error_message": {
13+
"description": "The message to be used for the field and error type specified. This can include templating (specified using jinja2 conventions). During templating, the full record will be available with an additional __error_value to easily obtain nested offending values.",
14+
"type": "string",
15+
"enum": [
16+
"record_rejection",
17+
"file_rejection",
18+
"warning"
19+
]
20+
}
21+
},
22+
"required": [
23+
"error_code",
24+
"error_message"
25+
],
26+
"additionalProperties": false
27+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/field_error_type.schema.json",
4+
"title": "field_error_detail",
5+
"description": "The error type for a field when a validation error is raised during the data contract phase",
6+
"type": "object",
7+
"properties": {
8+
"error_type": {
9+
"description": "The type of error the details are for",
10+
"type": "string",
11+
"enum": [
12+
"Blank",
13+
"Bad value",
14+
"Wrong format"
15+
],
16+
"additionalProperties": {
17+
"$ref": "field_error_detail.schema.json"
18+
}
19+
}
20+
}
21+
}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "nhs_dve"
3-
version = "1.0.0"
3+
version = "1.1.0"
44
description = "`nhs data validation engine` is a framework used to validate data"
55
authors = ["NHS England <england.contactus@nhs.net>"]
66
readme = "README.md"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Implementation of duckdb backend"""
2+
from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader
3+
from dve.core_engine.backends.readers import register_reader
4+
5+
from .contract import DuckDBDataContract
6+
from .readers import (
7+
DuckDBCSVReader,
8+
DuckDBCSVRepeatingHeaderReader,
9+
DuckDBXMLStreamReader,
10+
PolarsToDuckDBCSVReader
11+
)
12+
from .reference_data import DuckDBRefDataLoader
13+
from .rules import DuckDBStepImplementations
14+
15+
register_reader(DuckDBCSVReader)
16+
register_reader(DuckDBCSVRepeatingHeaderReader)
17+
register_reader(DuckDBJSONReader)
18+
register_reader(DuckDBXMLStreamReader)
19+
register_reader(PolarsToDuckDBCSVReader)
20+
21+
__all__ = [
22+
"DuckDBDataContract",
23+
"DuckDBRefDataLoader",
24+
"DuckDBStepImplementations",
25+
]

src/dve/core_engine/backends/implementations/duckdb/auditing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
)
1414
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
1515
PYTHON_TYPE_TO_DUCKDB_TYPE,
16-
PYTHON_TYPE_TO_POLARS_TYPE,
1716
table_exists,
1817
)
18+
from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
1919
from dve.core_engine.models import (
2020
AuditRecord,
2121
ProcessingStatusRecord,

src/dve/core_engine/backends/implementations/duckdb/contract.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,12 @@
2020
duckdb_read_parquet,
2121
duckdb_write_parquet,
2222
get_duckdb_type_from_annotation,
23-
get_polars_type_from_annotation,
2423
relation_is_empty,
2524
)
2625
from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
2726
from dve.core_engine.backends.metadata.contract import DataContractMetadata
2827
from dve.core_engine.backends.types import StageSuccessful
29-
from dve.core_engine.backends.utilities import stringify_model
28+
from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
3029
from dve.core_engine.message import FeedbackMessage
3130
from dve.core_engine.type_hints import URI, Messages
3231
from dve.core_engine.validation import RowValidator
@@ -95,8 +94,8 @@ def generate_ddb_cast_statement(
9594
Current duckdb python API doesn't play well with this currently.
9695
"""
9796
if not null_flag:
98-
return f"try_cast({column_name} AS {dtype}) AS {column_name}"
99-
return f"cast(NULL AS {dtype}) AS {column_name}"
97+
return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
98+
return f'cast(NULL AS {dtype}) AS "{column_name}"'
10099

101100
def apply_data_contract(
102101
self, entities: DuckDBEntities, contract_metadata: DataContractMetadata

0 commit comments

Comments
 (0)