66from duckdb import DuckDBPyRelation , default_connection
77from pydantic import BaseModel
88
9+ from dve .core_engine .backends .exceptions import EmptyFileError , MessageBearingError
910from dve .core_engine .backends .implementations .duckdb .duckdb_helpers import (
1011 get_duckdb_type_from_annotation ,
1112)
12- from dve .core_engine .backends .implementations .duckdb .readers .csv import DuckDBCSVReader , SQLType
13+ from dve .core_engine .backends .implementations .duckdb .readers .csv import (
14+ DuckDBCSVReader ,
15+ DuckDBCSVRepeatingHeaderReader ,
16+ PolarsToDuckDBCSVReader ,
17+ )
1318from dve .core_engine .backends .utilities import stringify_model
1419from tests .test_core_engine .test_backends .fixtures import duckdb_connection
1520
21+ # pylint: disable=C0116
22+
1623
1724class SimpleModel (BaseModel ):
1825 varchar_field : str
@@ -21,6 +28,11 @@ class SimpleModel(BaseModel):
2128 timestamp_field : datetime
2229
2330
31+ class SimpleHeaderModel (BaseModel ):
32+ header_1 : str
33+ header_2 : str
34+
35+
2436@pytest .fixture
2537def temp_dir ():
2638 with TemporaryDirectory (prefix = "ddb_test_csv_reader" ) as temp_dir :
@@ -43,18 +55,19 @@ def temp_csv_file(temp_dir: Path):
4355 yield temp_dir .joinpath ("dummy.csv" ), header , typed_data , SimpleModel
4456
4557
46- class SimpleModel (BaseModel ):
47- varchar_field : str
48- bigint_field : int
49- date_field : date
50- timestamp_field : datetime
58+ @pytest .fixture
59+ def temp_empty_csv_file (temp_dir : Path ):
60+ with open (temp_dir .joinpath ("empty.csv" ), mode = "w" ):
61+ pass
62+
63+ yield temp_dir .joinpath ("empty.csv" ), SimpleModel
5164
5265
5366def test_ddb_csv_reader_all_str (temp_csv_file ):
5467 uri , header , data , mdl = temp_csv_file
5568 reader = DuckDBCSVReader (header = True , delim = "," , connection = default_connection )
5669 rel : DuckDBPyRelation = reader .read_to_entity_type (
57- DuckDBPyRelation , uri , "test" , stringify_model (mdl )
70+ DuckDBPyRelation , str ( uri ) , "test" , stringify_model (mdl )
5871 )
5972 assert rel .columns == header .split ("," )
6073 assert dict (zip (rel .columns , rel .dtypes )) == {fld : "VARCHAR" for fld in header .split ("," )}
@@ -64,7 +77,7 @@ def test_ddb_csv_reader_all_str(temp_csv_file):
6477def test_ddb_csv_reader_cast (temp_csv_file ):
6578 uri , header , data , mdl = temp_csv_file
6679 reader = DuckDBCSVReader (header = True , delim = "," , connection = default_connection )
67- rel : DuckDBPyRelation = reader .read_to_entity_type (DuckDBPyRelation , uri , "test" , mdl )
80+ rel : DuckDBPyRelation = reader .read_to_entity_type (DuckDBPyRelation , str ( uri ) , "test" , mdl )
6881 assert rel .columns == header .split ("," )
6982 assert dict (zip (rel .columns , rel .dtypes )) == {
7083 fld .name : str (get_duckdb_type_from_annotation (fld .annotation ))
@@ -77,9 +90,70 @@ def test_ddb_csv_write_parquet(temp_csv_file):
7790 uri , header , data , mdl = temp_csv_file
7891 reader = DuckDBCSVReader (header = True , delim = "," , connection = default_connection )
7992 rel : DuckDBPyRelation = reader .read_to_entity_type (
80- DuckDBPyRelation , uri , "test" , stringify_model (mdl )
93+ DuckDBPyRelation , str ( uri ) , "test" , stringify_model (mdl )
8194 )
8295 target_loc : Path = uri .parent .joinpath ("test_parquet.parquet" ).as_posix ()
8396 reader .write_parquet (rel , target_loc )
8497 parquet_rel = reader ._connection .read_parquet (target_loc )
8598 assert parquet_rel .df ().to_dict (orient = "records" ) == rel .df ().to_dict (orient = "records" )
99+
100+
101+ def test_ddb_csv_read_empty_file (temp_empty_csv_file ):
102+ uri , mdl = temp_empty_csv_file
103+ reader = DuckDBCSVReader (header = True , delim = "," , connection = default_connection )
104+
105+ with pytest .raises (EmptyFileError ):
106+ reader .read_to_relation (str (uri ), "test" , mdl )
107+
108+
109+ def test_polars_to_ddb_csv_reader (temp_csv_file ):
110+ uri , header , data , mdl = temp_csv_file
111+ reader = PolarsToDuckDBCSVReader (
112+ header = True , delim = "," , quotechar = '"' , connection = default_connection
113+ )
114+ entity = reader .read_to_relation (str (uri ), "test" , mdl )
115+
116+ assert entity .shape [0 ] == 2
117+
118+
119+ def test_ddb_csv_repeating_header_reader_non_duplicate (temp_dir ):
120+ header = "header_1,header_2,non_header_1"
121+ typed_data = [
122+ ["hvalue1" , "hvalue1" , "nhvalue1" ],
123+ ["hvalue1" , "hvalue1" , "nhvalue2" ],
124+ ["hvalue1" , "hvalue1" , "nhvalue3" ],
125+ ]
126+ with open (temp_dir .joinpath ("test_header.csv" ), mode = "w" ) as csv_file :
127+ csv_file .write (header + "\n " )
128+ for rw in typed_data :
129+ csv_file .write ("," .join ([str (val ) for val in rw ]) + "\n " )
130+
131+ file_uri = temp_dir .joinpath ("test_header.csv" )
132+
133+ reader = DuckDBCSVRepeatingHeaderReader (
134+ header = True , delim = "," , quotechar = '"' , connection = default_connection
135+ )
136+ entity = reader .read_to_relation (str (file_uri ), "test" , SimpleHeaderModel )
137+
138+ assert entity .shape [0 ] == 1
139+
140+
141+ def test_ddb_csv_repeating_header_reader_with_more_than_one_set_of_distinct_values (temp_dir ):
142+ header = "header_1,header_2,non_header_1"
143+ typed_data = [
144+ ["hvalue1" , "hvalue2" , "nhvalue1" ],
145+ ["hvalue2" , "hvalue2" , "nhvalue2" ],
146+ ["hvalue1" , "hvalue1" , "nhvalue3" ],
147+ ]
148+ with open (temp_dir .joinpath ("test_header.csv" ), mode = "w" ) as csv_file :
149+ csv_file .write (header + "\n " )
150+ for rw in typed_data :
151+ csv_file .write ("," .join ([str (val ) for val in rw ]) + "\n " )
152+
153+ file_uri = temp_dir .joinpath ("test_header.csv" )
154+ reader = DuckDBCSVRepeatingHeaderReader (
155+ header = True , delim = "," , quotechar = '"' , connection = default_connection
156+ )
157+
158+ with pytest .raises (MessageBearingError ):
159+ reader .read_to_relation (str (file_uri ), "test" , SimpleHeaderModel )
0 commit comments