@@ -28,13 +28,13 @@ def method_under_test(to_gbq):
2828
2929SeriesRoundTripTestCase = collections .namedtuple (
3030 "SeriesRoundTripTestCase" ,
31- ["input_series" , "api_methods" ],
32- defaults = [None , {"load_csv" , "load_parquet" }],
31+ ["input_series" , "api_methods" , "expected_dtype" ],
32+ defaults = [None , {"load_csv" , "load_parquet" }, None ],
3333)
3434
3535
3636@pytest .mark .parametrize (
37- ["input_series" , "api_methods" ],
37+ ["input_series" , "api_methods" , "expected_dtype" ],
3838 [
3939 # Ensure that 64-bit floating point numbers are unchanged.
4040 # See: https://github.com/pydata/pandas-gbq/issues/326
@@ -53,40 +53,46 @@ def method_under_test(to_gbq):
5353 name = "test_col" ,
5454 ),
5555 ),
56- SeriesRoundTripTestCase (
57- input_series = pandas .Series (
58- [
59- "abc" ,
60- "defg" ,
61- # Ensure that unicode characters are encoded. See:
62- # https://github.com/googleapis/python-bigquery-pandas/issues/106
63- "信用卡" ,
64- "Skywalker™" ,
65- "hülle" ,
66- ],
67- name = "test_col" ,
56+ pytest .param (
57+ * SeriesRoundTripTestCase (
58+ input_series = pandas .Series (
59+ [
60+ "abc" ,
61+ "defg" ,
62+ # Ensure that unicode characters are encoded. See:
63+ # https://github.com/googleapis/python-bigquery-pandas/issues/106
64+ "信用卡" ,
65+ "Skywalker™" ,
66+ "hülle" ,
67+ ],
68+ name = "test_col" ,
69+ ),
6870 ),
71+ id = "string-unicode" ,
6972 ),
70- SeriesRoundTripTestCase (
71- input_series = pandas .Series (
72- [
73- "abc" ,
74- "defg" ,
75- # Ensure that empty strings are written as empty string,
76- # not NULL. See:
77- # https://github.com/googleapis/python-bigquery-pandas/issues/366
78- "" ,
79- None ,
80- ],
81- name = "empty_strings" ,
73+ pytest .param (
74+ * SeriesRoundTripTestCase (
75+ input_series = pandas .Series (
76+ [
77+ "abc" ,
78+ "defg" ,
79+ # Ensure that empty strings are written as empty string,
80+ # not NULL. See:
81+ # https://github.com/googleapis/python-bigquery-pandas/issues/366
82+ "" ,
83+ None ,
84+ ],
85+ name = "empty_strings" ,
86+ ),
87+ # BigQuery CSV loader uses empty string as the "null marker" by
88+ # default. Potentially one could choose a rarely used character or
89+ # string as the null marker to disambiguate null from empty string,
90+ # but then that string couldn't be loaded.
91+ # TODO: Revist when custom load job configuration is supported.
92+ # https://github.com/googleapis/python-bigquery-pandas/issues/425
93+ api_methods = {"load_parquet" },
8294 ),
83- # BigQuery CSV loader uses empty string as the "null marker" by
84- # default. Potentially one could choose a rarely used character or
85- # string as the null marker to disambiguate null from empty string,
86- # but then that string couldn't be loaded.
87- # TODO: Revist when custom load job configuration is supported.
88- # https://github.com/googleapis/python-bigquery-pandas/issues/425
89- api_methods = {"load_parquet" },
95+ id = "string-empty-and-null" ,
9096 ),
9197 ],
9298)
@@ -97,6 +103,7 @@ def test_series_round_trip(
97103 input_series ,
98104 api_method ,
99105 api_methods ,
106+ expected_dtype ,
100107):
101108 if api_method not in api_methods :
102109 pytest .skip (f"{ api_method } not supported." )
@@ -111,9 +118,14 @@ def test_series_round_trip(
111118
112119 round_trip = read_gbq (table_id )
113120 round_trip_series = round_trip ["test_col" ].sort_values ().reset_index (drop = True )
121+
122+ expected_series = input_series .copy ()
123+ if expected_dtype is not None :
124+ expected_series = expected_series .astype (expected_dtype )
125+
114126 pandas .testing .assert_series_equal (
115127 round_trip_series ,
116- input_series ,
128+ expected_series ,
117129 check_exact = True ,
118130 check_names = False ,
119131 )
@@ -362,6 +374,79 @@ def test_series_round_trip(
362374 ),
363375 id = "issue365-extreme-datetimes" ,
364376 ),
377+ # Loading a STRING column should work with all available string dtypes.
378+ pytest .param (
379+ * DataFrameRoundTripTestCase (
380+ input_df = pandas .DataFrame (
381+ {
382+ "row_num" : [1 , 2 , 3 ],
383+ # If a cast to STRING is lossless, pandas-gbq should do that automatically.
384+ # See: https://github.com/googleapis/python-bigquery-pandas/issues/875
385+ "int_want_string" : [94043 , 10011 , 98033 ],
386+ "object" : pandas .Series (["a" , "b" , "c" ], dtype = "object" ),
387+ "string_python" : pandas .Series (
388+ ["d" , "e" , "f" ],
389+ dtype = (
390+ pandas .StringDtype (storage = "python" )
391+ if hasattr (pandas , "ArrowDtype" )
392+ else pandas .StringDtype ()
393+ ),
394+ ),
395+ "string_pyarrow" : pandas .Series (
396+ ["g" , "h" , "i" ],
397+ dtype = (
398+ pandas .StringDtype (storage = "pyarrow" )
399+ if hasattr (pandas , "ArrowDtype" )
400+ else pandas .StringDtype ()
401+ ),
402+ ),
403+ "arrowdtype_string" : pandas .Series (
404+ ["j" , "k" , "l" ],
405+ dtype = (
406+ pandas .ArrowDtype (pyarrow .string ())
407+ if hasattr (pandas , "ArrowDtype" )
408+ else pandas .StringDtype ()
409+ ),
410+ ),
411+ "arrowdtype_large_string" : pandas .Series (
412+ ["m" , "n" , "o" ],
413+ dtype = (
414+ pandas .ArrowDtype (pyarrow .large_string ())
415+ if hasattr (pandas , "ArrowDtype" )
416+ and hasattr (pyarrow , "large_string" )
417+ else pandas .StringDtype ()
418+ ),
419+ ),
420+ },
421+ ),
422+ expected_df = pandas .DataFrame (
423+ {
424+ "row_num" : [1 , 2 , 3 ],
425+ "int_want_string" : pandas .Series (
426+ ["94043" , "10011" , "98033" ], dtype = "object"
427+ ),
428+ "object" : pandas .Series (["a" , "b" , "c" ], dtype = "object" ),
429+ "string_python" : pandas .Series (["d" , "e" , "f" ], dtype = "object" ),
430+ "string_pyarrow" : pandas .Series (["g" , "h" , "i" ], dtype = "object" ),
431+ "arrowdtype_string" : pandas .Series (["j" , "k" , "l" ], dtype = "object" ),
432+ "arrowdtype_large_string" : pandas .Series (
433+ ["m" , "n" , "o" ], dtype = "object"
434+ ),
435+ },
436+ ),
437+ table_schema = [
438+ {"name" : "row_num" , "type" : "INTEGER" },
439+ {"name" : "int_want_string" , "type" : "STRING" },
440+ {"name" : "object" , "type" : "STRING" },
441+ {"name" : "string_python" , "type" : "STRING" },
442+ {"name" : "string_pyarrow" , "type" : "STRING" },
443+ {"name" : "string_pyarrow_from_int" , "type" : "STRING" },
444+ {"name" : "arrowdtype_string" , "type" : "STRING" },
445+ {"name" : "arrowdtype_large_string" , "type" : "STRING" },
446+ ],
447+ ),
448+ id = "issue875-strings" ,
449+ ),
365450 pytest .param (
366451 # Load STRUCT and ARRAY using either object column or ArrowDtype.
367452 # See: https://github.com/googleapis/python-bigquery-pandas/issues/452
0 commit comments