Skip to content

Commit 89078f8

Browse files
authored
feat!: use nullable Int64 and boolean dtypes if available (#445)
* feat: use nullable Int64 and boolean dtypes if available * allow google-cloud-bigquery 3.x * document dtypes mapping
1 parent e13abaf commit 89078f8

File tree

6 files changed

+73
-26
lines changed

6 files changed

+73
-26
lines changed

docs/reading.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,13 @@ column, based on the BigQuery table schema.
5959
================== =========================
6060
BigQuery Data Type dtype
6161
================== =========================
62-
FLOAT float
63-
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
62+
DATE datetime64[ns]
6463
DATETIME datetime64[ns]
64+
BOOL boolean
65+
FLOAT float
66+
INT64 Int64
6567
TIME datetime64[ns]
66-
DATE datetime64[ns]
68+
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
6769
================== =========================
6870

6971
.. _reading-bqstorage-api:

pandas_gbq/features.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
BIGQUERY_BQSTORAGE_VERSION = "1.24.0"
1111
BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0"
1212
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
13+
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
1314
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"
1415

1516

@@ -90,6 +91,13 @@ def pandas_has_deprecated_verbose(self):
9091
)
9192
return self.pandas_installed_version >= pandas_verbosity_deprecation
9293

94+
@property
95+
def pandas_has_boolean_dtype(self):
96+
import pkg_resources
97+
98+
desired_version = pkg_resources.parse_version(PANDAS_BOOLEAN_DTYPE_VERSION)
99+
return self.pandas_installed_version >= desired_version
100+
93101
@property
94102
def pandas_has_parquet_with_lossless_timestamp(self):
95103
import pkg_resources

pandas_gbq/gbq.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,12 +579,13 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
579579
#missing-data-casting-rules-and-indexing
580580
"""
581581
# If you update this mapping, also update the table at
582-
# `docs/source/reading.rst`.
582+
# `docs/reading.rst`.
583583
dtype_map = {
584584
"DATE": "datetime64[ns]",
585585
"DATETIME": "datetime64[ns]",
586586
"FLOAT": np.dtype(float),
587587
"GEOMETRY": "object",
588+
"INTEGER": "Int64",
588589
"RECORD": "object",
589590
"STRING": "object",
590591
# datetime.time objects cannot be case to datetime64.
@@ -596,6 +597,10 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
596597
"TIMESTAMP": "datetime64[ns]",
597598
}
598599

600+
# Amend dtype_map with newer extension types if pandas version allows.
601+
if FEATURES.pandas_has_boolean_dtype:
602+
dtype_map["BOOLEAN"] = "boolean"
603+
599604
dtypes = {}
600605
for field in schema_fields:
601606
name = str(field["name"])

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"google-auth-oauthlib",
3333
# 2.4.* has a bug where waiting for the query can hang indefinitely.
3434
# https://github.com/pydata/pandas-gbq/issues/343
35-
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*",
35+
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*",
3636
]
3737
extras = {
3838
"tqdm": "tqdm>=4.23.0",

tests/system/test_gbq.py

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import numpy as np
1111
import pandas
1212
import pandas.api.types
13-
import pandas.util.testing as tm
13+
import pandas.testing as tm
1414
from pandas import DataFrame, NaT
1515

1616
try:
@@ -21,6 +21,7 @@
2121
import pytz
2222

2323
from pandas_gbq import gbq
24+
from pandas_gbq.features import FEATURES
2425
import pandas_gbq.schema
2526

2627

@@ -32,6 +33,18 @@ def test_imports():
3233
gbq._test_google_api_imports()
3334

3435

36+
def make_mixed_dataframe_v1():
37+
# Re-implementation of private pandas.util.testing.makeMixedDataFrame
38+
return pandas.DataFrame(
39+
{
40+
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
41+
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
42+
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
43+
"D": pandas.bdate_range("1/1/2009", periods=5),
44+
}
45+
)
46+
47+
3548
def make_mixed_dataframe_v2(test_size):
3649
# create df to test for all BQ datatypes except RECORD
3750
bools = np.random.randint(2, size=(1, test_size)).astype(bool)
@@ -168,7 +181,7 @@ def test_should_properly_handle_valid_integers(self, project_id):
168181
credentials=self.credentials,
169182
dialect="standard",
170183
)
171-
tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}))
184+
tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64"))
172185

173186
def test_should_properly_handle_nullable_integers(self, project_id):
174187
query = """SELECT * FROM
@@ -194,7 +207,7 @@ def test_should_properly_handle_valid_longs(self, project_id):
194207
credentials=self.credentials,
195208
dialect="standard",
196209
)
197-
tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}))
210+
tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64"))
198211

199212
def test_should_properly_handle_nullable_longs(self, project_id):
200213
query = """SELECT * FROM
@@ -433,7 +446,10 @@ def test_should_properly_handle_null_boolean(self, project_id):
433446
credentials=self.credentials,
434447
dialect="legacy",
435448
)
436-
tm.assert_frame_equal(df, DataFrame({"null_boolean": [None]}))
449+
expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None
450+
tm.assert_frame_equal(
451+
df, DataFrame({"null_boolean": [None]}, dtype=expected_dtype)
452+
)
437453

438454
def test_should_properly_handle_nullable_booleans(self, project_id):
439455
query = """SELECT * FROM
@@ -445,8 +461,9 @@ def test_should_properly_handle_nullable_booleans(self, project_id):
445461
credentials=self.credentials,
446462
dialect="legacy",
447463
)
464+
expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None
448465
tm.assert_frame_equal(
449-
df, DataFrame({"nullable_boolean": [True, None]}).astype(object)
466+
df, DataFrame({"nullable_boolean": [True, None]}, dtype=expected_dtype)
450467
)
451468

452469
def test_unicode_string_conversion_and_normalization(self, project_id):
@@ -629,7 +646,7 @@ def test_one_row_one_column(self, project_id):
629646
credentials=self.credentials,
630647
dialect="standard",
631648
)
632-
expected_result = DataFrame(dict(v=[3]))
649+
expected_result = DataFrame(dict(v=[3]), dtype="Int64")
633650
tm.assert_frame_equal(df, expected_result)
634651

635652
def test_legacy_sql(self, project_id):
@@ -719,7 +736,7 @@ def test_query_with_parameters(self, project_id):
719736
configuration=config,
720737
dialect="legacy",
721738
)
722-
tm.assert_frame_equal(df, DataFrame({"valid_result": [3]}))
739+
tm.assert_frame_equal(df, DataFrame({"valid_result": [3]}, dtype="Int64"))
723740

724741
def test_query_inside_configuration(self, project_id):
725742
query_no_use = 'SELECT "PI_WRONG" AS valid_string'
@@ -842,7 +859,11 @@ def test_struct(self, project_id):
842859
dialect="standard",
843860
)
844861
expected = DataFrame(
845-
[[1, {"letter": "a", "num": 1}]], columns=["int_field", "struct_field"],
862+
{
863+
"int_field": pandas.Series([1], dtype="Int64"),
864+
"struct_field": [{"letter": "a", "num": 1}],
865+
},
866+
columns=["int_field", "struct_field"],
846867
)
847868
tm.assert_frame_equal(df, expected)
848869

@@ -874,7 +895,12 @@ def test_array_length_zero(self, project_id):
874895
dialect="standard",
875896
)
876897
expected = DataFrame(
877-
[["a", [""], 1], ["b", [], 0]], columns=["letter", "array_field", "len"],
898+
{
899+
"letter": ["a", "b"],
900+
"array_field": [[""], []],
901+
"len": pandas.Series([1, 0], dtype="Int64"),
902+
},
903+
columns=["letter", "array_field", "len"],
878904
)
879905
tm.assert_frame_equal(df, expected)
880906

@@ -908,7 +934,13 @@ def test_array_of_floats(self, project_id):
908934
credentials=self.credentials,
909935
dialect="standard",
910936
)
911-
tm.assert_frame_equal(df, DataFrame([[[1.1, 2.2, 3.3], 4]], columns=["a", "b"]))
937+
tm.assert_frame_equal(
938+
df,
939+
DataFrame(
940+
{"a": [[1.1, 2.2, 3.3]], "b": pandas.Series([4], dtype="Int64")},
941+
columns=["a", "b"],
942+
),
943+
)
912944

913945
def test_tokyo(self, tokyo_dataset, tokyo_table, project_id):
914946
df = gbq.read_gbq(
@@ -1021,7 +1053,7 @@ def test_upload_data_if_table_exists_append(self, project_id):
10211053
test_id = "3"
10221054
test_size = 10
10231055
df = make_mixed_dataframe_v2(test_size)
1024-
df_different_schema = tm.makeMixedDataFrame()
1056+
df_different_schema = make_mixed_dataframe_v1()
10251057

10261058
# Initialize table with sample data
10271059
gbq.to_gbq(
@@ -1101,7 +1133,7 @@ def test_upload_data_if_table_exists_replace(self, project_id):
11011133
test_id = "4"
11021134
test_size = 10
11031135
df = make_mixed_dataframe_v2(test_size)
1104-
df_different_schema = tm.makeMixedDataFrame()
1136+
df_different_schema = make_mixed_dataframe_v1()
11051137

11061138
# Initialize table with sample data
11071139
gbq.to_gbq(
@@ -1225,7 +1257,7 @@ def test_upload_data_with_newlines(self, project_id):
12251257
result = result_df["s"].sort_values()
12261258
expected = df["s"].sort_values()
12271259

1228-
tm.assert_numpy_array_equal(expected.values, result.values)
1260+
tm.assert_series_equal(expected, result)
12291261

12301262
def test_upload_data_flexible_column_order(self, project_id):
12311263
test_id = "13"
@@ -1254,7 +1286,7 @@ def test_upload_data_flexible_column_order(self, project_id):
12541286
def test_upload_data_with_valid_user_schema(self, project_id):
12551287
# Issue #46; tests test scenarios with user-provided
12561288
# schemas
1257-
df = tm.makeMixedDataFrame()
1289+
df = make_mixed_dataframe_v1()
12581290
test_id = "18"
12591291
test_schema = [
12601292
{"name": "A", "type": "FLOAT"},
@@ -1276,7 +1308,7 @@ def test_upload_data_with_valid_user_schema(self, project_id):
12761308
)
12771309

12781310
def test_upload_data_with_invalid_user_schema_raises_error(self, project_id):
1279-
df = tm.makeMixedDataFrame()
1311+
df = make_mixed_dataframe_v1()
12801312
test_id = "19"
12811313
test_schema = [
12821314
{"name": "A", "type": "FLOAT"},
@@ -1295,7 +1327,7 @@ def test_upload_data_with_invalid_user_schema_raises_error(self, project_id):
12951327
)
12961328

12971329
def test_upload_data_with_missing_schema_fields_raises_error(self, project_id):
1298-
df = tm.makeMixedDataFrame()
1330+
df = make_mixed_dataframe_v1()
12991331
test_id = "20"
13001332
test_schema = [
13011333
{"name": "A", "type": "FLOAT"},
@@ -1351,7 +1383,7 @@ def test_upload_data_with_timestamp(self, project_id):
13511383
tm.assert_series_equal(expected, result)
13521384

13531385
def test_upload_data_with_different_df_and_user_schema(self, project_id):
1354-
df = tm.makeMixedDataFrame()
1386+
df = make_mixed_dataframe_v1()
13551387
df["A"] = df["A"].astype(str)
13561388
df["B"] = df["B"].astype(str)
13571389
test_id = "22"
@@ -1460,13 +1492,13 @@ def test_dataset_does_not_exist(gbq_dataset, random_dataset_id):
14601492

14611493

14621494
def test_create_table(gbq_table):
1463-
schema = gbq._generate_bq_schema(tm.makeMixedDataFrame())
1495+
schema = gbq._generate_bq_schema(make_mixed_dataframe_v1())
14641496
gbq_table.create("test_create_table", schema)
14651497
assert gbq_table.exists("test_create_table")
14661498

14671499

14681500
def test_create_table_already_exists(gbq_table):
1469-
schema = gbq._generate_bq_schema(tm.makeMixedDataFrame())
1501+
schema = gbq._generate_bq_schema(make_mixed_dataframe_v1())
14701502
gbq_table.create("test_create_table_exists", schema)
14711503
with pytest.raises(gbq.TableCreationError):
14721504
gbq_table.create("test_create_table_exists", schema)

tests/unit/test_gbq.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def no_auth(monkeypatch):
6464
@pytest.mark.parametrize(
6565
("type_", "expected"),
6666
[
67-
("INTEGER", None), # Can't handle NULL
68-
("BOOLEAN", None), # Can't handle NULL
67+
("SOME_NEW_UNKNOWN_TYPE", None),
68+
("INTEGER", "Int64"),
6969
("FLOAT", numpy.dtype(float)),
7070
# TIMESTAMP will be localized after DataFrame construction.
7171
("TIMESTAMP", "datetime64[ns]"),

0 commit comments

Comments
 (0)