Skip to content

Commit 107bb40

Browse files
authored
fix!: to_gbq loads unit8 columns to BigQuery INT64 instead of STRING (#814)
* fix!: `to_gbq` loads `unit8` columns to BigQuery INT64 instead of STRING fix!: `to_gbq` loads naive (no timezone) columns to BigQuery DATETIME instead of TIMESTAMP (#814) fix!: `to_gbq` loads object column containing bool values to BOOLEAN instead of STRING (#814) fix!: `to_gbq` loads object column containing dictionary values to STRUCT instead of STRING (#814) deps: min pyarrow is now 4.0.0 to support compliant nested types (#814) Release-As: 0.24.0
1 parent 8a4389b commit 107bb40

18 files changed

+997
-76
lines changed

noxfile.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
UNIT_TEST_EXTRAS = [
5252
"bqstorage",
5353
"tqdm",
54+
"geopandas",
5455
]
5556
UNIT_TEST_EXTRAS_BY_PYTHON = {
5657
"3.9": [],

owlbot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
# Use a middle version of Python to test when no extras are installed.
3333
"3.9": []
3434
}
35-
extras = ["tqdm"]
35+
extras = ["tqdm", "geopandas"]
3636
templated_files = common.py_library(
3737
unit_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"],
3838
system_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"],

pandas_gbq/core/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Copyright (c) 2024 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.

pandas_gbq/core/pandas.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
import itertools
6+
7+
import pandas
8+
9+
10+
def list_columns_and_indexes(dataframe, index=True):
11+
"""Return all index and column names with dtypes.
12+
13+
Returns:
14+
Sequence[Tuple[str, dtype]]:
15+
Returns a sorted list of indexes and column names with
16+
corresponding dtypes. If an index is missing a name or has the
17+
same name as a column, the index is omitted.
18+
"""
19+
column_names = frozenset(dataframe.columns)
20+
columns_and_indexes = []
21+
if index:
22+
if isinstance(dataframe.index, pandas.MultiIndex):
23+
for name in dataframe.index.names:
24+
if name and name not in column_names:
25+
values = dataframe.index.get_level_values(name)
26+
columns_and_indexes.append((name, values.dtype))
27+
else:
28+
if dataframe.index.name and dataframe.index.name not in column_names:
29+
columns_and_indexes.append(
30+
(dataframe.index.name, dataframe.index.dtype)
31+
)
32+
33+
columns_and_indexes += zip(dataframe.columns, dataframe.dtypes)
34+
return columns_and_indexes
35+
36+
37+
def first_valid(series):
38+
first_valid_index = series.first_valid_index()
39+
if first_valid_index is not None:
40+
return series.at[first_valid_index]
41+
42+
43+
def first_array_valid(series):
44+
"""Return the first "meaningful" element from the array series.
45+
46+
Here, "meaningful" means the first non-None element in one of the arrays that can
47+
be used for type detextion.
48+
"""
49+
first_valid_index = series.first_valid_index()
50+
if first_valid_index is None:
51+
return None
52+
53+
valid_array = series.at[first_valid_index]
54+
valid_item = next((item for item in valid_array if not pandas.isna(item)), None)
55+
56+
if valid_item is not None:
57+
return valid_item
58+
59+
# Valid item is None because all items in the "valid" array are invalid. Try
60+
# to find a true valid array manually.
61+
for array in itertools.islice(series, first_valid_index + 1, None):
62+
try:
63+
array_iter = iter(array)
64+
except TypeError:
65+
continue # Not an array, apparently, e.g. None, thus skip.
66+
valid_item = next((item for item in array_iter if not pandas.isna(item)), None)
67+
if valid_item is not None:
68+
break
69+
70+
return valid_item

pandas_gbq/gbq.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pandas_gbq.features import FEATURES
2626
import pandas_gbq.query
2727
import pandas_gbq.schema
28+
import pandas_gbq.schema.pandas_to_bigquery
2829
import pandas_gbq.timestamp
2930

3031
try:
@@ -1219,9 +1220,16 @@ def _generate_bq_schema(df, default_type="STRING"):
12191220
be overridden: https://github.com/pydata/pandas-gbq/issues/218, this
12201221
method can be removed after there is time to migrate away from this
12211222
method."""
1222-
from pandas_gbq import schema
1223+
fields = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
1224+
df,
1225+
default_type=default_type,
1226+
)
1227+
fields_json = []
1228+
1229+
for field in fields:
1230+
fields_json.append(field.to_api_repr())
12231231

1224-
return schema.generate_bq_schema(df, default_type=default_type)
1232+
return {"fields": fields_json}
12251233

12261234

12271235
class _Table(GbqConnector):

pandas_gbq/load.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
from pandas_gbq import exceptions
1717
import pandas_gbq.schema
18+
import pandas_gbq.schema.bigquery
19+
import pandas_gbq.schema.pandas_to_bigquery
1820

1921

2022
def encode_chunk(dataframe):
@@ -214,11 +216,9 @@ def load_csv_from_file(
214216
This method is needed for writing with google-cloud-bigquery versions that
215217
don't implment load_table_from_dataframe with the CSV serialization format.
216218
"""
217-
if schema is None:
218-
schema = pandas_gbq.schema.generate_bq_schema(dataframe)
219-
220-
schema = pandas_gbq.schema.remove_policy_tags(schema)
221-
bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
219+
bq_schema = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
220+
dataframe, schema
221+
)
222222

223223
def load_chunk(chunk, job_config):
224224
try:

pandas_gbq/schema.py renamed to pandas_gbq/schema/__init__.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -92,37 +92,6 @@ def schema_is_subset(schema_remote, schema_local):
9292
return all(field in fields_remote for field in fields_local)
9393

9494

95-
def generate_bq_schema(dataframe, default_type="STRING"):
96-
"""Given a passed dataframe, generate the associated Google BigQuery schema.
97-
98-
Arguments:
99-
dataframe (pandas.DataFrame): D
100-
default_type : string
101-
The default big query type in case the type of the column
102-
does not exist in the schema.
103-
"""
104-
105-
# If you update this mapping, also update the table at
106-
# `docs/source/writing.rst`.
107-
type_mapping = {
108-
"i": "INTEGER",
109-
"b": "BOOLEAN",
110-
"f": "FLOAT",
111-
"O": "STRING",
112-
"S": "STRING",
113-
"U": "STRING",
114-
"M": "TIMESTAMP",
115-
}
116-
117-
fields = []
118-
for column_name, dtype in dataframe.dtypes.items():
119-
fields.append(
120-
{"name": column_name, "type": type_mapping.get(dtype.kind, default_type)}
121-
)
122-
123-
return {"fields": fields}
124-
125-
12695
def update_schema(schema_old, schema_new):
12796
"""
12897
Given an old BigQuery schema, update it with a new one.

pandas_gbq/schema/bigquery.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
import collections
6+
7+
import google.cloud.bigquery
8+
9+
10+
def to_schema_fields(schema):
11+
"""Coerce `schema` to a list of schema field instances.
12+
13+
Args:
14+
schema(Sequence[Union[ \
15+
:class:`~google.cloud.bigquery.schema.SchemaField`, \
16+
Mapping[str, Any] \
17+
]]):
18+
Table schema to convert. If some items are passed as mappings,
19+
their content must be compatible with
20+
:meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
21+
22+
Returns:
23+
Sequence[:class:`~google.cloud.bigquery.schema.SchemaField`]
24+
25+
Raises:
26+
Exception: If ``schema`` is not a sequence, or if any item in the
27+
sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField`
28+
instance or a compatible mapping representation of the field.
29+
"""
30+
for field in schema:
31+
if not isinstance(
32+
field, (google.cloud.bigquery.SchemaField, collections.abc.Mapping)
33+
):
34+
raise ValueError(
35+
"Schema items must either be fields or compatible "
36+
"mapping representations."
37+
)
38+
39+
return [
40+
field
41+
if isinstance(field, google.cloud.bigquery.SchemaField)
42+
else google.cloud.bigquery.SchemaField.from_api_repr(field)
43+
for field in schema
44+
]

0 commit comments

Comments
 (0)