Closed
Description
Is your feature request related to a problem? Please describe.
If I have a DataFrame that contains int64 or object dtype containing integers, I'd like to be able to upload it with to_gbq
to a table with string columns. For example:
import pandas
import pandas_gbq
df = pandas.DataFrame(
{"zip_code": [94043, 10011, 98033]},
)
pandas_gbq.to_gbq(
df,
"swast-scratch.my_dataset.pandas_gbq_string_test",
table_schema=[
{"name": "zip_code", "type": "STRING"},
],
)
This fails with
Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray
---------------------------------------------------------------------------
ArrowTypeError Traceback (most recent call last)
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:343, in bq_to_arrow_array(series, bq_field)
342 return pyarrow.StructArray.from_pandas(series, type=arrow_type)
--> 343 return pyarrow.Array.from_pandas(series, type=arrow_type)
344 except pyarrow.ArrowTypeError:
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:1126, in pyarrow.lib.Array.from_pandas()
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:360, in pyarrow.lib.array()
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:87, in pyarrow.lib._ndarray_to_array()
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
ArrowTypeError: Expected a string or bytes dtype, got int64
During handling of the above exception, another exception occurred:
ArrowTypeError Traceback (most recent call last)
Cell In[2], line 7
2 import pandas_gbq
4 df = pandas.DataFrame(
5 {"zip_code": [94043, 10011, 98033]},
6 )
----> 7 pandas_gbq.to_gbq(
8 df,
9 "swast-scratch.my_dataset.pandas_gbq_string_test",
10 table_schema=[
11 {"name": "zip_code", "type": "STRING"},
12 ],
13 )
File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:1209, in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, api_method, verbose, private_key, auth_redirect_uri, client_id, client_secret, user_agent, rfc9110_delimiter)
1204 if dataframe.empty:
1205 # Create the table (if needed), but don't try to run a load job with an
1206 # empty file. See: https://github.com/pydata/pandas-gbq/issues/237
1207 return
-> 1209 connector.load_data(
1210 dataframe,
1211 destination_table_ref,
1212 write_disposition=write_disposition,
1213 chunksize=chunksize,
1214 schema=table_schema,
1215 progress_bar=progress_bar,
1216 api_method=api_method,
1217 billing_project=project_id,
1218 )
File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:554, in GbqConnector.load_data(self, dataframe, destination_table_ref, write_disposition, chunksize, schema, progress_bar, api_method, billing_project)
551 total_rows = len(dataframe)
553 try:
--> 554 chunks = load.load_chunks(
555 self.client,
556 dataframe,
557 destination_table_ref,
558 chunksize=chunksize,
559 schema=schema,
560 location=self.location,
561 api_method=api_method,
562 write_disposition=write_disposition,
563 billing_project=billing_project,
564 )
565 if progress_bar and tqdm:
566 chunks = tqdm.tqdm(chunks)
File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:251, in load_chunks(client, dataframe, destination_table_ref, chunksize, schema, location, api_method, write_disposition, billing_project)
239 def load_chunks(
240 client,
241 dataframe,
(...)
248 billing_project: Optional[str] = None,
249 ):
250 if api_method == "load_parquet":
--> 251 load_parquet(
252 client,
253 dataframe,
254 destination_table_ref,
255 write_disposition,
256 location,
257 schema,
258 billing_project=billing_project,
259 )
260 # TODO: yield progress depending on result() with timeout
261 return [0]
File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:141, in load_parquet(client, dataframe, destination_table_ref, write_disposition, location, schema, billing_project)
138 dataframe = cast_dataframe_for_parquet(dataframe, schema)
140 try:
--> 141 client.load_table_from_dataframe(
142 dataframe,
143 destination_table_ref,
144 job_config=job_config,
145 location=location,
146 project=billing_project,
147 ).result()
148 except pyarrow.lib.ArrowInvalid as exc:
149 raise exceptions.ConversionError(
150 "Could not convert DataFrame to Parquet."
151 ) from exc
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/client.py:2806, in Client.load_table_from_dataframe(self, dataframe, destination, num_retries, job_id, job_id_prefix, location, project, job_config, parquet_compression, timeout)
2803 if parquet_compression == "snappy": # adjust the default value
2804 parquet_compression = parquet_compression.upper()
-> 2806 _pandas_helpers.dataframe_to_parquet(
2807 dataframe,
2808 new_job_config.schema,
2809 tmppath,
2810 parquet_compression=parquet_compression,
2811 parquet_use_compliant_nested_type=True,
2812 )
2813 else:
2814 dataframe.to_parquet(
2815 tmppath,
2816 engine="pyarrow",
(...)
2822 ),
2823 )
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:689, in dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression, parquet_use_compliant_nested_type)
682 kwargs = (
683 {"use_compliant_nested_type": parquet_use_compliant_nested_type}
684 if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type
685 else {}
686 )
688 bq_schema = schema._to_schema_fields(bq_schema)
--> 689 arrow_table = dataframe_to_arrow(dataframe, bq_schema)
690 pyarrow.parquet.write_table(
691 arrow_table,
692 filepath,
693 compression=parquet_compression,
694 **kwargs,
695 )
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:632, in dataframe_to_arrow(dataframe, bq_schema)
629 for bq_field in bq_schema:
630 arrow_names.append(bq_field.name)
631 arrow_arrays.append(
--> 632 bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
633 )
634 arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type))
636 if all((field is not None for field in arrow_fields)):
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:347, in bq_to_arrow_array(series, bq_field)
345 msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""
346 _LOGGER.error(msg)
--> 347 raise pyarrow.ArrowTypeError(msg)
ArrowTypeError: Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray
Describe the solution you'd like
If a cast from the local type to the BigQuery type would be lossless (as is the case with integer to string), pandas-gbq should automatically do that before trying to create a parquet file.
Describe alternatives you've considered
Two workarounds:
- Users must cast to the string dtype in pandas before calling to_gbq.
- Pass in
api_method="load_csv"
toto_gbq(df, table_schema=..., api_method="load_csv")
Additional context
Reported by a customer in an email sent to me.