Skip to content

can't upload an int64 column dtype (or object dtype containing integers) to a string column in BigQuery #875

Closed
@tswast

Description

@tswast

Is your feature request related to a problem? Please describe.

If I have a DataFrame that contains int64 or object dtype containing integers, I'd like to be able to upload it with to_gbq to a table with string columns. For example:

import pandas
import pandas_gbq

df = pandas.DataFrame(
    {"zip_code": [94043, 10011, 98033]},
)
pandas_gbq.to_gbq(
    df,
    "swast-scratch.my_dataset.pandas_gbq_string_test",
    table_schema=[
        {"name": "zip_code", "type": "STRING"},
    ],
)

This fails with

Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray
---------------------------------------------------------------------------
ArrowTypeError                            Traceback (most recent call last)
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:343, in bq_to_arrow_array(series, bq_field)
    342         return pyarrow.StructArray.from_pandas(series, type=arrow_type)
--> 343     return pyarrow.Array.from_pandas(series, type=arrow_type)
    344 except pyarrow.ArrowTypeError:

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:1126, in pyarrow.lib.Array.from_pandas()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:360, in pyarrow.lib.array()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:87, in pyarrow.lib._ndarray_to_array()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

ArrowTypeError: Expected a string or bytes dtype, got int64

During handling of the above exception, another exception occurred:

ArrowTypeError                            Traceback (most recent call last)
Cell In[2], line 7
      2 import pandas_gbq
      4 df = pandas.DataFrame(
      5     {"zip_code": [94043, 10011, 98033]},
      6 )
----> 7 pandas_gbq.to_gbq(
      8     df,
      9     "swast-scratch.my_dataset.pandas_gbq_string_test",
     10     table_schema=[
     11         {"name": "zip_code", "type": "STRING"},
     12     ],
     13 )

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:1209, in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, api_method, verbose, private_key, auth_redirect_uri, client_id, client_secret, user_agent, rfc9110_delimiter)
   1204 if dataframe.empty:
   1205     # Create the table (if needed), but don't try to run a load job with an
   1206     # empty file. See: https://github.com/pydata/pandas-gbq/issues/237
   1207     return
-> 1209 connector.load_data(
   1210     dataframe,
   1211     destination_table_ref,
   1212     write_disposition=write_disposition,
   1213     chunksize=chunksize,
   1214     schema=table_schema,
   1215     progress_bar=progress_bar,
   1216     api_method=api_method,
   1217     billing_project=project_id,
   1218 )

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:554, in GbqConnector.load_data(self, dataframe, destination_table_ref, write_disposition, chunksize, schema, progress_bar, api_method, billing_project)
    551 total_rows = len(dataframe)
    553 try:
--> 554     chunks = load.load_chunks(
    555         self.client,
    556         dataframe,
    557         destination_table_ref,
    558         chunksize=chunksize,
    559         schema=schema,
    560         location=self.location,
    561         api_method=api_method,
    562         write_disposition=write_disposition,
    563         billing_project=billing_project,
    564     )
    565     if progress_bar and tqdm:
    566         chunks = tqdm.tqdm(chunks)

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:251, in load_chunks(client, dataframe, destination_table_ref, chunksize, schema, location, api_method, write_disposition, billing_project)
    239 def load_chunks(
    240     client,
    241     dataframe,
   (...)
    248     billing_project: Optional[str] = None,
    249 ):
    250     if api_method == "load_parquet":
--> 251         load_parquet(
    252             client,
    253             dataframe,
    254             destination_table_ref,
    255             write_disposition,
    256             location,
    257             schema,
    258             billing_project=billing_project,
    259         )
    260         # TODO: yield progress depending on result() with timeout
    261         return [0]

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:141, in load_parquet(client, dataframe, destination_table_ref, write_disposition, location, schema, billing_project)
    138     dataframe = cast_dataframe_for_parquet(dataframe, schema)
    140 try:
--> 141     client.load_table_from_dataframe(
    142         dataframe,
    143         destination_table_ref,
    144         job_config=job_config,
    145         location=location,
    146         project=billing_project,
    147     ).result()
    148 except pyarrow.lib.ArrowInvalid as exc:
    149     raise exceptions.ConversionError(
    150         "Could not convert DataFrame to Parquet."
    151     ) from exc

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/client.py:2806, in Client.load_table_from_dataframe(self, dataframe, destination, num_retries, job_id, job_id_prefix, location, project, job_config, parquet_compression, timeout)
   2803     if parquet_compression == "snappy":  # adjust the default value
   2804         parquet_compression = parquet_compression.upper()
-> 2806     _pandas_helpers.dataframe_to_parquet(
   2807         dataframe,
   2808         new_job_config.schema,
   2809         tmppath,
   2810         parquet_compression=parquet_compression,
   2811         parquet_use_compliant_nested_type=True,
   2812     )
   2813 else:
   2814     dataframe.to_parquet(
   2815         tmppath,
   2816         engine="pyarrow",
   (...)
   2822         ),
   2823     )

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:689, in dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression, parquet_use_compliant_nested_type)
    682 kwargs = (
    683     {"use_compliant_nested_type": parquet_use_compliant_nested_type}
    684     if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type
    685     else {}
    686 )
    688 bq_schema = schema._to_schema_fields(bq_schema)
--> 689 arrow_table = dataframe_to_arrow(dataframe, bq_schema)
    690 pyarrow.parquet.write_table(
    691     arrow_table,
    692     filepath,
    693     compression=parquet_compression,
    694     **kwargs,
    695 )

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:632, in dataframe_to_arrow(dataframe, bq_schema)
    629 for bq_field in bq_schema:
    630     arrow_names.append(bq_field.name)
    631     arrow_arrays.append(
--> 632         bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
    633     )
    634     arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type))
    636 if all((field is not None for field in arrow_fields)):

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:347, in bq_to_arrow_array(series, bq_field)
    345 msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""
    346 _LOGGER.error(msg)
--> 347 raise pyarrow.ArrowTypeError(msg)

ArrowTypeError: Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray

Describe the solution you'd like

If a cast from the local type to the BigQuery type would be lossless (as is the case with integer to string), pandas-gbq should automatically do that before trying to create a parquet file.

Describe alternatives you've considered

Two workarounds:

  • Users must cast to the string dtype in pandas before calling to_gbq.
  • Pass in api_method="load_csv" to to_gbq(df, table_schema=..., api_method="load_csv")

Additional context

Reported by a customer in an email sent to me.

Metadata

Metadata

Assignees

Labels

api: bigqueryIssues related to the googleapis/python-bigquery-pandas API.type: feature request‘Nice-to-have’ improvement, new feature or different behavior or design.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions