can't upload an int64 column dtype (or object dtype containing integers) to a string column in BigQuery

 **Is your feature request related to a problem? Please describe.**

If I have a DataFrame that contains int64 or object dtype containing integers, I'd like to be able to upload it with `to_gbq` to a table with string columns. For example:

```python
import pandas
import pandas_gbq

df = pandas.DataFrame(
    {"zip_code": [94043, 10011, 98033]},
)
pandas_gbq.to_gbq(
    df,
    "swast-scratch.my_dataset.pandas_gbq_string_test",
    table_schema=[
        {"name": "zip_code", "type": "STRING"},
    ],
)
```

This fails with

<details><summary>Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray</summary>

```
---------------------------------------------------------------------------
ArrowTypeError                            Traceback (most recent call last)
File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:343, in bq_to_arrow_array(series, bq_field)
    342         return pyarrow.StructArray.from_pandas(series, type=arrow_type)
--> 343     return pyarrow.Array.from_pandas(series, type=arrow_type)
    344 except pyarrow.ArrowTypeError:

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:1126, in pyarrow.lib.Array.from_pandas()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:360, in pyarrow.lib.array()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/array.pxi:87, in pyarrow.lib._ndarray_to_array()

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

ArrowTypeError: Expected a string or bytes dtype, got int64

During handling of the above exception, another exception occurred:

ArrowTypeError                            Traceback (most recent call last)
Cell In[2], line 7
      2 import pandas_gbq
      4 df = pandas.DataFrame(
      5     {"zip_code": [94043, 10011, 98033]},
      6 )
----> 7 pandas_gbq.to_gbq(
      8     df,
      9     "swast-scratch.my_dataset.pandas_gbq_string_test",
     10     table_schema=[
     11         {"name": "zip_code", "type": "STRING"},
     12     ],
     13 )

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:1209, in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, api_method, verbose, private_key, auth_redirect_uri, client_id, client_secret, user_agent, rfc9110_delimiter)
   1204 if dataframe.empty:
   1205     # Create the table (if needed), but don't try to run a load job with an
   1206     # empty file. See: https://github.com/pydata/pandas-gbq/issues/237
   1207     return
-> 1209 connector.load_data(
   1210     dataframe,
   1211     destination_table_ref,
   1212     write_disposition=write_disposition,
   1213     chunksize=chunksize,
   1214     schema=table_schema,
   1215     progress_bar=progress_bar,
   1216     api_method=api_method,
   1217     billing_project=project_id,
   1218 )

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:554, in GbqConnector.load_data(self, dataframe, destination_table_ref, write_disposition, chunksize, schema, progress_bar, api_method, billing_project)
    551 total_rows = len(dataframe)
    553 try:
--> 554     chunks = load.load_chunks(
    555         self.client,
    556         dataframe,
    557         destination_table_ref,
    558         chunksize=chunksize,
    559         schema=schema,
    560         location=self.location,
    561         api_method=api_method,
    562         write_disposition=write_disposition,
    563         billing_project=billing_project,
    564     )
    565     if progress_bar and tqdm:
    566         chunks = tqdm.tqdm(chunks)

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:251, in load_chunks(client, dataframe, destination_table_ref, chunksize, schema, location, api_method, write_disposition, billing_project)
    239 def load_chunks(
    240     client,
    241     dataframe,
   (...)
    248     billing_project: Optional[str] = None,
    249 ):
    250     if api_method == "load_parquet":
--> 251         load_parquet(
    252             client,
    253             dataframe,
    254             destination_table_ref,
    255             write_disposition,
    256             location,
    257             schema,
    258             billing_project=billing_project,
    259         )
    260         # TODO: yield progress depending on result() with timeout
    261         return [0]

File ~/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/load.py:141, in load_parquet(client, dataframe, destination_table_ref, write_disposition, location, schema, billing_project)
    138     dataframe = cast_dataframe_for_parquet(dataframe, schema)
    140 try:
--> 141     client.load_table_from_dataframe(
    142         dataframe,
    143         destination_table_ref,
    144         job_config=job_config,
    145         location=location,
    146         project=billing_project,
    147     ).result()
    148 except pyarrow.lib.ArrowInvalid as exc:
    149     raise exceptions.ConversionError(
    150         "Could not convert DataFrame to Parquet."
    151     ) from exc

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/client.py:2806, in Client.load_table_from_dataframe(self, dataframe, destination, num_retries, job_id, job_id_prefix, location, project, job_config, parquet_compression, timeout)
   2803     if parquet_compression == "snappy":  # adjust the default value
   2804         parquet_compression = parquet_compression.upper()
-> 2806     _pandas_helpers.dataframe_to_parquet(
   2807         dataframe,
   2808         new_job_config.schema,
   2809         tmppath,
   2810         parquet_compression=parquet_compression,
   2811         parquet_use_compliant_nested_type=True,
   2812     )
   2813 else:
   2814     dataframe.to_parquet(
   2815         tmppath,
   2816         engine="pyarrow",
   (...)
   2822         ),
   2823     )

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:689, in dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression, parquet_use_compliant_nested_type)
    682 kwargs = (
    683     {"use_compliant_nested_type": parquet_use_compliant_nested_type}
    684     if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type
    685     else {}
    686 )
    688 bq_schema = schema._to_schema_fields(bq_schema)
--> 689 arrow_table = dataframe_to_arrow(dataframe, bq_schema)
    690 pyarrow.parquet.write_table(
    691     arrow_table,
    692     filepath,
    693     compression=parquet_compression,
    694     **kwargs,
    695 )

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:632, in dataframe_to_arrow(dataframe, bq_schema)
    629 for bq_field in bq_schema:
    630     arrow_names.append(bq_field.name)
    631     arrow_arrays.append(
--> 632         bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
    633     )
    634     arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type))
    636 if all((field is not None for field in arrow_fields)):

File ~/src/github.com/googleapis/python-bigquery-pandas/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:347, in bq_to_arrow_array(series, bq_field)
    345 msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""
    346 _LOGGER.error(msg)
--> 347 raise pyarrow.ArrowTypeError(msg)

ArrowTypeError: Error converting Pandas column with name: "zip_code" and datatype: "int64" to an appropriate pyarrow datatype: Array, ListArray, or StructArray
```

</details>

 **Describe the solution you'd like**

If a cast from the local type to the BigQuery type would be lossless (as is the case with integer to string), pandas-gbq should automatically do that before trying to create a parquet file.

 **Describe alternatives you've considered**

Two workarounds:

* Users must cast to the string dtype in pandas before calling to_gbq.
* Pass in `api_method="load_csv"` to `to_gbq(df, table_schema=..., api_method="load_csv")`

 **Additional context**

Reported by a customer in an email sent to me.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

can't upload an int64 column dtype (or object dtype containing integers) to a string column in BigQuery #875

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

can't upload an int64 column dtype (or object dtype containing integers) to a string column in BigQuery #875

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions