BUG: use greater precision when serializing floating points (#336)

tswast · web-flow · commit f1995f8dac64 · 2020-11-09T09:30:16.000-06:00
* BUG: use greater precision when serializing floating points

This allows the exact binary representation to be transferred
correctly, round-trip.

* blacken

* remove f-string

* adjust string formatting
diff --git a/conftest.py b/conftest.py
@@ -1,4 +1,4 @@
-"""Shared pytest fixtures for system tests."""
+"""Shared pytest fixtures for `tests/system` and `samples/tests` tests."""
 
 import os
 import os.path
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,17 @@
 Changelog
 =========
 
+.. _changelog-0.14.1:
+
+0.14.1 / TBD
+------------
+
+Bug fixes
+~~~~~~~~~
+
+- Encode floating point values with greater precision. (:issue:`326`)
+
+
 .. _changelog-0.14.0:
 
 0.14.0 / 2020-10-05
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -19,7 +19,7 @@ def encode_chunk(dataframe):
         index=False,
         header=False,
         encoding="utf-8",
-        float_format="%.15g",
+        float_format="%.17g",
         date_format="%Y-%m-%d %H:%M:%S.%f",
     )
 
diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py
@@ -0,0 +1,49 @@
+import functools
+import pandas
+import pandas.testing
+
+import pytest
+
+
+pytest.importorskip("google.cloud.bigquery", minversion="1.24.0")
+
+
+@pytest.fixture
+def method_under_test(credentials):
+    import pandas_gbq
+
+    return functools.partial(pandas_gbq.to_gbq, credentials=credentials)
+
+
+def test_float_round_trip(
+    method_under_test, random_dataset_id, bigquery_client
+):
+    """Ensure that 64-bit floating point numbers are unchanged.
+
+    See: https://github.com/pydata/pandas-gbq/issues/326
+    """
+
+    table_id = "{}.float_round_trip".format(random_dataset_id)
+    input_floats = pandas.Series(
+        [
+            0.14285714285714285,
+            0.4406779661016949,
+            1.05148,
+            1.05153,
+            1.8571428571428572,
+            2.718281828459045,
+            3.141592653589793,
+            2.0988936657440586e43,
+        ],
+        name="float_col",
+    )
+    df = pandas.DataFrame({"float_col": input_floats})
+    method_under_test(df, table_id)
+
+    round_trip = bigquery_client.list_rows(table_id).to_dataframe()
+    round_trip_floats = round_trip["float_col"].sort_values()
+    pandas.testing.assert_series_equal(
+        round_trip_floats,
+        input_floats,
+        check_exact=True,
+    )
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+import textwrap
 from io import StringIO
 
 import numpy
@@ -24,17 +25,32 @@ def test_encode_chunk_with_unicode():
 
 
 def test_encode_chunk_with_floats():
-    """Test that floats in a dataframe are encoded with at most 15 significant
+    """Test that floats in a dataframe are encoded with at most 17 significant
         figures.
 
-    See: https://github.com/pydata/pandas-gbq/issues/192
+    See: https://github.com/pydata/pandas-gbq/issues/192 and
+    https://github.com/pydata/pandas-gbq/issues/326
     """
-    input_csv = StringIO(u"01/01/17 23:00,1.05148,1.05153,1.05148,1.05153,4")
-    df = pandas.read_csv(input_csv, header=None)
-    csv_buffer = load.encode_chunk(df)
-    csv_bytes = csv_buffer.read()
-    csv_string = csv_bytes.decode("utf-8")
-    assert "1.05153" in csv_string
+    input_csv = textwrap.dedent(
+        """01/01/17 23:00,0.14285714285714285,4
+        01/02/17 22:00,1.05148,3
+        01/03/17 21:00,1.05153,2
+        01/04/17 20:00,3.141592653589793,1
+        01/05/17 19:00,2.0988936657440586e+43,0
+        """
+    )
+    input_df = pandas.read_csv(
+        StringIO(input_csv), header=None, float_precision="round_trip"
+    )
+    csv_buffer = load.encode_chunk(input_df)
+    round_trip = pandas.read_csv(
+        csv_buffer, header=None, float_precision="round_trip"
+    )
+    pandas.testing.assert_frame_equal(
+        round_trip,
+        input_df,
+        check_exact=True,
+    )
 
 
 def test_encode_chunk_with_newlines():

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-"""Shared pytest fixtures for system tests."""`
	`1`	+"""Shared pytest fixtures for `tests/system` and `samples/tests` tests."""
`2`	`2`
`3`	`3`	`import os`
`4`	`4`	`import os.path`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ def encode_chunk(dataframe):`
`19`	`19`	`index=False,`
`20`	`20`	`header=False,`
`21`	`21`	`encoding="utf-8",`
`22`		`- float_format="%.15g",`
	`22`	`+ float_format="%.17g",`
`23`	`23`	`date_format="%Y-%m-%d %H:%M:%S.%f",`
`24`	`24`	`)`
`25`	`25`