BUG: fix read_csv to parse timezone correctly

swyoon · swyoon · commit aae7d6e20089 · 2018-08-16T12:25:26.000+09:00
- make the csv parsing compatible with `box=True` of `to_datetime`
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -673,6 +673,7 @@ I/O
 
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
+- :func:`read_csv()` will correctly parse timezone-aware datetimes. (:issue:`22256`)
 -
 
 Plotting
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -25,7 +25,8 @@
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
-                               ensure_index_from_sequences)
+                               ensure_index_from_sequences,
+                               DatetimeIndex)
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.arrays import Categorical
@@ -1589,11 +1590,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
             else:
                 # skip inference if specified dtype is object
                 try_num_bool = not (cast_type and is_string_dtype(cast_type))
-
-                # general type inference and conversion
-                cvals, na_count = self._infer_types(
-                    values, set(col_na_values) | col_na_fvalues,
-                    try_num_bool)
+                if isinstance(values, np.ndarray):
+                    # general type inference and conversion
+                    cvals, na_count = self._infer_types(
+                        values, set(col_na_values) | col_na_fvalues,
+                        try_num_bool)
+                else:  # _infer_types only accepts ndarray.
+                    cvals = values
 
                 # type specified in dtype param
                 if cast_type and not is_dtype_equal(cvals, cast_type):
@@ -3030,14 +3033,19 @@ def converter(*date_cols):
             strs = _concat_date_cols(date_cols)
 
             try:
-                return tools.to_datetime(
+                converted = tools.to_datetime(
                     ensure_object(strs),
                     utc=None,
-                    box=False,
+                    box=True,
                     dayfirst=dayfirst,
                     errors='ignore',
                     infer_datetime_format=infer_datetime_format
                 )
+                if not isinstance(converted, DatetimeIndex):
+                    # GH-22256 : non-datetime Index needs to be
+                    # converted to ndarray to avoid downstream errors
+                    return np.array(converted)
+                return converted
             except:
                 return tools.to_datetime(
                     parsing.try_parse_dates(strs, dayfirst=dayfirst))
diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py
@@ -674,3 +674,19 @@ def test_parse_date_float(self, data, expected, parse_dates):
         # (i.e. float precision should remain unchanged).
         result = self.read_csv(StringIO(data), parse_dates=parse_dates)
         tm.assert_frame_equal(result, expected)
+
+    def test_parse_timezone(self):
+        import pytz
+        data = """dt,val
+                  2018-01-04 09:01:00+09:00,23350
+                  2018-01-04 09:02:00+09:00,23400
+                  2018-01-04 09:03:00+09:00,23400
+                  2018-01-04 09:04:00+09:00,23400
+                  2018-01-04 09:05:00+09:00,23400"""
+        parsed = self.read_csv(StringIO(data), parse_dates=['dt'])
+        dti = pd.DatetimeIndex(start='2018-01-04 09:01:00',
+                               end='2018-01-04 09:05:00', freq='1min',
+                               tz=pytz.FixedOffset(540))
+        expected_data = {'dt': dti, 'val': [23350, 23400, 23400, 23400, 23400]}
+        expected = DataFrame(expected_data)
+        tm.assert_frame_equal(parsed, expected)

Original file line number	Diff line number	Diff line change
`@@ -673,6 +673,7 @@ I/O`
`673`	`673`
`674`	`674`	- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
`675`	`675`	- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
	`676`	+- :func:`read_csv()` will correctly parse timezone-aware datetimes. (:issue:`22256`)
`676`	`677`	`-`
`677`	`678`
`678`	`679`	`Plotting`