pandas-dev · sinhrks · Jul 17, 2016 · jreback · Jul 18, 2016 · sinhrks
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -566,6 +566,7 @@ Performance Improvements
 - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
 - Improved performance of ``Index.difference`` (:issue:`12044`)
+- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
 
 .. _whatsnew_0190.bug_fixes:
 
@@ -631,6 +632,7 @@ Bug Fixes
 - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
 
 
+
 - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
 
 
@@ -654,6 +656,8 @@ Bug Fixes
 
 - Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`)
 - Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`)
+- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`)
+- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`)
 
 - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
 - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2440,7 +2440,7 @@ def converter(*date_cols):
             strs = _concat_date_cols(date_cols)
 
             try:
-                return tools._to_datetime(
+                return tools.to_datetime(
                     _ensure_object(strs),
                     utc=None,
                     box=False,

diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py
@@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self):
         self.assert_index_equal(result, exp, exact=True)
         self.assertFalse(isinstance(result, DatetimeIndex))
 
-        # passing tz results in DatetimeIndex
-        result = Index([Timestamp('2011-01-01 10:00'),
-                        Timestamp('2011-01-02 10:00', tz='US/Eastern')],
-                       tz='Asia/Tokyo', name='idx')
-        exp = DatetimeIndex([Timestamp('2011-01-01 19:00'),
-                             Timestamp('2011-01-03 00:00')],
-                            tz='Asia/Tokyo', name='idx')
-        self.assert_index_equal(result, exp, exact=True)
-        self.assertTrue(isinstance(result, DatetimeIndex))
-
         # length = 1
         result = Index([Timestamp('2011-01-01')], name='idx')
         exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx')
@@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self):
         self.assert_index_equal(result, exp, exact=True)
         self.assertFalse(isinstance(result, DatetimeIndex))
 
-        # passing tz results in DatetimeIndex
-        result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
-                        pd.NaT, Timestamp('2011-01-02 10:00',
-                                          tz='US/Eastern')],
-                       tz='Asia/Tokyo', name='idx')
-        exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'),
-                             pd.NaT, Timestamp('2011-01-03 00:00')],
-                            tz='Asia/Tokyo', name='idx')
-        self.assert_index_equal(result, exp, exact=True)
-        self.assertTrue(isinstance(result, DatetimeIndex))
-
         # all NaT
         result = Index([pd.NaT, pd.NaT], name='idx')
         exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx')
@@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self):
         self.assertTrue(isinstance(result, DatetimeIndex))
 
         # tz mismatch affecting to tz-aware raises TypeError/ValueError
+
         with tm.assertRaises(ValueError):
             DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
                            Timestamp('2011-01-02 10:00', tz='US/Eastern')],
                           name='idx')
 
-        with tm.assertRaises(TypeError):
+        with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
             DatetimeIndex([Timestamp('2011-01-01 10:00'),
                            Timestamp('2011-01-02 10:00', tz='US/Eastern')],
                           tz='Asia/Tokyo', name='idx')
@@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self):
                            Timestamp('2011-01-02 10:00', tz='US/Eastern')],
                           tz='US/Eastern', name='idx')
 
+        with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
+            # passing tz should results in DatetimeIndex, then mismatch raises
+            # TypeError
+            Index([pd.NaT, Timestamp('2011-01-01 10:00'),
+                   pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+                  tz='Asia/Tokyo', name='idx')
+
     def test_construction_base_constructor(self):
         arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]
         tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr))

diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -292,55 +292,32 @@ def __new__(cls, data=None,
                 raise ValueError('DatetimeIndex() must be called with a '
                                  'collection of some kind, %s was passed'
                                  % repr(data))
-
             # other iterable of some kind
             if not isinstance(data, (list, tuple)):
                 data = list(data)
-
             data = np.asarray(data, dtype='O')
+        elif isinstance(data, ABCSeries):
+            data = data._values
 
-            # try a few ways to make it datetime64
-            if lib.is_string_array(data):
-                data = tslib.parse_str_array_to_datetime(data, freq=freq,
-                                                         dayfirst=dayfirst,
-                                                         yearfirst=yearfirst)
-            else:
-                data = tools.to_datetime(data, errors='raise')
-                data.offset = freq
-                if isinstance(data, DatetimeIndex):
-                    if name is not None:
-                        data.name = name
-
-                    if tz is not None:
-
-                        # we might already be localized to this tz
-                        # so passing the same tz is ok
-                        # however any other tz is a no-no
-                        if data.tz is None:
-                            return data.tz_localize(tz, ambiguous=ambiguous)
-                        elif str(tz) != str(data.tz):
-                            raise TypeError("Already tz-aware, use tz_convert "
-                                            "to convert.")
-
-                    return data._deepcopy_if_needed(ref_to_data, copy)
-
-        if issubclass(data.dtype.type, compat.string_types):
-            data = tslib.parse_str_array_to_datetime(data, freq=freq,
-                                                     dayfirst=dayfirst,
-                                                     yearfirst=yearfirst)
+        # data must be Index or np.ndarray here
+        if not (is_datetime64_dtype(data) or is_datetimetz(data) or
+                is_integer_dtype(data)):
+            data = tools.to_datetime(data, dayfirst=dayfirst,
+                                     yearfirst=yearfirst)
 
         if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data):
-            if isinstance(data, ABCSeries):
-                data = data._values
+
             if isinstance(data, DatetimeIndex):
                 if tz is None:
                     tz = data.tz
-
+                elif data.tz is None:
+                    data = data.tz_localize(tz, ambiguous=ambiguous)
                 else:
                     # the tz's must match
                     if str(tz) != str(data.tz):
-                        raise TypeError("Already tz-aware, use tz_convert "
-                                        "to convert.")
+                        msg = ('data is already tz-aware {0}, unable to '
+                               'set specified tz: {1}')
+                        raise TypeError(msg.format(data.tz, tz))
 
                 subarr = data.values
 
@@ -356,35 +333,6 @@ def __new__(cls, data=None,
             if isinstance(data, Int64Index):
                 raise TypeError('cannot convert Int64Index->DatetimeIndex')
             subarr = data.view(_NS_DTYPE)
-        else:
-            if isinstance(data, (ABCSeries, Index)):
-                values = data._values
-            else:
-                values = data
-
-            if lib.is_string_array(values):
-                subarr = tslib.parse_str_array_to_datetime(
-                    values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst)
-            else:
-                try:
-                    subarr = tools.to_datetime(data, box=False)
-
-                    # make sure that we have a index/ndarray like (and not a
-                    # Series)
-                    if isinstance(subarr, ABCSeries):
-                        subarr = subarr._values
-                        if subarr.dtype == np.object_:
-                            subarr = tools._to_datetime(subarr, box=False)
-
-                except ValueError:
-                    # tz aware
-                    subarr = tools._to_datetime(data, box=False, utc=True)
-
-                # we may not have been able to convert
-                if not (is_datetimetz(subarr) or
-                        np.issubdtype(subarr.dtype, np.datetime64)):
-                    raise ValueError('Unable to convert %s to datetime dtype'
-                                     % str(data))
 
         if isinstance(subarr, DatetimeIndex):
             if tz is None:
@@ -399,27 +347,21 @@ def __new__(cls, data=None,
                     ints = subarr.view('i8')
                     subarr = tslib.tz_localize_to_utc(ints, tz,
                                                       ambiguous=ambiguous)
-
                 subarr = subarr.view(_NS_DTYPE)
 
         subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz)
-
-        # if dtype is provided, coerce here
         if dtype is not None:
-
             if not is_dtype_equal(subarr.dtype, dtype):
-
+                # dtype must be coerced to DatetimeTZDtype above
                 if subarr.tz is not None:
                     raise ValueError("cannot localize from non-UTC data")
-                dtype = DatetimeTZDtype.construct_from_string(dtype)
-                subarr = subarr.tz_localize(dtype.tz)
 
         if verify_integrity and len(subarr) > 0:
             if freq is not None and not freq_infer:
                 inferred = subarr.inferred_freq
                 if inferred != freq.freqstr:
-                    on_freq = cls._generate(subarr[0], None, len(
-                        subarr), None, freq, tz=tz, ambiguous=ambiguous)
+                    on_freq = cls._generate(subarr[0], None, len(subarr), None,
+                                            freq, tz=tz, ambiguous=ambiguous)
                     if not np.array_equal(subarr.asi8, on_freq.asi8):
                         raise ValueError('Inferred frequency {0} from passed '
                                          'dates does not conform to passed '
@@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset,
             index = index[1:]
         if not right_closed and len(index) and index[-1] == end:
             index = index[:-1]
-
         index = cls._simple_new(index, name=name, freq=offset, tz=tz)
         return index
 
@@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
             xdr = generate_range(offset=offset, start=_CACHE_START,
                                  end=_CACHE_END)
 
-            arr = tools._to_datetime(list(xdr), box=False)
+            arr = tools.to_datetime(list(xdr), box=False)
 
             cachedRange = DatetimeIndex._simple_new(arr)
             cachedRange.offset = offset

diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
@@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj):
         l = []
         for key, group in grouper.get_iterator(self.ax):
             l.extend([key] * len(group))
-        grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
+
+        if isinstance(self.ax, PeriodIndex):
+            grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
+        else:
+            # resampling causes duplicated values, specifying freq is invalid
+            grouper = binner.__class__(l, name=binner.name)
 
         # since we may have had to sort
         # may need to reorder groups here

diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self):
 
         # 11314
         # with tz
-        index = date_range(datetime(2015, 10, 1), datetime(
-            2015, 10, 1, 23), freq='H', tz='US/Eastern')
+        index = date_range(datetime(2015, 10, 1),
+                           datetime(2015, 10, 1, 23),
+                           freq='H', tz='US/Eastern')
         df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
         new_index = date_range(datetime(2015, 10, 2),
                                datetime(2015, 10, 2, 23),

diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py
@@ -7,7 +7,8 @@
 import datetime
 
 import pandas as pd
-from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime
+from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period,
+                             to_datetime)
 from pandas.tslib import get_timezone
 from pandas._period import period_asfreq, period_ordinal
 from pandas.tseries.index import date_range, DatetimeIndex
@@ -698,14 +699,19 @@ def test_parsers(self):
                                                     yearfirst=yearfirst)
             result2 = to_datetime(date_str, yearfirst=yearfirst)
             result3 = to_datetime([date_str], yearfirst=yearfirst)
+            # result5 is used below
             result4 = to_datetime(np.array([date_str], dtype=object),
                                   yearfirst=yearfirst)
-            result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0]
-            self.assertEqual(result1, expected)
-            self.assertEqual(result2, expected)
-            self.assertEqual(result3, expected)
-            self.assertEqual(result4, expected)
-            self.assertEqual(result6, expected)
+            result6 = DatetimeIndex([date_str], yearfirst=yearfirst)
+            # result7 is used below
+            result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst)
+            result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst)
+
+            for res in [result1, result2]:
+                self.assertEqual(res, expected)
+            for res in [result3, result4, result6, result8, result9]:
+                exp = DatetimeIndex([pd.Timestamp(expected)])
+                tm.assert_index_equal(res, exp)
 
             # these really need to have yearfist, but we don't support
             if not yearfirst:
@@ -893,9 +899,7 @@ def test_parsers_monthfreq(self):
 
         for date_str, expected in compat.iteritems(cases):
             result1, _, _ = tools.parse_time_string(date_str, freq='M')
-            result2 = tools._to_datetime(date_str, freq='M')
             self.assertEqual(result1, expected)
-            self.assertEqual(result2, expected)
 
     def test_parsers_quarterly_with_freq(self):
         msg = ('Incorrect quarterly string is given, quarter '