Merge branch 'dtype-patch' of https://github.com/trevorbye/pandas into dtype-patch

Trevor Bye · Trevor Bye · commit 794cf9439fcf · 2020-01-28T22:21:54.000-08:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4307,6 +4307,7 @@ def set_index(
             "one-dimensional arrays."
         )
 
+        current_dtype = None
         missing: List[Optional[Hashable]] = []
         for col in keys:
             if isinstance(
@@ -4320,6 +4321,9 @@ def set_index(
                 # everything else gets tried as a key; see GH 24969
                 try:
                     found = col in self.columns
+                    if found:
+                        # get current dtype to preserve through index creation
+                        current_dtype = self.dtypes.get(col).type
                 except TypeError:
                     raise TypeError(f"{err_msg}. Received column of type {type(col)}")
                 else:
@@ -4375,7 +4379,7 @@ def set_index(
                     f"received array of length {len(arrays[-1])}"
                 )
 
-        index = ensure_index_from_sequences(arrays, names)
+        index = ensure_index_from_sequences(arrays, names, current_dtype)
 
         if verify_integrity and not index.is_unique:
             duplicates = index[index.duplicated()].unique()
@@ -4550,9 +4554,6 @@ class    max    type
 
         def _maybe_casted_values(index, labels=None):
             values = index._values
-            if not isinstance(index, (PeriodIndex, DatetimeIndex)):
-                if values.dtype == np.object_:
-                    values = lib.maybe_convert_objects(values)
 
             # if we have the labels, extract the values with a mask
             if labels is not None:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5501,7 +5501,7 @@ def shape(self):
 Index._add_comparison_methods()
 
 
-def ensure_index_from_sequences(sequences, names=None):
+def ensure_index_from_sequences(sequences, names=None, dtype=None):
     """
     Construct an index from sequences of data.
 
@@ -5512,6 +5512,7 @@ def ensure_index_from_sequences(sequences, names=None):
     ----------
     sequences : sequence of sequences
     names : sequence of str
+    dtype : NumPy dtype
 
     Returns
     -------
@@ -5537,7 +5538,7 @@ def ensure_index_from_sequences(sequences, names=None):
     if len(sequences) == 1:
         if names is not None:
             names = names[0]
-        return Index(sequences[0], name=names)
+        return Index(sequences[0], name=names, dtype=dtype)
     else:
         return MultiIndex.from_arrays(sequences, names=names)
 
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -1486,6 +1486,41 @@ def test_droplevel(self):
         result = df.droplevel("level_2", axis="columns")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize('test_dtype', [object, 'int64'])
+    def test_dtypes(self, test_dtype):
+        df = DataFrame({'A': Series([1, 2, 3], dtype=test_dtype), 'B': [1, 2, 3]})
+        expected = df.dtypes.values[0].type
+
+        result = df.set_index('A').index.dtype.type
+        assert result == expected
+
+    @pytest.fixture
+    def mixed_series(self):
+        return Series([1, 2, 3, 'apple', 'corn'], dtype=object)
+
+    @pytest.fixture
+    def int_series(self):
+        return Series([100, 200, 300, 400, 500])
+
+    def test_dtypes_between_queries(self, mixed_series, int_series):
+        df = DataFrame({'item': mixed_series, 'cost': int_series})
+
+        orig_dtypes = df.dtypes
+        item_dtype = orig_dtypes.get('item').type
+        cost_dtype = orig_dtypes.get('cost').type
+        expected = {'item': item_dtype, 'cost': cost_dtype}
+
+        # after applying a query that would remove strings from the 'item' series with
+        # dtype: object, that series should remain as dtype: object as it becomes an
+        # index, and again as it becomes a column again after calling reset_index()
+        dtypes_transformed = df.query('cost < 400').set_index(
+            'item').reset_index().dtypes
+        item_dtype_transformed = dtypes_transformed.get('item').type
+        cost_dtype_transformed = dtypes_transformed.get('cost').type
+        result = {'item': item_dtype_transformed, 'cost': cost_dtype_transformed}
+
+        assert result == expected
+
 
 class TestIntervalIndex:
     def test_setitem(self):