Skip to content

BUG: interpolate should preserve dtypes #6378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ API Changes
- ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent;
previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`)
- ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`)

- The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to
``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`).

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -121,6 +122,7 @@ Bug Fixes
- Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained
keys not in the values to be replaced (:issue:`6342`)
- Bug in take with duplicate columns not consolidated (:issue:`6240`)
- Bug in interpolate changing dtypes (:issue:`6290`)

pandas 0.13.1
-------------
Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ API changes
df.iloc[:,2:3]
df.iloc[:,1:3]

- The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to
``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`).

MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
6 changes: 2 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2435,7 +2435,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
return self._constructor(new_data).__finalize__(self)

def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
downcast='infer', **kwargs):
downcast=None, **kwargs):
"""
Interpolate values according to different methods.

Expand Down Expand Up @@ -2468,7 +2468,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
Maximum number of consecutive NaNs to fill.
inplace : bool, default False
Update the NDFrame in place if possible.
downcast : optional, 'infer' or None, defaults to 'infer'
downcast : optional, 'infer' or None, defaults to None
Downcast dtypes if possible.

Returns
Expand All @@ -2492,7 +2492,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
dtype: float64

"""

if self.ndim > 2:
raise NotImplementedError("Interpolate has not been implemented "
"on Panel and Panel 4D objects.")
Expand Down Expand Up @@ -2534,7 +2533,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
inplace=inplace,
downcast=downcast,
**kwargs)

if inplace:
if axis == 1:
self._update_inplace(new_data)
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,13 +805,25 @@ def interpolate(self, method='pad', axis=0, index=None,
values=None, inplace=False, limit=None,
fill_value=None, coerce=False, downcast=None, **kwargs):

def check_int_bool(self, inplace):
# Only FloatBlocks will contain NaNs.
# timedelta subclasses IntBlock
if (self.is_bool or self.is_integer) and not self.is_timedelta:
if inplace:
return self
else:
return self.copy()

# a fill na type method
try:
m = com._clean_fill_method(method)
except:
m = None

if m is not None:
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate_with_fill(method=m,
axis=axis,
inplace=inplace,
Expand All @@ -826,6 +838,9 @@ def interpolate(self, method='pad', axis=0, index=None,
m = None

if m is not None:
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate(method=m,
index=index,
values=values,
Expand Down
79 changes: 61 additions & 18 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,10 @@ def test_interpolate(self):
self.assert_numpy_array_equal(time_interp, ord_ts)

# try time interpolation on a non-TimeSeries
self.assertRaises(ValueError, self.series.interpolate, method='time')
# Only raises ValueError if there are NaNs.
non_ts = self.series.copy()
non_ts[0] = np.NaN
self.assertRaises(ValueError, non_ts.interpolate, method='time')

def test_interp_regression(self):
_skip_if_no_scipy()
Expand Down Expand Up @@ -512,7 +515,7 @@ def test_interpolate_non_ts(self):
def test_nan_interpolate(self):
s = Series([0, 1, np.nan, 3])
result = s.interpolate()
expected = Series([0, 1, 2, 3])
expected = Series([0., 1., 2., 3.])
assert_series_equal(result, expected)

_skip_if_no_scipy()
Expand All @@ -522,20 +525,20 @@ def test_nan_interpolate(self):
def test_nan_irregular_index(self):
s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9])
result = s.interpolate()
expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9])
expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9])
assert_series_equal(result, expected)

def test_nan_str_index(self):
s = Series([0, 1, 2, np.nan], index=list('abcd'))
result = s.interpolate()
expected = Series([0, 1, 2, 2], index=list('abcd'))
expected = Series([0., 1., 2., 2.], index=list('abcd'))
assert_series_equal(result, expected)

def test_interp_quad(self):
_skip_if_no_scipy()
sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4])
result = sq.interpolate(method='quadratic')
expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4])
expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4])
assert_series_equal(result, expected)

def test_interp_scipy_basic(self):
Expand All @@ -545,18 +548,30 @@ def test_interp_scipy_basic(self):
expected = Series([1., 3., 7.5, 12., 18.5, 25.])
result = s.interpolate(method='slinear')
assert_series_equal(result, expected)

result = s.interpolate(method='slinear', donwcast='infer')
assert_series_equal(result, expected)
# nearest
expected = Series([1, 3, 3, 12, 12, 25])
result = s.interpolate(method='nearest')
assert_series_equal(result, expected.astype('float'))

result = s.interpolate(method='nearest', downcast='infer')
assert_series_equal(result, expected)
# zero
expected = Series([1, 3, 3, 12, 12, 25])
result = s.interpolate(method='zero')
assert_series_equal(result, expected.astype('float'))

result = s.interpolate(method='zero', downcast='infer')
assert_series_equal(result, expected)
# quadratic
expected = Series([1, 3., 6.769231, 12., 18.230769, 25.])
result = s.interpolate(method='quadratic')
assert_series_equal(result, expected)

result = s.interpolate(method='quadratic', downcast='infer')
assert_series_equal(result, expected)
# cubic
expected = Series([1., 3., 6.8, 12., 18.2, 25.])
result = s.interpolate(method='cubic')
Expand Down Expand Up @@ -585,7 +600,6 @@ def test_interp_multiIndex(self):

expected = s.copy()
expected.loc[2] = 2
expected = expected.astype(np.int64)
result = s.interpolate()
assert_series_equal(result, expected)

Expand All @@ -595,15 +609,15 @@ def test_interp_multiIndex(self):

def test_interp_nonmono_raise(self):
_skip_if_no_scipy()
s = pd.Series([1, 2, 3], index=[0, 2, 1])
s = Series([1, np.nan, 3], index=[0, 2, 1])
with tm.assertRaises(ValueError):
s.interpolate(method='krogh')

def test_interp_datetime64(self):
_skip_if_no_scipy()
df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3))
result = df.interpolate(method='nearest')
expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3))
expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3))
assert_series_equal(result, expected)

class TestDataFrame(tm.TestCase, Generic):
Expand Down Expand Up @@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self):
def test_interp_basic(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5], 'D': list('abcd')})
expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9],
expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.],
'C': [1, 2, 3, 5], 'D': list('abcd')})
result = df.interpolate()
assert_frame_equal(result, expected)
Expand All @@ -648,8 +662,6 @@ def test_interp_basic(self):
expected = df.set_index('C')
expected.A.loc[3] = 3
expected.B.loc[5] = 9
expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64)

assert_frame_equal(result, expected)

def test_interp_bad_method(self):
Expand All @@ -663,9 +675,14 @@ def test_interp_combo(self):
'C': [1, 2, 3, 5], 'D': list('abcd')})

result = df['A'].interpolate()
expected = Series([1., 2., 3., 4.])
assert_series_equal(result, expected)

result = df['A'].interpolate(downcast='infer')
expected = Series([1, 2, 3, 4])
assert_series_equal(result, expected)


def test_interp_nan_idx(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
df = df.set_index('A')
Expand Down Expand Up @@ -722,13 +739,16 @@ def test_interp_alt_scipy(self):
expected = df.copy()
expected['A'].iloc[2] = 3
expected['A'].iloc[5] = 6
assert_frame_equal(result, expected)

result = df.interpolate(method='barycentric', downcast='infer')
assert_frame_equal(result, expected.astype(np.int64))

result = df.interpolate(method='krogh')
expectedk = df.copy()
expectedk['A'].iloc[2] = 3
expectedk['A'].iloc[5] = 6
expectedk['A'] = expected['A'].astype(np.int64)
# expectedk['A'].iloc[2] = 3
# expectedk['A'].iloc[5] = 6
expectedk['A'] = expected['A']
assert_frame_equal(result, expectedk)

_skip_if_no_pchip()
Expand Down Expand Up @@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self):

def test_interp_inplace(self):
df = DataFrame({'a': [1., 2., np.nan, 4.]})
expected = DataFrame({'a': [1, 2, 3, 4]})
df['a'].interpolate(inplace=True)
assert_frame_equal(df, expected)
expected = DataFrame({'a': [1., 2., 3., 4.]})
result = df.copy()
result['a'].interpolate(inplace=True)
assert_frame_equal(result, expected)

result = df.copy()
result['a'].interpolate(inplace=True, downcast='infer')
assert_frame_equal(result, expected.astype('int'))

def test_interp_ignore_all_good(self):
# GH
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 2, 3, 4],
'C': [1., 2., np.nan, 4.],
'D': [1., 2., 3., 4.]})
expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'),
'B': np.array([1, 2, 3, 4], dtype='int'),
'C': np.array([1., 2., 3, 4.], dtype='float'),
'D': np.array([1., 2., 3., 4.], dtype='float')})

result = df.interpolate(downcast=None)
assert_frame_equal(result, expected)

# all good
result = df[['B', 'D']].interpolate(downcast=None)
assert_frame_equal(result, df[['B', 'D']])

def test_no_order(self):
_skip_if_no_scipy()
Expand All @@ -802,7 +845,7 @@ def test_spline(self):
_skip_if_no_scipy()
s = Series([1, 2, np.nan, 4, 5, np.nan, 7])
result = s.interpolate(method='spline', order=1)
expected = Series([1, 2, 3, 4, 5, 6, 7])
expected = Series([1., 2., 3., 4., 5., 6., 7.])
assert_series_equal(result, expected)

def test_metadata_propagation_indiv(self):
Expand Down
26 changes: 26 additions & 0 deletions vb_suite/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,3 +403,29 @@ def test_unequal(name):
frame_object_unequal = Benchmark('test_unequal("object_df")', setup)
frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup)

#-----------------------------------------------------------------------------
# interpolate
# this is the worst case, where every column has NaNs.
setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_interpolate = Benchmark('df.interpolate()', setup,
start_date=datetime(2014, 2, 7))

setup = common_setup + """
df = DataFrame({'A': np.arange(0, 10000),
'B': np.random.randint(0, 100, 10000),
'C': randn(10000),
'D': randn(10000)})
df.loc[1::5, 'A'] = np.nan
df.loc[1::5, 'C'] = np.nan
"""

frame_interpolate_some_good = Benchmark('df.interpolate()', setup,
start_date=datetime(2014, 2, 7))
frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")',
setup,
start_date=datetime(2014, 2, 7))