Skip to content

BUG: preserve dtypes in interpolate #6291

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2435,7 +2435,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
return self._constructor(new_data).__finalize__(self)

def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
downcast='infer', **kwargs):
downcast=None, **kwargs):
"""
Interpolate values according to different methods.

Expand Down Expand Up @@ -2468,7 +2468,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
Maximum number of consecutive NaNs to fill.
inplace : bool, default False
Update the NDFrame in place if possible.
downcast : optional, 'infer' or None, defaults to 'infer'
downcast : optional, 'infer' or None, defaults to None
Downcast dtypes if possible.

Returns
Expand All @@ -2492,7 +2492,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
dtype: float64

"""

if self.ndim > 2:
raise NotImplementedError("Interpolate has not been implemented "
"on Panel and Panel 4D objects.")
Expand Down Expand Up @@ -2534,7 +2533,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
inplace=inplace,
downcast=downcast,
**kwargs)

if inplace:
if axis == 1:
self._update_inplace(new_data)
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,12 @@ def interpolate(self, method='pad', axis=0, index=None,
m = None

if m is not None:
# Skip interpolating this block if no NaNs.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is fine; but you of course have the problem if say part of block is interpolated and part is not. but better.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I'm still looking into how to handle blocks with some good cols, and some that need interpolating.

if (~isnull(self.values)).all():
if inplace:
return self
else:
return self.copy()
return self._interpolate(method=m,
index=index,
values=values,
Expand Down
79 changes: 61 additions & 18 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,10 @@ def test_interpolate(self):
self.assert_numpy_array_equal(time_interp, ord_ts)

# try time interpolation on a non-TimeSeries
self.assertRaises(ValueError, self.series.interpolate, method='time')
# Only raises ValueError if there are NaNs.
non_ts = self.series.copy()
non_ts[0] = np.NaN
self.assertRaises(ValueError, non_ts.interpolate, method='time')

def test_interp_regression(self):
_skip_if_no_scipy()
Expand Down Expand Up @@ -512,7 +515,7 @@ def test_interpolate_non_ts(self):
def test_nan_interpolate(self):
s = Series([0, 1, np.nan, 3])
result = s.interpolate()
expected = Series([0, 1, 2, 3])
expected = Series([0., 1., 2., 3.])
assert_series_equal(result, expected)

_skip_if_no_scipy()
Expand All @@ -522,20 +525,20 @@ def test_nan_interpolate(self):
def test_nan_irregular_index(self):
s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9])
result = s.interpolate()
expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9])
expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9])
assert_series_equal(result, expected)

def test_nan_str_index(self):
s = Series([0, 1, 2, np.nan], index=list('abcd'))
result = s.interpolate()
expected = Series([0, 1, 2, 2], index=list('abcd'))
expected = Series([0., 1., 2., 2.], index=list('abcd'))
assert_series_equal(result, expected)

def test_interp_quad(self):
_skip_if_no_scipy()
sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4])
result = sq.interpolate(method='quadratic')
expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4])
expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4])
assert_series_equal(result, expected)

def test_interp_scipy_basic(self):
Expand All @@ -545,18 +548,30 @@ def test_interp_scipy_basic(self):
expected = Series([1., 3., 7.5, 12., 18.5, 25.])
result = s.interpolate(method='slinear')
assert_series_equal(result, expected)

result = s.interpolate(method='slinear', donwcast='infer')
assert_series_equal(result, expected)
# nearest
expected = Series([1, 3, 3, 12, 12, 25])
result = s.interpolate(method='nearest')
assert_series_equal(result, expected.astype('float'))

result = s.interpolate(method='nearest', downcast='infer')
assert_series_equal(result, expected)
# zero
expected = Series([1, 3, 3, 12, 12, 25])
result = s.interpolate(method='zero')
assert_series_equal(result, expected.astype('float'))

result = s.interpolate(method='zero', downcast='infer')
assert_series_equal(result, expected)
# quadratic
expected = Series([1, 3., 6.769231, 12., 18.230769, 25.])
result = s.interpolate(method='quadratic')
assert_series_equal(result, expected)

result = s.interpolate(method='quadratic', downcast='infer')
assert_series_equal(result, expected)
# cubic
expected = Series([1., 3., 6.8, 12., 18.2, 25.])
result = s.interpolate(method='cubic')
Expand Down Expand Up @@ -585,7 +600,6 @@ def test_interp_multiIndex(self):

expected = s.copy()
expected.loc[2] = 2
expected = expected.astype(np.int64)
result = s.interpolate()
assert_series_equal(result, expected)

Expand All @@ -595,15 +609,15 @@ def test_interp_multiIndex(self):

def test_interp_nonmono_raise(self):
_skip_if_no_scipy()
s = pd.Series([1, 2, 3], index=[0, 2, 1])
s = Series([1, np.nan, 3], index=[0, 2, 1])
with tm.assertRaises(ValueError):
s.interpolate(method='krogh')

def test_interp_datetime64(self):
_skip_if_no_scipy()
df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3))
result = df.interpolate(method='nearest')
expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3))
expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3))
assert_series_equal(result, expected)

class TestDataFrame(tm.TestCase, Generic):
Expand Down Expand Up @@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self):
def test_interp_basic(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5], 'D': list('abcd')})
expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9],
expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.],
'C': [1, 2, 3, 5], 'D': list('abcd')})
result = df.interpolate()
assert_frame_equal(result, expected)
Expand All @@ -648,8 +662,6 @@ def test_interp_basic(self):
expected = df.set_index('C')
expected.A.loc[3] = 3
expected.B.loc[5] = 9
expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64)

assert_frame_equal(result, expected)

def test_interp_bad_method(self):
Expand All @@ -663,9 +675,14 @@ def test_interp_combo(self):
'C': [1, 2, 3, 5], 'D': list('abcd')})

result = df['A'].interpolate()
expected = Series([1., 2., 3., 4.])
assert_series_equal(result, expected)

result = df['A'].interpolate(downcast='infer')
expected = Series([1, 2, 3, 4])
assert_series_equal(result, expected)


def test_interp_nan_idx(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
df = df.set_index('A')
Expand Down Expand Up @@ -722,13 +739,16 @@ def test_interp_alt_scipy(self):
expected = df.copy()
expected['A'].iloc[2] = 3
expected['A'].iloc[5] = 6
assert_frame_equal(result, expected)

result = df.interpolate(method='barycentric', downcast='infer')
assert_frame_equal(result, expected.astype(np.int64))

result = df.interpolate(method='krogh')
expectedk = df.copy()
expectedk['A'].iloc[2] = 3
expectedk['A'].iloc[5] = 6
expectedk['A'] = expected['A'].astype(np.int64)
# expectedk['A'].iloc[2] = 3
# expectedk['A'].iloc[5] = 6
expectedk['A'] = expected['A']
assert_frame_equal(result, expectedk)

_skip_if_no_pchip()
Expand Down Expand Up @@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self):

def test_interp_inplace(self):
df = DataFrame({'a': [1., 2., np.nan, 4.]})
expected = DataFrame({'a': [1, 2, 3, 4]})
df['a'].interpolate(inplace=True)
assert_frame_equal(df, expected)
expected = DataFrame({'a': [1., 2., 3., 4.]})
result = df.copy()
result['a'].interpolate(inplace=True)
assert_frame_equal(result, expected)

result = df.copy()
result['a'].interpolate(inplace=True, downcast='infer')
assert_frame_equal(result, expected.astype('int'))

def test_interp_ignore_all_good(self):
# GH
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 2, 3, 4],
'C': [1., 2., np.nan, 4.],
'D': [1., 2., 3., 4.]})
expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'),
'B': np.array([1, 2, 3, 4], dtype='int'),
'C': np.array([1., 2., 3, 4.], dtype='float'),
'D': np.array([1., 2., 3., 4.], dtype='float')})

result = df.interpolate(downcast=None)
assert_frame_equal(result, expected)

# all good
result = df[['B', 'D']].interpolate(downcast=None)
assert_frame_equal(result, df[['B', 'D']])

def test_no_order(self):
_skip_if_no_scipy()
Expand All @@ -802,7 +845,7 @@ def test_spline(self):
_skip_if_no_scipy()
s = Series([1, 2, np.nan, 4, 5, np.nan, 7])
result = s.interpolate(method='spline', order=1)
expected = Series([1, 2, 3, 4, 5, 6, 7])
expected = Series([1., 2., 3., 4., 5., 6., 7.])
assert_series_equal(result, expected)

def test_metadata_propagation_indiv(self):
Expand Down
26 changes: 26 additions & 0 deletions vb_suite/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,3 +403,29 @@ def test_unequal(name):
frame_object_unequal = Benchmark('test_unequal("object_df")', setup)
frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup)

#-----------------------------------------------------------------------------
# interpolate
# this is the worst case, where every column has NaNs.
setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_interpolate = Benchmark('df.interpolate()', setup,
start_date=datetime(2014, 2, 7))

setup = common_setup + """
df = DataFrame({'A': np.arange(0, 10000),
'B': np.random.randint(0, 100, 10000),
'C': randn(10000),
'D': randn(10000)})
df.loc[1::5, 'A'] = np.nan
df.loc[1::5, 'C'] = np.nan
"""

frame_interpolate_some_good = Benchmark('df.interpolate()', setup,
start_date=datetime(2014, 2, 7))
frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")',
setup,
start_date=datetime(2014, 2, 7))