Skip to content

API: for HDFStore, add the keyword dropna=True to append to change whether to write ALL nan rows to the store (GH4625) #4714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 31, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ pandas 0.13
be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`)
- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`)
- the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t``
- add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
to the store (default is ``True``, ALL nan rows are NOT written), also settable
via the option ``io.hdf.dropna_table`` (:issue:`4625`)
- ``JSON``

- added ``date_unit`` parameter to specify resolution of timestamps. Options
Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ API changes

import os
os.remove(path)
- add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written
to the store (default is ``True``, ALL nan rows are NOT written), also settable
via the option ``io.hdf.dropna_table`` (:issue:`4625`)

- Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``,
``labels``, and ``names``) (:issue:`4039`):
Expand Down
58 changes: 40 additions & 18 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pandas.tools.merge import concat
from pandas import compat
from pandas.io.common import PerformanceWarning
from pandas.core.config import get_option

import pandas.lib as lib
import pandas.algos as algos
Expand Down Expand Up @@ -165,6 +166,17 @@ class DuplicateWarning(Warning):
Panel4D: [1, 2, 3],
}

# register our configuration options
from pandas.core import config
dropna_doc = """
: boolean
drop ALL nan rows when appending to a table
"""

with config.config_prefix('io.hdf'):
config.register_option('dropna_table', True, dropna_doc,
validator=config.is_bool)

# oh the troubles to reduce import time
_table_mod = None
_table_supports_index = False
Expand Down Expand Up @@ -730,7 +742,7 @@ def remove(self, key, where=None, start=None, stop=None):
'can only remove with where on objects written as tables')
return s.delete(where=where, start=start, stop=stop)

def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
def append(self, key, value, fmt=None, append=True, columns=None, dropna=None, **kwargs):
"""
Append to Table in file. Node must already exist and be Table
format.
Expand All @@ -751,7 +763,8 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
chunksize : size to chunk the writing
expectedrows : expected TOTAL row size of this table
encoding : default None, provide an encoding for strings

dropna : boolean, default True, do not write an ALL nan row to the store
settable by the option 'io.hdf.dropna_table'
Notes
-----
Does *not* check if data being appended overlaps with existing
Expand All @@ -761,8 +774,10 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs):
raise Exception(
"columns is not a supported keyword in append, try data_columns")

if dropna is None:
dropna = get_option("io.hdf.dropna_table")
kwargs = self._validate_format(fmt or 't', kwargs)
self._write_to_group(key, value, append=append, **kwargs)
self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)

def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):
"""
Expand Down Expand Up @@ -3219,7 +3234,7 @@ class AppendableTable(LegacyTable):

def write(self, obj, axes=None, append=False, complib=None,
complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
expectedrows=None, **kwargs):
expectedrows=None, dropna=True, **kwargs):

if not append and self.is_exists:
self._handle.removeNode(self.group, 'table')
Expand Down Expand Up @@ -3254,29 +3269,36 @@ def write(self, obj, axes=None, append=False, complib=None,
a.validate_and_set(table, append)

# add the rows
self.write_data(chunksize)
self.write_data(chunksize, dropna=dropna)

def write_data(self, chunksize):
def write_data(self, chunksize, dropna=True):
""" we form the data into a 2-d including indexes,values,mask
write chunk-by-chunk """

names = self.dtype.names
nrows = self.nrows_expected

# create the masks & values
masks = []
for a in self.values_axes:
# if dropna==True, then drop ALL nan rows
if dropna:

masks = []
for a in self.values_axes:

# figure the mask: only do if we can successfully process this
# column, otherwise ignore the mask
mask = com.isnull(a.data).all(axis=0)
masks.append(mask.astype('u1'))

# figure the mask: only do if we can successfully process this
# column, otherwise ignore the mask
mask = com.isnull(a.data).all(axis=0)
masks.append(mask.astype('u1'))
# consolidate masks
mask = masks[0]
for m in masks[1:]:
mask = mask & m
mask = mask.ravel()

else:

# consolidate masks
mask = masks[0]
for m in masks[1:]:
mask = mask & m
mask = mask.ravel()
mask = np.empty(nrows, dtype='u1')
mask.fill(False)

# broadcast the indexes if needed
indexes = [a.cvalues for a in self.index_axes]
Expand Down
61 changes: 46 additions & 15 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,45 +757,76 @@ def test_append_some_nans(self):
store.append('df3', df3[10:])
tm.assert_frame_equal(store['df3'], df3)

##### THIS IS A BUG, should not drop these all-nan rows
##### BUT need to store the index which we don't want to do....
# nan some entire rows
def test_append_all_nans(self):

with ensure_clean(self.path) as store:

df = DataFrame({'A1' : np.random.randn(20),
'A2' : np.random.randn(20)},
index=np.arange(20))
df.ix[0:15,:] = np.nan


# nan some entire rows (dropna=True)
_maybe_remove(store, 'df')
store.append('df', df[:10], dropna=True)
store.append('df', df[10:], dropna=True)
tm.assert_frame_equal(store['df'], df[-4:])

# nan some entire rows (dropna=False)
_maybe_remove(store, 'df2')
store.append('df2', df[:10], dropna=False)
store.append('df2', df[10:], dropna=False)
tm.assert_frame_equal(store['df2'], df)

# tests the option io.hdf.dropna_table
pandas.set_option('io.hdf.dropna_table',False)
_maybe_remove(store, 'df3')
store.append('df3', df[:10])
store.append('df3', df[10:])
tm.assert_frame_equal(store['df3'], df)

pandas.set_option('io.hdf.dropna_table',True)
_maybe_remove(store, 'df4')
df.ix[0:15,:] = np.nan
store.append('df4', df[:10])
store.append('df4', df[10:])
tm.assert_frame_equal(store['df4'], df[-4:])
self.assert_(store.get_storer('df4').nrows == 4)

# nan some entire rows (string are still written!)
df = DataFrame({'A1' : np.random.randn(20),
'A2' : np.random.randn(20),
'B' : 'foo', 'C' : 'bar'},
index=np.arange(20))

_maybe_remove(store, 'df5')
df.ix[0:15,:] = np.nan
store.append('df5', df[:10])
store.append('df5', df[10:])
tm.assert_frame_equal(store['df5'], df)
self.assert_(store.get_storer('df5').nrows == 20)

_maybe_remove(store, 'df')
store.append('df', df[:10], dropna=True)
store.append('df', df[10:], dropna=True)
tm.assert_frame_equal(store['df'], df)

_maybe_remove(store, 'df2')
store.append('df2', df[:10], dropna=False)
store.append('df2', df[10:], dropna=False)
tm.assert_frame_equal(store['df2'], df)

# nan some entire rows (but since we have dates they are still written!)
df = DataFrame({'A1' : np.random.randn(20),
'A2' : np.random.randn(20),
'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
index=np.arange(20))

_maybe_remove(store, 'df6')
df.ix[0:15,:] = np.nan
store.append('df6', df[:10])
store.append('df6', df[10:])
tm.assert_frame_equal(store['df6'], df)
self.assert_(store.get_storer('df6').nrows == 20)

_maybe_remove(store, 'df')
store.append('df', df[:10], dropna=True)
store.append('df', df[10:], dropna=True)
tm.assert_frame_equal(store['df'], df)

_maybe_remove(store, 'df2')
store.append('df2', df[:10], dropna=False)
store.append('df2', df[10:], dropna=False)
tm.assert_frame_equal(store['df2'], df)

def test_append_frame_column_oriented(self):

Expand Down