Skip to content

BUG: to_csv handles cols= reordering,dupe cols GH3454 #3458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
3 commits merged into from Apr 25, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ pandas 0.12.0
- Fixed an esoteric excel reading bug, xlrd>= 0.9.0 now required for excel
support. Should provide python3 support (for reading) which has been
lacking. (GH3164_)
- Fix to_csv issue when having a large number of rows and ``NaT`` in some
- Addressed handling of dupe columns in df.to_csv new and old (GH3454_, GH3457_)
- Fix to_csv issue when having a large number of rows and ``NaT`` in some
columns (GH3437_)
- ``.loc`` was not raising when passed an integer list (GH3449_)
- Unordered time series selection was misbehaving when using label slicing (GH3448_)
Expand All @@ -57,6 +58,8 @@ pandas 0.12.0
.. _GH3164: https://github.com/pydata/pandas/issues/3164
.. _GH3251: https://github.com/pydata/pandas/issues/3251
.. _GH3379: https://github.com/pydata/pandas/issues/3379
.. _GH3454: https://github.com/pydata/pandas/issues/3454
.. _GH3457: https://github.com/pydata/pandas/issues/3457
.. _GH3038: https://github.com/pydata/pandas/issues/3038
.. _GH3437: https://github.com/pydata/pandas/issues/3437
.. _GH3455: https://github.com/pydata/pandas/issues/3455
Expand Down
25 changes: 21 additions & 4 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,

self.engine = engine # remove for 0.12
self.obj = obj

self.path_or_buf = path_or_buf
self.sep = sep
self.na_rep = na_rep
Expand All @@ -789,13 +790,27 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,

self.line_terminator = line_terminator

if cols is None:
cols = obj.columns
#GH3457
if not self.obj.columns.is_unique and engine == 'python':
msg= "columns.is_unique == False not supported with engine='python'"
raise NotImplementedError(msg)

if cols is not None:
if isinstance(cols,Index):
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
else:
cols=list(cols)
self.obj = self.obj.loc[:,cols]

# update columns to include possible multiplicity of dupes
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols,Index):
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
else:
cols=list(cols)

# save it
self.cols = cols

# preallocate data 2d list
Expand All @@ -804,7 +819,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
self.data =[None] * ncols

if self.obj.columns.is_unique:
self.colname_map = dict((k,i) for i,k in enumerate(obj.columns))
self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns))
else:
ks = [set(x.items) for x in self.blocks]
u = len(reduce(lambda a,x: a.union(x),ks,set()))
Expand Down Expand Up @@ -1024,7 +1039,9 @@ def _save_chunk(self, start_i, end_i):
# self.data is a preallocated list
self.data[self.colname_map[k]] = d[j]
else:
for i in range(len(self.cols)):
# self.obj should contain a proper view of the dataframes
# with the specified ordering of cols if cols was specified
for i in range(len(self.obj.columns)):
self.data[i] = self.obj.icol(i).values[slicer].tolist()

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
Expand Down
53 changes: 52 additions & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pandas.util.testing import (assert_almost_equal,
assert_series_equal,
assert_frame_equal,
makeCustomDataframe as mkdf,
ensure_clean)
from pandas.util import py3compat
from pandas.util.compat import OrderedDict
Expand Down Expand Up @@ -4621,9 +4622,59 @@ def test_to_csv_from_csv(self):
xp.columns = map(int,xp.columns)
assert_frame_equal(xp,rs)

def test_to_csv_cols_reordering(self):
# GH3454
import pandas as pd

def _check_df(df,cols=None):
with ensure_clean() as path:
df.to_csv(path,cols = cols,engine='python')
rs_p = pd.read_csv(path,index_col=0)
df.to_csv(path,cols = cols,chunksize=chunksize)
rs_c = pd.read_csv(path,index_col=0)

if cols:
df = df[cols]
assert (rs_c.columns==rs_p.columns).all()
assert_frame_equal(df,rs_c,check_names=False)

chunksize=5
N = int(chunksize*2.5)

df= mkdf(N, 3)
cs = df.columns
cols = [cs[2],cs[0]]
_check_df(df,cols)

def test_to_csv_legacy_raises_on_dupe_cols(self):
df= mkdf(10, 3)
df.columns = ['a','a','b']
with ensure_clean() as path:
self.assertRaises(NotImplementedError,df.to_csv,path,engine='python')

def test_to_csv_new_dupe_cols(self):
import pandas as pd
def _check_df(df,cols=None):
with ensure_clean() as path:
df.to_csv(path,cols = cols,chunksize=chunksize)
rs_c = pd.read_csv(path,index_col=0)
rs_c.columns = df.columns
assert_frame_equal(df,rs_c,check_names=False)

chunksize=5
N = int(chunksize*2.5)

# dupe cols
df= mkdf(N, 3)
df.columns = ['a','a','b']
_check_df(df,None)

# dupe cols with selection
cols = ['b','a']
_check_df(df,cols)

@slow
def test_to_csv_moar(self):
from pandas.util.testing import makeCustomDataframe as mkdf
path = '__tmp_to_csv_moar__'

def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None,
Expand Down