ENH: Add date_format keyword to to_csv()

qwhelan · qwhelan · commit ce669d6f295f · 2013-10-11T22:11:05.000-07:00
DOC: add date_format to release notes
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -60,6 +60,9 @@ New features
   - Clipboard functionality now works with PySide (:issue:`4282`)
   - New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
   - Auto-detect field widths in read_fwf when unspecified (:issue:`4488`)
+  - ``to_csv()`` now outputs datetime objects according to a specified format string
+    via the ``date_format`` keyword (:issue:`4313`)
+
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -87,6 +87,9 @@ API changes
     and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
     support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
 
+  - ``to_csv`` now takes a ``date_format`` keyword argument that specifies how
+    output datetime objects should be formatted. Datetimes encountered in the
+    index, columns, and values will all have this formatting applied. (:issue:`4313`)
 
 Prior Version Deprecations/Changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -18,7 +18,7 @@
 import itertools
 import csv
 
-from pandas.tseries.period import PeriodIndex
+from pandas.tseries.period import PeriodIndex, DatetimeIndex
 
 docstring_to_string = """
      Parameters
@@ -850,7 +850,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
                  cols=None, header=True, index=True, index_label=None,
                  mode='w', nanRep=None, encoding=None, quoting=None,
                  line_terminator='\n', chunksize=None, engine=None,
-                 tupleize_cols=False, quotechar='"'):
+                 tupleize_cols=False, quotechar='"', date_format=None):
 
         self.engine = engine  # remove for 0.13
         self.obj = obj
@@ -877,6 +877,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
 
         self.line_terminator = line_terminator
 
+        self.date_format = date_format
+
         #GH3457
         if not self.obj.columns.is_unique and engine == 'python':
             msg= "columns.is_unique == False not supported with engine='python'"
@@ -893,7 +895,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
 
         if cols is not None:
             if isinstance(cols,Index):
-                cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
+                cols = cols.to_native_types(na_rep=na_rep,float_format=float_format,
+                                            date_format=date_format)
             else:
                 cols=list(cols)
             self.obj = self.obj.loc[:,cols]
@@ -902,7 +905,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
         # and make sure sure cols is just a list of labels
         cols = self.obj.columns
         if isinstance(cols,Index):
-            cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
+            cols = cols.to_native_types(na_rep=na_rep,float_format=float_format,
+                                        date_format=date_format)
         else:
             cols=list(cols)
 
@@ -923,6 +927,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
         if isinstance(obj.index, PeriodIndex):
             self.data_index = obj.index.to_timestamp()
 
+        if isinstance(self.data_index, DatetimeIndex) and date_format is not None:
+            self.data_index = Index([x.strftime(date_format) if notnull(x) else '' for x in self.data_index])
+
         self.nlevels = getattr(self.data_index, 'nlevels', 1)
         if not index:
             self.nlevels = 0
@@ -931,15 +938,10 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
     # invoked by df.to_csv(engine=python)
     def _helper_csv(self, writer, na_rep=None, cols=None,
                     header=True, index=True,
-                    index_label=None, float_format=None):
+                    index_label=None, float_format=None, date_format=None):
         if cols is None:
             cols = self.columns
 
-        series = {}
-        for k, v in compat.iteritems(self.obj._series):
-            series[k] = v.values
-
-
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
         if has_aliases or header:
             if index:
@@ -981,10 +983,34 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
                 encoded_cols = list(cols)
                 writer.writerow(encoded_cols)
 
+        if date_format is None:
+            date_formatter = lambda x: lib.Timestamp(x)._repr_base
+        else:
+            def strftime_with_nulls(x):
+                x = lib.Timestamp(x)
+                if notnull(x):
+                    return x.strftime(date_format)
+
+            date_formatter = lambda x: strftime_with_nulls(x)
+
         data_index = self.obj.index
+
         if isinstance(self.obj.index, PeriodIndex):
             data_index = self.obj.index.to_timestamp()
 
+        if isinstance(data_index, DatetimeIndex) and date_format is not None:
+            data_index = Index([date_formatter(x) for x in data_index])
+
+        values = self.obj.copy()
+        values.index = data_index
+        values.columns = values.columns.to_native_types(na_rep=na_rep,float_format=float_format,
+                                            date_format=date_format)
+        values = values[cols]
+
+        series = {}
+        for k, v in compat.iteritems(values._series):
+            series[k] = v.values
+
         nlevels = getattr(data_index, 'nlevels', 1)
         for j, idx in enumerate(data_index):
             row_fields = []
@@ -1000,8 +1026,8 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
 
                 if float_format is not None and com.is_float(val):
                     val = float_format % val
-                elif isinstance(val, np.datetime64):
-                    val = lib.Timestamp(val)._repr_base
+                elif isinstance(val, (np.datetime64, lib.Timestamp)):
+                    val = date_formatter(val)
 
                 row_fields.append(val)
 
@@ -1031,7 +1057,7 @@ def save(self):
                 self._helper_csv(self.writer, na_rep=self.na_rep,
                                  float_format=self.float_format, cols=self.cols,
                                  header=self.header, index=self.index,
-                                 index_label=self.index_label)
+                                 index_label=self.index_label, date_format=self.date_format)
 
             else:
                 self._save()
@@ -1150,13 +1176,16 @@ def _save_chunk(self, start_i, end_i):
         slicer = slice(start_i,end_i)
         for i in range(len(self.blocks)):
             b = self.blocks[i]
-            d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
+            d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
+                                  float_format=self.float_format, date_format=self.date_format)
+
             for i, item in enumerate(b.items):
 
                 # self.data is a preallocated list
                 self.data[self.column_map[b][i]] = d[i]
 
-        ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
+        ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
+                                        float_format=self.float_format, date_format=self.date_format)
 
         lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1030,7 +1030,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
                line_terminator='\n', chunksize=None,
-               tupleize_cols=False, **kwds):
+               tupleize_cols=False, date_format=None, **kwds):
         r"""Write DataFrame to a comma-separated values (csv) file
 
         Parameters
@@ -1073,6 +1073,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
         tupleize_cols : boolean, default False
             write multi_index columns as a list of tuples (if True)
             or new (expanded format) if False)
+        date_format : string, default None
+            Format string for datetime objects.
         """
         if nanRep is not None:  # pragma: no cover
             warnings.warn("nanRep is deprecated, use na_rep",
@@ -1088,7 +1090,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                                      index_label=index_label, mode=mode,
                                      chunksize=chunksize, engine=kwds.get(
                                          "engine"),
-                                     tupleize_cols=tupleize_cols)
+                                     tupleize_cols=tupleize_cols,
+                                     date_format=date_format)
         formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -22,7 +22,7 @@
 
 from pandas.tslib import Timestamp
 from pandas import compat
-from pandas.compat import range, lrange, lmap, callable, map, zip
+from pandas.compat import range, lrange, lmap, callable, map, zip, u
 from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
 
 class Block(PandasObject):
@@ -1396,7 +1396,7 @@ def fillna(self, value, inplace=False, downcast=None):
         return [self if inplace else make_block(values, self.items,
                                                 self.ref_items, fastpath=True)]
 
-    def to_native_types(self, slicer=None, na_rep=None, **kwargs):
+    def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs):
         """ convert to our native types format, slicing if desired """
 
         values = self.values
@@ -1409,8 +1409,14 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
             na_rep = 'NaT'
         rvalues[mask] = na_rep
         imask = (-mask).ravel()
-        rvalues.flat[imask] = np.array(
-            [Timestamp(val)._repr_base for val in values.ravel()[imask]], dtype=object)
+
+        if date_format is None:
+            date_formatter = lambda x: Timestamp(x)._repr_base
+        else:
+            date_formatter = lambda x: Timestamp(x).strftime(date_format)
+
+        rvalues.flat[imask] = np.array([date_formatter(val) for val in
+                                        values.ravel()[imask]], dtype=object)
 
         return rvalues.tolist()
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2129,7 +2129,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
 
     def to_csv(self, path, index=True, sep=",", na_rep='',
                float_format=None, header=False,
-               index_label=None, mode='w', nanRep=None, encoding=None):
+               index_label=None, mode='w', nanRep=None, encoding=None,
+               date_format=None):
         """
         Write Series to a comma-separated values (csv) file
 
@@ -2154,13 +2155,15 @@ def to_csv(self, path, index=True, sep=",", na_rep='',
         encoding : string, optional
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
+        date_format: string, default None
+            Format string for datetime objects.
         """
         from pandas.core.frame import DataFrame
         df = DataFrame(self)
         df.to_csv(path, index=index, sep=sep, na_rep=na_rep,
                   float_format=float_format, header=header,
                   index_label=index_label, mode=mode, nanRep=nanRep,
-                  encoding=encoding)
+                  encoding=encoding, date_format=date_format)
 
     def dropna(self):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -11407,6 +11407,53 @@ def test_isin_with_string_scalar(self):
         with tm.assertRaises(TypeError):
             df.isin('aaa')
 
+    def test_to_csv_date_format(self):
+        from pandas import to_datetime
+        pname = '__tmp_to_csv_date_format__'
+        with ensure_clean(pname) as path:
+            for engine in [None, 'python']:
+                dt_index = self.tsframe.index
+                datetime_frame = DataFrame({'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index)
+
+                datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine)
+                # Check that the data was put in the specified format
+                test = read_csv(path, index_col=0)
+
+                datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime('%Y%m%d')))
+                datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime('%Y%m%d')))
+
+                assert_frame_equal(test, datetime_frame_int)
+
+                datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine)
+                # Check that the data was put in the specified format
+                test = read_csv(path, index_col=0)
+                datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime('%Y-%m-%d'))
+                datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime('%Y-%m-%d'))
+
+                assert_frame_equal(test, datetime_frame_str)
+
+                # Check that columns get converted
+                datetime_frame_columns = datetime_frame.T
+
+                datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine)
+
+                test = read_csv(path, index_col=0)
+
+                datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime('%Y%m%d')))
+                # Columns don't get converted to ints by read_csv
+                datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime('%Y%m%d'))
+
+                assert_frame_equal(test, datetime_frame_columns)
+
+                # test NaTs
+                nat_index = to_datetime(['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000'])
+                nat_frame = DataFrame({'A': nat_index}, index=nat_index)
+
+                nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine)
+
+                test = read_csv(path, parse_dates=[0, 1], index_col=0)
+
+                assert_frame_equal(test, nat_frame)
 
 def skip_if_no_ne(engine='numexpr'):
     if engine == 'numexpr':
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -7,7 +7,8 @@
 import numpy as np
 
 from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE,
-                                is_list_like,_values_from_object, _maybe_box)
+                                is_list_like,_values_from_object, _maybe_box,
+                                notnull)
 from pandas.core.index import Index, Int64Index, _Identity
 import pandas.compat as compat
 from pandas.compat import u
@@ -599,23 +600,29 @@ def __contains__(self, key):
     def _format_with_header(self, header, **kwargs):
         return header + self._format_native_types(**kwargs)
 
-    def _format_native_types(self, na_rep=u('NaT'), **kwargs):
+    def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs):
         data = list(self)
 
         # tz formatter or time formatter
         zero_time = time(0, 0)
-        for d in data:
-            if d.time() != zero_time or d.tzinfo is not None:
-                return [u('%s') % x for x in data]
+        if date_format is None:
+            for d in data:
+                if d.time() != zero_time or d.tzinfo is not None:
+                    return [u('%s') % x for x in data]
 
         values = np.array(data, dtype=object)
         mask = isnull(self.values)
         values[mask] = na_rep
 
         imask = -mask
-        values[imask] = np.array([u('%d-%.2d-%.2d') % (dt.year, dt.month,
-                                                       dt.day)
-                                  for dt in values[imask]])
+
+        if date_format is None:
+            date_formatter = lambda x: u('%d-%.2d-%.2d' % (x.year, x.month, x.day))
+        else:
+            date_formatter = lambda x: u(x.strftime(date_format))
+
+        values[imask] = np.array([date_formatter(dt) for dt in values[imask]])
+
         return values.tolist()
 
     def isin(self, values):
diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py
@@ -88,3 +88,13 @@ def create_cols(name):
         "         parse_dates=['foo'])")
 read_parse_dates_iso8601 = Benchmark(stmt, setup,
                                      start_date=datetime(2012, 3, 1))
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = DataFrame(rng, index=rng)
+"""
+
+stmt = ("data.to_csv('__test__.csv', date_format='%Y%m%d')")
+
+frame_to_csv_date_formatting = Benchmark(stmt, setup,
+                                     start_date=datetime(2013, 9, 1))