Merge pull request #7862 from bashtage/stata-minimal-width-strings

jreback · jreback · commit 9f4264000c5e · 2014-08-01T09:30:09.000-04:00
BUG: Fixed incorrect string length calculation when writing strings in Stata
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``.
    Conversion from ``int64`` to ``float64`` may result in a loss of precision
    if ``int64`` values are larger than 2**53.
 
+.. warning::
+  :class:`~pandas.io.stata.StataWriter`` and
+  :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width
+  strings containing up to 244 characters, a limitation imposed by the version
+  115 dta file format. Attempting to write *Stata* dta files with strings
+  longer than 244 characters raises a ``ValueError``.
+
 
 .. _io.stata_reader:
 
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -119,6 +119,11 @@ API changes
 - The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no
   effect (:issue:`7762`, :issue:`7032`).
 
+- ``DataFrame.to_stata`` and ``StataWriter`` check string length for
+  compatibility with limitations imposed in dta files where fixed-width
+  strings must contain 244 or fewer characters.  Attempting to write Stata
+  dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)
+
 
 .. _whatsnew_0150.cat:
 
@@ -312,7 +317,7 @@ Bug Fixes
 
 - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
 - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
-
+- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`)
 - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
   returning results with columns sorted by name and producing an error for non-unique columns;
   now handles non-unique columns and returns columns in original order
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -23,6 +23,7 @@
 from pandas.compat import long, lrange, lmap, lzip, text_type, string_types
 from pandas import isnull
 from pandas.io.common import get_filepath_or_buffer
+from pandas.lib import max_len_string_array, is_string_array
 from pandas.tslib import NaT
 
 def read_stata(filepath_or_buffer, convert_dates=True,
@@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt):
         raise ValueError("fmt %s not understood" % fmt)
 
 
+excessive_string_length_error = """
+Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters.
+Column '%s' does not satisfy this restriction.
+"""
+
 class PossiblePrecisionLoss(Warning):
     pass
 
@@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype):
                          "Please report an error to the developers." % dtype)
 
 
-def _dtype_to_default_stata_fmt(dtype):
+def _dtype_to_default_stata_fmt(dtype, column):
     """
     Maps numpy dtype to stata's default format for this type. Not terribly
     important since users can change this in Stata. Semantics are
 
     string  -> "%DDs" where DD is the length of the string
+    object  -> "%DDs" where DD is the length of the string, if a string, or 244
+                for anything that cannot be converted to a string.
     float64 -> "%10.0g"
     float32 -> "%9.0g"
     int64   -> "%9.0g"
@@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype):
     """
     #TODO: expand this to handle a default datetime format?
     if dtype.type == np.string_:
+        if max_len_string_array(column.values) > 244:
+            raise ValueError(excessive_string_length_error % column.name)
+
         return "%" + str(dtype.itemsize) + "s"
     elif dtype.type == np.object_:
-        return "%244s"
+        try:
+            # Try to use optimal size if available
+            itemsize = max_len_string_array(column.values)
+        except:
+            # Default size
+            itemsize = 244
+        if itemsize > 244:
+            raise ValueError(excessive_string_length_error % column.name)
+
+        return "%" + str(itemsize) + "s"
     elif dtype == np.float64:
         return "%10.0g"
     elif dtype == np.float32:
@@ -1264,7 +1284,9 @@ def __iter__(self):
                 )
                 dtypes[key] = np.dtype(new_type)
         self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
-        self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
+        self.fmtlist = []
+        for col, dtype in dtypes.iteritems():
+            self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
         # set the given format for the datetime cols
         if self._convert_dates is not None:
             for key in self._convert_dates:
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -565,6 +565,30 @@ def test_variable_labels(self):
             self.assertTrue(k in keys)
             self.assertTrue(v in labels)
 
+    def test_minimal_size_col(self):
+        str_lens = (1, 100, 244)
+        s = {}
+        for str_len in str_lens:
+            s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
+        original = DataFrame(s)
+        with tm.ensure_clean() as path:
+            original.to_stata(path, write_index=False)
+            sr = StataReader(path)
+            variables = sr.varlist
+            formats = sr.fmtlist
+            for variable, fmt in zip(variables, formats):
+                self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
+
+    def test_excessively_long_string(self):
+        str_lens = (1, 244, 500)
+        s = {}
+        for str_len in str_lens:
+            s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
+        original = DataFrame(s)
+        with tm.assertRaises(ValueError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],