BUG: Fixed incorrect string length calculation when writing strings to Stata

bashtage · bashtage · commit cb5fc6023953 · 2014-07-31T10:04:23.000-04:00
Strings were incorrectly written using 244 character irrespective of the actual length of the underlying due to changes in pandas where the underlying NumPy datatype of strings is always np.object_, and never np.string_. Closes #7858 String types were also not being checked for excessive length, and DataFrames with strings containing more then 244 characters were producing invalid dta files. Attempting to write long strings raises an error now.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``.
    Conversion from ``int64`` to ``float64`` may result in a loss of precision
    if ``int64`` values are larger than 2**53.
 
+.. warning::
+  :class:`~pandas.io.stata.StataWriter`` and
+  :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width
+  strings containing up to 244 characters, a limitation imposed by the version
+  115 dta file format. Attempting to write *Stata* dta files with strings
+  longer than 244 characters raises a ``ValueError``.
+
 
 .. _io.stata_reader:
 
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -119,6 +119,11 @@ API changes
 - The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no
   effect (:issue:`7762`, :issue:`7032`).
 
+- ``DataFrame.to_stata`` and ``StataWriter`` check string length for
+  compatibility with limitations imposed in dta files where fixed-width
+  strings must contain 244 or fewer characters.  Attempting to write Stata
+  dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)
+
 
 .. _whatsnew_0150.cat:
 
@@ -312,7 +317,7 @@ Bug Fixes
 
 - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
 - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
-
+- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`)
 - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
   returning results with columns sorted by name and producing an error for non-unique columns;
   now handles non-unique columns and returns columns in original order
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -23,6 +23,7 @@
 from pandas.compat import long, lrange, lmap, lzip, text_type, string_types
 from pandas import isnull
 from pandas.io.common import get_filepath_or_buffer
+from pandas.lib import max_len_string_array, is_string_array
 from pandas.tslib import NaT
 
 def read_stata(filepath_or_buffer, convert_dates=True,
@@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt):
         raise ValueError("fmt %s not understood" % fmt)
 
 
+excessive_string_length_error = """
+Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters.
+Column '%s' does not satisfy this restriction.
+"""
+
 class PossiblePrecisionLoss(Warning):
     pass
 
@@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype):
                          "Please report an error to the developers." % dtype)
 
 
-def _dtype_to_default_stata_fmt(dtype):
+def _dtype_to_default_stata_fmt(dtype, column):
     """
     Maps numpy dtype to stata's default format for this type. Not terribly
     important since users can change this in Stata. Semantics are
 
     string  -> "%DDs" where DD is the length of the string
+    object  -> "%DDs" where DD is the length of the string, if a string, or 244
+                for anything that cannot be converted to a string.
     float64 -> "%10.0g"
     float32 -> "%9.0g"
     int64   -> "%9.0g"
@@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype):
     """
     #TODO: expand this to handle a default datetime format?
     if dtype.type == np.string_:
+        if max_len_string_array(column.values) > 244:
+            raise ValueError(excessive_string_length_error % column.name)
+
         return "%" + str(dtype.itemsize) + "s"
     elif dtype.type == np.object_:
-        return "%244s"
+        try:
+            # Try to use optimal size if available
+            itemsize = max_len_string_array(column.values)
+        except:
+            # Default size
+            itemsize = 244
+        if itemsize > 244:
+            raise ValueError(excessive_string_length_error % column.name)
+
+        return "%" + str(itemsize) + "s"
     elif dtype == np.float64:
         return "%10.0g"
     elif dtype == np.float32:
@@ -1264,7 +1284,9 @@ def __iter__(self):
                 )
                 dtypes[key] = np.dtype(new_type)
         self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
-        self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
+        self.fmtlist = []
+        for col, dtype in dtypes.iteritems():
+            self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
         # set the given format for the datetime cols
         if self._convert_dates is not None:
             for key in self._convert_dates:
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -565,6 +565,30 @@ def test_variable_labels(self):
             self.assertTrue(k in keys)
             self.assertTrue(v in labels)
 
+    def test_minimal_size_col(self):
+        str_lens = (1, 100, 244)
+        s = {}
+        for str_len in str_lens:
+            s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
+        original = DataFrame(s)
+        with tm.ensure_clean() as path:
+            original.to_stata(path, write_index=False)
+            sr = StataReader(path)
+            variables = sr.varlist
+            formats = sr.fmtlist
+            for variable, fmt in zip(variables, formats):
+                self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
+
+    def test_excessively_long_string(self):
+        str_lens = (1, 244, 500)
+        s = {}
+        for str_len in str_lens:
+            s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
+        original = DataFrame(s)
+        with tm.assertRaises(ValueError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],