Skip to content

Commit cb5fc60

Browse files
committed
BUG: Fixed incorrect string length calculation when writing strings to Stata
Strings were incorrectly written using 244 character irrespective of the actual length of the underlying due to changes in pandas where the underlying NumPy datatype of strings is always np.object_, and never np.string_. Closes #7858 String types were also not being checked for excessive length, and DataFrames with strings containing more then 244 characters were producing invalid dta files. Attempting to write long strings raises an error now.
1 parent 0621f9f commit cb5fc60

File tree

4 files changed

+62
-4
lines changed

4 files changed

+62
-4
lines changed

doc/source/io.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``.
35293529
Conversion from ``int64`` to ``float64`` may result in a loss of precision
35303530
if ``int64`` values are larger than 2**53.
35313531

3532+
.. warning::
3533+
:class:`~pandas.io.stata.StataWriter`` and
3534+
:func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width
3535+
strings containing up to 244 characters, a limitation imposed by the version
3536+
115 dta file format. Attempting to write *Stata* dta files with strings
3537+
longer than 244 characters raises a ``ValueError``.
3538+
35323539

35333540
.. _io.stata_reader:
35343541

doc/source/v0.15.0.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ API changes
119119
- The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no
120120
effect (:issue:`7762`, :issue:`7032`).
121121

122+
- ``DataFrame.to_stata`` and ``StataWriter`` check string length for
123+
compatibility with limitations imposed in dta files where fixed-width
124+
strings must contain 244 or fewer characters. Attempting to write Stata
125+
dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)
126+
122127

123128
.. _whatsnew_0150.cat:
124129

@@ -312,7 +317,7 @@ Bug Fixes
312317

313318
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
314319
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
315-
320+
- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`)
316321
- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
317322
returning results with columns sorted by name and producing an error for non-unique columns;
318323
now handles non-unique columns and returns columns in original order

pandas/io/stata.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pandas.compat import long, lrange, lmap, lzip, text_type, string_types
2424
from pandas import isnull
2525
from pandas.io.common import get_filepath_or_buffer
26+
from pandas.lib import max_len_string_array, is_string_array
2627
from pandas.tslib import NaT
2728

2829
def read_stata(filepath_or_buffer, convert_dates=True,
@@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt):
181182
raise ValueError("fmt %s not understood" % fmt)
182183

183184

185+
excessive_string_length_error = """
186+
Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters.
187+
Column '%s' does not satisfy this restriction.
188+
"""
189+
184190
class PossiblePrecisionLoss(Warning):
185191
pass
186192

@@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype):
10401046
"Please report an error to the developers." % dtype)
10411047

10421048

1043-
def _dtype_to_default_stata_fmt(dtype):
1049+
def _dtype_to_default_stata_fmt(dtype, column):
10441050
"""
10451051
Maps numpy dtype to stata's default format for this type. Not terribly
10461052
important since users can change this in Stata. Semantics are
10471053
10481054
string -> "%DDs" where DD is the length of the string
1055+
object -> "%DDs" where DD is the length of the string, if a string, or 244
1056+
for anything that cannot be converted to a string.
10491057
float64 -> "%10.0g"
10501058
float32 -> "%9.0g"
10511059
int64 -> "%9.0g"
@@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype):
10551063
"""
10561064
#TODO: expand this to handle a default datetime format?
10571065
if dtype.type == np.string_:
1066+
if max_len_string_array(column.values) > 244:
1067+
raise ValueError(excessive_string_length_error % column.name)
1068+
10581069
return "%" + str(dtype.itemsize) + "s"
10591070
elif dtype.type == np.object_:
1060-
return "%244s"
1071+
try:
1072+
# Try to use optimal size if available
1073+
itemsize = max_len_string_array(column.values)
1074+
except:
1075+
# Default size
1076+
itemsize = 244
1077+
if itemsize > 244:
1078+
raise ValueError(excessive_string_length_error % column.name)
1079+
1080+
return "%" + str(itemsize) + "s"
10611081
elif dtype == np.float64:
10621082
return "%10.0g"
10631083
elif dtype == np.float32:
@@ -1264,7 +1284,9 @@ def __iter__(self):
12641284
)
12651285
dtypes[key] = np.dtype(new_type)
12661286
self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
1267-
self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
1287+
self.fmtlist = []
1288+
for col, dtype in dtypes.iteritems():
1289+
self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
12681290
# set the given format for the datetime cols
12691291
if self._convert_dates is not None:
12701292
for key in self._convert_dates:

pandas/io/tests/test_stata.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,30 @@ def test_variable_labels(self):
565565
self.assertTrue(k in keys)
566566
self.assertTrue(v in labels)
567567

568+
def test_minimal_size_col(self):
569+
str_lens = (1, 100, 244)
570+
s = {}
571+
for str_len in str_lens:
572+
s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
573+
original = DataFrame(s)
574+
with tm.ensure_clean() as path:
575+
original.to_stata(path, write_index=False)
576+
sr = StataReader(path)
577+
variables = sr.varlist
578+
formats = sr.fmtlist
579+
for variable, fmt in zip(variables, formats):
580+
self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
581+
582+
def test_excessively_long_string(self):
583+
str_lens = (1, 244, 500)
584+
s = {}
585+
for str_len in str_lens:
586+
s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
587+
original = DataFrame(s)
588+
with tm.assertRaises(ValueError):
589+
with tm.ensure_clean() as path:
590+
original.to_stata(path)
591+
568592

569593
if __name__ == '__main__':
570594
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)