Skip to content

Commit 9f42640

Browse files
committed
Merge pull request #7862 from bashtage/stata-minimal-width-strings
BUG: Fixed incorrect string length calculation when writing strings in Stata
2 parents a921117 + cb5fc60 commit 9f42640

File tree

4 files changed

+62
-4
lines changed

4 files changed

+62
-4
lines changed

doc/source/io.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3529,6 +3529,13 @@ outside of this range, the data is cast to ``int16``.
35293529
Conversion from ``int64`` to ``float64`` may result in a loss of precision
35303530
if ``int64`` values are larger than 2**53.
35313531

3532+
.. warning::
3533+
:class:`~pandas.io.stata.StataWriter`` and
3534+
:func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width
3535+
strings containing up to 244 characters, a limitation imposed by the version
3536+
115 dta file format. Attempting to write *Stata* dta files with strings
3537+
longer than 244 characters raises a ``ValueError``.
3538+
35323539

35333540
.. _io.stata_reader:
35343541

doc/source/v0.15.0.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ API changes
119119
- The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no
120120
effect (:issue:`7762`, :issue:`7032`).
121121

122+
- ``DataFrame.to_stata`` and ``StataWriter`` check string length for
123+
compatibility with limitations imposed in dta files where fixed-width
124+
strings must contain 244 or fewer characters. Attempting to write Stata
125+
dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)
126+
122127

123128
.. _whatsnew_0150.cat:
124129

@@ -312,7 +317,7 @@ Bug Fixes
312317

313318
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
314319
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
315-
320+
- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`)
316321
- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
317322
returning results with columns sorted by name and producing an error for non-unique columns;
318323
now handles non-unique columns and returns columns in original order

pandas/io/stata.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pandas.compat import long, lrange, lmap, lzip, text_type, string_types
2424
from pandas import isnull
2525
from pandas.io.common import get_filepath_or_buffer
26+
from pandas.lib import max_len_string_array, is_string_array
2627
from pandas.tslib import NaT
2728

2829
def read_stata(filepath_or_buffer, convert_dates=True,
@@ -181,6 +182,11 @@ def _datetime_to_stata_elapsed(date, fmt):
181182
raise ValueError("fmt %s not understood" % fmt)
182183

183184

185+
excessive_string_length_error = """
186+
Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters.
187+
Column '%s' does not satisfy this restriction.
188+
"""
189+
184190
class PossiblePrecisionLoss(Warning):
185191
pass
186192

@@ -1040,12 +1046,14 @@ def _dtype_to_stata_type(dtype):
10401046
"Please report an error to the developers." % dtype)
10411047

10421048

1043-
def _dtype_to_default_stata_fmt(dtype):
1049+
def _dtype_to_default_stata_fmt(dtype, column):
10441050
"""
10451051
Maps numpy dtype to stata's default format for this type. Not terribly
10461052
important since users can change this in Stata. Semantics are
10471053
10481054
string -> "%DDs" where DD is the length of the string
1055+
object -> "%DDs" where DD is the length of the string, if a string, or 244
1056+
for anything that cannot be converted to a string.
10491057
float64 -> "%10.0g"
10501058
float32 -> "%9.0g"
10511059
int64 -> "%9.0g"
@@ -1055,9 +1063,21 @@ def _dtype_to_default_stata_fmt(dtype):
10551063
"""
10561064
#TODO: expand this to handle a default datetime format?
10571065
if dtype.type == np.string_:
1066+
if max_len_string_array(column.values) > 244:
1067+
raise ValueError(excessive_string_length_error % column.name)
1068+
10581069
return "%" + str(dtype.itemsize) + "s"
10591070
elif dtype.type == np.object_:
1060-
return "%244s"
1071+
try:
1072+
# Try to use optimal size if available
1073+
itemsize = max_len_string_array(column.values)
1074+
except:
1075+
# Default size
1076+
itemsize = 244
1077+
if itemsize > 244:
1078+
raise ValueError(excessive_string_length_error % column.name)
1079+
1080+
return "%" + str(itemsize) + "s"
10611081
elif dtype == np.float64:
10621082
return "%10.0g"
10631083
elif dtype == np.float32:
@@ -1264,7 +1284,9 @@ def __iter__(self):
12641284
)
12651285
dtypes[key] = np.dtype(new_type)
12661286
self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
1267-
self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
1287+
self.fmtlist = []
1288+
for col, dtype in dtypes.iteritems():
1289+
self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
12681290
# set the given format for the datetime cols
12691291
if self._convert_dates is not None:
12701292
for key in self._convert_dates:

pandas/io/tests/test_stata.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,30 @@ def test_variable_labels(self):
565565
self.assertTrue(k in keys)
566566
self.assertTrue(v in labels)
567567

568+
def test_minimal_size_col(self):
569+
str_lens = (1, 100, 244)
570+
s = {}
571+
for str_len in str_lens:
572+
s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
573+
original = DataFrame(s)
574+
with tm.ensure_clean() as path:
575+
original.to_stata(path, write_index=False)
576+
sr = StataReader(path)
577+
variables = sr.varlist
578+
formats = sr.fmtlist
579+
for variable, fmt in zip(variables, formats):
580+
self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
581+
582+
def test_excessively_long_string(self):
583+
str_lens = (1, 244, 500)
584+
s = {}
585+
for str_len in str_lens:
586+
s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
587+
original = DataFrame(s)
588+
with tm.assertRaises(ValueError):
589+
with tm.ensure_clean() as path:
590+
original.to_stata(path)
591+
568592

569593
if __name__ == '__main__':
570594
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)