Skip to content

Commit 0cff907

Browse files
committed
Merge pull request #7818 from bashtage/stata-117-variable-labels
BUG: Fixed failure in StataReader when reading variable labels in 117
2 parents d109ab0 + 6265450 commit 0cff907

File tree

5 files changed

+26
-4
lines changed

5 files changed

+26
-4
lines changed

doc/source/v0.15.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ Bug Fixes
233233

234234

235235
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
236-
236+
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
237237

238238

239239

pandas/io/stata.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,15 @@ def _read_header(self):
520520
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
521521
seek_value_label_names = struct.unpack(
522522
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
523-
seek_variable_labels = struct.unpack(
524-
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
523+
# Stata 117 data files do not follow the described format. This is
524+
# a work around that uses the previous label, 33 bytes for each
525+
# variable, 20 for the closing tag and 17 for the opening tag
526+
self.path_or_buf.read(8) # <variable_lables>, throw away
527+
seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17
528+
# Below is the original, correct code (per Stata sta format doc,
529+
# although this is not followed in actual 117 dtas)
530+
#seek_variable_labels = struct.unpack(
531+
# self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
525532
self.path_or_buf.read(8) # <characteristics>
526533
self.data_location = struct.unpack(
527534
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6

pandas/io/tests/data/stata7_115.dta

722 Bytes
Binary file not shown.

pandas/io/tests/data/stata7_117.dta

1.13 KB
Binary file not shown.

pandas/io/tests/test_stata.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ def setUp(self):
6868
self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
6969
self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
7070

71+
self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
72+
self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
73+
7174
def read_dta(self, file):
7275
return read_stata(file, convert_dates=True)
7376

@@ -199,7 +202,7 @@ def test_read_dta4(self):
199202
'labeled_with_missings', 'float_labelled'])
200203

201204
# these are all categoricals
202-
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1)
205+
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected)],axis=1)
203206

204207
tm.assert_frame_equal(parsed_113, expected)
205208
tm.assert_frame_equal(parsed_114, expected)
@@ -551,6 +554,18 @@ def test_bool_uint(self):
551554
written_and_read_again = written_and_read_again.set_index('index')
552555
tm.assert_frame_equal(written_and_read_again, expected)
553556

557+
def test_variable_labels(self):
558+
sr_115 = StataReader(self.dta16_115).variable_labels()
559+
sr_117 = StataReader(self.dta16_117).variable_labels()
560+
keys = ('var1', 'var2', 'var3')
561+
labels = ('label1', 'label2', 'label3')
562+
for k,v in compat.iteritems(sr_115):
563+
self.assertTrue(k in sr_117)
564+
self.assertTrue(v == sr_117[k])
565+
self.assertTrue(k in keys)
566+
self.assertTrue(v in labels)
567+
568+
554569
if __name__ == '__main__':
555570
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
556571
exit=False)

0 commit comments

Comments
 (0)