Skip to content

Commit f05ebc8

Browse files
committed
BUG: Fix parsing of sas7bdat files with odd data pages (#16615)
SAS can apparently generate data pages having bit 7 (128) set on the page type. It seems that the presence of bit 8 (256) determines whether it's a data page or not. So treat page as a data page if bit 8 is set and don't mind the lower bits.
1 parent 2baa169 commit f05ebc8

File tree

5 files changed

+20
-8
lines changed

5 files changed

+20
-8
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ I/O
735735
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
736736
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
737737
- :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`)
738-
738+
- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
739739

740740
Plotting
741741
^^^^^^^^

pandas/io/sas/sas.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ cdef class Parser(object):
375375
if done:
376376
return True
377377
return False
378-
elif self.current_page_type == page_data_type:
378+
elif self.current_page_type & page_data_type == page_data_type:
379379
self.process_byte_array_with_data(
380380
bit_offset + subheader_pointers_offset +
381381
self.current_row_on_page_index * self.row_length,

pandas/io/sas/sas7bdat.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,10 @@ def _process_page_meta(self):
301301
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
302302
if self._current_page_type in pt:
303303
self._process_page_metadata()
304-
return ((self._current_page_type in [256] + const.page_mix_types) or
305-
(self._current_page_data_subheader_pointers != []))
304+
is_data_page = self._current_page_type & const.page_data_type
305+
is_mix_page = self._current_page_type in const.page_mix_types
306+
return (is_data_page or is_mix_page
307+
or self._current_page_data_subheader_pointers != [])
306308

307309
def _read_page_header(self):
308310
bit_offset = self._page_bit_offset
@@ -644,11 +646,13 @@ def _read_next_page(self):
644646
self._page_length))
645647

646648
self._read_page_header()
647-
if self._current_page_type == const.page_meta_type:
649+
page_type = self._current_page_type
650+
if page_type == const.page_meta_type:
648651
self._process_page_metadata()
649-
pt = [const.page_meta_type, const.page_data_type]
650-
pt += [const.page_mix_types]
651-
if self._current_page_type not in pt:
652+
653+
is_data_page = page_type & const.page_data_type
654+
pt = [const.page_meta_type] + const.page_mix_types
655+
if not is_data_page and self._current_page_type not in pt:
652656
return self._read_next_page()
653657

654658
return False
576 KB
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def test_date_time(datapath):
184184

185185

186186
def test_many_columns(datapath):
187+
# Test for looking for column information in more places (PR #22628)
187188
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
188189
df = pd.read_sas(fname, encoding='latin-1')
189190
fname = datapath("io", "sas", "data", "many_columns.csv")
@@ -199,6 +200,13 @@ def test_many_columns(datapath):
199200
tm.assert_frame_equal(df, df0)
200201

201202

203+
def test_inconsistent_number_of_rows(datapath):
204+
# Regression test for issue #16615. (PR #22628)
205+
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
206+
df = pd.read_sas(fname, encoding='latin-1')
207+
assert len(df) == 2097
208+
209+
202210
def test_zero_variables(datapath):
203211
# Check if the SAS file has zero variables (PR #18184)
204212
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")

0 commit comments

Comments
 (0)