Skip to content

Commit aa99afe

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent 2f434f1 commit aa99afe

File tree

1 file changed

+39
-3
lines changed

1 file changed

+39
-3
lines changed

pandas/io/stata.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,9 @@ def __init__(
11441144
self._preserve_dtypes = preserve_dtypes
11451145
self._columns = columns
11461146
self._order_categoricals = order_categoricals
1147+
self._original_path_or_buf = path_or_buf
1148+
self._compression = compression
1149+
self._storage_options = storage_options
11471150
self._encoding = ""
11481151
self._chunksize = chunksize
11491152
self._using_iterator = False
@@ -1153,6 +1156,9 @@ def __init__(
11531156
raise ValueError("chunksize must be a positive integer when set.")
11541157

11551158
# State variables for the file
1159+
# NB: _path_or_buf is mistyped on purpose, since the alternative is to placate
1160+
# mypy by having an assert before every read.
1161+
self._path_or_buf: IO[bytes] = None # type: ignore[assignment]
11561162
self._has_string_data = False
11571163
self._missing_values = False
11581164
self._can_read_value_labels = False
@@ -1163,12 +1169,24 @@ def __init__(
11631169
self._lines_read = 0
11641170

11651171
self._native_byteorder = _set_endianness(sys.byteorder)
1172+
1173+
def _ensure_open(self) -> None:
1174+
"""
1175+
Ensure the file has been opened and its header data read.
1176+
"""
1177+
if self._path_or_buf is None:
1178+
self._open_file()
1179+
1180+
def _open_file(self) -> None:
1181+
"""
1182+
Open the file (with compression options, etc.), and read header information.
1183+
"""
11661184
with get_handle(
1167-
path_or_buf,
1185+
self._original_path_or_buf,
11681186
"rb",
1169-
storage_options=storage_options,
1187+
storage_options=self._storage_options,
11701188
is_text=False,
1171-
compression=compression,
1189+
compression=self._compression,
11721190
) as handles:
11731191
# Copy to BytesIO, and ensure no encoding
11741192
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1536,6 +1554,7 @@ def _decode(self, s: bytes) -> str:
15361554
return s.decode("latin-1")
15371555

15381556
def _read_value_labels(self) -> None:
1557+
self._ensure_open()
15391558
if self._value_labels_read:
15401559
# Don't read twice
15411560
return
@@ -1655,6 +1674,7 @@ def read(
16551674
columns: Sequence[str] | None = None,
16561675
order_categoricals: bool | None = None,
16571676
) -> DataFrame:
1677+
self._ensure_open()
16581678
# Handle empty file or chunk. If reading incrementally raise
16591679
# StopIteration. If reading the whole thing return an empty
16601680
# data frame.
@@ -1983,57 +2003,72 @@ def data_label(self) -> str:
19832003
"""
19842004
Return data label of Stata file.
19852005
"""
2006+
self._ensure_open()
19862007
return self._data_label
19872008

19882009
@property
19892010
def typlist(self) -> list[int | str]:
19902011
"""
19912012
Return list of variable types.
19922013
"""
2014+
self._ensure_open()
19932015
return self._typlist
19942016

19952017
@property
19962018
def dtyplist(self) -> list[str | np.dtype]:
19972019
"""
19982020
Return list of variable types.
19992021
"""
2022+
self._ensure_open()
20002023
return self._dtyplist
20012024

20022025
@property
20032026
def lbllist(self) -> list[str]:
20042027
"""
20052028
Return list of variable labels.
20062029
"""
2030+
self._ensure_open()
20072031
return self._lbllist
20082032

20092033
@property
20102034
def varlist(self) -> list[str]:
20112035
"""
20122036
Return list of variable names.
20132037
"""
2038+
self._ensure_open()
20142039
return self._varlist
20152040

20162041
@property
20172042
def fmtlist(self) -> list[str]:
20182043
"""
20192044
Return list of variable formats.
20202045
"""
2046+
self._ensure_open()
20212047
return self._fmtlist
20222048

20232049
@property
20242050
def time_stamp(self) -> str:
20252051
"""
20262052
Return time stamp of Stata file.
20272053
"""
2054+
self._ensure_open()
20282055
return self._time_stamp
20292056

20302057
@property
20312058
def format_version(self) -> int:
20322059
"""
20332060
Return format version of Stata file.
20342061
"""
2062+
self._ensure_open()
20352063
return self._format_version
20362064

2065+
@property
2066+
def path_or_buf(self) -> IO[bytes]:
2067+
"""
2068+
Return the file handle of the Stata file being read.
2069+
"""
2070+
return self._path_or_buf
2071+
20372072
def variable_labels(self) -> dict[str, str]:
20382073
"""
20392074
Return a dict associating each variable name with corresponding label.
@@ -2042,6 +2077,7 @@ def variable_labels(self) -> dict[str, str]:
20422077
-------
20432078
dict
20442079
"""
2080+
self._ensure_open()
20452081
return dict(zip(self._varlist, self._variable_labels))
20462082

20472083
def value_labels(self) -> dict[str, dict[float, str]]:

0 commit comments

Comments
 (0)