Skip to content

ENH: .read_pickle(...) from zip containing hidden OS X/macOS metadata files/folders #37101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ Other enhancements
- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
- :func:`read_pickle` (and other ``read_*`` functions that handle compressed inputs) can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`).


.. ---------------------------------------------------------------------------

Expand Down
9 changes: 8 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,14 @@ def get_handle(
handle = _BytesZipFile(handle, ioargs.mode, **compression_args)
if handle.mode == "r":
handles.append(handle)
zip_names = handle.namelist()

# Ignore hidden folders added by OS X/macOS on .zip creation
zip_names = [
_
for _ in handle.namelist()
if not (_.startswith("__MACOSX/") or _.startswith(".DS_STORE"))
]

if len(zip_names) == 1:
handle = handle.open(zip_names.pop())
elif len(zip_names) == 0:
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/io/parser/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ def test_zip_error_multiple_files(parser_and_data, compression):
parser.read_csv(path, compression=compression)


@pytest.mark.parametrize("compression", ["zip", "infer"])
def test_zip_no_error_hidden_files(parser_and_data, compression, python_parser_only):
_, data, expected = parser_and_data

with tm.ensure_clean("combined_zip.zip") as path:
inner_file_names = ["test_file", "__MACOSX/dummy", ".DS_STORE"]

with zipfile.ZipFile(path, mode="w") as tmp:
for file_name in inner_file_names:
tmp.writestr(file_name, data)

python_parser_only.read_csv(path, compression=compression)


def test_zip_error_no_files(parser_and_data):
parser, _, _ = parser_and_data

Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,33 @@ def test_read_infer(self, ext, get_random_path):

tm.assert_frame_equal(df, df2)

@pytest.mark.parametrize("cruft", ["__MACOSX/", ".DS_STORE"])
def test_load_zip_with_hidden_folders(self, cruft, get_random_path):
# Test loading .zip files with platform-specific hidden folders (issue #37098)
base = get_random_path
path1 = f"{base}.raw"
path2 = f"{base}.zip"
dummy = f"{base}.dummy"
compression = "zip"

with tm.ensure_clean(path1) as p1, tm.ensure_clean(
path2
) as p2, tm.ensure_clean(dummy) as dummy_path:

df = tm.makeDataFrame()
df.to_pickle(p1, compression=None)
self.compress_file(p1, p2, compression=compression)

# add dummy file `{cruft}{dummy}` to the archive
with zipfile.ZipFile(p2, "a", compression=zipfile.ZIP_DEFLATED) as f:
f.write(dummy_path, f"{cruft}{dummy}")
with zipfile.ZipFile(p2, "r") as f:
assert f"{cruft}{dummy}" in f.namelist()

# dummy file should be ignored on reading, otherwise read_pickle will fail
df2 = pd.read_pickle(p2)
tm.assert_frame_equal(df, df2)


# ---------------------
# test pickle compression
Expand Down