Skip to content

Backport PR #27882 on branch 0.25.x #28012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion doc/source/whatsnew/v0.25.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ MultiIndex

I/O
^^^

- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`)
- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`)
-
Expand Down Expand Up @@ -160,6 +159,14 @@ Other
-
-

I/O and LZMA
~~~~~~~~~~~~

Some users may unknowingly have an incomplete Python installation, which lacks the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`).
Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`.
A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python.
For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python.

.. _whatsnew_0.251.contributors:

Contributors
Expand Down
8 changes: 5 additions & 3 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# See LICENSE for the license
import bz2
import gzip
import lzma
import os
import sys
import time
Expand Down Expand Up @@ -59,9 +58,12 @@ from pandas.core.arrays import Categorical
from pandas.core.dtypes.concat import union_categoricals
import pandas.io.common as icom

from pandas.compat import _import_lzma, _get_lzma_file
from pandas.errors import (ParserError, DtypeWarning,
EmptyDataError, ParserWarning)

lzma = _import_lzma()

# Import CParserError as alias of ParserError for backwards compatibility.
# Ultimately, we want to remove this import. See gh-12665 and gh-14479.
CParserError = ParserError
Expand Down Expand Up @@ -645,9 +647,9 @@ cdef class TextReader:
'zip file %s', str(zip_names))
elif self.compression == 'xz':
if isinstance(source, str):
source = lzma.LZMAFile(source, 'rb')
source = _get_lzma_file(lzma)(source, 'rb')
else:
source = lzma.LZMAFile(filename=source)
source = _get_lzma_file(lzma)(filename=source)
else:
raise ValueError('Unrecognized compression type: %s' %
self.compression)
Expand Down
30 changes: 30 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import platform
import struct
import sys
import warnings

PY35 = sys.version_info[:2] == (3, 5)
PY36 = sys.version_info >= (3, 6)
Expand Down Expand Up @@ -65,3 +66,32 @@ def is_platform_mac():

def is_platform_32bit():
return struct.calcsize("P") * 8 < 64


def _import_lzma():
"""Attempts to import lzma, warning the user when lzma is not available.
"""
try:
import lzma

return lzma
except ImportError:
msg = (
"Could not import the lzma module. "
"Your installed Python is incomplete. "
"Attempting to use lzma compression will result in a RuntimeError."
)
warnings.warn(msg)


def _get_lzma_file(lzma):
"""Returns the lzma method LZMAFile when the module was correctly imported.
Otherwise, raises a RuntimeError.
"""
if lzma is None:
raise RuntimeError(
"lzma module not available. "
"A Python re-install with the proper "
"dependencies might be required to solve this issue."
)
return lzma.LZMAFile
6 changes: 4 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import gzip
from http.client import HTTPException # noqa
from io import BytesIO
import lzma
import mmap
import os
import pathlib
Expand All @@ -22,6 +21,7 @@
from urllib.request import pathname2url, urlopen
import zipfile

from pandas.compat import _get_lzma_file, _import_lzma
from pandas.errors import ( # noqa
AbstractMethodError,
DtypeWarning,
Expand All @@ -32,6 +32,8 @@

from pandas.core.dtypes.common import is_file_like

lzma = _import_lzma()

# gh-12665: Alias for now and remove later.
CParserError = ParserError

Expand Down Expand Up @@ -382,7 +384,7 @@ def _get_handle(

# XZ Compression
elif compression == "xz":
f = lzma.LZMAFile(path_or_buf, mode)
f = _get_lzma_file(lzma)(path_or_buf, mode)

# Unrecognized Compression
else:
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import contextlib
import os
import subprocess
import textwrap
import warnings

import pytest
Expand Down Expand Up @@ -125,3 +127,33 @@ def test_compression_warning(compression_only):
with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False):
with f:
df.to_csv(f, compression=compression_only)


def test_with_missing_lzma():
"""Tests if import pandas works when lzma is not present."""
# https://github.com/pandas-dev/pandas/issues/27575
code = textwrap.dedent(
"""\
import sys
sys.modules['lzma'] = None
import pandas
"""
)
subprocess.check_output(["python", "-c", code])


def test_with_missing_lzma_runtime():
"""Tests if RuntimeError is hit when calling lzma without
having the module available."""
code = textwrap.dedent(
"""
import sys
import pytest
sys.modules['lzma'] = None
import pandas
df = pandas.DataFrame()
with pytest.raises(RuntimeError, match='lzma module'):
df.to_csv('foo.csv', compression='xz')
"""
)
subprocess.check_output(["python", "-c", code])
7 changes: 4 additions & 3 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import bz2
import glob
import gzip
import lzma
import os
import pickle
import shutil
Expand All @@ -22,14 +21,16 @@

import pytest

from pandas.compat import is_platform_little_endian
from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian

import pandas as pd
from pandas import Index
import pandas.util.testing as tm

from pandas.tseries.offsets import Day, MonthEnd

lzma = _import_lzma()


@pytest.fixture(scope="module")
def current_pickle_data():
Expand Down Expand Up @@ -270,7 +271,7 @@ def compress_file(self, src_path, dest_path, compression):
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
f.write(src_path, os.path.basename(src_path))
elif compression == "xz":
f = lzma.LZMAFile(dest_path, "w")
f = _get_lzma_file(lzma)(dest_path, "w")
else:
msg = "Unrecognized compression type: {}".format(compression)
raise ValueError(msg)
Expand Down
11 changes: 5 additions & 6 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from functools import wraps
import gzip
import http.client
import lzma
import os
import re
from shutil import rmtree
Expand All @@ -26,7 +25,7 @@
)

import pandas._libs.testing as _testing
from pandas.compat import raise_with_traceback
from pandas.compat import _get_lzma_file, _import_lzma, raise_with_traceback

from pandas.core.dtypes.common import (
is_bool,
Expand Down Expand Up @@ -70,6 +69,8 @@
from pandas.io.common import urlopen
from pandas.io.formats.printing import pprint_thing

lzma = _import_lzma()

N = 30
K = 4
_RAISE_NETWORK_ERROR_DEFAULT = False
Expand Down Expand Up @@ -211,7 +212,7 @@ def decompress_file(path, compression):
elif compression == "bz2":
f = bz2.BZ2File(path, "rb")
elif compression == "xz":
f = lzma.LZMAFile(path, "rb")
f = _get_lzma_file(lzma)(path, "rb")
elif compression == "zip":
zip_file = zipfile.ZipFile(path)
zip_names = zip_file.namelist()
Expand Down Expand Up @@ -264,9 +265,7 @@ def write_to_compressed(compression, path, data, dest="test"):

compress_method = bz2.BZ2File
elif compression == "xz":
import lzma

compress_method = lzma.LZMAFile
compress_method = _get_lzma_file(lzma)
else:
msg = "Unrecognized compression type: {}".format(compression)
raise ValueError(msg)
Expand Down