-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add tranparent compression to json reading/writing #17798
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
9f2af42
3ed830c
2a7c3b2
8e9fd4a
ff98b60
402fa11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import pytest | ||
import moto | ||
|
||
import pandas as pd | ||
from pandas import compat | ||
import pandas.util.testing as tm | ||
from pandas.util.testing import assert_frame_equal, assert_raises_regex | ||
|
||
|
||
COMPRESSION_TYPES = [None, 'bz2', 'gzip', 'xz'] | ||
|
||
|
||
def test_compress_gzip(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls parametrize all of this see how this is done in other compression tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used the pattern that was used in the tests of compression with Where there is a function which is used to decompress files by compression type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The zip tests will IMO always need to be special cases, as there isn't a writer we will always need to read from a fixture. |
||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression='gzip') | ||
assert_frame_equal(df, pd.read_json(path, compression='gzip')) | ||
|
||
# explicitly make sure file is gzipped | ||
import gzip | ||
with gzip.GzipFile(path, 'rb') as f: | ||
text = f.read().decode('utf8') | ||
assert_frame_equal(df, pd.read_json(text)) | ||
|
||
|
||
def test_compress_bz2(): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression='bz2') | ||
assert_frame_equal(df, pd.read_json(path, compression='bz2')) | ||
|
||
# explicitly make sure file is bz2ed | ||
import bz2 | ||
with bz2.BZ2File(path, 'rb') as f: | ||
text = f.read().decode('utf8') | ||
assert_frame_equal(df, pd.read_json(text)) | ||
|
||
|
||
def test_compress_xz(): | ||
tm._skip_if_no_lzma() | ||
|
||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression='xz') | ||
assert_frame_equal(df, pd.read_json(path, compression='xz')) | ||
|
||
# explicitly make sure file is xzipped | ||
lzma = compat.import_lzma() | ||
with lzma.open(path, 'rb') as f: | ||
text = f.read().decode('utf8') | ||
assert_frame_equal(df, pd.read_json(text)) | ||
|
||
|
||
def test_compress_zip_value_error(): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
import zipfile | ||
pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") | ||
|
||
|
||
def test_read_zipped_json(): | ||
uncompressed_path = tm.get_data_path("tsframe_v012.json") | ||
uncompressed_df = pd.read_json(uncompressed_path) | ||
|
||
compressed_path = tm.get_data_path("tsframe_v012.json.zip") | ||
compressed_df = pd.read_json(compressed_path, compression='zip') | ||
|
||
assert_frame_equal(uncompressed_df, compressed_df) | ||
|
||
|
||
@pytest.mark.parametrize('compression', COMPRESSION_TYPES) | ||
def test_with_file_url(compression): | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
df.to_json(path, compression=compression) | ||
file_url = 'file://localhost' + path | ||
assert_frame_equal(df, pd.read_json(file_url, compression=compression)) | ||
|
||
|
||
@pytest.mark.parametrize('compression', COMPRESSION_TYPES) | ||
def test_with_s3_url(compression): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
boto3 = pytest.importorskip('boto3') | ||
pytest.importorskip('s3fs') | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
|
||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
with moto.mock_s3(): | ||
conn = boto3.resource("s3", region_name="us-east-1") | ||
bucket = conn.create_bucket(Bucket="pandas-test") | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression=compression) | ||
with open(path, 'rb') as f: | ||
bucket.put_object(Key='test-1', Body=f) | ||
|
||
roundtripped_df = pd.read_json('s3://pandas-test/test-1', | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
@pytest.mark.parametrize('compression', COMPRESSION_TYPES) | ||
def test_lines_with_compression(compression): | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, compression=compression) | ||
roundtripped_df = pd.read_json(path, lines=True, | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
@pytest.mark.parametrize('compression', COMPRESSION_TYPES) | ||
def test_chunksize_with_compression(compression): | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, compression=compression) | ||
|
||
roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1, | ||
compression=compression)) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
def test_write_unsupported_compression_type(): | ||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
with tm.ensure_clean() as path: | ||
msg = "Unrecognized compression type: unsupported" | ||
assert_raises_regex(ValueError, msg, df.to_json, | ||
path, compression="unsupported") | ||
|
||
|
||
def test_read_unsupported_compression_type(): | ||
with tm.ensure_clean() as path: | ||
msg = "Unrecognized compression type: unsupported" | ||
assert_raises_regex(ValueError, msg, pd.read_json, | ||
path, compression="unsupported") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
update this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated