Skip to content

Commit f996427

Browse files
committed
Google Cloud Storage support using gcsfs
1 parent f1aa08c commit f996427

17 files changed

+100
-60
lines changed

ci/appveyor-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ dependencies:
66
- beautifulsoup4
77
- bottleneck
88
- dateutil
9+
- gcsfs
910
- html5lib
1011
- jinja2=2.8
1112
- lxml

ci/check_imports.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
blacklist = {
77
'bs4',
8+
'gcsfs',
89
'html5lib',
910
'ipython',
1011
'jinja2'

ci/circle-36-locale_slow.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- beautifulsoup4
77
- cython
8+
- gcsfs
89
- html5lib
910
- ipython
1011
- jinja2

ci/requirements-optional-conda.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ blosc
33
bottleneck
44
fastparquet
55
feather-format
6+
gcsfs
67
html5lib
78
ipython>=5.6.0
89
ipykernel

ci/requirements-optional-pip.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ blosc
55
bottleneck
66
fastparquet
77
feather-format
8+
gcsfs
89
html5lib
910
ipython>=5.6.0
1011
ipykernel

ci/travis-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies:
99
- fastparquet
1010
- feather-format
1111
- flake8=3.4.1
12+
- gcsfs
1213
- html5lib
1314
- ipython
1415
- jemalloc=4.5.0.post

ci/travis-36.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies:
88
- dask
99
- fastparquet
1010
- feather-format
11+
- gcsfs
1112
- geopandas
1213
- html5lib
1314
- ipython

doc/source/install.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ Optional Dependencies
276276
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
277277
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
278278
* `blosc <https://pypi.org/project/blosc>`__: for msgpack compression using ``blosc``
279+
* `gcsfs <http://gcsfs.readthedocs.io/>`__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
279280
* One of
280281
`qtpy <https://github.com/spyder-ide/qtpy>`__ (requires PyQt or PySide),
281282
`PyQt5 <https://www.riverbankcomputing.com/software/pyqt/download5>`__,

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Other Enhancements
1818
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
1919
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
2020
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`)
21-
21+
- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`)
2222

2323
.. _whatsnew_0240.api_breaking:
2424

pandas/io/common.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _is_url(url):
8888
"""
8989
try:
9090
return parse_url(url).scheme in _VALID_URLS
91-
except:
91+
except Exception:
9292
return False
9393

9494

@@ -165,7 +165,15 @@ def is_s3_url(url):
165165
"""Check for an s3, s3n, or s3a url"""
166166
try:
167167
return parse_url(url).scheme in ['s3', 's3n', 's3a']
168-
except: # noqa
168+
except Exception:
169+
return False
170+
171+
172+
def is_gcs_url(url):
173+
"""Check for a gcs url"""
174+
try:
175+
return parse_url(url).scheme in ['gcs', 'gs']
176+
except Exception:
169177
return False
170178

171179

@@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
208216
compression=compression,
209217
mode=mode)
210218

219+
if is_gcs_url(filepath_or_buffer):
220+
from pandas.io import gcs
221+
return gcs.get_filepath_or_buffer(filepath_or_buffer,
222+
encoding=encoding,
223+
compression=compression,
224+
mode=mode)
225+
211226
if isinstance(filepath_or_buffer, (compat.string_types,
212227
compat.binary_type,
213228
mmap.mmap)):

pandas/io/excel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
io : string, path object (pathlib.Path or py._path.local.LocalPath),
4747
file-like object, pandas ExcelFile, or xlrd workbook.
4848
The string could be a URL. Valid URL schemes include http, ftp, s3,
49-
and file. For file URLs, a host is expected. For instance, a local
49+
gcs, and file. For file URLs, a host is expected. For instance, a local
5050
file could be file://localhost/path/to/workbook.xlsx
5151
sheet_name : string, int, mixed list of strings/ints, or None, default 0
5252

pandas/io/gcs.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
""" GCS support for remote file interactivity """
2+
try:
3+
import gcsfs
4+
except ImportError:
5+
raise ImportError("The gcsfs library is required to handle GCS files")
6+
7+
8+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
9+
compression=None, mode=None):
10+
11+
if mode is None:
12+
mode = 'rb'
13+
14+
fs = gcsfs.GCSFileSystem()
15+
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
16+
return filepath_or_buffer, None, compression, True

pandas/io/json/json.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
231231
Parameters
232232
----------
233233
path_or_buf : a valid JSON string or file-like, default: None
234-
The string could be a URL. Valid URL schemes include http, ftp, s3, and
235-
file. For file URLs, a host is expected. For instance, a local file
236-
could be ``file://localhost/path/to/table.json``
234+
The string could be a URL. Valid URL schemes include http, ftp, s3,
235+
gcs, and file. For file URLs, a host is expected. For instance, a local
236+
file could be ``file://localhost/path/to/table.json``
237237
238238
orient : string,
239239
Indication of expected JSON string format.

pandas/tests/io/test_gcs.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import DataFrame, date_range, read_csv
5+
from pandas.compat import StringIO
6+
from pandas.io.common import is_gcs_url
7+
from pandas.util import _test_decorators as td
8+
from pandas.util.testing import assert_frame_equal, patch
9+
10+
11+
def test_is_gcs_url():
12+
assert is_gcs_url("gcs://pandas/somethingelse.com")
13+
assert is_gcs_url("gs://pandas/somethingelse.com")
14+
assert not is_gcs_url("s3://pandas/somethingelse.com")
15+
16+
17+
@td.skip_if_no('gcsfs')
18+
def test_read_csv_gcs():
19+
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
20+
'dt': date_range('2018-06-18', periods=2)})
21+
with patch('gcsfs.GCSFileSystem') as MockFileSystem:
22+
instance = MockFileSystem.return_value
23+
instance.open.return_value = StringIO(df1.to_csv(index=False))
24+
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
25+
26+
assert_frame_equal(df1, df2)
27+
28+
29+
@td.skip_if_no('gcsfs')
30+
def test_gcs_get_filepath_or_buffer():
31+
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
32+
'dt': date_range('2018-06-18', periods=2)})
33+
with patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath:
34+
MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)),
35+
None, None, False)
36+
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
37+
38+
assert_frame_equal(df1, df2)
39+
assert MockGetFilepath.called
40+
41+
42+
@pytest.mark.skipif(td.safe_import('gcsfs'),
43+
reason='Only check when gcsfs not installed')
44+
def test_gcs_not_present_exception():
45+
with pytest.raises(ImportError) as e:
46+
read_csv('gs://test/test.csv')
47+
assert 'gcsfs library is required' in str(e.value)

pandas/tests/io/test_packers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,7 @@ def decompress(ob):
682682
np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
683683
}
684684

685-
with patch(compress_module, 'decompress', decompress), \
685+
with patch(compress + '.decompress', new=decompress), \
686686
tm.assert_produces_warning(PerformanceWarning) as ws:
687687

688688
i_rec = self.encode_decode(self.frame, compress=compress)

pandas/util/_print_versions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def show_versions(as_json=False):
9696
("fastparquet", lambda mod: mod.__version__),
9797
("pandas_gbq", lambda mod: mod.__version__),
9898
("pandas_datareader", lambda mod: mod.__version__),
99+
("gcsfs", lambda mod: mod.__version__),
99100
]
100101

101102
deps_blob = list()

pandas/util/testing.py

Lines changed: 5 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@
4848
from pandas._libs import testing as _testing
4949
from pandas.io.common import urlopen
5050

51+
try:
52+
from unittest.mock import patch
53+
except ImportError:
54+
from mock import patch
55+
5156

5257
N = 30
5358
K = 4
@@ -2667,58 +2672,6 @@ def _constructor(self):
26672672
return SubclassedCategorical
26682673

26692674

2670-
@contextmanager
2671-
def patch(ob, attr, value):
2672-
"""Temporarily patch an attribute of an object.
2673-
2674-
Parameters
2675-
----------
2676-
ob : any
2677-
The object to patch. This must support attribute assignment for `attr`.
2678-
attr : str
2679-
The name of the attribute to patch.
2680-
value : any
2681-
The temporary attribute to assign.
2682-
2683-
Examples
2684-
--------
2685-
>>> class C(object):
2686-
... attribute = 'original'
2687-
...
2688-
>>> C.attribute
2689-
'original'
2690-
>>> with patch(C, 'attribute', 'patched'):
2691-
... in_context = C.attribute
2692-
...
2693-
>>> in_context
2694-
'patched'
2695-
>>> C.attribute # the value is reset when the context manager exists
2696-
'original'
2697-
2698-
Correctly replaces attribute when the manager exits with an exception.
2699-
>>> with patch(C, 'attribute', 'patched'):
2700-
... in_context = C.attribute
2701-
... raise ValueError()
2702-
Traceback (most recent call last):
2703-
...
2704-
ValueError
2705-
>>> in_context
2706-
'patched'
2707-
>>> C.attribute
2708-
'original'
2709-
"""
2710-
noattr = object() # mark that the attribute never existed
2711-
old = getattr(ob, attr, noattr)
2712-
setattr(ob, attr, value)
2713-
try:
2714-
yield
2715-
finally:
2716-
if old is noattr:
2717-
delattr(ob, attr)
2718-
else:
2719-
setattr(ob, attr, old)
2720-
2721-
27222675
@contextmanager
27232676
def set_timezone(tz):
27242677
"""Context manager for temporarily setting a timezone.

0 commit comments

Comments
 (0)