Skip to content

Commit e35e888

Browse files
ap--gutzbenj
authored andcommitted
LocalFileSystem restore _strip_protocol signature (fsspec#1567)
1 parent bc40f36 commit e35e888

19 files changed

+689
-248
lines changed

ci/environment-py38.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ channels:
33
- conda-forge
44
dependencies:
55
- pip
6-
- git
6+
- git <2.45.0
77
- py
88
- pip:
99
- hadoop-test-cluster

docs/source/api.rst

+10-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ Base Classes
4747
fsspec.core.OpenFiles
4848
fsspec.core.get_fs_token_paths
4949
fsspec.core.url_to_fs
50-
fsspec.dircache.DirCache
50+
fsspec.dircache.DisabledListingsCache
51+
fsspec.dircache.MemoryListingsCache
52+
fsspec.dircache.FileListingsCache
5153
fsspec.FSMap
5254
fsspec.generic.GenericFileSystem
5355
fsspec.registry.register_implementation
@@ -82,7 +84,13 @@ Base Classes
8284

8385
.. autofunction:: fsspec.core.url_to_fs
8486

85-
.. autoclass:: fsspec.dircache.DirCache
87+
.. autoclass:: fsspec.dircache.DisabledListingsCache
88+
:members: __init__
89+
90+
.. autoclass:: fsspec.dircache.MemoryListingsCache
91+
:members: __init__
92+
93+
.. autoclass:: fsspec.dircache.FileListingsCache
8694
:members: __init__
8795

8896
.. autoclass:: fsspec.FSMap

docs/source/changelog.rst

+8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
Changelog
22
=========
33

4+
Dev
5+
--------
6+
7+
Enhancements
8+
9+
- add file-based listing cache using diskcache (#895)
10+
warning: use new ``listings_cache_options`` instead of ``use_listings_cache`` etc.
11+
412
2024.3.1
513
--------
614

docs/source/features.rst

+20-9
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,26 @@ Listings Caching
181181
----------------
182182

183183
For some implementations, getting file listings (i.e., ``ls`` and anything that
184-
depends on it) is expensive. These implementations use dict-like instances of
185-
:class:`fsspec.dircache.DirCache` to manage the listings.
186-
187-
The cache allows for time-based expiry of entries with the ``listings_expiry_time``
188-
parameter, or LRU expiry with the ``max_paths`` parameter. These can be
189-
set on any implementation instance that uses listings caching; or to skip the
190-
caching altogether, use ``use_listings_cache=False``. That would be appropriate
191-
when the target location is known to be volatile because it is being written
192-
to from other sources.
184+
depends on it) is expensive. These implementations maye use either dict-like instances of
185+
:class:`fsspec.dircache.MemoryListingsCache` or file-based caching with instances of
186+
:class:`fsspec.dircache.FileListingsCache` to manage the listings.
187+
188+
The listings cache can be controlled via the keyword ``listings_cache_options`` which is a dictionary.
189+
The type of cache that is used, can be controlled via the keyword ``cache_type`` (`disabled`, `memory` or `file`).
190+
The cache allows for time-based expiry of entries with the keyword ``expiry_time``. If the target location is known to
191+
be volatile because e.g. it is being written to from other sources we recommend to disable the listings cache.
192+
If you want to use the file-based caching, you can also provide the argument
193+
``directory`` to determine where the cache file is stored.
194+
195+
Example for ``listings_cache_options``:
196+
197+
.. code-block:: json
198+
199+
{
200+
"cache_type": "file",
201+
"expiry_time": 3600,
202+
"directory": "/tmp/cache"
203+
}
193204
194205
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
195206
is called, so that subsequent listing of the given paths will force a refresh. In

fsspec/archive.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,19 @@ def _all_dirnames(self, paths):
3737
def info(self, path, **kwargs):
3838
self._get_dirs()
3939
path = self._strip_protocol(path)
40-
if path in {"", "/"} and self.dir_cache:
40+
if path in {"", "/"} and self.dircache:
4141
return {"name": "", "type": "directory", "size": 0}
42-
if path in self.dir_cache:
43-
return self.dir_cache[path]
44-
elif path + "/" in self.dir_cache:
45-
return self.dir_cache[path + "/"]
42+
if path in self.dircache:
43+
return self.dircache[path]
44+
elif path + "/" in self.dircache:
45+
return self.dircache[path + "/"]
4646
else:
4747
raise FileNotFoundError(path)
4848

4949
def ls(self, path, detail=True, **kwargs):
5050
self._get_dirs()
5151
paths = {}
52-
for p, f in self.dir_cache.items():
52+
for p, f in self.dircache.items():
5353
p = p.rstrip("/")
5454
if "/" in p:
5555
root = p.rsplit("/", 1)[0]

fsspec/asyn.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -312,15 +312,23 @@ class AsyncFileSystem(AbstractFileSystem):
312312
mirror_sync_methods = True
313313
disable_throttling = False
314314

315-
def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
315+
def __init__(
316+
self,
317+
*args,
318+
asynchronous=False,
319+
loop=None,
320+
batch_size=None,
321+
listings_cache_options=None,
322+
**kwargs,
323+
):
316324
self.asynchronous = asynchronous
317325
self._pid = os.getpid()
318326
if not asynchronous:
319327
self._loop = loop or get_loop()
320328
else:
321329
self._loop = None
322330
self.batch_size = batch_size
323-
super().__init__(*args, **kwargs)
331+
super().__init__(listings_cache_options, *args, **kwargs)
324332

325333
@property
326334
def loop(self):

fsspec/dircache.py

-98
This file was deleted.

fsspec/implementations/http.py

+33-14
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import logging
44
import re
55
import weakref
6-
from copy import copy
76
from urllib.parse import urlparse
87

98
import aiohttp
@@ -58,6 +57,7 @@ def __init__(
5857
client_kwargs=None,
5958
get_client=get_client,
6059
encoded=False,
60+
listings_cache_options=None,
6161
**storage_options,
6262
):
6363
"""
@@ -83,11 +83,39 @@ def __init__(
8383
A callable which takes keyword arguments and constructs
8484
an aiohttp.ClientSession. It's state will be managed by
8585
the HTTPFileSystem class.
86+
listings_cache_options: dict
87+
Options for the listings cache.
8688
storage_options: key-value
8789
Any other parameters passed on to requests
8890
cache_type, cache_options: defaults used in open
8991
"""
90-
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
92+
# TODO: remove in future release
93+
# Clean caching-related parameters from `storage_options`
94+
# before propagating them as `request_options` through `self.kwargs`.
95+
old_listings_cache_kwargs = {
96+
"use_listings_cache",
97+
"listings_expiry_time",
98+
"max_paths",
99+
"skip_instance_cache",
100+
}
101+
# intersection of old_listings_cache_kwargs and storage_options
102+
old_listings_cache_kwargs = old_listings_cache_kwargs.intersection(
103+
storage_options
104+
)
105+
if old_listings_cache_kwargs:
106+
logger.warning(
107+
f"The following parameters are not used anymore and will be ignored: {old_listings_cache_kwargs}. "
108+
f"Use new `listings_cache_options` instead."
109+
)
110+
for key in old_listings_cache_kwargs:
111+
del storage_options[key]
112+
super().__init__(
113+
self,
114+
asynchronous=asynchronous,
115+
loop=loop,
116+
listings_cache_options=listings_cache_options,
117+
**storage_options,
118+
)
91119
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
92120
self.simple_links = simple_links
93121
self.same_schema = same_scheme
@@ -96,19 +124,10 @@ def __init__(
96124
self.client_kwargs = client_kwargs or {}
97125
self.get_client = get_client
98126
self.encoded = encoded
99-
self.kwargs = storage_options
100-
self._session = None
101-
102-
# Clean caching-related parameters from `storage_options`
103-
# before propagating them as `request_options` through `self.kwargs`.
104127
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
105128
# it clearer.
106-
request_options = copy(storage_options)
107-
self.use_listings_cache = request_options.pop("use_listings_cache", False)
108-
request_options.pop("listings_expiry_time", None)
109-
request_options.pop("max_paths", None)
110-
request_options.pop("skip_instance_cache", None)
111-
self.kwargs = request_options
129+
self.kwargs = storage_options
130+
self._session = None
112131

113132
@property
114133
def fsid(self):
@@ -201,7 +220,7 @@ async def _ls_real(self, url, detail=True, **kwargs):
201220
return sorted(out)
202221

203222
async def _ls(self, url, detail=True, **kwargs):
204-
if self.use_listings_cache and url in self.dircache:
223+
if url in self.dircache:
205224
out = self.dircache[url]
206225
else:
207226
out = await self._ls_real(url, detail=detail, **kwargs)

fsspec/implementations/libarchive.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def __init__(
115115
Kwargs passed when instantiating the target FS, if ``fo`` is
116116
a string.
117117
"""
118-
super().__init__(self, **kwargs)
118+
super().__init__(False, self, **kwargs)
119119
if mode != "r":
120120
raise ValueError("Only read from archive files accepted")
121121
if isinstance(fo, str):

0 commit comments

Comments
 (0)