I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,69 @@
from importlib.metadata import entry_points
from . import caching
from ._version import __version__ # noqa: F401
from .callbacks import Callback
from .compression import available_compressions
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
from .exceptions import FSTimeoutError
from .mapping import FSMap, get_mapper
from .registry import (
available_protocols,
filesystem,
get_filesystem_class,
register_implementation,
registry,
)
from .spec import AbstractFileSystem
__all__ = [
"AbstractFileSystem",
"FSTimeoutError",
"FSMap",
"filesystem",
"register_implementation",
"get_filesystem_class",
"get_fs_token_paths",
"get_mapper",
"open",
"open_files",
"open_local",
"registry",
"caching",
"Callback",
"available_protocols",
"available_compressions",
"url_to_fs",
]
def process_entries():
if entry_points is not None:
try:
eps = entry_points()
except TypeError:
pass # importlib-metadata < 0.8
else:
if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
specs = eps.select(group="fsspec.specs")
else:
specs = eps.get("fsspec.specs", [])
registered_names = {}
for spec in specs:
err_msg = f"Unable to load filesystem from {spec}"
name = spec.name
if name in registered_names:
continue
registered_names[name] = True
register_implementation(
name,
spec.value.replace(":", "."),
errtxt=err_msg,
# We take our implementations as the ones to overload with if
# for some reason we encounter some, may be the same, already
# registered
clobber=True,
)
process_entries()

View File

@ -0,0 +1,16 @@
# file generated by setuptools_scm
# don't change, don't track in version control
TYPE_CHECKING = False
if TYPE_CHECKING:
from typing import Tuple, Union
VERSION_TUPLE = Tuple[Union[int, str], ...]
else:
VERSION_TUPLE = object
version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE
__version__ = version = '2024.10.0'
__version_tuple__ = version_tuple = (2024, 10, 0)

View File

@ -0,0 +1,73 @@
from fsspec import AbstractFileSystem
from fsspec.utils import tokenize
class AbstractArchiveFileSystem(AbstractFileSystem):
"""
A generic superclass for implementing Archive-based filesystems.
Currently, it is shared amongst
:class:`~fsspec.implementations.zip.ZipFileSystem`,
:class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
:class:`~fsspec.implementations.tar.TarFileSystem`.
"""
def __str__(self):
return f"<Archive-like object {type(self).__name__} at {id(self)}>"
__repr__ = __str__
def ukey(self, path):
return tokenize(path, self.fo, self.protocol)
def _all_dirnames(self, paths):
"""Returns *all* directory names for each path in paths, including intermediate
ones.
Parameters
----------
paths: Iterable of path strings
"""
if len(paths) == 0:
return set()
dirnames = {self._parent(path) for path in paths} - {self.root_marker}
return dirnames | self._all_dirnames(dirnames)
def info(self, path, **kwargs):
self._get_dirs()
path = self._strip_protocol(path)
if path in {"", "/"} and self.dir_cache:
return {"name": "", "type": "directory", "size": 0}
if path in self.dir_cache:
return self.dir_cache[path]
elif path + "/" in self.dir_cache:
return self.dir_cache[path + "/"]
else:
raise FileNotFoundError(path)
def ls(self, path, detail=True, **kwargs):
self._get_dirs()
paths = {}
for p, f in self.dir_cache.items():
p = p.rstrip("/")
if "/" in p:
root = p.rsplit("/", 1)[0]
else:
root = ""
if root == path.rstrip("/"):
paths[p] = f
elif all(
(a == b)
for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
):
# root directory entry
ppath = p.rstrip("/").split("/", 1)[0]
if ppath not in paths:
out = {"name": ppath, "size": 0, "type": "directory"}
paths[ppath] = out
if detail:
out = sorted(paths.values(), key=lambda _: _["name"])
return out
else:
return sorted(paths)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,951 @@
from __future__ import annotations
import collections
import functools
import logging
import math
import os
import threading
import warnings
from concurrent.futures import Future, ThreadPoolExecutor
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Generic,
NamedTuple,
Optional,
OrderedDict,
TypeVar,
)
if TYPE_CHECKING:
import mmap
from typing_extensions import ParamSpec
P = ParamSpec("P")
else:
P = TypeVar("P")
T = TypeVar("T")
logger = logging.getLogger("fsspec")
Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
class BaseCache:
"""Pass-though cache: doesn't keep anything, calls every time
Acts as base class for other cachers
Parameters
----------
blocksize: int
How far to read ahead in numbers of bytes
fetcher: func
Function of the form f(start, end) which gets bytes from remote as
specified
size: int
How big this file is
"""
name: ClassVar[str] = "none"
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
self.blocksize = blocksize
self.nblocks = 0
self.fetcher = fetcher
self.size = size
self.hit_count = 0
self.miss_count = 0
# the bytes that we actually requested
self.total_requested_bytes = 0
def _fetch(self, start: int | None, stop: int | None) -> bytes:
if start is None:
start = 0
if stop is None:
stop = self.size
if start >= self.size or start >= stop:
return b""
return self.fetcher(start, stop)
def _reset_stats(self) -> None:
"""Reset hit and miss counts for a more ganular report e.g. by file."""
self.hit_count = 0
self.miss_count = 0
self.total_requested_bytes = 0
def _log_stats(self) -> str:
"""Return a formatted string of the cache statistics."""
if self.hit_count == 0 and self.miss_count == 0:
# a cache that does nothing, this is for logs only
return ""
return " , %s: %d hits, %d misses, %d total requested bytes" % (
self.name,
self.hit_count,
self.miss_count,
self.total_requested_bytes,
)
def __repr__(self) -> str:
# TODO: use rich for better formatting
return f"""
<{self.__class__.__name__}:
block size : {self.blocksize}
block count : {self.nblocks}
file size : {self.size}
cache hits : {self.hit_count}
cache misses: {self.miss_count}
total requested bytes: {self.total_requested_bytes}>
"""
class MMapCache(BaseCache):
"""memory-mapped sparse file cache
Opens temporary file, which is filled blocks-wise when data is requested.
Ensure there is enough disc space in the temporary location.
This cache method might only work on posix
"""
name = "mmap"
def __init__(
self,
blocksize: int,
fetcher: Fetcher,
size: int,
location: str | None = None,
blocks: set[int] | None = None,
) -> None:
super().__init__(blocksize, fetcher, size)
self.blocks = set() if blocks is None else blocks
self.location = location
self.cache = self._makefile()
def _makefile(self) -> mmap.mmap | bytearray:
import mmap
import tempfile
if self.size == 0:
return bytearray()
# posix version
if self.location is None or not os.path.exists(self.location):
if self.location is None:
fd = tempfile.TemporaryFile()
self.blocks = set()
else:
fd = open(self.location, "wb+")
fd.seek(self.size - 1)
fd.write(b"1")
fd.flush()
else:
fd = open(self.location, "r+b")
return mmap.mmap(fd.fileno(), self.size)
def _fetch(self, start: int | None, end: int | None) -> bytes:
logger.debug(f"MMap cache fetching {start}-{end}")
if start is None:
start = 0
if end is None:
end = self.size
if start >= self.size or start >= end:
return b""
start_block = start // self.blocksize
end_block = end // self.blocksize
need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
self.miss_count += len(need)
self.hit_count += len(hits)
while need:
# TODO: not a for loop so we can consolidate blocks later to
# make fewer fetch calls; this could be parallel
i = need.pop(0)
sstart = i * self.blocksize
send = min(sstart + self.blocksize, self.size)
self.total_requested_bytes += send - sstart
logger.debug(f"MMap get block #{i} ({sstart}-{send})")
self.cache[sstart:send] = self.fetcher(sstart, send)
self.blocks.add(i)
return self.cache[start:end]
def __getstate__(self) -> dict[str, Any]:
state = self.__dict__.copy()
# Remove the unpicklable entries.
del state["cache"]
return state
def __setstate__(self, state: dict[str, Any]) -> None:
# Restore instance attributes
self.__dict__.update(state)
self.cache = self._makefile()
class ReadAheadCache(BaseCache):
"""Cache which reads only when we get beyond a block of data
This is a much simpler version of BytesCache, and does not attempt to
fill holes in the cache or keep fragments alive. It is best suited to
many small reads in a sequential order (e.g., reading lines from a file).
"""
name = "readahead"
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
super().__init__(blocksize, fetcher, size)
self.cache = b""
self.start = 0
self.end = 0
def _fetch(self, start: int | None, end: int | None) -> bytes:
if start is None:
start = 0
if end is None or end > self.size:
end = self.size
if start >= self.size or start >= end:
return b""
l = end - start
if start >= self.start and end <= self.end:
# cache hit
self.hit_count += 1
return self.cache[start - self.start : end - self.start]
elif self.start <= start < self.end:
# partial hit
self.miss_count += 1
part = self.cache[start - self.start :]
l -= len(part)
start = self.end
else:
# miss
self.miss_count += 1
part = b""
end = min(self.size, end + self.blocksize)
self.total_requested_bytes += end - start
self.cache = self.fetcher(start, end) # new block replaces old
self.start = start
self.end = self.start + len(self.cache)
return part + self.cache[:l]
class FirstChunkCache(BaseCache):
"""Caches the first block of a file only
This may be useful for file types where the metadata is stored in the header,
but is randomly accessed.
"""
name = "first"
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
if blocksize > size:
# this will buffer the whole thing
blocksize = size
super().__init__(blocksize, fetcher, size)
self.cache: bytes | None = None
def _fetch(self, start: int | None, end: int | None) -> bytes:
start = start or 0
if start > self.size:
logger.debug("FirstChunkCache: requested start > file size")
return b""
end = min(end, self.size)
if start < self.blocksize:
if self.cache is None:
self.miss_count += 1
if end > self.blocksize:
self.total_requested_bytes += end
data = self.fetcher(0, end)
self.cache = data[: self.blocksize]
return data[start:]
self.cache = self.fetcher(0, self.blocksize)
self.total_requested_bytes += self.blocksize
part = self.cache[start:end]
if end > self.blocksize:
self.total_requested_bytes += end - self.blocksize
part += self.fetcher(self.blocksize, end)
self.hit_count += 1
return part
else:
self.miss_count += 1
self.total_requested_bytes += end - start
return self.fetcher(start, end)
class BlockCache(BaseCache):
"""
Cache holding memory as a set of blocks.
Requests are only ever made ``blocksize`` at a time, and are
stored in an LRU cache. The least recently accessed block is
discarded when more than ``maxblocks`` are stored.
Parameters
----------
blocksize : int
The number of bytes to store in each block.
Requests are only ever made for ``blocksize``, so this
should balance the overhead of making a request against
the granularity of the blocks.
fetcher : Callable
size : int
The total size of the file being cached.
maxblocks : int
The maximum number of blocks to cache for. The maximum memory
use for this cache is then ``blocksize * maxblocks``.
"""
name = "blockcache"
def __init__(
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
) -> None:
super().__init__(blocksize, fetcher, size)
self.nblocks = math.ceil(size / blocksize)
self.maxblocks = maxblocks
self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
def cache_info(self):
"""
The statistics on the block cache.
Returns
-------
NamedTuple
Returned directly from the LRU Cache used internally.
"""
return self._fetch_block_cached.cache_info()
def __getstate__(self) -> dict[str, Any]:
state = self.__dict__
del state["_fetch_block_cached"]
return state
def __setstate__(self, state: dict[str, Any]) -> None:
self.__dict__.update(state)
self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
self._fetch_block
)
def _fetch(self, start: int | None, end: int | None) -> bytes:
if start is None:
start = 0
if end is None:
end = self.size
if start >= self.size or start >= end:
return b""
# byte position -> block numbers
start_block_number = start // self.blocksize
end_block_number = end // self.blocksize
# these are cached, so safe to do multiple calls for the same start and end.
for block_number in range(start_block_number, end_block_number + 1):
self._fetch_block_cached(block_number)
return self._read_cache(
start,
end,
start_block_number=start_block_number,
end_block_number=end_block_number,
)
def _fetch_block(self, block_number: int) -> bytes:
"""
Fetch the block of data for `block_number`.
"""
if block_number > self.nblocks:
raise ValueError(
f"'block_number={block_number}' is greater than "
f"the number of blocks ({self.nblocks})"
)
start = block_number * self.blocksize
end = start + self.blocksize
self.total_requested_bytes += end - start
self.miss_count += 1
logger.info("BlockCache fetching block %d", block_number)
block_contents = super()._fetch(start, end)
return block_contents
def _read_cache(
self, start: int, end: int, start_block_number: int, end_block_number: int
) -> bytes:
"""
Read from our block cache.
Parameters
----------
start, end : int
The start and end byte positions.
start_block_number, end_block_number : int
The start and end block numbers.
"""
start_pos = start % self.blocksize
end_pos = end % self.blocksize
self.hit_count += 1
if start_block_number == end_block_number:
block: bytes = self._fetch_block_cached(start_block_number)
return block[start_pos:end_pos]
else:
# read from the initial
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
# intermediate blocks
# Note: it'd be nice to combine these into one big request. However
# that doesn't play nicely with our LRU cache.
out.extend(
map(
self._fetch_block_cached,
range(start_block_number + 1, end_block_number),
)
)
# final block
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
return b"".join(out)
class BytesCache(BaseCache):
"""Cache which holds data in a in-memory bytes object
Implements read-ahead by the block size, for semi-random reads progressing
through the file.
Parameters
----------
trim: bool
As we read more data, whether to discard the start of the buffer when
we are more than a blocksize ahead of it.
"""
name: ClassVar[str] = "bytes"
def __init__(
self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
) -> None:
super().__init__(blocksize, fetcher, size)
self.cache = b""
self.start: int | None = None
self.end: int | None = None
self.trim = trim
def _fetch(self, start: int | None, end: int | None) -> bytes:
# TODO: only set start/end after fetch, in case it fails?
# is this where retry logic might go?
if start is None:
start = 0
if end is None:
end = self.size
if start >= self.size or start >= end:
return b""
if (
self.start is not None
and start >= self.start
and self.end is not None
and end < self.end
):
# cache hit: we have all the required data
offset = start - self.start
self.hit_count += 1
return self.cache[offset : offset + end - start]
if self.blocksize:
bend = min(self.size, end + self.blocksize)
else:
bend = end
if bend == start or start > self.size:
return b""
if (self.start is None or start < self.start) and (
self.end is None or end > self.end
):
# First read, or extending both before and after
self.total_requested_bytes += bend - start
self.miss_count += 1
self.cache = self.fetcher(start, bend)
self.start = start
else:
assert self.start is not None
assert self.end is not None
self.miss_count += 1
if start < self.start:
if self.end is None or self.end - end > self.blocksize:
self.total_requested_bytes += bend - start
self.cache = self.fetcher(start, bend)
self.start = start
else:
self.total_requested_bytes += self.start - start
new = self.fetcher(start, self.start)
self.start = start
self.cache = new + self.cache
elif self.end is not None and bend > self.end:
if self.end > self.size:
pass
elif end - self.end > self.blocksize:
self.total_requested_bytes += bend - start
self.cache = self.fetcher(start, bend)
self.start = start
else:
self.total_requested_bytes += bend - self.end
new = self.fetcher(self.end, bend)
self.cache = self.cache + new
self.end = self.start + len(self.cache)
offset = start - self.start
out = self.cache[offset : offset + end - start]
if self.trim:
num = (self.end - self.start) // (self.blocksize + 1)
if num > 1:
self.start += self.blocksize * num
self.cache = self.cache[self.blocksize * num :]
return out
def __len__(self) -> int:
return len(self.cache)
class AllBytes(BaseCache):
"""Cache entire contents of the file"""
name: ClassVar[str] = "all"
def __init__(
self,
blocksize: int | None = None,
fetcher: Fetcher | None = None,
size: int | None = None,
data: bytes | None = None,
) -> None:
super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
if data is None:
self.miss_count += 1
self.total_requested_bytes += self.size
data = self.fetcher(0, self.size)
self.data = data
def _fetch(self, start: int | None, stop: int | None) -> bytes:
self.hit_count += 1
return self.data[start:stop]
class KnownPartsOfAFile(BaseCache):
"""
Cache holding known file parts.
Parameters
----------
blocksize: int
How far to read ahead in numbers of bytes
fetcher: func
Function of the form f(start, end) which gets bytes from remote as
specified
size: int
How big this file is
data: dict
A dictionary mapping explicit `(start, stop)` file-offset tuples
with known bytes.
strict: bool, default True
Whether to fetch reads that go beyond a known byte-range boundary.
If `False`, any read that ends outside a known part will be zero
padded. Note that zero padding will not be used for reads that
begin outside a known byte-range.
"""
name: ClassVar[str] = "parts"
def __init__(
self,
blocksize: int,
fetcher: Fetcher,
size: int,
data: Optional[dict[tuple[int, int], bytes]] = None,
strict: bool = True,
**_: Any,
):
super().__init__(blocksize, fetcher, size)
self.strict = strict
# simple consolidation of contiguous blocks
if data:
old_offsets = sorted(data.keys())
offsets = [old_offsets[0]]
blocks = [data.pop(old_offsets[0])]
for start, stop in old_offsets[1:]:
start0, stop0 = offsets[-1]
if start == stop0:
offsets[-1] = (start0, stop)
blocks[-1] += data.pop((start, stop))
else:
offsets.append((start, stop))
blocks.append(data.pop((start, stop)))
self.data = dict(zip(offsets, blocks))
else:
self.data = {}
def _fetch(self, start: int | None, stop: int | None) -> bytes:
if start is None:
start = 0
if stop is None:
stop = self.size
out = b""
for (loc0, loc1), data in self.data.items():
# If self.strict=False, use zero-padded data
# for reads beyond the end of a "known" buffer
if loc0 <= start < loc1:
off = start - loc0
out = data[off : off + stop - start]
if not self.strict or loc0 <= stop <= loc1:
# The request is within a known range, or
# it begins within a known range, and we
# are allowed to pad reads beyond the
# buffer with zero
out += b"\x00" * (stop - start - len(out))
self.hit_count += 1
return out
else:
# The request ends outside a known range,
# and we are being "strict" about reads
# beyond the buffer
start = loc1
break
# We only get here if there is a request outside the
# known parts of the file. In an ideal world, this
# should never happen
if self.fetcher is None:
# We cannot fetch the data, so raise an error
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
# We can fetch the data, but should warn the user
# that this may be slow
warnings.warn(
f"Read is outside the known file parts: {(start, stop)}. "
f"IO/caching performance may be poor!"
)
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
self.total_requested_bytes += stop - start
self.miss_count += 1
return out + super()._fetch(start, stop)
class UpdatableLRU(Generic[P, T]):
"""
Custom implementation of LRU cache that allows updating keys
Used by BackgroudBlockCache
"""
class CacheInfo(NamedTuple):
hits: int
misses: int
maxsize: int
currsize: int
def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
self._cache: OrderedDict[Any, T] = collections.OrderedDict()
self._func = func
self._max_size = max_size
self._hits = 0
self._misses = 0
self._lock = threading.Lock()
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
if kwargs:
raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
with self._lock:
if args in self._cache:
self._cache.move_to_end(args)
self._hits += 1
return self._cache[args]
result = self._func(*args, **kwargs)
with self._lock:
self._cache[args] = result
self._misses += 1
if len(self._cache) > self._max_size:
self._cache.popitem(last=False)
return result
def is_key_cached(self, *args: Any) -> bool:
with self._lock:
return args in self._cache
def add_key(self, result: T, *args: Any) -> None:
with self._lock:
self._cache[args] = result
if len(self._cache) > self._max_size:
self._cache.popitem(last=False)
def cache_info(self) -> UpdatableLRU.CacheInfo:
with self._lock:
return self.CacheInfo(
maxsize=self._max_size,
currsize=len(self._cache),
hits=self._hits,
misses=self._misses,
)
class BackgroundBlockCache(BaseCache):
"""
Cache holding memory as a set of blocks with pre-loading of
the next block in the background.
Requests are only ever made ``blocksize`` at a time, and are
stored in an LRU cache. The least recently accessed block is
discarded when more than ``maxblocks`` are stored. If the
next block is not in cache, it is loaded in a separate thread
in non-blocking way.
Parameters
----------
blocksize : int
The number of bytes to store in each block.
Requests are only ever made for ``blocksize``, so this
should balance the overhead of making a request against
the granularity of the blocks.
fetcher : Callable
size : int
The total size of the file being cached.
maxblocks : int
The maximum number of blocks to cache for. The maximum memory
use for this cache is then ``blocksize * maxblocks``.
"""
name: ClassVar[str] = "background"
def __init__(
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
) -> None:
super().__init__(blocksize, fetcher, size)
self.nblocks = math.ceil(size / blocksize)
self.maxblocks = maxblocks
self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
self._thread_executor = ThreadPoolExecutor(max_workers=1)
self._fetch_future_block_number: int | None = None
self._fetch_future: Future[bytes] | None = None
self._fetch_future_lock = threading.Lock()
def cache_info(self) -> UpdatableLRU.CacheInfo:
"""
The statistics on the block cache.
Returns
-------
NamedTuple
Returned directly from the LRU Cache used internally.
"""
return self._fetch_block_cached.cache_info()
def __getstate__(self) -> dict[str, Any]:
state = self.__dict__
del state["_fetch_block_cached"]
del state["_thread_executor"]
del state["_fetch_future_block_number"]
del state["_fetch_future"]
del state["_fetch_future_lock"]
return state
def __setstate__(self, state) -> None:
self.__dict__.update(state)
self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
self._thread_executor = ThreadPoolExecutor(max_workers=1)
self._fetch_future_block_number = None
self._fetch_future = None
self._fetch_future_lock = threading.Lock()
def _fetch(self, start: int | None, end: int | None) -> bytes:
if start is None:
start = 0
if end is None:
end = self.size
if start >= self.size or start >= end:
return b""
# byte position -> block numbers
start_block_number = start // self.blocksize
end_block_number = end // self.blocksize
fetch_future_block_number = None
fetch_future = None
with self._fetch_future_lock:
# Background thread is running. Check we we can or must join it.
if self._fetch_future is not None:
assert self._fetch_future_block_number is not None
if self._fetch_future.done():
logger.info("BlockCache joined background fetch without waiting.")
self._fetch_block_cached.add_key(
self._fetch_future.result(), self._fetch_future_block_number
)
# Cleanup the fetch variables. Done with fetching the block.
self._fetch_future_block_number = None
self._fetch_future = None
else:
# Must join if we need the block for the current fetch
must_join = bool(
start_block_number
<= self._fetch_future_block_number
<= end_block_number
)
if must_join:
# Copy to the local variables to release lock
# before waiting for result
fetch_future_block_number = self._fetch_future_block_number
fetch_future = self._fetch_future
# Cleanup the fetch variables. Have a local copy.
self._fetch_future_block_number = None
self._fetch_future = None
# Need to wait for the future for the current read
if fetch_future is not None:
logger.info("BlockCache waiting for background fetch.")
# Wait until result and put it in cache
self._fetch_block_cached.add_key(
fetch_future.result(), fetch_future_block_number
)
# these are cached, so safe to do multiple calls for the same start and end.
for block_number in range(start_block_number, end_block_number + 1):
self._fetch_block_cached(block_number)
# fetch next block in the background if nothing is running in the background,
# the block is within file and it is not already cached
end_block_plus_1 = end_block_number + 1
with self._fetch_future_lock:
if (
self._fetch_future is None
and end_block_plus_1 <= self.nblocks
and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
):
self._fetch_future_block_number = end_block_plus_1
self._fetch_future = self._thread_executor.submit(
self._fetch_block, end_block_plus_1, "async"
)
return self._read_cache(
start,
end,
start_block_number=start_block_number,
end_block_number=end_block_number,
)
def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
"""
Fetch the block of data for `block_number`.
"""
if block_number > self.nblocks:
raise ValueError(
f"'block_number={block_number}' is greater than "
f"the number of blocks ({self.nblocks})"
)
start = block_number * self.blocksize
end = start + self.blocksize
logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
self.total_requested_bytes += end - start
self.miss_count += 1
block_contents = super()._fetch(start, end)
return block_contents
def _read_cache(
self, start: int, end: int, start_block_number: int, end_block_number: int
) -> bytes:
"""
Read from our block cache.
Parameters
----------
start, end : int
The start and end byte positions.
start_block_number, end_block_number : int
The start and end block numbers.
"""
start_pos = start % self.blocksize
end_pos = end % self.blocksize
# kind of pointless to count this as a hit, but it is
self.hit_count += 1
if start_block_number == end_block_number:
block = self._fetch_block_cached(start_block_number)
return block[start_pos:end_pos]
else:
# read from the initial
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
# intermediate blocks
# Note: it'd be nice to combine these into one big request. However
# that doesn't play nicely with our LRU cache.
out.extend(
map(
self._fetch_block_cached,
range(start_block_number + 1, end_block_number),
)
)
# final block
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
return b"".join(out)
caches: dict[str | None, type[BaseCache]] = {
# one custom case
None: BaseCache,
}
def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
"""'Register' cache implementation.
Parameters
----------
clobber: bool, optional
If set to True (default is False) - allow to overwrite existing
entry.
Raises
------
ValueError
"""
name = cls.name
if not clobber and name in caches:
raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
caches[name] = cls
for c in (
BaseCache,
MMapCache,
BytesCache,
ReadAheadCache,
BlockCache,
FirstChunkCache,
AllBytes,
KnownPartsOfAFile,
BackgroundBlockCache,
):
register_cache(c)

View File

@ -0,0 +1,324 @@
from functools import wraps
class Callback:
"""
Base class and interface for callback mechanism
This class can be used directly for monitoring file transfers by
providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
below), or subclassed for more specialised behaviour.
Parameters
----------
size: int (optional)
Nominal quantity for the value that corresponds to a complete
transfer, e.g., total number of tiles or total number of
bytes
value: int (0)
Starting internal counter value
hooks: dict or None
A dict of named functions to be called on each update. The signature
of these must be ``f(size, value, **kwargs)``
"""
def __init__(self, size=None, value=0, hooks=None, **kwargs):
self.size = size
self.value = value
self.hooks = hooks or {}
self.kw = kwargs
def __enter__(self):
return self
def __exit__(self, *exc_args):
self.close()
def close(self):
"""Close callback."""
def branched(self, path_1, path_2, **kwargs):
"""
Return callback for child transfers
If this callback is operating at a higher level, e.g., put, which may
trigger transfers that can also be monitored. The function returns a callback
that has to be passed to the child method, e.g., put_file,
as `callback=` argument.
The implementation uses `callback.branch` for compatibility.
When implementing callbacks, it is recommended to override this function instead
of `branch` and avoid calling `super().branched(...)`.
Prefer using this function over `branch`.
Parameters
----------
path_1: str
Child's source path
path_2: str
Child's destination path
**kwargs:
Arbitrary keyword arguments
Returns
-------
callback: Callback
A callback instance to be passed to the child method
"""
self.branch(path_1, path_2, kwargs)
# mutate kwargs so that we can force the caller to pass "callback=" explicitly
return kwargs.pop("callback", DEFAULT_CALLBACK)
def branch_coro(self, fn):
"""
Wraps a coroutine, and pass a new child callback to it.
"""
@wraps(fn)
async def func(path1, path2: str, **kwargs):
with self.branched(path1, path2, **kwargs) as child:
return await fn(path1, path2, callback=child, **kwargs)
return func
def set_size(self, size):
"""
Set the internal maximum size attribute
Usually called if not initially set at instantiation. Note that this
triggers a ``call()``.
Parameters
----------
size: int
"""
self.size = size
self.call()
def absolute_update(self, value):
"""
Set the internal value state
Triggers ``call()``
Parameters
----------
value: int
"""
self.value = value
self.call()
def relative_update(self, inc=1):
"""
Delta increment the internal counter
Triggers ``call()``
Parameters
----------
inc: int
"""
self.value += inc
self.call()
def call(self, hook_name=None, **kwargs):
"""
Execute hook(s) with current state
Each function is passed the internal size and current value
Parameters
----------
hook_name: str or None
If given, execute on this hook
kwargs: passed on to (all) hook(s)
"""
if not self.hooks:
return
kw = self.kw.copy()
kw.update(kwargs)
if hook_name:
if hook_name not in self.hooks:
return
return self.hooks[hook_name](self.size, self.value, **kw)
for hook in self.hooks.values() or []:
hook(self.size, self.value, **kw)
def wrap(self, iterable):
"""
Wrap an iterable to call ``relative_update`` on each iterations
Parameters
----------
iterable: Iterable
The iterable that is being wrapped
"""
for item in iterable:
self.relative_update()
yield item
def branch(self, path_1, path_2, kwargs):
"""
Set callbacks for child transfers
If this callback is operating at a higher level, e.g., put, which may
trigger transfers that can also be monitored. The passed kwargs are
to be *mutated* to add ``callback=``, if this class supports branching
to children.
Parameters
----------
path_1: str
Child's source path
path_2: str
Child's destination path
kwargs: dict
arguments passed to child method, e.g., put_file.
Returns
-------
"""
return None
def no_op(self, *_, **__):
pass
def __getattr__(self, item):
"""
If undefined methods are called on this class, nothing happens
"""
return self.no_op
@classmethod
def as_callback(cls, maybe_callback=None):
"""Transform callback=... into Callback instance
For the special value of ``None``, return the global instance of
``NoOpCallback``. This is an alternative to including
``callback=DEFAULT_CALLBACK`` directly in a method signature.
"""
if maybe_callback is None:
return DEFAULT_CALLBACK
return maybe_callback
class NoOpCallback(Callback):
"""
This implementation of Callback does exactly nothing
"""
def call(self, *args, **kwargs):
return None
class DotPrinterCallback(Callback):
"""
Simple example Callback implementation
Almost identical to Callback with a hook that prints a char; here we
demonstrate how the outer layer may print "#" and the inner layer "."
"""
def __init__(self, chr_to_print="#", **kwargs):
self.chr = chr_to_print
super().__init__(**kwargs)
def branch(self, path_1, path_2, kwargs):
"""Mutate kwargs to add new instance with different print char"""
kwargs["callback"] = DotPrinterCallback(".")
def call(self, **kwargs):
"""Just outputs a character"""
print(self.chr, end="")
class TqdmCallback(Callback):
"""
A callback to display a progress bar using tqdm
Parameters
----------
tqdm_kwargs : dict, (optional)
Any argument accepted by the tqdm constructor.
See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
Will be forwarded to `tqdm_cls`.
tqdm_cls: (optional)
subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
Examples
--------
>>> import fsspec
>>> from fsspec.callbacks import TqdmCallback
>>> fs = fsspec.filesystem("memory")
>>> path2distant_data = "/your-path"
>>> fs.upload(
".",
path2distant_data,
recursive=True,
callback=TqdmCallback(),
)
You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
>>> fs.upload(
".",
path2distant_data,
recursive=True,
callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
)
You can also customize the progress bar by passing a subclass of `tqdm`.
.. code-block:: python
class TqdmFormat(tqdm):
'''Provides a `total_time` format parameter'''
@property
def format_dict(self):
d = super().format_dict
total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
d.update(total_time=self.format_interval(total_time) + " in total")
return d
>>> with TqdmCallback(
tqdm_kwargs={
"desc": "desc",
"bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
},
tqdm_cls=TqdmFormat,
) as callback:
fs.upload(".", path2distant_data, recursive=True, callback=callback)
"""
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
try:
from tqdm import tqdm
except ImportError as exce:
raise ImportError(
"Using TqdmCallback requires tqdm to be installed"
) from exce
self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
self._tqdm_kwargs = tqdm_kwargs or {}
self.tqdm = None
super().__init__(*args, **kwargs)
def call(self, *args, **kwargs):
if self.tqdm is None:
self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
self.tqdm.total = self.size
self.tqdm.update(self.value - self.tqdm.n)
def close(self):
if self.tqdm is not None:
self.tqdm.close()
self.tqdm = None
def __del__(self):
return self.close()
DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()

View File

@ -0,0 +1,175 @@
"""Helper functions for a standard streaming compression API"""
from zipfile import ZipFile
import fsspec.utils
from fsspec.spec import AbstractBufferedFile
def noop_file(file, mode, **kwargs):
return file
# TODO: files should also be available as contexts
# should be functions of the form func(infile, mode=, **kwargs) -> file-like
compr = {None: noop_file}
def register_compression(name, callback, extensions, force=False):
"""Register an "inferable" file compression type.
Registers transparent file compression type for use with fsspec.open.
Compression can be specified by name in open, or "infer"-ed for any files
ending with the given extensions.
Args:
name: (str) The compression type name. Eg. "gzip".
callback: A callable of form (infile, mode, **kwargs) -> file-like.
Accepts an input file-like object, the target mode and kwargs.
Returns a wrapped file-like object.
extensions: (str, Iterable[str]) A file extension, or list of file
extensions for which to infer this compression scheme. Eg. "gz".
force: (bool) Force re-registration of compression type or extensions.
Raises:
ValueError: If name or extensions already registered, and not force.
"""
if isinstance(extensions, str):
extensions = [extensions]
# Validate registration
if name in compr and not force:
raise ValueError(f"Duplicate compression registration: {name}")
for ext in extensions:
if ext in fsspec.utils.compressions and not force:
raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
compr[name] = callback
for ext in extensions:
fsspec.utils.compressions[ext] = name
def unzip(infile, mode="rb", filename=None, **kwargs):
if "r" not in mode:
filename = filename or "file"
z = ZipFile(infile, mode="w", **kwargs)
fo = z.open(filename, mode="w")
fo.close = lambda closer=fo.close: closer() or z.close()
return fo
z = ZipFile(infile)
if filename is None:
filename = z.namelist()[0]
return z.open(filename, mode="r", **kwargs)
register_compression("zip", unzip, "zip")
try:
from bz2 import BZ2File
except ImportError:
pass
else:
register_compression("bz2", BZ2File, "bz2")
try: # pragma: no cover
from isal import igzip
def isal(infile, mode="rb", **kwargs):
return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
register_compression("gzip", isal, "gz")
except ImportError:
from gzip import GzipFile
register_compression(
"gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
)
try:
from lzma import LZMAFile
register_compression("lzma", LZMAFile, "lzma")
register_compression("xz", LZMAFile, "xz")
except ImportError:
pass
try:
import lzmaffi
register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
except ImportError:
pass
class SnappyFile(AbstractBufferedFile):
def __init__(self, infile, mode, **kwargs):
import snappy
super().__init__(
fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
)
self.infile = infile
if "r" in mode:
self.codec = snappy.StreamDecompressor()
else:
self.codec = snappy.StreamCompressor()
def _upload_chunk(self, final=False):
self.buffer.seek(0)
out = self.codec.add_chunk(self.buffer.read())
self.infile.write(out)
return True
def seek(self, loc, whence=0):
raise NotImplementedError("SnappyFile is not seekable")
def seekable(self):
return False
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
data = self.infile.read(end - start)
return self.codec.decompress(data)
try:
import snappy
snappy.compress(b"")
# Snappy may use the .sz file extension, but this is not part of the
# standard implementation.
register_compression("snappy", SnappyFile, [])
except (ImportError, NameError, AttributeError):
pass
try:
import lz4.frame
register_compression("lz4", lz4.frame.open, "lz4")
except ImportError:
pass
try:
import zstandard as zstd
def zstandard_file(infile, mode="rb"):
if "r" in mode:
cctx = zstd.ZstdDecompressor()
return cctx.stream_reader(infile)
else:
cctx = zstd.ZstdCompressor(level=10)
return cctx.stream_writer(infile)
register_compression("zstd", zstandard_file, "zst")
except ImportError:
pass
def available_compressions():
"""Return a list of the implemented compressions."""
return list(compr)

View File

@ -0,0 +1,131 @@
from __future__ import annotations
import configparser
import json
import os
import warnings
from typing import Any
conf: dict[str, dict[str, Any]] = {}
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
def set_conf_env(conf_dict, envdict=os.environ):
"""Set config values from environment variables
Looks for variables of the form ``FSSPEC_<protocol>`` and
``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
as a json dictionary and used to ``update`` the config of the
corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
attempt to convert the string value, but the kwarg keys will be lower-cased.
The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
``FSSPEC_<protocol>`` ones.
Parameters
----------
conf_dict : dict(str, dict)
This dict will be mutated
envdict : dict-like(str, str)
Source for the values - usually the real environment
"""
kwarg_keys = []
for key in envdict:
if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
if key.count("_") > 1:
kwarg_keys.append(key)
continue
try:
value = json.loads(envdict[key])
except json.decoder.JSONDecodeError as ex:
warnings.warn(
f"Ignoring environment variable {key} due to a parse failure: {ex}"
)
else:
if isinstance(value, dict):
_, proto = key.split("_", 1)
conf_dict.setdefault(proto.lower(), {}).update(value)
else:
warnings.warn(
f"Ignoring environment variable {key} due to not being a dict:"
f" {type(value)}"
)
elif key.startswith("FSSPEC"):
warnings.warn(
f"Ignoring environment variable {key} due to having an unexpected name"
)
for key in kwarg_keys:
_, proto, kwarg = key.split("_", 2)
conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
def set_conf_files(cdir, conf_dict):
"""Set config values from files
Scans for INI and JSON files in the given dictionary, and uses their
contents to set the config. In case of repeated values, later values
win.
In the case of INI files, all values are strings, and these will not
be converted.
Parameters
----------
cdir : str
Directory to search
conf_dict : dict(str, dict)
This dict will be mutated
"""
if not os.path.isdir(cdir):
return
allfiles = sorted(os.listdir(cdir))
for fn in allfiles:
if fn.endswith(".ini"):
ini = configparser.ConfigParser()
ini.read(os.path.join(cdir, fn))
for key in ini:
if key == "DEFAULT":
continue
conf_dict.setdefault(key, {}).update(dict(ini[key]))
if fn.endswith(".json"):
with open(os.path.join(cdir, fn)) as f:
js = json.load(f)
for key in js:
conf_dict.setdefault(key, {}).update(dict(js[key]))
def apply_config(cls, kwargs, conf_dict=None):
"""Supply default values for kwargs when instantiating class
Augments the passed kwargs, by finding entries in the config dict
which match the classes ``.protocol`` attribute (one or more str)
Parameters
----------
cls : file system implementation
kwargs : dict
conf_dict : dict of dict
Typically this is the global configuration
Returns
-------
dict : the modified set of kwargs
"""
if conf_dict is None:
conf_dict = conf
protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
kw = {}
for proto in protos:
# default kwargs from the current state of the config
if proto in conf_dict:
kw.update(conf_dict[proto])
# explicit kwargs always win
kw.update(**kwargs)
kwargs = kw
return kwargs
set_conf_files(conf_dir, conf)
set_conf_env(conf)

View File

@ -0,0 +1,55 @@
import os
import shutil
import subprocess
import sys
import time
import pytest
import fsspec
from fsspec.implementations.cached import CachingFileSystem
@pytest.fixture()
def m():
"""
Fixture providing a memory filesystem.
"""
m = fsspec.filesystem("memory")
m.store.clear()
m.pseudo_dirs.clear()
m.pseudo_dirs.append("")
try:
yield m
finally:
m.store.clear()
m.pseudo_dirs.clear()
m.pseudo_dirs.append("")
@pytest.fixture
def ftp_writable(tmpdir):
"""
Fixture providing a writable FTP filesystem.
"""
pytest.importorskip("pyftpdlib")
from fsspec.implementations.ftp import FTPFileSystem
FTPFileSystem.clear_instance_cache() # remove lingering connections
CachingFileSystem.clear_instance_cache()
d = str(tmpdir)
with open(os.path.join(d, "out"), "wb") as f:
f.write(b"hello" * 10000)
P = subprocess.Popen(
[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
)
try:
time.sleep(1)
yield "localhost", 2121, "user", "pass"
finally:
P.terminate()
P.wait()
try:
shutil.rmtree(tmpdir)
except Exception:
pass

View File

@ -0,0 +1,741 @@
from __future__ import annotations
import io
import logging
import os
import re
from glob import has_magic
from pathlib import Path
# for backwards compat, we export cache things from here too
from fsspec.caching import ( # noqa: F401
BaseCache,
BlockCache,
BytesCache,
MMapCache,
ReadAheadCache,
caches,
)
from fsspec.compression import compr
from fsspec.config import conf
from fsspec.registry import filesystem, get_filesystem_class
from fsspec.utils import (
_unstrip_protocol,
build_name_function,
infer_compression,
stringify_path,
)
logger = logging.getLogger("fsspec")
class OpenFile:
"""
File-like object to be used in a context
Can layer (buffered) text-mode and compression over any file-system, which
are typically binary-only.
These instances are safe to serialize, as the low-level file object
is not created until invoked using ``with``.
Parameters
----------
fs: FileSystem
The file system to use for opening the file. Should be a subclass or duck-type
with ``fsspec.spec.AbstractFileSystem``
path: str
Location to open
mode: str like 'rb', optional
Mode of the opened file
compression: str or None, optional
Compression to apply
encoding: str or None, optional
The encoding to use if opened in text mode.
errors: str or None, optional
How to handle encoding errors if opened in text mode.
newline: None or str
Passed to TextIOWrapper in text mode, how to handle line endings.
autoopen: bool
If True, calls open() immediately. Mostly used by pickle
pos: int
If given and autoopen is True, seek to this location immediately
"""
def __init__(
self,
fs,
path,
mode="rb",
compression=None,
encoding=None,
errors=None,
newline=None,
):
self.fs = fs
self.path = path
self.mode = mode
self.compression = get_compression(path, compression)
self.encoding = encoding
self.errors = errors
self.newline = newline
self.fobjects = []
def __reduce__(self):
return (
OpenFile,
(
self.fs,
self.path,
self.mode,
self.compression,
self.encoding,
self.errors,
self.newline,
),
)
def __repr__(self):
return f"<OpenFile '{self.path}'>"
def __enter__(self):
mode = self.mode.replace("t", "").replace("b", "") + "b"
try:
f = self.fs.open(self.path, mode=mode)
except FileNotFoundError as e:
if has_magic(self.path):
raise FileNotFoundError(
"%s not found. The URL contains glob characters: you maybe needed\n"
"to pass expand=True in fsspec.open() or the storage_options of \n"
"your library. You can also set the config value 'open_expand'\n"
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
self.path,
) from e
raise
self.fobjects = [f]
if self.compression is not None:
compress = compr[self.compression]
f = compress(f, mode=mode[0])
self.fobjects.append(f)
if "b" not in self.mode:
# assume, for example, that 'r' is equivalent to 'rt' as in builtin
f = PickleableTextIOWrapper(
f, encoding=self.encoding, errors=self.errors, newline=self.newline
)
self.fobjects.append(f)
return self.fobjects[-1]
def __exit__(self, *args):
self.close()
@property
def full_name(self):
return _unstrip_protocol(self.path, self.fs)
def open(self):
"""Materialise this as a real open file without context
The OpenFile object should be explicitly closed to avoid enclosed file
instances persisting. You must, therefore, keep a reference to the OpenFile
during the life of the file-like it generates.
"""
return self.__enter__()
def close(self):
"""Close all encapsulated file objects"""
for f in reversed(self.fobjects):
if "r" not in self.mode and not f.closed:
f.flush()
f.close()
self.fobjects.clear()
class OpenFiles(list):
"""List of OpenFile instances
Can be used in a single context, which opens and closes all of the
contained files. Normal list access to get the elements works as
normal.
A special case is made for caching filesystems - the files will
be down/uploaded together at the start or end of the context, and
this may happen concurrently, if the target filesystem supports it.
"""
def __init__(self, *args, mode="rb", fs=None):
self.mode = mode
self.fs = fs
self.files = []
super().__init__(*args)
def __enter__(self):
if self.fs is None:
raise ValueError("Context has already been used")
fs = self.fs
while True:
if hasattr(fs, "open_many"):
# check for concurrent cache download; or set up for upload
self.files = fs.open_many(self)
return self.files
if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
return [s.__enter__() for s in self]
def __exit__(self, *args):
fs = self.fs
[s.__exit__(*args) for s in self]
if "r" not in self.mode:
while True:
if hasattr(fs, "open_many"):
# check for concurrent cache upload
fs.commit_many(self.files)
return
if hasattr(fs, "fs") and fs.fs is not None:
fs = fs.fs
else:
break
def __getitem__(self, item):
out = super().__getitem__(item)
if isinstance(item, slice):
return OpenFiles(out, mode=self.mode, fs=self.fs)
return out
def __repr__(self):
return f"<List of {len(self)} OpenFile instances>"
def open_files(
urlpath,
mode="rb",
compression=None,
encoding="utf8",
errors=None,
name_function=None,
num=1,
protocol=None,
newline=None,
auto_mkdir=True,
expand=True,
**kwargs,
):
"""Given a path or paths, return a list of ``OpenFile`` objects.
For writing, a str path must contain the "*" character, which will be filled
in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
For either reading or writing, can instead provide explicit list of paths.
Parameters
----------
urlpath: string or list
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
to read from alternative filesystems. To read from multiple files you
can pass a globstring or a list of paths, with the caveat that they
must all have the same protocol.
mode: 'rb', 'wt', etc.
compression: string or None
If given, open file using compression codec. Can either be a compression
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
compression from the filename suffix.
encoding: str
For text mode only
errors: None or str
Passed to TextIOWrapper in text mode
name_function: function or None
if opening a set of files for writing, those files do not yet exist,
so we need to generate their names by formatting the urlpath for
each sequence number
num: int [1]
if writing mode, number of files we expect to create (passed to
name+function)
protocol: str or None
If given, overrides the protocol found in the URL.
newline: bytes or None
Used for line terminator in text mode. If None, uses system default;
if blank, uses no translation.
auto_mkdir: bool (True)
If in write mode, this will ensure the target directory exists before
writing, by calling ``fs.mkdirs(exist_ok=True)``.
expand: bool
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Examples
--------
>>> files = open_files('2015-*-*.csv') # doctest: +SKIP
>>> files = open_files(
... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
... ) # doctest: +SKIP
Returns
-------
An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
be used as a single context
Notes
-----
For a full list of the available protocols and the implementations that
they map across to see the latest online documentation:
- For implementations built into ``fsspec`` see
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
fs, fs_token, paths = get_fs_token_paths(
urlpath,
mode,
num=num,
name_function=name_function,
storage_options=kwargs,
protocol=protocol,
expand=expand,
)
if fs.protocol == "file":
fs.auto_mkdir = auto_mkdir
elif "r" not in mode and auto_mkdir:
parents = {fs._parent(path) for path in paths}
for parent in parents:
try:
fs.makedirs(parent, exist_ok=True)
except PermissionError:
pass
return OpenFiles(
[
OpenFile(
fs,
path,
mode=mode,
compression=compression,
encoding=encoding,
errors=errors,
newline=newline,
)
for path in paths
],
mode=mode,
fs=fs,
)
def _un_chain(path, kwargs):
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
bits = (
[p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
if "::" in path
else [path]
)
# [[url, protocol, kwargs], ...]
out = []
previous_bit = None
kwargs = kwargs.copy()
for bit in reversed(bits):
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
cls = get_filesystem_class(protocol)
extra_kwargs = cls._get_kwargs_from_urls(bit)
kws = kwargs.pop(protocol, {})
if bit is bits[0]:
kws.update(kwargs)
kw = dict(
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
**kws,
)
bit = cls._strip_protocol(bit)
if (
protocol in {"blockcache", "filecache", "simplecache"}
and "target_protocol" not in kw
):
bit = previous_bit
out.append((bit, protocol, kw))
previous_bit = bit
out.reverse()
return out
def url_to_fs(url, **kwargs):
"""
Turn fully-qualified and potentially chained URL into filesystem instance
Parameters
----------
url : str
The fsspec-compatible URL
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Returns
-------
filesystem : FileSystem
The new filesystem discovered from ``url`` and created with
``**kwargs``.
urlpath : str
The file-systems-specific URL for ``url``.
"""
url = stringify_path(url)
# non-FS arguments that appear in fsspec.open()
# inspect could keep this in sync with open()'s signature
known_kwargs = {
"compression",
"encoding",
"errors",
"expand",
"mode",
"name_function",
"newline",
"num",
}
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
chain = _un_chain(url, kwargs)
inkwargs = {}
# Reverse iterate the chain, creating a nested target_* structure
for i, ch in enumerate(reversed(chain)):
urls, protocol, kw = ch
if i == len(chain) - 1:
inkwargs = dict(**kw, **inkwargs)
continue
inkwargs["target_options"] = dict(**kw, **inkwargs)
inkwargs["target_protocol"] = protocol
inkwargs["fo"] = urls
urlpath, protocol, _ = chain[0]
fs = filesystem(protocol, **inkwargs)
return fs, urlpath
DEFAULT_EXPAND = conf.get("open_expand", False)
def open(
urlpath,
mode="rb",
compression=None,
encoding="utf8",
errors=None,
protocol=None,
newline=None,
expand=None,
**kwargs,
):
"""Given a path or paths, return one ``OpenFile`` object.
Parameters
----------
urlpath: string or list
Absolute or relative filepath. Prefix with a protocol like ``s3://``
to read from alternative filesystems. Should not include glob
character(s).
mode: 'rb', 'wt', etc.
compression: string or None
If given, open file using compression codec. Can either be a compression
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
compression from the filename suffix.
encoding: str
For text mode only
errors: None or str
Passed to TextIOWrapper in text mode
protocol: str or None
If given, overrides the protocol found in the URL.
newline: bytes or None
Used for line terminator in text mode. If None, uses system default;
if blank, uses no translation.
expand: bool or Nonw
Whether to regard file paths containing special glob characters as needing
expansion (finding the first match) or absolute. Setting False allows using
paths which do embed such characters. If None (default), this argument
takes its value from the DEFAULT_EXPAND module variable, which takes
its initial value from the "open_expand" config value at startup, which will
be False if not set.
**kwargs: dict
Extra options that make sense to a particular storage connection, e.g.
host, port, username, password, etc.
Examples
--------
>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
>>> openfile = open(
... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
... ) # doctest: +SKIP
>>> with openfile as f:
... df = pd.read_csv(f) # doctest: +SKIP
...
Returns
-------
``OpenFile`` object.
Notes
-----
For a full list of the available protocols and the implementations that
they map across to see the latest online documentation:
- For implementations built into ``fsspec`` see
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
- For implementations in separate packages see
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
"""
expand = DEFAULT_EXPAND if expand is None else expand
out = open_files(
urlpath=[urlpath],
mode=mode,
compression=compression,
encoding=encoding,
errors=errors,
protocol=protocol,
newline=newline,
expand=expand,
**kwargs,
)
if not out:
raise FileNotFoundError(urlpath)
return out[0]
def open_local(
url: str | list[str] | Path | list[Path],
mode: str = "rb",
**storage_options: dict,
) -> str | list[str]:
"""Open file(s) which can be resolved to local
For files which either are local, or get downloaded upon open
(e.g., by file caching)
Parameters
----------
url: str or list(str)
mode: str
Must be read mode
storage_options:
passed on to FS for or used by open_files (e.g., compression)
"""
if "r" not in mode:
raise ValueError("Can only ensure local files when reading")
of = open_files(url, mode=mode, **storage_options)
if not getattr(of[0].fs, "local_file", False):
raise ValueError(
"open_local can only be used on a filesystem which"
" has attribute local_file=True"
)
with of as files:
paths = [f.name for f in files]
if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
return paths[0]
return paths
def get_compression(urlpath, compression):
if compression == "infer":
compression = infer_compression(urlpath)
if compression is not None and compression not in compr:
raise ValueError(f"Compression type {compression} not supported")
return compression
def split_protocol(urlpath):
"""Return protocol, path pair"""
urlpath = stringify_path(urlpath)
if "://" in urlpath:
protocol, path = urlpath.split("://", 1)
if len(protocol) > 1:
# excludes Windows paths
return protocol, path
if urlpath.startswith("data:"):
return urlpath.split(":", 1)
return None, urlpath
def strip_protocol(urlpath):
"""Return only path part of full URL, according to appropriate backend"""
protocol, _ = split_protocol(urlpath)
cls = get_filesystem_class(protocol)
return cls._strip_protocol(urlpath)
def expand_paths_if_needed(paths, mode, num, fs, name_function):
"""Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
in them (read mode).
:param paths: list of paths
mode: str
Mode in which to open files.
num: int
If opening in writing mode, number of files we expect to create.
fs: filesystem object
name_function: callable
If opening in writing mode, this callable is used to generate path
names. Names are generated for each partition by
``urlpath.replace('*', name_function(partition_index))``.
:return: list of paths
"""
expanded_paths = []
paths = list(paths)
if "w" in mode: # read mode
if sum(1 for p in paths if "*" in p) > 1:
raise ValueError(
"When writing data, only one filename mask can be specified."
)
num = max(num, len(paths))
for curr_path in paths:
if "*" in curr_path:
# expand using name_function
expanded_paths.extend(_expand_paths(curr_path, name_function, num))
else:
expanded_paths.append(curr_path)
# if we generated more paths that asked for, trim the list
if len(expanded_paths) > num:
expanded_paths = expanded_paths[:num]
else: # read mode
for curr_path in paths:
if has_magic(curr_path):
# expand using glob
expanded_paths.extend(fs.glob(curr_path))
else:
expanded_paths.append(curr_path)
return expanded_paths
def get_fs_token_paths(
urlpath,
mode="rb",
num=1,
name_function=None,
storage_options=None,
protocol=None,
expand=True,
):
"""Filesystem, deterministic token, and paths from a urlpath and options.
Parameters
----------
urlpath: string or iterable
Absolute or relative filepath, URL (may include protocols like
``s3://``), or globstring pointing to data.
mode: str, optional
Mode in which to open files.
num: int, optional
If opening in writing mode, number of files we expect to create.
name_function: callable, optional
If opening in writing mode, this callable is used to generate path
names. Names are generated for each partition by
``urlpath.replace('*', name_function(partition_index))``.
storage_options: dict, optional
Additional keywords to pass to the filesystem class.
protocol: str or None
To override the protocol specifier in the URL
expand: bool
Expand string paths for writing, assuming the path is a directory
"""
if isinstance(urlpath, (list, tuple, set)):
if not urlpath:
raise ValueError("empty urlpath sequence")
urlpath0 = stringify_path(next(iter(urlpath)))
else:
urlpath0 = stringify_path(urlpath)
storage_options = storage_options or {}
if protocol:
storage_options["protocol"] = protocol
chain = _un_chain(urlpath0, storage_options or {})
inkwargs = {}
# Reverse iterate the chain, creating a nested target_* structure
for i, ch in enumerate(reversed(chain)):
urls, nested_protocol, kw = ch
if i == len(chain) - 1:
inkwargs = dict(**kw, **inkwargs)
continue
inkwargs["target_options"] = dict(**kw, **inkwargs)
inkwargs["target_protocol"] = nested_protocol
inkwargs["fo"] = urls
paths, protocol, _ = chain[0]
fs = filesystem(protocol, **inkwargs)
if isinstance(urlpath, (list, tuple, set)):
pchains = [
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
]
if len({pc[1] for pc in pchains}) > 1:
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
paths = [pc[0] for pc in pchains]
else:
paths = fs._strip_protocol(paths)
if isinstance(paths, (list, tuple, set)):
if expand:
paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
elif not isinstance(paths, list):
paths = list(paths)
else:
if "w" in mode and expand:
paths = _expand_paths(paths, name_function, num)
elif "x" in mode and expand:
paths = _expand_paths(paths, name_function, num)
elif "*" in paths:
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
else:
paths = [paths]
return fs, fs._fs_token, paths
def _expand_paths(path, name_function, num):
if isinstance(path, str):
if path.count("*") > 1:
raise ValueError("Output path spec must contain exactly one '*'.")
elif "*" not in path:
path = os.path.join(path, "*.part")
if name_function is None:
name_function = build_name_function(num - 1)
paths = [path.replace("*", name_function(i)) for i in range(num)]
if paths != sorted(paths):
logger.warning(
"In order to preserve order between partitions"
" paths created with ``name_function`` should "
"sort to partition order"
)
elif isinstance(path, (tuple, list)):
assert len(path) == num
paths = list(path)
else:
raise ValueError(
"Path should be either\n"
"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
"2. A directory: 'foo/\n"
"3. A path with a '*' in it: 'foo.*.json'"
)
return paths
class PickleableTextIOWrapper(io.TextIOWrapper):
"""TextIOWrapper cannot be pickled. This solves it.
Requires that ``buffer`` be pickleable, which all instances of
AbstractBufferedFile are.
"""
def __init__(
self,
buffer,
encoding=None,
errors=None,
newline=None,
line_buffering=False,
write_through=False,
):
self.args = buffer, encoding, errors, newline, line_buffering, write_through
super().__init__(*self.args)
def __reduce__(self):
return PickleableTextIOWrapper, self.args

View File

@ -0,0 +1,98 @@
import time
from collections.abc import MutableMapping
from functools import lru_cache
class DirCache(MutableMapping):
"""
Caching of directory listings, in a structure like::
{"path0": [
{"name": "path0/file0",
"size": 123,
"type": "file",
...
},
{"name": "path0/file1",
},
...
],
"path1": [...]
}
Parameters to this class control listing expiry or indeed turn
caching off
"""
def __init__(
self,
use_listings_cache=True,
listings_expiry_time=None,
max_paths=None,
**kwargs,
):
"""
Parameters
----------
use_listings_cache: bool
If False, this cache never returns items, but always reports KeyError,
and setting items has no effect
listings_expiry_time: int or float (optional)
Time in seconds that a listing is considered valid. If None,
listings do not expire.
max_paths: int (optional)
The number of most recent listings that are considered valid; 'recent'
refers to when the entry was set.
"""
self._cache = {}
self._times = {}
if max_paths:
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
self.use_listings_cache = use_listings_cache
self.listings_expiry_time = listings_expiry_time
self.max_paths = max_paths
def __getitem__(self, item):
if self.listings_expiry_time is not None:
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
del self._cache[item]
if self.max_paths:
self._q(item)
return self._cache[item] # maybe raises KeyError
def clear(self):
self._cache.clear()
def __len__(self):
return len(self._cache)
def __contains__(self, item):
try:
self[item]
return True
except KeyError:
return False
def __setitem__(self, key, value):
if not self.use_listings_cache:
return
if self.max_paths:
self._q(key)
self._cache[key] = value
if self.listings_expiry_time is not None:
self._times[key] = time.time()
def __delitem__(self, key):
del self._cache[key]
def __iter__(self):
entries = list(self._cache)
return (k for k in entries if k in self)
def __reduce__(self):
return (
DirCache,
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
)

View File

@ -0,0 +1,18 @@
"""
fsspec user-defined exception classes
"""
import asyncio
class BlocksizeMismatchError(ValueError):
"""
Raised when a cached file is opened with a different blocksize than it was
written with
"""
class FSTimeoutError(asyncio.TimeoutError):
"""
Raised when a fsspec function timed out occurs
"""

View File

@ -0,0 +1,324 @@
import argparse
import logging
import os
import stat
import threading
import time
from errno import EIO, ENOENT
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
from fsspec import __version__
from fsspec.core import url_to_fs
logger = logging.getLogger("fsspec.fuse")
class FUSEr(Operations):
def __init__(self, fs, path, ready_file=False):
self.fs = fs
self.cache = {}
self.root = path.rstrip("/") + "/"
self.counter = 0
logger.info("Starting FUSE at %s", path)
self._ready_file = ready_file
def getattr(self, path, fh=None):
logger.debug("getattr %s", path)
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
return {"type": "file", "st_size": 5}
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
try:
info = self.fs.info(path)
except FileNotFoundError as exc:
raise FuseOSError(ENOENT) from exc
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
perm = info.get("mode", 0o777)
if info["type"] != "file":
data["st_mode"] = stat.S_IFDIR | perm
data["st_size"] = 0
data["st_blksize"] = 0
else:
data["st_mode"] = stat.S_IFREG | perm
data["st_size"] = info["size"]
data["st_blksize"] = 5 * 2**20
data["st_nlink"] = 1
data["st_atime"] = info["atime"] if "atime" in info else time.time()
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
return data
def readdir(self, path, fh):
logger.debug("readdir %s", path)
path = "".join([self.root, path.lstrip("/")])
files = self.fs.ls(path, False)
files = [os.path.basename(f.rstrip("/")) for f in files]
return [".", ".."] + files
def mkdir(self, path, mode):
path = "".join([self.root, path.lstrip("/")])
self.fs.mkdir(path)
return 0
def rmdir(self, path):
path = "".join([self.root, path.lstrip("/")])
self.fs.rmdir(path)
return 0
def read(self, path, size, offset, fh):
logger.debug("read %s", (path, size, offset))
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
# status indicator
return b"ready"
f = self.cache[fh]
f.seek(offset)
out = f.read(size)
return out
def write(self, path, data, offset, fh):
logger.debug("write %s", (path, offset))
f = self.cache[fh]
f.seek(offset)
f.write(data)
return len(data)
def create(self, path, flags, fi=None):
logger.debug("create %s", (path, flags))
fn = "".join([self.root, path.lstrip("/")])
self.fs.touch(fn) # OS will want to get attributes immediately
f = self.fs.open(fn, "wb")
self.cache[self.counter] = f
self.counter += 1
return self.counter - 1
def open(self, path, flags):
logger.debug("open %s", (path, flags))
fn = "".join([self.root, path.lstrip("/")])
if flags % 2 == 0:
# read
mode = "rb"
else:
# write/create
mode = "wb"
self.cache[self.counter] = self.fs.open(fn, mode)
self.counter += 1
return self.counter - 1
def truncate(self, path, length, fh=None):
fn = "".join([self.root, path.lstrip("/")])
if length != 0:
raise NotImplementedError
# maybe should be no-op since open with write sets size to zero anyway
self.fs.touch(fn)
def unlink(self, path):
fn = "".join([self.root, path.lstrip("/")])
try:
self.fs.rm(fn, False)
except (OSError, FileNotFoundError) as exc:
raise FuseOSError(EIO) from exc
def release(self, path, fh):
try:
if fh in self.cache:
f = self.cache[fh]
f.close()
self.cache.pop(fh)
except Exception as e:
print(e)
return 0
def chmod(self, path, mode):
if hasattr(self.fs, "chmod"):
path = "".join([self.root, path.lstrip("/")])
return self.fs.chmod(path, mode)
raise NotImplementedError
def run(
fs,
path,
mount_point,
foreground=True,
threads=False,
ready_file=False,
ops_class=FUSEr,
):
"""Mount stuff in a local directory
This uses fusepy to make it appear as if a given path on an fsspec
instance is in fact resident within the local file-system.
This requires that fusepy by installed, and that FUSE be available on
the system (typically requiring a package to be installed with
apt, yum, brew, etc.).
Parameters
----------
fs: file-system instance
From one of the compatible implementations
path: str
Location on that file-system to regard as the root directory to
mount. Note that you typically should include the terminating "/"
character.
mount_point: str
An empty directory on the local file-system where the contents of
the remote path will appear.
foreground: bool
Whether or not calling this function will block. Operation will
typically be more stable if True.
threads: bool
Whether or not to create threads when responding to file operations
within the mounter directory. Operation will typically be more
stable if False.
ready_file: bool
Whether the FUSE process is ready. The ``.fuse_ready`` file will
exist in the ``mount_point`` directory if True. Debugging purpose.
ops_class: FUSEr or Subclass of FUSEr
To override the default behavior of FUSEr. For Example, logging
to file.
"""
func = lambda: FUSE(
ops_class(fs, path, ready_file=ready_file),
mount_point,
nothreads=not threads,
foreground=foreground,
)
if not foreground:
th = threading.Thread(target=func)
th.daemon = True
th.start()
return th
else: # pragma: no cover
try:
func()
except KeyboardInterrupt:
pass
def main(args):
"""Mount filesystem from chained URL to MOUNT_POINT.
Examples:
python3 -m fsspec.fuse memory /usr/share /tmp/mem
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
-l /tmp/fsspecfuse.log
You can also mount chained-URLs and use special settings:
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
/ /tmp/zip \\
-o 'filecache-cache_storage=/tmp/simplecache'
You can specify the type of the setting by using `[int]` or `[bool]`,
(`true`, `yes`, `1` represents the Boolean value `True`):
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
/historic/packages/RPMS /tmp/ftp \\
-o 'simplecache-cache_storage=/tmp/simplecache' \\
-o 'simplecache-check_files=false[bool]' \\
-o 'ftp-listings_expiry_time=60[int]' \\
-o 'ftp-username=anonymous' \\
-o 'ftp-password=xieyanbo'
"""
class RawDescriptionArgumentParser(argparse.ArgumentParser):
def format_help(self):
usage = super().format_help()
parts = usage.split("\n\n")
parts[1] = self.description.rstrip()
return "\n\n".join(parts)
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
parser.add_argument("--version", action="version", version=__version__)
parser.add_argument("url", type=str, help="fs url")
parser.add_argument("source_path", type=str, help="source directory in fs")
parser.add_argument("mount_point", type=str, help="local directory")
parser.add_argument(
"-o",
"--option",
action="append",
help="Any options of protocol included in the chained URL",
)
parser.add_argument(
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
)
parser.add_argument(
"-f",
"--foreground",
action="store_false",
help="Running in foreground or not (Default: False)",
)
parser.add_argument(
"-t",
"--threads",
action="store_false",
help="Running with threads support (Default: False)",
)
parser.add_argument(
"-r",
"--ready-file",
action="store_false",
help="The `.fuse_ready` file will exist after FUSE is ready. "
"(Debugging purpose, Default: False)",
)
args = parser.parse_args(args)
kwargs = {}
for item in args.option or []:
key, sep, value = item.partition("=")
if not sep:
parser.error(message=f"Wrong option: {item!r}")
val = value.lower()
if val.endswith("[int]"):
value = int(value[: -len("[int]")])
elif val.endswith("[bool]"):
value = val[: -len("[bool]")] in ["1", "yes", "true"]
if "-" in key:
fs_name, setting_name = key.split("-", 1)
if fs_name in kwargs:
kwargs[fs_name][setting_name] = value
else:
kwargs[fs_name] = {setting_name: value}
else:
kwargs[key] = value
if args.log_file:
logging.basicConfig(
level=logging.DEBUG,
filename=args.log_file,
format="%(asctime)s %(message)s",
)
class LoggingFUSEr(FUSEr, LoggingMixIn):
pass
fuser = LoggingFUSEr
else:
fuser = FUSEr
fs, url_path = url_to_fs(args.url, **kwargs)
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
run(
fs,
args.source_path,
args.mount_point,
foreground=args.foreground,
threads=args.threads,
ready_file=args.ready_file,
ops_class=fuser,
)
if __name__ == "__main__":
import sys
main(sys.argv[1:])

View File

@ -0,0 +1,411 @@
from __future__ import annotations
import inspect
import logging
import os
import shutil
import uuid
from typing import Optional
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
from .callbacks import DEFAULT_CALLBACK
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
_generic_fs = {}
logger = logging.getLogger("fsspec.generic")
def set_generic_fs(protocol, **storage_options):
_generic_fs[protocol] = filesystem(protocol, **storage_options)
default_method = "default"
def _resolve_fs(url, method=None, protocol=None, storage_options=None):
"""Pick instance of backend FS"""
method = method or default_method
protocol = protocol or split_protocol(url)[0]
storage_options = storage_options or {}
if method == "default":
return filesystem(protocol)
if method == "generic":
return _generic_fs[protocol]
if method == "current":
cls = get_filesystem_class(protocol)
return cls.current()
if method == "options":
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
return fs
raise ValueError(f"Unknown FS resolution method: {method}")
def rsync(
source,
destination,
delete_missing=False,
source_field="size",
dest_field="size",
update_cond="different",
inst_kwargs=None,
fs=None,
**kwargs,
):
"""Sync files between two directory trees
(experimental)
Parameters
----------
source: str
Root of the directory tree to take files from. This must be a directory, but
do not include any terminating "/" character
destination: str
Root path to copy into. The contents of this location should be
identical to the contents of ``source`` when done. This will be made a
directory, and the terminal "/" should not be included.
delete_missing: bool
If there are paths in the destination that don't exist in the
source and this is True, delete them. Otherwise, leave them alone.
source_field: str | callable
If ``update_field`` is "different", this is the key in the info
of source files to consider for difference. Maybe a function of the
info dict.
dest_field: str | callable
If ``update_field`` is "different", this is the key in the info
of destination files to consider for difference. May be a function of
the info dict.
update_cond: "different"|"always"|"never"
If "always", every file is copied, regardless of whether it exists in
the destination. If "never", files that exist in the destination are
not copied again. If "different" (default), only copy if the info
fields given by ``source_field`` and ``dest_field`` (usually "size")
are different. Other comparisons may be added in the future.
inst_kwargs: dict|None
If ``fs`` is None, use this set of keyword arguments to make a
GenericFileSystem instance
fs: GenericFileSystem|None
Instance to use if explicitly given. The instance defines how to
to make downstream file system instances from paths.
Returns
-------
dict of the copy operations that were performed, {source: destination}
"""
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
source = fs._strip_protocol(source)
destination = fs._strip_protocol(destination)
allfiles = fs.find(source, withdirs=True, detail=True)
if not fs.isdir(source):
raise ValueError("Can only rsync on a directory")
otherfiles = fs.find(destination, withdirs=True, detail=True)
dirs = [
a
for a, v in allfiles.items()
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
]
logger.debug(f"{len(dirs)} directories to create")
if dirs:
fs.make_many_dirs(
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
)
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
logger.debug(f"{len(allfiles)} files to consider for copy")
to_delete = [
o
for o, v in otherfiles.items()
if o.replace(destination, source) not in allfiles and v["type"] == "file"
]
for k, v in allfiles.copy().items():
otherfile = k.replace(source, destination)
if otherfile in otherfiles:
if update_cond == "always":
allfiles[k] = otherfile
elif update_cond == "different":
inf1 = source_field(v) if callable(source_field) else v[source_field]
v2 = otherfiles[otherfile]
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
if inf1 != inf2:
# details mismatch, make copy
allfiles[k] = otherfile
else:
# details match, don't copy
allfiles.pop(k)
else:
# file not in target yet
allfiles[k] = otherfile
logger.debug(f"{len(allfiles)} files to copy")
if allfiles:
source_files, target_files = zip(*allfiles.items())
fs.cp(source_files, target_files, **kwargs)
logger.debug(f"{len(to_delete)} files to delete")
if delete_missing and to_delete:
fs.rm(to_delete)
return allfiles
class GenericFileSystem(AsyncFileSystem):
"""Wrapper over all other FS types
<experimental!>
This implementation is a single unified interface to be able to run FS operations
over generic URLs, and dispatch to the specific implementations using the URL
protocol prefix.
Note: instances of this FS are always async, even if you never use it with any async
backend.
"""
protocol = "generic" # there is no real reason to ever use a protocol with this FS
def __init__(self, default_method="default", **kwargs):
"""
Parameters
----------
default_method: str (optional)
Defines how to configure backend FS instances. Options are:
- "default": instantiate like FSClass(), with no
extra arguments; this is the default instance of that FS, and can be
configured via the config system
- "generic": takes instances from the `_generic_fs` dict in this module,
which you must populate before use. Keys are by protocol
- "current": takes the most recently instantiated version of each FS
"""
self.method = default_method
super().__init__(**kwargs)
def _parent(self, path):
fs = _resolve_fs(path, self.method)
return fs.unstrip_protocol(fs._parent(path))
def _strip_protocol(self, path):
# normalization only
fs = _resolve_fs(path, self.method)
return fs.unstrip_protocol(fs._strip_protocol(path))
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
fs = _resolve_fs(path, self.method)
if fs.async_impl:
out = await fs._find(
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
)
else:
out = fs.find(
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
)
result = {}
for k, v in out.items():
v = v.copy() # don't corrupt target FS dircache
name = fs.unstrip_protocol(k)
v["name"] = name
result[name] = v
if detail:
return result
return list(result)
async def _info(self, url, **kwargs):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
out = await fs._info(url, **kwargs)
else:
out = fs.info(url, **kwargs)
out = out.copy() # don't edit originals
out["name"] = fs.unstrip_protocol(out["name"])
return out
async def _ls(
self,
url,
detail=True,
**kwargs,
):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
out = await fs._ls(url, detail=True, **kwargs)
else:
out = fs.ls(url, detail=True, **kwargs)
out = [o.copy() for o in out] # don't edit originals
for o in out:
o["name"] = fs.unstrip_protocol(o["name"])
if detail:
return out
else:
return [o["name"] for o in out]
async def _cat_file(
self,
url,
**kwargs,
):
fs = _resolve_fs(url, self.method)
if fs.async_impl:
return await fs._cat_file(url, **kwargs)
else:
return fs.cat_file(url, **kwargs)
async def _pipe_file(
self,
path,
value,
**kwargs,
):
fs = _resolve_fs(path, self.method)
if fs.async_impl:
return await fs._pipe_file(path, value, **kwargs)
else:
return fs.pipe_file(path, value, **kwargs)
async def _rm(self, url, **kwargs):
urls = url
if isinstance(urls, str):
urls = [urls]
fs = _resolve_fs(urls[0], self.method)
if fs.async_impl:
await fs._rm(urls, **kwargs)
else:
fs.rm(url, **kwargs)
async def _makedirs(self, path, exist_ok=False):
logger.debug("Make dir %s", path)
fs = _resolve_fs(path, self.method)
if fs.async_impl:
await fs._makedirs(path, exist_ok=exist_ok)
else:
fs.makedirs(path, exist_ok=exist_ok)
def rsync(self, source, destination, **kwargs):
"""Sync files between two directory trees
See `func:rsync` for more details.
"""
rsync(source, destination, fs=self, **kwargs)
async def _cp_file(
self,
url,
url2,
blocksize=2**20,
callback=DEFAULT_CALLBACK,
**kwargs,
):
fs = _resolve_fs(url, self.method)
fs2 = _resolve_fs(url2, self.method)
if fs is fs2:
# pure remote
if fs.async_impl:
return await fs._cp_file(url, url2, **kwargs)
else:
return fs.cp_file(url, url2, **kwargs)
kw = {"blocksize": 0, "cache_type": "none"}
try:
f1 = (
await fs.open_async(url, "rb")
if hasattr(fs, "open_async")
else fs.open(url, "rb", **kw)
)
callback.set_size(await maybe_await(f1.size))
f2 = (
await fs2.open_async(url2, "wb")
if hasattr(fs2, "open_async")
else fs2.open(url2, "wb", **kw)
)
while f1.size is None or f2.tell() < f1.size:
data = await maybe_await(f1.read(blocksize))
if f1.size is None and not data:
break
await maybe_await(f2.write(data))
callback.absolute_update(f2.tell())
finally:
try:
await maybe_await(f2.close())
await maybe_await(f1.close())
except NameError:
# fail while opening f1 or f2
pass
async def _make_many_dirs(self, urls, exist_ok=True):
fs = _resolve_fs(urls[0], self.method)
if fs.async_impl:
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
await _run_coros_in_chunks(coros)
else:
for u in urls:
fs.makedirs(u, exist_ok=exist_ok)
make_many_dirs = sync_wrapper(_make_many_dirs)
async def _copy(
self,
path1: list[str],
path2: list[str],
recursive: bool = False,
on_error: str = "ignore",
maxdepth: Optional[int] = None,
batch_size: Optional[int] = None,
tempdir: Optional[str] = None,
**kwargs,
):
if recursive:
raise NotImplementedError
fs = _resolve_fs(path1[0], self.method)
fs2 = _resolve_fs(path2[0], self.method)
# not expanding paths atm., assume call is from rsync()
if fs is fs2:
# pure remote
if fs.async_impl:
return await fs._copy(path1, path2, **kwargs)
else:
return fs.copy(path1, path2, **kwargs)
await copy_file_op(
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
)
async def copy_file_op(
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
):
import tempfile
tempdir = tempdir or tempfile.mkdtemp()
try:
coros = [
_copy_file_op(
fs1,
u1,
fs2,
u2,
os.path.join(tempdir, uuid.uuid4().hex),
on_error=on_error,
)
for u1, u2 in zip(url1, url2)
]
await _run_coros_in_chunks(coros, batch_size=batch_size)
finally:
shutil.rmtree(tempdir)
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
ex = () if on_error == "raise" else Exception
logger.debug("Copy %s -> %s", url1, url2)
try:
if fs1.async_impl:
await fs1._get_file(url1, local)
else:
fs1.get_file(url1, local)
if fs2.async_impl:
await fs2._put_file(local, url2)
else:
fs2.put_file(local, url2)
os.unlink(local)
logger.debug("Copy %s -> %s; done", url1, url2)
except ex as e:
logger.debug("ignoring cp exception for %s: %s", url1, e)
async def maybe_await(cor):
if inspect.iscoroutine(cor):
return await cor
else:
return cor

View File

@ -0,0 +1,416 @@
import ast
import contextlib
import logging
import os
import re
from typing import ClassVar, Sequence
import panel as pn
from .core import OpenFile, get_filesystem_class, split_protocol
from .registry import known_implementations
pn.extension()
logger = logging.getLogger("fsspec.gui")
class SigSlot:
"""Signal-slot mixin, for Panel event passing
Include this class in a widget manager's superclasses to be able to
register events and callbacks on Panel widgets managed by that class.
The method ``_register`` should be called as widgets are added, and external
code should call ``connect`` to associate callbacks.
By default, all signals emit a DEBUG logging statement.
"""
# names of signals that this class may emit each of which must be
# set by _register for any new instance
signals: ClassVar[Sequence[str]] = []
# names of actions that this class may respond to
slots: ClassVar[Sequence[str]] = []
# each of which must be a method name
def __init__(self):
self._ignoring_events = False
self._sigs = {}
self._map = {}
self._setup()
def _setup(self):
"""Create GUI elements and register signals"""
self.panel = pn.pane.PaneBase()
# no signals to set up in the base class
def _register(
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
):
"""Watch the given attribute of a widget and assign it a named event
This is normally called at the time a widget is instantiated, in the
class which owns it.
Parameters
----------
widget : pn.layout.Panel or None
Widget to watch. If None, an anonymous signal not associated with
any widget.
name : str
Name of this event
thing : str
Attribute of the given widget to watch
log_level : int
When the signal is triggered, a logging event of the given level
will be fired in the dfviz logger.
auto : bool
If True, automatically connects with a method in this class of the
same name.
"""
if name not in self.signals:
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
self._sigs[name] = {
"widget": widget,
"callbacks": [],
"thing": thing,
"log": log_level,
}
wn = "-".join(
[
getattr(widget, "name", str(widget)) if widget is not None else "none",
thing,
]
)
self._map[wn] = name
if widget is not None:
widget.param.watch(self._signal, thing, onlychanged=True)
if auto and hasattr(self, name):
self.connect(name, getattr(self, name))
def _repr_mimebundle_(self, *args, **kwargs):
"""Display in a notebook or a server"""
try:
return self.panel._repr_mimebundle_(*args, **kwargs)
except (ValueError, AttributeError) as exc:
raise NotImplementedError(
"Panel does not seem to be set up properly"
) from exc
def connect(self, signal, slot):
"""Associate call back with given event
The callback must be a function which takes the "new" value of the
watched attribute as the only parameter. If the callback return False,
this cancels any further processing of the given event.
Alternatively, the callback can be a string, in which case it means
emitting the correspondingly-named event (i.e., connect to self)
"""
self._sigs[signal]["callbacks"].append(slot)
def _signal(self, event):
"""This is called by a an action on a widget
Within an self.ignore_events context, nothing happens.
Tests can execute this method by directly changing the values of
widget components.
"""
if not self._ignoring_events:
wn = "-".join([event.obj.name, event.name])
if wn in self._map and self._map[wn] in self._sigs:
self._emit(self._map[wn], event.new)
@contextlib.contextmanager
def ignore_events(self):
"""Temporarily turn off events processing in this instance
(does not propagate to children)
"""
self._ignoring_events = True
try:
yield
finally:
self._ignoring_events = False
def _emit(self, sig, value=None):
"""An event happened, call its callbacks
This method can be used in tests to simulate message passing without
directly changing visual elements.
Calling of callbacks will halt whenever one returns False.
"""
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
for callback in self._sigs[sig]["callbacks"]:
if isinstance(callback, str):
self._emit(callback)
else:
try:
# running callbacks should not break the interface
ret = callback(value)
if ret is False:
break
except Exception as e:
logger.exception(
"Exception (%s) while executing callback for signal: %s",
e,
sig,
)
def show(self, threads=False):
"""Open a new browser tab and display this instance's interface"""
self.panel.show(threads=threads, verbose=False)
return self
class SingleSelect(SigSlot):
"""A multiselect which only allows you to select one item for an event"""
signals = ["_selected", "selected"] # the first is internal
slots = ["set_options", "set_selection", "add", "clear", "select"]
def __init__(self, **kwargs):
self.kwargs = kwargs
super().__init__()
def _setup(self):
self.panel = pn.widgets.MultiSelect(**self.kwargs)
self._register(self.panel, "_selected", "value")
self._register(None, "selected")
self.connect("_selected", self.select_one)
def _signal(self, *args, **kwargs):
super()._signal(*args, **kwargs)
def select_one(self, *_):
with self.ignore_events():
val = [self.panel.value[-1]] if self.panel.value else []
self.panel.value = val
self._emit("selected", self.panel.value)
def set_options(self, options):
self.panel.options = options
def clear(self):
self.panel.options = []
@property
def value(self):
return self.panel.value
def set_selection(self, selection):
self.panel.value = [selection]
class FileSelector(SigSlot):
"""Panel-based graphical file selector widget
Instances of this widget are interactive and can be displayed in jupyter by having
them as the output of a cell, or in a separate browser tab using ``.show()``.
"""
signals = [
"protocol_changed",
"selection_changed",
"directory_entered",
"home_clicked",
"up_clicked",
"go_clicked",
"filters_changed",
]
slots = ["set_filters", "go_home"]
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
"""
Parameters
----------
url : str (optional)
Initial value of the URL to populate the dialog; should include protocol
filters : list(str) (optional)
File endings to include in the listings. If not included, all files are
allowed. Does not affect directories.
If given, the endings will appear as checkboxes in the interface
ignore : list(str) (optional)
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
hidden files on posix
kwargs : dict (optional)
To pass to file system instance
"""
if url:
self.init_protocol, url = split_protocol(url)
else:
self.init_protocol, url = "file", os.getcwd()
self.init_url = url
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
self.filters = filters
self.ignore = [re.compile(i) for i in ignore or []]
self._fs = None
super().__init__()
def _setup(self):
self.url = pn.widgets.TextInput(
name="url",
value=self.init_url,
align="end",
sizing_mode="stretch_width",
width_policy="max",
)
self.protocol = pn.widgets.Select(
options=sorted(known_implementations),
value=self.init_protocol,
name="protocol",
align="center",
)
self.kwargs = pn.widgets.TextInput(
name="kwargs", value=self.init_kwargs, align="center"
)
self.go = pn.widgets.Button(name="", align="end", width=45)
self.main = SingleSelect(size=10)
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
self.up = pn.widgets.Button(name="", width=30, height=30, align="end")
self._register(self.protocol, "protocol_changed", auto=True)
self._register(self.go, "go_clicked", "clicks", auto=True)
self._register(self.up, "up_clicked", "clicks", auto=True)
self._register(self.home, "home_clicked", "clicks", auto=True)
self._register(None, "selection_changed")
self.main.connect("selected", self.selection_changed)
self._register(None, "directory_entered")
self.prev_protocol = self.protocol.value
self.prev_kwargs = self.storage_options
self.filter_sel = pn.widgets.CheckBoxGroup(
value=[], options=[], inline=False, align="end", width_policy="min"
)
self._register(self.filter_sel, "filters_changed", auto=True)
self.panel = pn.Column(
pn.Row(self.protocol, self.kwargs),
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
self.main.panel,
)
self.set_filters(self.filters)
self.go_clicked()
def set_filters(self, filters=None):
self.filters = filters
if filters:
self.filter_sel.options = filters
self.filter_sel.value = filters
else:
self.filter_sel.options = []
self.filter_sel.value = []
@property
def storage_options(self):
"""Value of the kwargs box as a dictionary"""
return ast.literal_eval(self.kwargs.value) or {}
@property
def fs(self):
"""Current filesystem instance"""
if self._fs is None:
cls = get_filesystem_class(self.protocol.value)
self._fs = cls(**self.storage_options)
return self._fs
@property
def urlpath(self):
"""URL of currently selected item"""
return (
(f"{self.protocol.value}://{self.main.value[0]}")
if self.main.value
else None
)
def open_file(self, mode="rb", compression=None, encoding=None):
"""Create OpenFile instance for the currently selected item
For example, in a notebook you might do something like
.. code-block::
[ ]: sel = FileSelector(); sel
# user selects their file
[ ]: with sel.open_file('rb') as f:
... out = f.read()
Parameters
----------
mode: str (optional)
Open mode for the file.
compression: str (optional)
The interact with the file as compressed. Set to 'infer' to guess
compression from the file ending
encoding: str (optional)
If using text mode, use this encoding; defaults to UTF8.
"""
if self.urlpath is None:
raise ValueError("No file selected")
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
def filters_changed(self, values):
self.filters = values
self.go_clicked()
def selection_changed(self, *_):
if self.urlpath is None:
return
if self.fs.isdir(self.urlpath):
self.url.value = self.fs._strip_protocol(self.urlpath)
self.go_clicked()
def go_clicked(self, *_):
if (
self.prev_protocol != self.protocol.value
or self.prev_kwargs != self.storage_options
):
self._fs = None # causes fs to be recreated
self.prev_protocol = self.protocol.value
self.prev_kwargs = self.storage_options
listing = sorted(
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
)
listing = [
l
for l in listing
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
]
folders = {
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
for o in listing
if o["type"] == "directory"
}
files = {
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
for o in listing
if o["type"] == "file"
}
if self.filters:
files = {
k: v
for k, v in files.items()
if any(v.endswith(ext) for ext in self.filters)
}
self.main.set_options(dict(**folders, **files))
def protocol_changed(self, *_):
self._fs = None
self.main.options = []
self.url.value = ""
def home_clicked(self, *_):
self.protocol.value = self.init_protocol
self.kwargs.value = self.init_kwargs
self.url.value = self.init_url
self.go_clicked()
def up_clicked(self, *_):
self.url.value = self.fs._parent(self.url.value)
self.go_clicked()

View File

@ -0,0 +1,304 @@
import errno
import io
import os
import secrets
import shutil
from contextlib import suppress
from functools import cached_property, wraps
from urllib.parse import parse_qs
from fsspec.spec import AbstractFileSystem
from fsspec.utils import (
get_package_version_without_import,
infer_storage_options,
mirror_from,
tokenize,
)
def wrap_exceptions(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except OSError as exception:
if not exception.args:
raise
message, *args = exception.args
if isinstance(message, str) and "does not exist" in message:
raise FileNotFoundError(errno.ENOENT, message) from exception
else:
raise
return wrapper
PYARROW_VERSION = None
class ArrowFSWrapper(AbstractFileSystem):
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
Parameters
----------
fs : pyarrow.fs.FileSystem
"""
root_marker = "/"
def __init__(self, fs, **kwargs):
global PYARROW_VERSION
PYARROW_VERSION = get_package_version_without_import("pyarrow")
self.fs = fs
super().__init__(**kwargs)
@property
def protocol(self):
return self.fs.type_name
@cached_property
def fsid(self):
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
@classmethod
def _strip_protocol(cls, path):
ops = infer_storage_options(path)
path = ops["path"]
if path.startswith("//"):
# special case for "hdfs://path" (without the triple slash)
path = path[1:]
return path
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
from pyarrow.fs import FileSelector
entries = [
self._make_entry(entry)
for entry in self.fs.get_file_info(FileSelector(path))
]
if detail:
return entries
else:
return [entry["name"] for entry in entries]
def info(self, path, **kwargs):
path = self._strip_protocol(path)
[info] = self.fs.get_file_info([path])
return self._make_entry(info)
def exists(self, path):
path = self._strip_protocol(path)
try:
self.info(path)
except FileNotFoundError:
return False
else:
return True
def _make_entry(self, info):
from pyarrow.fs import FileType
if info.type is FileType.Directory:
kind = "directory"
elif info.type is FileType.File:
kind = "file"
elif info.type is FileType.NotFound:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
else:
kind = "other"
return {
"name": info.path,
"size": info.size,
"type": kind,
"mtime": info.mtime,
}
@wrap_exceptions
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
with self._open(path1, "rb") as lstream:
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.fs.move(tmp_fname, path2)
except BaseException:
with suppress(FileNotFoundError):
self.fs.delete_file(tmp_fname)
raise
@wrap_exceptions
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1).rstrip("/")
path2 = self._strip_protocol(path2).rstrip("/")
self.fs.move(path1, path2)
@wrap_exceptions
def rm_file(self, path):
path = self._strip_protocol(path)
self.fs.delete_file(path)
@wrap_exceptions
def rm(self, path, recursive=False, maxdepth=None):
path = self._strip_protocol(path).rstrip("/")
if self.isdir(path):
if recursive:
self.fs.delete_dir(path)
else:
raise ValueError("Can't delete directories without recursive=False")
else:
self.fs.delete_file(path)
@wrap_exceptions
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
if mode == "rb":
if seekable:
method = self.fs.open_input_file
else:
method = self.fs.open_input_stream
elif mode == "wb":
method = self.fs.open_output_stream
elif mode == "ab":
method = self.fs.open_append_stream
else:
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
_kwargs = {}
if mode != "rb" or not seekable:
if int(PYARROW_VERSION.split(".")[0]) >= 4:
# disable compression auto-detection
_kwargs["compression"] = None
stream = method(path, **_kwargs)
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
@wrap_exceptions
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
self.fs.create_dir(path, recursive=False)
@wrap_exceptions
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
self.fs.create_dir(path, recursive=True)
@wrap_exceptions
def rmdir(self, path):
path = self._strip_protocol(path)
self.fs.delete_dir(path)
@wrap_exceptions
def modified(self, path):
path = self._strip_protocol(path)
return self.fs.get_file_info(path).mtime
def cat_file(self, path, start=None, end=None, **kwargs):
kwargs["seekable"] = start not in [None, 0]
return super().cat_file(path, start=None, end=None, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
kwargs["seekable"] = False
super().get_file(rpath, lpath, **kwargs)
@mirror_from(
"stream",
[
"read",
"seek",
"tell",
"write",
"readable",
"writable",
"close",
"size",
"seekable",
],
)
class ArrowFile(io.IOBase):
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
self.path = path
self.mode = mode
self.fs = fs
self.stream = stream
self.blocksize = self.block_size = block_size
self.kwargs = kwargs
def __enter__(self):
return self
def __exit__(self, *args):
return self.close()
class HadoopFileSystem(ArrowFSWrapper):
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
to connect it's interface with fsspec"""
protocol = "hdfs"
def __init__(
self,
host="default",
port=0,
user=None,
kerb_ticket=None,
replication=3,
extra_conf=None,
**kwargs,
):
"""
Parameters
----------
host: str
Hostname, IP or "default" to try to read from Hadoop config
port: int
Port to connect on, or default from Hadoop config if 0
user: str or None
If given, connect as this username
kerb_ticket: str or None
If given, use this ticket for authentication
replication: int
set replication factor of file for write operations. default value is 3.
extra_conf: None or dict
Passed on to HadoopFileSystem
"""
from pyarrow.fs import HadoopFileSystem
fs = HadoopFileSystem(
host=host,
port=port,
user=user,
kerb_ticket=kerb_ticket,
replication=replication,
extra_conf=extra_conf,
)
super().__init__(fs=fs, **kwargs)
@staticmethod
def _get_kwargs_from_urls(path):
ops = infer_storage_options(path)
out = {}
if ops.get("host", None):
out["host"] = ops["host"]
if ops.get("username", None):
out["user"] = ops["username"]
if ops.get("port", None):
out["port"] = ops["port"]
if ops.get("url_query", None):
queries = parse_qs(ops["url_query"])
if queries.get("replication", None):
out["replication"] = int(queries["replication"][0])
return out

View File

@ -0,0 +1,75 @@
from __future__ import annotations
import abc
import hashlib
from fsspec.implementations.local import make_path_posix
class AbstractCacheMapper(abc.ABC):
"""Abstract super-class for mappers from remote URLs to local cached
basenames.
"""
@abc.abstractmethod
def __call__(self, path: str) -> str: ...
def __eq__(self, other: object) -> bool:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return isinstance(other, type(self))
def __hash__(self) -> int:
# Identity only depends on class. When derived classes have attributes
# they will need to be included.
return hash(type(self))
class BasenameCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses the basename of the remote URL and a fixed number
of directory levels above this.
The default is zero directory levels, meaning different paths with the same
basename will have the same cached basename.
"""
def __init__(self, directory_levels: int = 0):
if directory_levels < 0:
raise ValueError(
"BasenameCacheMapper requires zero or positive directory_levels"
)
self.directory_levels = directory_levels
# Separator for directories when encoded as strings.
self._separator = "_@_"
def __call__(self, path: str) -> str:
path = make_path_posix(path)
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
if bits:
return self._separator.join(bits)
else:
return prefix # No separator found, simple filename
def __eq__(self, other: object) -> bool:
return super().__eq__(other) and self.directory_levels == other.directory_levels
def __hash__(self) -> int:
return super().__hash__() ^ hash(self.directory_levels)
class HashCacheMapper(AbstractCacheMapper):
"""Cache mapper that uses a hash of the remote URL."""
def __call__(self, path: str) -> str:
return hashlib.sha256(path.encode()).hexdigest()
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
"""Factory method to create cache mapper for backward compatibility with
``CachingFileSystem`` constructor using ``same_names`` kwarg.
"""
if same_names:
return BasenameCacheMapper()
else:
return HashCacheMapper()

View File

@ -0,0 +1,232 @@
from __future__ import annotations
import os
import pickle
import time
from typing import TYPE_CHECKING
from fsspec.utils import atomic_write
try:
import ujson as json
except ImportError:
if not TYPE_CHECKING:
import json
if TYPE_CHECKING:
from typing import Any, Dict, Iterator, Literal
from typing_extensions import TypeAlias
from .cached import CachingFileSystem
Detail: TypeAlias = Dict[str, Any]
class CacheMetadata:
"""Cache metadata.
All reading and writing of cache metadata is performed by this class,
accessing the cached files and blocks is not.
Metadata is stored in a single file per storage directory in JSON format.
For backward compatibility, also reads metadata stored in pickle format
which is converted to JSON when next saved.
"""
def __init__(self, storage: list[str]):
"""
Parameters
----------
storage: list[str]
Directories containing cached files, must be at least one. Metadata
is stored in the last of these directories by convention.
"""
if not storage:
raise ValueError("CacheMetadata expects at least one storage location")
self._storage = storage
self.cached_files: list[Detail] = [{}]
# Private attribute to force saving of metadata in pickle format rather than
# JSON for use in tests to confirm can read both pickle and JSON formats.
self._force_save_pickle = False
def _load(self, fn: str) -> Detail:
"""Low-level function to load metadata from specific file"""
try:
with open(fn, "r") as f:
loaded = json.load(f)
except ValueError:
with open(fn, "rb") as f:
loaded = pickle.load(f)
for c in loaded.values():
if isinstance(c.get("blocks"), list):
c["blocks"] = set(c["blocks"])
return loaded
def _save(self, metadata_to_save: Detail, fn: str) -> None:
"""Low-level function to save metadata to specific file"""
if self._force_save_pickle:
with atomic_write(fn) as f:
pickle.dump(metadata_to_save, f)
else:
with atomic_write(fn, mode="w") as f:
json.dump(metadata_to_save, f)
def _scan_locations(
self, writable_only: bool = False
) -> Iterator[tuple[str, str, bool]]:
"""Yield locations (filenames) where metadata is stored, and whether
writable or not.
Parameters
----------
writable: bool
Set to True to only yield writable locations.
Returns
-------
Yields (str, str, bool)
"""
n = len(self._storage)
for i, storage in enumerate(self._storage):
writable = i == n - 1
if writable_only and not writable:
continue
yield os.path.join(storage, "cache"), storage, writable
def check_file(
self, path: str, cfs: CachingFileSystem | None
) -> Literal[False] | tuple[Detail, str]:
"""If path is in cache return its details, otherwise return ``False``.
If the optional CachingFileSystem is specified then it is used to
perform extra checks to reject possible matches, such as if they are
too old.
"""
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
if path not in cache:
continue
detail = cache[path].copy()
if cfs is not None:
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
# Wrong file as determined by hash of file properties
continue
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
# Cached file has expired
continue
fn = os.path.join(base, detail["fn"])
if os.path.exists(fn):
return detail, fn
return False
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
"""Remove expired metadata from the cache.
Returns names of files corresponding to expired metadata and a boolean
flag indicating whether the writable cache is empty. Caller is
responsible for deleting the expired files.
"""
expired_files = []
for path, detail in self.cached_files[-1].copy().items():
if time.time() - detail["time"] > expiry_time:
fn = detail.get("fn", "")
if not fn:
raise RuntimeError(
f"Cache metadata does not contain 'fn' for {path}"
)
fn = os.path.join(self._storage[-1], fn)
expired_files.append(fn)
self.cached_files[-1].pop(path)
if self.cached_files[-1]:
cache_path = os.path.join(self._storage[-1], "cache")
self._save(self.cached_files[-1], cache_path)
writable_cache_empty = not self.cached_files[-1]
return expired_files, writable_cache_empty
def load(self) -> None:
"""Load all metadata from disk and store in ``self.cached_files``"""
cached_files = []
for fn, _, _ in self._scan_locations():
if os.path.exists(fn):
# TODO: consolidate blocks here
cached_files.append(self._load(fn))
else:
cached_files.append({})
self.cached_files = cached_files or [{}]
def on_close_cached_file(self, f: Any, path: str) -> None:
"""Perform side-effect actions on closing a cached file.
The actual closing of the file is the responsibility of the caller.
"""
# File must be writeble, so in self.cached_files[-1]
c = self.cached_files[-1][path]
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
c["blocks"] = True
def pop_file(self, path: str) -> str | None:
"""Remove metadata of cached file.
If path is in the cache, return the filename of the cached file,
otherwise return ``None``. Caller is responsible for deleting the
cached file.
"""
details = self.check_file(path, None)
if not details:
return None
_, fn = details
if fn.startswith(self._storage[-1]):
self.cached_files[-1].pop(path)
self.save()
else:
raise PermissionError(
"Can only delete cached file in last, writable cache location"
)
return fn
def save(self) -> None:
"""Save metadata to disk"""
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
if not writable:
continue
if os.path.exists(fn):
cached_files = self._load(fn)
for k, c in cached_files.items():
if k in cache:
if c["blocks"] is True or cache[k]["blocks"] is True:
c["blocks"] = True
else:
# self.cached_files[*][*]["blocks"] must continue to
# point to the same set object so that updates
# performed by MMapCache are propagated back to
# self.cached_files.
blocks = cache[k]["blocks"]
blocks.update(c["blocks"])
c["blocks"] = blocks
c["time"] = max(c["time"], cache[k]["time"])
c["uid"] = cache[k]["uid"]
# Files can be added to cache after it was written once
for k, c in cache.items():
if k not in cached_files:
cached_files[k] = c
else:
cached_files = cache
cache = {k: v.copy() for k, v in cached_files.items()}
for c in cache.values():
if isinstance(c["blocks"], set):
c["blocks"] = list(c["blocks"])
self._save(cache, fn)
self.cached_files[-1] = cached_files
def update_file(self, path: str, detail: Detail) -> None:
"""Update metadata for specific file in memory, do not save"""
self.cached_files[-1][path] = detail

View File

@ -0,0 +1,929 @@
from __future__ import annotations
import inspect
import logging
import os
import tempfile
import time
import weakref
from shutil import rmtree
from typing import TYPE_CHECKING, Any, Callable, ClassVar
from fsspec import AbstractFileSystem, filesystem
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.compression import compr
from fsspec.core import BaseCache, MMapCache
from fsspec.exceptions import BlocksizeMismatchError
from fsspec.implementations.cache_mapper import create_cache_mapper
from fsspec.implementations.cache_metadata import CacheMetadata
from fsspec.spec import AbstractBufferedFile
from fsspec.transaction import Transaction
from fsspec.utils import infer_compression
if TYPE_CHECKING:
from fsspec.implementations.cache_mapper import AbstractCacheMapper
logger = logging.getLogger("fsspec.cached")
class WriteCachedTransaction(Transaction):
def complete(self, commit=True):
rpaths = [f.path for f in self.files]
lpaths = [f.fn for f in self.files]
if commit:
self.fs.put(lpaths, rpaths)
self.files.clear()
self.fs._intrans = False
self.fs._transaction = None
self.fs = None # break cycle
class CachingFileSystem(AbstractFileSystem):
"""Locally caching filesystem, layer over any other FS
This class implements chunk-wise local storage of remote files, for quick
access after the initial download. The files are stored in a given
directory with hashes of URLs for the filenames. If no directory is given,
a temporary one is used, which should be cleaned up by the OS after the
process ends. The files themselves are sparse (as implemented in
:class:`~fsspec.caching.MMapCache`), so only the data which is accessed
takes up space.
Restrictions:
- the block-size must be the same for each access of a given file, unless
all blocks of the file have already been read
- caching can only be applied to file-systems which produce files
derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
allowed, for testing
"""
protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
def __init__(
self,
target_protocol=None,
cache_storage="TMP",
cache_check=10,
check_files=False,
expiry_time=604800,
target_options=None,
fs=None,
same_names: bool | None = None,
compression=None,
cache_mapper: AbstractCacheMapper | None = None,
**kwargs,
):
"""
Parameters
----------
target_protocol: str (optional)
Target filesystem protocol. Provide either this or ``fs``.
cache_storage: str or list(str)
Location to store files. If "TMP", this is a temporary directory,
and will be cleaned up by the OS when this process ends (or later).
If a list, each location will be tried in the order given, but
only the last will be considered writable.
cache_check: int
Number of seconds between reload of cache metadata
check_files: bool
Whether to explicitly see if the UID of the remote file matches
the stored one before using. Warning: some file systems such as
HTTP cannot reliably give a unique hash of the contents of some
path, so be sure to set this option to False.
expiry_time: int
The time in seconds after which a local copy is considered useless.
Set to falsy to prevent expiry. The default is equivalent to one
week.
target_options: dict or None
Passed to the instantiation of the FS, if fs is None.
fs: filesystem instance
The target filesystem to run against. Provide this or ``protocol``.
same_names: bool (optional)
By default, target URLs are hashed using a ``HashCacheMapper`` so
that files from different backends with the same basename do not
conflict. If this argument is ``true``, a ``BasenameCacheMapper``
is used instead. Other cache mapper options are available by using
the ``cache_mapper`` keyword argument. Only one of this and
``cache_mapper`` should be specified.
compression: str (optional)
To decompress on download. Can be 'infer' (guess from the URL name),
one of the entries in ``fsspec.compression.compr``, or None for no
decompression.
cache_mapper: AbstractCacheMapper (optional)
The object use to map from original filenames to cached filenames.
Only one of this and ``same_names`` should be specified.
"""
super().__init__(**kwargs)
if fs is None and target_protocol is None:
raise ValueError(
"Please provide filesystem instance(fs) or target_protocol"
)
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
"Both filesystems (fs) and target_protocol may not be both given."
)
if cache_storage == "TMP":
tempdir = tempfile.mkdtemp()
storage = [tempdir]
weakref.finalize(self, self._remove_tempdir, tempdir)
else:
if isinstance(cache_storage, str):
storage = [cache_storage]
else:
storage = cache_storage
os.makedirs(storage[-1], exist_ok=True)
self.storage = storage
self.kwargs = target_options or {}
self.cache_check = cache_check
self.check_files = check_files
self.expiry = expiry_time
self.compression = compression
# Size of cache in bytes. If None then the size is unknown and will be
# recalculated the next time cache_size() is called. On writes to the
# cache this is reset to None.
self._cache_size = None
if same_names is not None and cache_mapper is not None:
raise ValueError(
"Cannot specify both same_names and cache_mapper in "
"CachingFileSystem.__init__"
)
if cache_mapper is not None:
self._mapper = cache_mapper
else:
self._mapper = create_cache_mapper(
same_names if same_names is not None else False
)
self.target_protocol = (
target_protocol
if isinstance(target_protocol, str)
else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
)
self._metadata = CacheMetadata(self.storage)
self.load_cache()
self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
def _strip_protocol(path):
# acts as a method, since each instance has a difference target
return self.fs._strip_protocol(type(self)._strip_protocol(path))
self._strip_protocol: Callable = _strip_protocol
@staticmethod
def _remove_tempdir(tempdir):
try:
rmtree(tempdir)
except Exception:
pass
def _mkcache(self):
os.makedirs(self.storage[-1], exist_ok=True)
def cache_size(self):
"""Return size of cache in bytes.
If more than one cache directory is in use, only the size of the last
one (the writable cache directory) is returned.
"""
if self._cache_size is None:
cache_dir = self.storage[-1]
self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
return self._cache_size
def load_cache(self):
"""Read set of stored blocks from file"""
self._metadata.load()
self._mkcache()
self.last_cache = time.time()
def save_cache(self):
"""Save set of stored blocks from file"""
self._mkcache()
self._metadata.save()
self.last_cache = time.time()
self._cache_size = None
def _check_cache(self):
"""Reload caches if time elapsed or any disappeared"""
self._mkcache()
if not self.cache_check:
# explicitly told not to bother checking
return
timecond = time.time() - self.last_cache > self.cache_check
existcond = all(os.path.exists(storage) for storage in self.storage)
if timecond or not existcond:
self.load_cache()
def _check_file(self, path):
"""Is path in cache and still valid"""
path = self._strip_protocol(path)
self._check_cache()
return self._metadata.check_file(path, self)
def clear_cache(self):
"""Remove all files and metadata from the cache
In the case of multiple cache locations, this clears only the last one,
which is assumed to be the read/write one.
"""
rmtree(self.storage[-1])
self.load_cache()
self._cache_size = None
def clear_expired_cache(self, expiry_time=None):
"""Remove all expired files and metadata from the cache
In the case of multiple cache locations, this clears only the last one,
which is assumed to be the read/write one.
Parameters
----------
expiry_time: int
The time in seconds after which a local copy is considered useless.
If not defined the default is equivalent to the attribute from the
file caching instantiation.
"""
if not expiry_time:
expiry_time = self.expiry
self._check_cache()
expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
for fn in expired_files:
if os.path.exists(fn):
os.remove(fn)
if writable_cache_empty:
rmtree(self.storage[-1])
self.load_cache()
self._cache_size = None
def pop_from_cache(self, path):
"""Remove cached version of given file
Deletes local copy of the given (remote) path. If it is found in a cache
location which is not the last, it is assumed to be read-only, and
raises PermissionError
"""
path = self._strip_protocol(path)
fn = self._metadata.pop_file(path)
if fn is not None:
os.remove(fn)
self._cache_size = None
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
"""Wrap the target _open
If the whole file exists in the cache, just open it locally and
return that.
Otherwise, open the file on the target FS, and make it have a mmap
cache pointing to the location which we determine, in our cache.
The ``blocks`` instance is shared, so as the mmap cache instance
updates, so does the entry in our ``cached_files`` attribute.
We monkey-patch this file, so that when it closes, we call
``close_and_update`` to save the state of the blocks.
"""
path = self._strip_protocol(path)
path = self.fs._strip_protocol(path)
if "r" not in mode:
return self.fs._open(
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
detail = self._check_file(path)
if detail:
# file is in cache
detail, fn = detail
hash, blocks = detail["fn"], detail["blocks"]
if blocks is True:
# stored file is complete
logger.debug("Opening local copy of %s", path)
return open(fn, mode)
# TODO: action where partial file exists in read-only cache
logger.debug("Opening partially cached copy of %s", path)
else:
hash = self._mapper(path)
fn = os.path.join(self.storage[-1], hash)
blocks = set()
detail = {
"original": path,
"fn": hash,
"blocks": blocks,
"time": time.time(),
"uid": self.fs.ukey(path),
}
self._metadata.update_file(path, detail)
logger.debug("Creating local sparse file for %s", path)
# call target filesystems open
self._mkcache()
f = self.fs._open(
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
cache_type="none",
**kwargs,
)
if self.compression:
comp = (
infer_compression(path)
if self.compression == "infer"
else self.compression
)
f = compr[comp](f, mode="rb")
if "blocksize" in detail:
if detail["blocksize"] != f.blocksize:
raise BlocksizeMismatchError(
f"Cached file must be reopened with same block"
f" size as original (old: {detail['blocksize']},"
f" new {f.blocksize})"
)
else:
detail["blocksize"] = f.blocksize
f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
close = f.close
f.close = lambda: self.close_and_update(f, close)
self.save_cache()
return f
def _parent(self, path):
return self.fs._parent(path)
def hash_name(self, path: str, *args: Any) -> str:
# Kept for backward compatibility with downstream libraries.
# Ignores extra arguments, previously same_name boolean.
return self._mapper(path)
def close_and_update(self, f, close):
"""Called when a file is closing, so store the set of blocks"""
if f.closed:
return
path = self._strip_protocol(f.path)
self._metadata.on_close_cached_file(f, path)
try:
logger.debug("going to save")
self.save_cache()
logger.debug("saved")
except OSError:
logger.debug("Cache saving failed while closing file")
except NameError:
logger.debug("Cache save failed due to interpreter shutdown")
close()
f.closed = True
def ls(self, path, detail=True):
return self.fs.ls(path, detail)
def __getattribute__(self, item):
if item in {
"load_cache",
"_open",
"save_cache",
"close_and_update",
"__init__",
"__getattribute__",
"__reduce__",
"_make_local_details",
"open",
"cat",
"cat_file",
"cat_ranges",
"get",
"read_block",
"tail",
"head",
"info",
"ls",
"exists",
"isfile",
"isdir",
"_check_file",
"_check_cache",
"_mkcache",
"clear_cache",
"clear_expired_cache",
"pop_from_cache",
"local_file",
"_paths_from_path",
"get_mapper",
"open_many",
"commit_many",
"hash_name",
"__hash__",
"__eq__",
"to_json",
"to_dict",
"cache_size",
"pipe_file",
"pipe",
"start_transaction",
"end_transaction",
}:
# all the methods defined in this class. Note `open` here, since
# it calls `_open`, but is actually in superclass
return lambda *args, **kw: getattr(type(self), item).__get__(self)(
*args, **kw
)
if item in ["__reduce_ex__"]:
raise AttributeError
if item in ["transaction"]:
# property
return type(self).transaction.__get__(self)
if item in ["_cache", "transaction_type"]:
# class attributes
return getattr(type(self), item)
if item == "__class__":
return type(self)
d = object.__getattribute__(self, "__dict__")
fs = d.get("fs", None) # fs is not immediately defined
if item in d:
return d[item]
elif fs is not None:
if item in fs.__dict__:
# attribute of instance
return fs.__dict__[item]
# attributed belonging to the target filesystem
cls = type(fs)
m = getattr(cls, item)
if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
not hasattr(m, "__self__") or m.__self__ is None
):
# instance method
return m.__get__(fs, cls)
return m # class method or attribute
else:
# attributes of the superclass, while target is being set up
return super().__getattribute__(item)
def __eq__(self, other):
"""Test for equality."""
if self is other:
return True
if not isinstance(other, type(self)):
return False
return (
self.storage == other.storage
and self.kwargs == other.kwargs
and self.cache_check == other.cache_check
and self.check_files == other.check_files
and self.expiry == other.expiry
and self.compression == other.compression
and self._mapper == other._mapper
and self.target_protocol == other.target_protocol
)
def __hash__(self):
"""Calculate hash."""
return (
hash(tuple(self.storage))
^ hash(str(self.kwargs))
^ hash(self.cache_check)
^ hash(self.check_files)
^ hash(self.expiry)
^ hash(self.compression)
^ hash(self._mapper)
^ hash(self.target_protocol)
)
class WholeFileCacheFileSystem(CachingFileSystem):
"""Caches whole remote files on first access
This class is intended as a layer over any other file system, and
will make a local copy of each file accessed, so that all subsequent
reads are local. This is similar to ``CachingFileSystem``, but without
the block-wise functionality and so can work even when sparse files
are not allowed. See its docstring for definition of the init
arguments.
The class still needs access to the remote store for listing files,
and may refresh cached files.
"""
protocol = "filecache"
local_file = True
def open_many(self, open_files, **kwargs):
paths = [of.path for of in open_files]
if "r" in open_files.mode:
self._mkcache()
else:
return [
LocalTempFile(
self.fs,
path,
mode=open_files.mode,
fn=os.path.join(self.storage[-1], self._mapper(path)),
**kwargs,
)
for path in paths
]
if self.compression:
raise NotImplementedError
details = [self._check_file(sp) for sp in paths]
downpath = [p for p, d in zip(paths, details) if not d]
downfn0 = [
os.path.join(self.storage[-1], self._mapper(p))
for p, d in zip(paths, details)
] # keep these path names for opening later
downfn = [fn for fn, d in zip(downfn0, details) if not d]
if downpath:
# skip if all files are already cached and up to date
self.fs.get(downpath, downfn)
# update metadata - only happens when downloads are successful
newdetail = [
{
"original": path,
"fn": self._mapper(path),
"blocks": True,
"time": time.time(),
"uid": self.fs.ukey(path),
}
for path in downpath
]
for path, detail in zip(downpath, newdetail):
self._metadata.update_file(path, detail)
self.save_cache()
def firstpart(fn):
# helper to adapt both whole-file and simple-cache
return fn[1] if isinstance(fn, tuple) else fn
return [
open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
for fn0, fn1 in zip(details, downfn0)
]
def commit_many(self, open_files):
self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
[f.close() for f in open_files]
for f in open_files:
# in case autocommit is off, and so close did not already delete
try:
os.remove(f.name)
except FileNotFoundError:
pass
self._cache_size = None
def _make_local_details(self, path):
hash = self._mapper(path)
fn = os.path.join(self.storage[-1], hash)
detail = {
"original": path,
"fn": hash,
"blocks": True,
"time": time.time(),
"uid": self.fs.ukey(path),
}
self._metadata.update_file(path, detail)
logger.debug("Copying %s to local cache", path)
return fn
def cat(
self,
path,
recursive=False,
on_error="raise",
callback=DEFAULT_CALLBACK,
**kwargs,
):
paths = self.expand_path(
path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
)
getpaths = []
storepaths = []
fns = []
out = {}
for p in paths.copy():
try:
detail = self._check_file(p)
if not detail:
fn = self._make_local_details(p)
getpaths.append(p)
storepaths.append(fn)
else:
detail, fn = detail if isinstance(detail, tuple) else (None, detail)
fns.append(fn)
except Exception as e:
if on_error == "raise":
raise
if on_error == "return":
out[p] = e
paths.remove(p)
if getpaths:
self.fs.get(getpaths, storepaths)
self.save_cache()
callback.set_size(len(paths))
for p, fn in zip(paths, fns):
with open(fn, "rb") as f:
out[p] = f.read()
callback.relative_update(1)
if isinstance(path, str) and len(paths) == 1 and recursive is False:
out = out[paths[0]]
return out
def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
if "r" not in mode:
hash = self._mapper(path)
fn = os.path.join(self.storage[-1], hash)
user_specified_kwargs = {
k: v
for k, v in kwargs.items()
# those kwargs were added by open(), we don't want them
if k not in ["autocommit", "block_size", "cache_options"]
}
return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
detail = self._check_file(path)
if detail:
detail, fn = detail
_, blocks = detail["fn"], detail["blocks"]
if blocks is True:
logger.debug("Opening local copy of %s", path)
# In order to support downstream filesystems to be able to
# infer the compression from the original filename, like
# the `TarFileSystem`, let's extend the `io.BufferedReader`
# fileobject protocol by adding a dedicated attribute
# `original`.
f = open(fn, mode)
f.original = detail.get("original")
return f
else:
raise ValueError(
f"Attempt to open partially cached file {path}"
f" as a wholly cached file"
)
else:
fn = self._make_local_details(path)
kwargs["mode"] = mode
# call target filesystems open
self._mkcache()
if self.compression:
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
if isinstance(f, AbstractBufferedFile):
# want no type of caching if just downloading whole thing
f.cache = BaseCache(0, f.cache.fetcher, f.size)
comp = (
infer_compression(path)
if self.compression == "infer"
else self.compression
)
f = compr[comp](f, mode="rb")
data = True
while data:
block = getattr(f, "blocksize", 5 * 2**20)
data = f.read(block)
f2.write(data)
else:
self.fs.get_file(path, fn)
self.save_cache()
return self._open(path, mode)
class SimpleCacheFileSystem(WholeFileCacheFileSystem):
"""Caches whole remote files on first access
This class is intended as a layer over any other file system, and
will make a local copy of each file accessed, so that all subsequent
reads are local. This implementation only copies whole files, and
does not keep any metadata about the download time or file details.
It is therefore safer to use in multi-threaded/concurrent situations.
This is the only of the caching filesystems that supports write: you will
be given a real local open file, and upon close and commit, it will be
uploaded to the target filesystem; the writability or the target URL is
not checked until that time.
"""
protocol = "simplecache"
local_file = True
transaction_type = WriteCachedTransaction
def __init__(self, **kwargs):
kw = kwargs.copy()
for key in ["cache_check", "expiry_time", "check_files"]:
kw[key] = False
super().__init__(**kw)
for storage in self.storage:
if not os.path.exists(storage):
os.makedirs(storage, exist_ok=True)
def _check_file(self, path):
self._check_cache()
sha = self._mapper(path)
for storage in self.storage:
fn = os.path.join(storage, sha)
if os.path.exists(fn):
return fn
def save_cache(self):
pass
def load_cache(self):
pass
def pipe_file(self, path, value=None, **kwargs):
if self._intrans:
with self.open(path, "wb") as f:
f.write(value)
else:
super().pipe_file(path, value)
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
details = []
try:
details = self.fs.ls(
path, detail=True, **kwargs
).copy() # don't edit original!
except FileNotFoundError as e:
ex = e
else:
ex = None
if self._intrans:
path1 = path.rstrip("/") + "/"
for f in self.transaction.files:
if f.path == path:
details.append(
{"name": path, "size": f.size or f.tell(), "type": "file"}
)
elif f.path.startswith(path1):
if f.path.count("/") == path1.count("/"):
details.append(
{"name": f.path, "size": f.size or f.tell(), "type": "file"}
)
else:
dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
details.append({"name": dname, "size": 0, "type": "directory"})
if ex is not None and not details:
raise ex
if detail:
return details
return sorted(_["name"] for _ in details)
def info(self, path, **kwargs):
path = self._strip_protocol(path)
if self._intrans:
f = [_ for _ in self.transaction.files if _.path == path]
if f:
size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell()
return {"name": path, "size": size, "type": "file"}
f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
if f:
return {"name": path, "size": 0, "type": "directory"}
return self.fs.info(path, **kwargs)
def pipe(self, path, value=None, **kwargs):
if isinstance(path, str):
self.pipe_file(self._strip_protocol(path), value, **kwargs)
elif isinstance(path, dict):
for k, v in path.items():
self.pipe_file(self._strip_protocol(k), v, **kwargs)
else:
raise ValueError("path must be str or dict")
def cat_ranges(
self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
):
lpaths = [self._check_file(p) for p in paths]
rpaths = [p for l, p in zip(lpaths, paths) if l is False]
lpaths = [l for l, p in zip(lpaths, paths) if l is False]
self.fs.get(rpaths, lpaths)
return super().cat_ranges(
paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
)
def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
sha = self._mapper(path)
if "r" not in mode:
fn = os.path.join(self.storage[-1], sha)
user_specified_kwargs = {
k: v
for k, v in kwargs.items()
if k not in ["autocommit", "block_size", "cache_options"]
} # those were added by open()
return LocalTempFile(
self,
path,
mode=mode,
autocommit=not self._intrans,
fn=fn,
**user_specified_kwargs,
)
fn = self._check_file(path)
if fn:
return open(fn, mode)
fn = os.path.join(self.storage[-1], sha)
logger.debug("Copying %s to local cache", path)
kwargs["mode"] = mode
self._mkcache()
self._cache_size = None
if self.compression:
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
if isinstance(f, AbstractBufferedFile):
# want no type of caching if just downloading whole thing
f.cache = BaseCache(0, f.cache.fetcher, f.size)
comp = (
infer_compression(path)
if self.compression == "infer"
else self.compression
)
f = compr[comp](f, mode="rb")
data = True
while data:
block = getattr(f, "blocksize", 5 * 2**20)
data = f.read(block)
f2.write(data)
else:
self.fs.get_file(path, fn)
return self._open(path, mode)
class LocalTempFile:
"""A temporary local file, which will be uploaded on commit"""
def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
self.fn = fn
self.fh = open(fn, mode)
self.mode = mode
if seek:
self.fh.seek(seek)
self.path = path
self.size = None
self.fs = fs
self.closed = False
self.autocommit = autocommit
self.kwargs = kwargs
def __reduce__(self):
# always open in r+b to allow continuing writing at a location
return (
LocalTempFile,
(self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
)
def __enter__(self):
return self.fh
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
# self.size = self.fh.tell()
if self.closed:
return
self.fh.close()
self.closed = True
if self.autocommit:
self.commit()
def discard(self):
self.fh.close()
os.remove(self.fn)
def commit(self):
self.fs.put(self.fn, self.path, **self.kwargs)
# we do not delete local copy - it's still in the cache
@property
def name(self):
return self.fn
def __repr__(self) -> str:
return f"LocalTempFile: {self.path}"
def __getattr__(self, item):
return getattr(self.fh, item)

View File

@ -0,0 +1,152 @@
import dask
from distributed.client import Client, _get_global_client
from distributed.worker import Worker
from fsspec import filesystem
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from fsspec.utils import infer_storage_options
def _get_client(client):
if client is None:
return _get_global_client()
elif isinstance(client, Client):
return client
else:
# e.g., connection string
return Client(client)
def _in_worker():
return bool(Worker._instances)
class DaskWorkerFileSystem(AbstractFileSystem):
"""View files accessible to a worker as any other remote file-system
When instances are run on the worker, uses the real filesystem. When
run on the client, they call the worker to provide information or data.
**Warning** this implementation is experimental, and read-only for now.
"""
def __init__(
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
):
super().__init__(**kwargs)
if not (fs is None) ^ (target_protocol is None):
raise ValueError(
"Please provide one of filesystem instance (fs) or"
" target_protocol, not both"
)
self.target_protocol = target_protocol
self.target_options = target_options
self.worker = None
self.client = client
self.fs = fs
self._determine_worker()
@staticmethod
def _get_kwargs_from_urls(path):
so = infer_storage_options(path)
if "host" in so and "port" in so:
return {"client": f"{so['host']}:{so['port']}"}
else:
return {}
def _determine_worker(self):
if _in_worker():
self.worker = True
if self.fs is None:
self.fs = filesystem(
self.target_protocol, **(self.target_options or {})
)
else:
self.worker = False
self.client = _get_client(self.client)
self.rfs = dask.delayed(self)
def mkdir(self, *args, **kwargs):
if self.worker:
self.fs.mkdir(*args, **kwargs)
else:
self.rfs.mkdir(*args, **kwargs).compute()
def rm(self, *args, **kwargs):
if self.worker:
self.fs.rm(*args, **kwargs)
else:
self.rfs.rm(*args, **kwargs).compute()
def copy(self, *args, **kwargs):
if self.worker:
self.fs.copy(*args, **kwargs)
else:
self.rfs.copy(*args, **kwargs).compute()
def mv(self, *args, **kwargs):
if self.worker:
self.fs.mv(*args, **kwargs)
else:
self.rfs.mv(*args, **kwargs).compute()
def ls(self, *args, **kwargs):
if self.worker:
return self.fs.ls(*args, **kwargs)
else:
return self.rfs.ls(*args, **kwargs).compute()
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if self.worker:
return self.fs._open(
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
else:
return DaskFile(
fs=self,
path=path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_options=cache_options,
**kwargs,
)
def fetch_range(self, path, mode, start, end):
if self.worker:
with self._open(path, mode) as f:
f.seek(start)
return f.read(end - start)
else:
return self.rfs.fetch_range(path, mode, start, end).compute()
class DaskFile(AbstractBufferedFile):
def __init__(self, mode="rb", **kwargs):
if mode != "rb":
raise ValueError('Remote dask files can only be opened in "rb" mode')
super().__init__(**kwargs)
def _upload_chunk(self, final=False):
pass
def _initiate_upload(self):
"""Create remote file/upload"""
pass
def _fetch_range(self, start, end):
"""Get the specified set of bytes from remote"""
return self.fs.fetch_range(self.path, self.mode, start, end)

View File

@ -0,0 +1,58 @@
import base64
import io
from typing import Optional
from urllib.parse import unquote
from fsspec import AbstractFileSystem
class DataFileSystem(AbstractFileSystem):
"""A handy decoder for data-URLs
Example
-------
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
... print(f.read())
b"Hello, World!"
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
protocol = "data"
def __init__(self, **kwargs):
"""No parameters for this filesystem"""
super().__init__(**kwargs)
def cat_file(self, path, start=None, end=None, **kwargs):
pref, data = path.split(",", 1)
if pref.endswith("base64"):
return base64.b64decode(data)[start:end]
return unquote(data).encode()[start:end]
def info(self, path, **kwargs):
pref, name = path.split(",", 1)
data = self.cat_file(path)
mime = pref.split(":", 1)[1].split(";", 1)[0]
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
if "r" not in mode:
raise ValueError("Read only filesystem")
return io.BytesIO(self.cat_file(path))
@staticmethod
def encode(data: bytes, mime: Optional[str] = None):
"""Format the given data into data-URL syntax
This version always base64 encodes, even when the data is ascii/url-safe.
"""
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"

View File

@ -0,0 +1,467 @@
import base64
import urllib
import requests
import requests.exceptions
from requests.adapters import HTTPAdapter, Retry
from fsspec import AbstractFileSystem
from fsspec.spec import AbstractBufferedFile
class DatabricksException(Exception):
"""
Helper class for exceptions raised in this module.
"""
def __init__(self, error_code, message):
"""Create a new DatabricksException"""
super().__init__(message)
self.error_code = error_code
self.message = message
class DatabricksFileSystem(AbstractFileSystem):
"""
Get access to the Databricks filesystem implementation over HTTP.
Can be used inside and outside of a databricks cluster.
"""
def __init__(self, instance, token, **kwargs):
"""
Create a new DatabricksFileSystem.
Parameters
----------
instance: str
The instance URL of the databricks cluster.
For example for an Azure databricks cluster, this
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
token: str
Your personal token. Find out more
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
"""
self.instance = instance
self.token = token
self.session = requests.Session()
self.retries = Retry(
total=10,
backoff_factor=0.05,
status_forcelist=[408, 429, 500, 502, 503, 504],
)
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
super().__init__(**kwargs)
def ls(self, path, detail=True, **kwargs):
"""
List the contents of the given path.
Parameters
----------
path: str
Absolute path
detail: bool
Return not only the list of filenames,
but also additional information on file sizes
and types.
"""
out = self._ls_from_cache(path)
if not out:
try:
r = self._send_to_api(
method="get", endpoint="list", json={"path": path}
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
files = r["files"]
out = [
{
"name": o["path"],
"type": "directory" if o["is_dir"] else "file",
"size": o["file_size"],
}
for o in files
]
self.dircache[path] = out
if detail:
return out
return [o["name"] for o in out]
def makedirs(self, path, exist_ok=True):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
exist_ok: bool
If false, checks if the folder
exists before creating it (and raises an
Exception if this is the case)
"""
if not exist_ok:
try:
# If the following succeeds, the path is already present
self._send_to_api(
method="get", endpoint="get-status", json={"path": path}
)
raise FileExistsError(f"Path {path} already exists")
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
pass
try:
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mkdir(self, path, create_parents=True, **kwargs):
"""
Create a given absolute path and all of its parents.
Parameters
----------
path: str
Absolute path to create
create_parents: bool
Whether to create all parents or not.
"False" is not implemented so far.
"""
if not create_parents:
raise NotImplementedError
self.mkdirs(path, **kwargs)
def rm(self, path, recursive=False, **kwargs):
"""
Remove the file or folder at the given absolute path.
Parameters
----------
path: str
Absolute path what to remove
recursive: bool
Recursively delete all files in a folder.
"""
try:
self._send_to_api(
method="post",
endpoint="delete",
json={"path": path, "recursive": recursive},
)
except DatabricksException as e:
# This is not really an exception, it just means
# not everything was deleted so far
if e.error_code == "PARTIAL_DELETE":
self.rm(path=path, recursive=recursive)
elif e.error_code == "IO_ERROR":
# Using the same exception as the os module would use here
raise OSError(e.message) from e
raise
self.invalidate_cache(self._parent(path))
def mv(
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
):
"""
Move a source to a destination path.
A note from the original [databricks API manual]
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
When moving a large number of files the API call will time out after
approximately 60s, potentially resulting in partially moved data.
Therefore, for operations that move more than 10k files, we strongly
discourage using the DBFS REST API.
Parameters
----------
source_path: str
From where to move (absolute path)
destination_path: str
To where to move (absolute path)
recursive: bool
Not implemented to far.
maxdepth:
Not implemented to far.
"""
if recursive:
raise NotImplementedError
if maxdepth:
raise NotImplementedError
try:
self._send_to_api(
method="post",
endpoint="move",
json={"source_path": source_path, "destination_path": destination_path},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
self.invalidate_cache(self._parent(source_path))
self.invalidate_cache(self._parent(destination_path))
def _open(self, path, mode="rb", block_size="default", **kwargs):
"""
Overwrite the base class method to make sure to create a DBFile.
All arguments are copied from the base method.
Only the default blocksize is allowed.
"""
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
def _send_to_api(self, method, endpoint, json):
"""
Send the given json to the DBFS API
using a get or post request (specified by the argument `method`).
Parameters
----------
method: str
Which http method to use for communication; "get" or "post".
endpoint: str
Where to send the request to (last part of the API URL)
json: dict
Dictionary of information to send
"""
if method == "post":
session_call = self.session.post
elif method == "get":
session_call = self.session.get
else:
raise ValueError(f"Do not understand method {method}")
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
r = session_call(url, json=json)
# The DBFS API will return a json, also in case of an exception.
# We want to preserve this information as good as possible.
try:
r.raise_for_status()
except requests.HTTPError as e:
# try to extract json error message
# if that fails, fall back to the original exception
try:
exception_json = e.response.json()
except Exception:
raise e from None
raise DatabricksException(**exception_json) from e
return r.json()
def _create_handle(self, path, overwrite=True):
"""
Internal function to create a handle, which can be used to
write blocks of a file to DBFS.
A handle has a unique identifier which needs to be passed
whenever written during this transaction.
The handle is active for 10 minutes - after that a new
write transaction needs to be created.
Make sure to close the handle after you are finished.
Parameters
----------
path: str
Absolute path for this file.
overwrite: bool
If a file already exist at this location, either overwrite
it or raise an exception.
"""
try:
r = self._send_to_api(
method="post",
endpoint="create",
json={"path": path, "overwrite": overwrite},
)
return r["handle"]
except DatabricksException as e:
if e.error_code == "RESOURCE_ALREADY_EXISTS":
raise FileExistsError(e.message) from e
raise
def _close_handle(self, handle):
"""
Close a handle, which was opened by :func:`_create_handle`.
Parameters
----------
handle: str
Which handle to close.
"""
try:
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
raise
def _add_data(self, handle, data):
"""
Upload data to an already opened file handle
(opened by :func:`_create_handle`).
The maximal allowed data size is 1MB after
conversion to base64.
Remember to close the handle when you are finished.
Parameters
----------
handle: str
Which handle to upload data to.
data: bytes
Block of data to add to the handle.
"""
data = base64.b64encode(data).decode()
try:
self._send_to_api(
method="post",
endpoint="add-block",
json={"handle": handle, "data": data},
)
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
raise ValueError(e.message) from e
raise
def _get_data(self, path, start, end):
"""
Download data in bytes from a given absolute path in a block
from [start, start+length].
The maximum number of allowed bytes to read is 1MB.
Parameters
----------
path: str
Absolute path to download data from
start: int
Start position of the block
end: int
End position of the block
"""
try:
r = self._send_to_api(
method="get",
endpoint="read",
json={"path": path, "offset": start, "length": end - start},
)
return base64.b64decode(r["data"])
except DatabricksException as e:
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
raise FileNotFoundError(e.message) from e
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
raise ValueError(e.message) from e
raise
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class DatabricksFile(AbstractBufferedFile):
"""
Helper class for files referenced in the DatabricksFileSystem.
"""
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
"""
Create a new instance of the DatabricksFile.
The blocksize needs to be the default one.
"""
if block_size is None or block_size == "default":
block_size = self.DEFAULT_BLOCK_SIZE
assert (
block_size == self.DEFAULT_BLOCK_SIZE
), f"Only the default block size is allowed, not {block_size}"
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options or {},
**kwargs,
)
def _initiate_upload(self):
"""Internal function to start a file upload"""
self.handle = self.fs._create_handle(self.path)
def _upload_chunk(self, final=False):
"""Internal function to add a chunk of data to a started upload"""
self.buffer.seek(0)
data = self.buffer.getvalue()
data_chunks = [
data[start:end] for start, end in self._to_sized_blocks(len(data))
]
for data_chunk in data_chunks:
self.fs._add_data(handle=self.handle, data=data_chunk)
if final:
self.fs._close_handle(handle=self.handle)
return True
def _fetch_range(self, start, end):
"""Internal function to download a block of data"""
return_buffer = b""
length = end - start
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
return_buffer += self.fs._get_data(
path=self.path, start=chunk_start, end=chunk_end
)
return return_buffer
def _to_sized_blocks(self, length, start=0):
"""Helper function to split a range from 0 to total_length into bloksizes"""
end = start + length
for data_chunk in range(start, end, self.blocksize):
data_start = data_chunk
data_end = min(end, data_chunk + self.blocksize)
yield data_start, data_end

View File

@ -0,0 +1,384 @@
from .. import filesystem
from ..asyn import AsyncFileSystem
class DirFileSystem(AsyncFileSystem):
"""Directory prefix filesystem
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
is relative to the `path`. After performing the necessary paths operation it
delegates everything to the wrapped filesystem.
"""
protocol = "dir"
def __init__(
self,
path=None,
fs=None,
fo=None,
target_protocol=None,
target_options=None,
**storage_options,
):
"""
Parameters
----------
path: str
Path to the directory.
fs: AbstractFileSystem
An instantiated filesystem to wrap.
target_protocol, target_options:
if fs is none, construct it from these
fo: str
Alternate for path; do not provide both
"""
super().__init__(**storage_options)
if fs is None:
fs = filesystem(protocol=target_protocol, **(target_options or {}))
if (path is not None) ^ (fo is not None) is False:
raise ValueError("Provide path or fo, not both")
path = path or fo
if self.asynchronous and not fs.async_impl:
raise ValueError("can't use asynchronous with non-async fs")
if fs.async_impl and self.asynchronous != fs.asynchronous:
raise ValueError("both dirfs and fs should be in the same sync/async mode")
self.path = fs._strip_protocol(path)
self.fs = fs
def _join(self, path):
if isinstance(path, str):
if not self.path:
return path
if not path:
return self.path
return self.fs.sep.join((self.path, self._strip_protocol(path)))
if isinstance(path, dict):
return {self._join(_path): value for _path, value in path.items()}
return [self._join(_path) for _path in path]
def _relpath(self, path):
if isinstance(path, str):
if not self.path:
return path
# We need to account for S3FileSystem returning paths that do not
# start with a '/'
if path == self.path or (
self.path.startswith(self.fs.sep) and path == self.path[1:]
):
return ""
prefix = self.path + self.fs.sep
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
prefix = prefix[1:]
assert path.startswith(prefix)
return path[len(prefix) :]
return [self._relpath(_path) for _path in path]
# Wrappers below
@property
def sep(self):
return self.fs.sep
async def set_session(self, *args, **kwargs):
return await self.fs.set_session(*args, **kwargs)
async def _rm_file(self, path, **kwargs):
return await self.fs._rm_file(self._join(path), **kwargs)
def rm_file(self, path, **kwargs):
return self.fs.rm_file(self._join(path), **kwargs)
async def _rm(self, path, *args, **kwargs):
return await self.fs._rm(self._join(path), *args, **kwargs)
def rm(self, path, *args, **kwargs):
return self.fs.rm(self._join(path), *args, **kwargs)
async def _cp_file(self, path1, path2, **kwargs):
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
def cp_file(self, path1, path2, **kwargs):
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
async def _copy(
self,
path1,
path2,
*args,
**kwargs,
):
return await self.fs._copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
def copy(self, path1, path2, *args, **kwargs):
return self.fs.copy(
self._join(path1),
self._join(path2),
*args,
**kwargs,
)
async def _pipe(self, path, *args, **kwargs):
return await self.fs._pipe(self._join(path), *args, **kwargs)
def pipe(self, path, *args, **kwargs):
return self.fs.pipe(self._join(path), *args, **kwargs)
async def _pipe_file(self, path, *args, **kwargs):
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
def pipe_file(self, path, *args, **kwargs):
return self.fs.pipe_file(self._join(path), *args, **kwargs)
async def _cat_file(self, path, *args, **kwargs):
return await self.fs._cat_file(self._join(path), *args, **kwargs)
def cat_file(self, path, *args, **kwargs):
return self.fs.cat_file(self._join(path), *args, **kwargs)
async def _cat(self, path, *args, **kwargs):
ret = await self.fs._cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
def cat(self, path, *args, **kwargs):
ret = self.fs.cat(
self._join(path),
*args,
**kwargs,
)
if isinstance(ret, dict):
return {self._relpath(key): value for key, value in ret.items()}
return ret
async def _put_file(self, lpath, rpath, **kwargs):
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
def put_file(self, lpath, rpath, **kwargs):
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
async def _put(
self,
lpath,
rpath,
*args,
**kwargs,
):
return await self.fs._put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
def put(self, lpath, rpath, *args, **kwargs):
return self.fs.put(
lpath,
self._join(rpath),
*args,
**kwargs,
)
async def _get_file(self, rpath, lpath, **kwargs):
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
def get_file(self, rpath, lpath, **kwargs):
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
async def _get(self, rpath, *args, **kwargs):
return await self.fs._get(self._join(rpath), *args, **kwargs)
def get(self, rpath, *args, **kwargs):
return self.fs.get(self._join(rpath), *args, **kwargs)
async def _isfile(self, path):
return await self.fs._isfile(self._join(path))
def isfile(self, path):
return self.fs.isfile(self._join(path))
async def _isdir(self, path):
return await self.fs._isdir(self._join(path))
def isdir(self, path):
return self.fs.isdir(self._join(path))
async def _size(self, path):
return await self.fs._size(self._join(path))
def size(self, path):
return self.fs.size(self._join(path))
async def _exists(self, path):
return await self.fs._exists(self._join(path))
def exists(self, path):
return self.fs.exists(self._join(path))
async def _info(self, path, **kwargs):
return await self.fs._info(self._join(path), **kwargs)
def info(self, path, **kwargs):
return self.fs.info(self._join(path), **kwargs)
async def _ls(self, path, detail=True, **kwargs):
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
def ls(self, path, detail=True, **kwargs):
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
if detail:
out = []
for entry in ret:
entry = entry.copy()
entry["name"] = self._relpath(entry["name"])
out.append(entry)
return out
return self._relpath(ret)
async def _walk(self, path, *args, **kwargs):
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
def walk(self, path, *args, **kwargs):
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
yield self._relpath(root), dirs, files
async def _glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def glob(self, path, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.glob(self._join(path), **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = await self.fs._du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
def du(self, path, *args, **kwargs):
total = kwargs.get("total", True)
ret = self.fs.du(self._join(path), *args, **kwargs)
if total:
return ret
return {self._relpath(path): size for path, size in ret.items()}
async def _find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = await self.fs._find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
def find(self, path, *args, **kwargs):
detail = kwargs.get("detail", False)
ret = self.fs.find(self._join(path), *args, **kwargs)
if detail:
return {self._relpath(path): info for path, info in ret.items()}
return self._relpath(ret)
async def _expand_path(self, path, *args, **kwargs):
return self._relpath(
await self.fs._expand_path(self._join(path), *args, **kwargs)
)
def expand_path(self, path, *args, **kwargs):
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
async def _mkdir(self, path, *args, **kwargs):
return await self.fs._mkdir(self._join(path), *args, **kwargs)
def mkdir(self, path, *args, **kwargs):
return self.fs.mkdir(self._join(path), *args, **kwargs)
async def _makedirs(self, path, *args, **kwargs):
return await self.fs._makedirs(self._join(path), *args, **kwargs)
def makedirs(self, path, *args, **kwargs):
return self.fs.makedirs(self._join(path), *args, **kwargs)
def rmdir(self, path):
return self.fs.rmdir(self._join(path))
def mv(self, path1, path2, **kwargs):
return self.fs.mv(
self._join(path1),
self._join(path2),
**kwargs,
)
def touch(self, path, **kwargs):
return self.fs.touch(self._join(path), **kwargs)
def created(self, path):
return self.fs.created(self._join(path))
def modified(self, path):
return self.fs.modified(self._join(path))
def sign(self, path, *args, **kwargs):
return self.fs.sign(self._join(path), *args, **kwargs)
def __repr__(self):
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
def open(
self,
path,
*args,
**kwargs,
):
return self.fs.open(
self._join(path),
*args,
**kwargs,
)
async def open_async(
self,
path,
*args,
**kwargs,
):
return await self.fs.open_async(
self._join(path),
*args,
**kwargs,
)

View File

@ -0,0 +1,395 @@
import os
import sys
import uuid
import warnings
from ftplib import FTP, FTP_TLS, Error, error_perm
from typing import Any
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, isfilelike
class FTPFileSystem(AbstractFileSystem):
"""A filesystem over classic FTP"""
root_marker = "/"
cachable = False
protocol = "ftp"
def __init__(
self,
host,
port=21,
username=None,
password=None,
acct=None,
block_size=None,
tempdir=None,
timeout=30,
encoding="utf-8",
tls=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable FTP url.
Authentication will be anonymous if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int
Port to connect with
username: str or None
If authenticating, the user's identifier
password: str of None
User's password on the server, if using
acct: str or None
Some servers also need an "account" string for auth
block_size: int or None
If given, the read-ahead or write buffer size.
tempdir: str
Directory on remote to put temporary files when in a transaction
timeout: int
Timeout of the ftp connection in seconds
encoding: str
Encoding to use for directories and filenames in FTP connection
tls: bool
Use FTP-TLS, by default False
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.tempdir = tempdir or "/tmp"
self.cred = username or "", password or "", acct or ""
self.timeout = timeout
self.encoding = encoding
if block_size is not None:
self.blocksize = block_size
else:
self.blocksize = 2**16
self.tls = tls
self._connect()
if self.tls:
self.ftp.prot_p()
def _connect(self):
if self.tls:
ftp_cls = FTP_TLS
else:
ftp_cls = FTP
if sys.version_info >= (3, 9):
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
elif self.encoding:
warnings.warn("`encoding` not supported for python<3.9, ignoring")
self.ftp = ftp_cls(timeout=self.timeout)
else:
self.ftp = ftp_cls(timeout=self.timeout)
self.ftp.connect(self.host, self.port)
self.ftp.login(*self.cred)
@classmethod
def _strip_protocol(cls, path):
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
out = []
if path not in self.dircache:
try:
try:
out = [
(fn, details)
for (fn, details) in self.ftp.mlsd(path)
if fn not in [".", ".."]
and details["type"] not in ["pdir", "cdir"]
]
except error_perm:
out = _mlsd2(self.ftp, path) # Not platform independent
for fn, details in out:
details["name"] = "/".join(
["" if path == "/" else path, fn.lstrip("/")]
)
if details["type"] == "file":
details["size"] = int(details["size"])
else:
details["size"] = 0
if details["type"] == "dir":
details["type"] = "directory"
self.dircache[path] = out
except Error:
try:
info = self.info(path)
if info["type"] == "file":
out = [(path, info)]
except (Error, IndexError) as exc:
raise FileNotFoundError(path) from exc
files = self.dircache.get(path, out)
if not detail:
return sorted([fn for fn, details in files])
return [details for fn, details in files]
def info(self, path, **kwargs):
# implement with direct method
path = self._strip_protocol(path)
if path == "/":
# special case, since this dir has no real entry
return {"name": "/", "size": 0, "type": "directory"}
files = self.ls(self._parent(path).lstrip("/"), True)
try:
out = next(f for f in files if f["name"] == path)
except StopIteration as exc:
raise FileNotFoundError(path) from exc
return out
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
if not os.path.exists(lpath):
os.mkdir(lpath)
return
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb")
def cb(x):
outfile.write(x)
self.ftp.retrbinary(
f"RETR {rpath}",
blocksize=self.blocksize,
callback=cb,
)
if not isfilelike(lpath):
outfile.close()
def cat_file(self, path, start=None, end=None, **kwargs):
if end is not None:
return super().cat_file(path, start, end, **kwargs)
out = []
def cb(x):
out.append(x)
try:
self.ftp.retrbinary(
f"RETR {path}",
blocksize=self.blocksize,
rest=start,
callback=cb,
)
except (Error, error_perm) as orig_exc:
raise FileNotFoundError(path) from orig_exc
return b"".join(out)
def _open(
self,
path,
mode="rb",
block_size=None,
cache_options=None,
autocommit=True,
**kwargs,
):
path = self._strip_protocol(path)
block_size = block_size or self.blocksize
return FTPFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
cache_options=cache_options,
)
def _rm(self, path):
path = self._strip_protocol(path)
self.ftp.delete(path)
self.invalidate_cache(self._parent(path))
def rm(self, path, recursive=False, maxdepth=None):
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
else:
self.rmdir(p)
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
path = self._strip_protocol(path)
parent = self._parent(path)
if parent != self.root_marker and not self.exists(parent) and create_parents:
self.mkdir(parent, create_parents=create_parents)
self.ftp.mkd(path)
self.invalidate_cache(self._parent(path))
def makedirs(self, path: str, exist_ok: bool = False) -> None:
path = self._strip_protocol(path)
if self.exists(path):
# NB: "/" does not "exist" as it has no directory entry
if not exist_ok:
raise FileExistsError(f"{path} exists without `exist_ok`")
# exists_ok=True -> no-op
else:
self.mkdir(path, create_parents=True)
def rmdir(self, path):
path = self._strip_protocol(path)
self.ftp.rmd(path)
self.invalidate_cache(self._parent(path))
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
self.ftp.rename(path1, path2)
self.invalidate_cache(self._parent(path1))
self.invalidate_cache(self._parent(path2))
def __del__(self):
self.ftp.close()
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
self.dircache.pop(path, None)
super().invalidate_cache(path)
class TransferDone(Exception):
"""Internal exception to break out of transfer"""
pass
class FTPFile(AbstractBufferedFile):
"""Interact with a remote FTP file with read/write buffering"""
def __init__(
self,
fs,
path,
mode="rb",
block_size="default",
autocommit=True,
cache_type="readahead",
cache_options=None,
**kwargs,
):
super().__init__(
fs,
path,
mode=mode,
block_size=block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
if not autocommit:
self.target = self.path
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)
def _fetch_range(self, start, end):
"""Get bytes between given byte limits
Implemented by raising an exception in the fetch callback when the
number of bytes received reaches the requested amount.
Will fail if the server does not respect the REST command on
retrieve requests.
"""
out = []
total = [0]
def callback(x):
total[0] += len(x)
if total[0] > end - start:
out.append(x[: (end - start) - total[0]])
if end < self.size:
raise TransferDone
else:
out.append(x)
if total[0] == end - start and end < self.size:
raise TransferDone
try:
self.fs.ftp.retrbinary(
f"RETR {self.path}",
blocksize=self.blocksize,
rest=start,
callback=callback,
)
except TransferDone:
try:
# stop transfer, we got enough bytes for this block
self.fs.ftp.abort()
self.fs.ftp.getmultiline()
except Error:
self.fs._connect()
return b"".join(out)
def _upload_chunk(self, final=False):
self.buffer.seek(0)
self.fs.ftp.storbinary(
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
)
return True
def _mlsd2(ftp, path="."):
"""
Fall back to using `dir` instead of `mlsd` if not supported.
This parses a Linux style `ls -l` response to `dir`, but the response may
be platform dependent.
Parameters
----------
ftp: ftplib.FTP
path: str
Expects to be given path, but defaults to ".".
"""
lines = []
minfo = []
ftp.dir(path, lines.append)
for line in lines:
split_line = line.split()
if len(split_line) < 9:
continue
this = (
split_line[-1],
{
"modify": " ".join(split_line[5:8]),
"unix.owner": split_line[2],
"unix.group": split_line[3],
"unix.mode": split_line[0],
"size": split_line[4],
},
)
if "d" == this[1]["unix.mode"][0]:
this[1]["type"] = "dir"
else:
this[1]["type"] = "file"
minfo.append(this)
return minfo

View File

@ -0,0 +1,115 @@
import os
import pygit2
from fsspec.spec import AbstractFileSystem
from .memory import MemoryFile
class GitFileSystem(AbstractFileSystem):
"""Browse the files of a local git repo at any hash/tag/branch
(experimental backend)
"""
root_marker = ""
cachable = True
def __init__(self, path=None, fo=None, ref=None, **kwargs):
"""
Parameters
----------
path: str (optional)
Local location of the repo (uses current directory if not given).
May be deprecated in favour of ``fo``. When used with a higher
level function such as fsspec.open(), may be of the form
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
file path should not contain "@" or ":").
fo: str (optional)
Same as ``path``, but passed as part of a chained URL. This one
takes precedence if both are given.
ref: str (optional)
Reference to work with, could be a hash, tag or branch name. Defaults
to current working tree. Note that ``ls`` and ``open`` also take hash,
so this becomes the default for those operations
kwargs
"""
super().__init__(**kwargs)
self.repo = pygit2.Repository(fo or path or os.getcwd())
self.ref = ref or "master"
@classmethod
def _strip_protocol(cls, path):
path = super()._strip_protocol(path).lstrip("/")
if ":" in path:
path = path.split(":", 1)[1]
if "@" in path:
path = path.split("@", 1)[1]
return path.lstrip("/")
def _path_to_object(self, path, ref):
comm, ref = self.repo.resolve_refish(ref or self.ref)
parts = path.split("/")
tree = comm.tree
for part in parts:
if part and isinstance(tree, pygit2.Tree):
if part not in tree:
raise FileNotFoundError(path)
tree = tree[part]
return tree
@staticmethod
def _get_kwargs_from_urls(path):
if path.startswith("git://"):
path = path[6:]
out = {}
if ":" in path:
out["path"], path = path.split(":", 1)
if "@" in path:
out["ref"], path = path.split("@", 1)
return out
@staticmethod
def _object_to_info(obj, path=None):
# obj.name and obj.filemode are None for the root tree!
is_dir = isinstance(obj, pygit2.Tree)
return {
"type": "directory" if is_dir else "file",
"name": (
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
),
"hex": str(obj.id),
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
"size": 0 if is_dir else obj.size,
}
def ls(self, path, detail=True, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return [
GitFileSystem._object_to_info(obj, path)
if detail
else GitFileSystem._object_to_info(obj, path)["name"]
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
]
def info(self, path, ref=None, **kwargs):
tree = self._path_to_object(self._strip_protocol(path), ref)
return GitFileSystem._object_to_info(tree, path)
def ukey(self, path, ref=None):
return self.info(path, ref=ref)["hex"]
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
ref=None,
**kwargs,
):
obj = self._path_to_object(path, ref or self.ref)
return MemoryFile(data=obj.data)

View File

@ -0,0 +1,239 @@
import requests
import fsspec
from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .memory import MemoryFile
# TODO: add GIST backend, would be very similar
class GithubFileSystem(AbstractFileSystem):
"""Interface to files in github
An instance of this class provides the files residing within a remote github
repository. You may specify a point in the repos history, by SHA, branch
or tag (default is current master).
Given that code files tend to be small, and that github does not support
retrieving partial content, we always fetch whole files.
When using fsspec.open, allows URIs of the form:
- "github://path/file", in which case you must specify org, repo and
may specify sha in the extra args
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
part of the URI
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
``sha`` can be the full or abbreviated hex of the commit you want to fetch
from, or a branch or tag name (so long as it doesn't contain special characters
like "/", "?", which would have to be HTTP-encoded).
For authorised access, you must provide username and token, which can be made
at https://github.com/settings/tokens
"""
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts
def __init__(
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
):
super().__init__(**kwargs)
self.org = org
self.repo = repo
if (username is None) ^ (token is None):
raise ValueError("Auth required both username and token")
self.username = username
self.token = token
if timeout is not None:
self.timeout = timeout
if sha is None:
# look up default branch (not necessarily "master")
u = "https://api.github.com/repos/{org}/{repo}"
r = requests.get(
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
)
r.raise_for_status()
sha = r.json()["default_branch"]
self.root = sha
self.ls("")
@property
def kw(self):
if self.username:
return {"auth": (self.username, self.token)}
return {}
@classmethod
def repos(cls, org_or_user, is_org=True):
"""List repo names for given org or user
This may become the top level of the FS
Parameters
----------
org_or_user: str
Name of the github org or user to query
is_org: bool (default True)
Whether the name is an organisation (True) or user (False)
Returns
-------
List of string
"""
r = requests.get(
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
timeout=cls.timeout,
)
r.raise_for_status()
return [repo["name"] for repo in r.json()]
@property
def tags(self):
"""Names of tags in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def branches(self):
"""Names of branches in the repo"""
r = requests.get(
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
timeout=self.timeout,
**self.kw,
)
r.raise_for_status()
return [t["name"] for t in r.json()]
@property
def refs(self):
"""Named references, tags and branches"""
return {"tags": self.tags, "branches": self.branches}
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
"""List files at given path
Parameters
----------
path: str
Location to list, relative to repo root
detail: bool
If True, returns list of dicts, one per file; if False, returns
list of full filenames only
sha: str (optional)
List at the given point in the repo history, branch or tag name or commit
SHA
_sha: str (optional)
List this specific tree object (used internally to descend into trees)
"""
path = self._strip_protocol(path)
if path == "":
_sha = sha or self.root
if _sha is None:
parts = path.rstrip("/").split("/")
so_far = ""
_sha = sha or self.root
for part in parts:
out = self.ls(so_far, True, sha=sha, _sha=_sha)
so_far += "/" + part if so_far else part
out = [o for o in out if o["name"] == so_far]
if not out:
raise FileNotFoundError(path)
out = out[0]
if out["type"] == "file":
if detail:
return [out]
else:
return path
_sha = out["sha"]
if path not in self.dircache or sha not in [self.root, None]:
r = requests.get(
self.url.format(org=self.org, repo=self.repo, sha=_sha),
timeout=self.timeout,
**self.kw,
)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
types = {"blob": "file", "tree": "directory"}
out = [
{
"name": path + "/" + f["path"] if path else f["path"],
"mode": f["mode"],
"type": types[f["type"]],
"size": f.get("size", 0),
"sha": f["sha"],
}
for f in r.json()["tree"]
if f["type"] in types
]
if sha in [self.root, None]:
self.dircache[path] = out
else:
out = self.dircache[path]
if detail:
return out
else:
return sorted([f["name"] for f in out])
def invalidate_cache(self, path=None):
self.dircache.clear()
@classmethod
def _strip_protocol(cls, path):
opts = infer_storage_options(path)
if "username" not in opts:
return super()._strip_protocol(path)
return opts["path"].lstrip("/")
@staticmethod
def _get_kwargs_from_urls(path):
opts = infer_storage_options(path)
if "username" not in opts:
return {}
out = {"org": opts["username"], "repo": opts["password"]}
if opts["host"]:
out["sha"] = opts["host"]
return out
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
sha=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError
url = self.rurl.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(None, None, r.content)
def cat(self, path, recursive=False, on_error="raise", **kwargs):
paths = self.expand_path(path, recursive=recursive)
urls = [
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
for u, sh in paths
]
fs = fsspec.filesystem("http")
data = fs.cat(urls, on_error="return")
return {u: v for ((k, v), u) in zip(data.items(), urls)}

View File

@ -0,0 +1,874 @@
import asyncio
import io
import logging
import re
import weakref
from copy import copy
from urllib.parse import urlparse
import aiohttp
import yarl
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
from fsspec.callbacks import DEFAULT_CALLBACK
from fsspec.exceptions import FSTimeoutError
from fsspec.spec import AbstractBufferedFile
from fsspec.utils import (
DEFAULT_BLOCK_SIZE,
glob_translate,
isfilelike,
nullcontext,
tokenize,
)
from ..caching import AllBytes
# https://stackoverflow.com/a/15926317/3821154
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
logger = logging.getLogger("fsspec.http")
async def get_client(**kwargs):
return aiohttp.ClientSession(**kwargs)
class HTTPFileSystem(AsyncFileSystem):
"""
Simple File-System for fetching data via HTTP(S)
``ls()`` is implemented by loading the parent page and doing a regex
match on the result. If simple_link=True, anything of the form
"http(s)://server.com/stuff?thing=other"; otherwise only links within
HTML href tags will be used.
"""
sep = "/"
def __init__(
self,
simple_links=True,
block_size=None,
same_scheme=True,
size_policy=None,
cache_type="bytes",
cache_options=None,
asynchronous=False,
loop=None,
client_kwargs=None,
get_client=get_client,
encoded=False,
**storage_options,
):
"""
NB: if this is called async, you must await set_client
Parameters
----------
block_size: int
Blocks to read bytes; if 0, will default to raw requests file-like
objects instead of HTTPFile instances
simple_links: bool
If True, will consider both HTML <a> tags and anything that looks
like a URL; if False, will consider only the former.
same_scheme: True
When doing ls/glob, if this is True, only consider paths that have
http/https matching the input URLs.
size_policy: this argument is deprecated
client_kwargs: dict
Passed to aiohttp.ClientSession, see
https://docs.aiohttp.org/en/stable/client_reference.html
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
get_client: Callable[..., aiohttp.ClientSession]
A callable which takes keyword arguments and constructs
an aiohttp.ClientSession. It's state will be managed by
the HTTPFileSystem class.
storage_options: key-value
Any other parameters passed on to requests
cache_type, cache_options: defaults used in open
"""
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
self.simple_links = simple_links
self.same_schema = same_scheme
self.cache_type = cache_type
self.cache_options = cache_options
self.client_kwargs = client_kwargs or {}
self.get_client = get_client
self.encoded = encoded
self.kwargs = storage_options
self._session = None
# Clean caching-related parameters from `storage_options`
# before propagating them as `request_options` through `self.kwargs`.
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
# it clearer.
request_options = copy(storage_options)
self.use_listings_cache = request_options.pop("use_listings_cache", False)
request_options.pop("listings_expiry_time", None)
request_options.pop("max_paths", None)
request_options.pop("skip_instance_cache", None)
self.kwargs = request_options
@property
def fsid(self):
return "http"
def encode_url(self, url):
return yarl.URL(url, encoded=self.encoded)
@staticmethod
def close_session(loop, session):
if loop is not None and loop.is_running():
try:
sync(loop, session.close, timeout=0.1)
return
except (TimeoutError, FSTimeoutError, NotImplementedError):
pass
connector = getattr(session, "_connector", None)
if connector is not None:
# close after loop is dead
connector._close()
async def set_session(self):
if self._session is None:
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
if not self.asynchronous:
weakref.finalize(self, self.close_session, self.loop, self._session)
return self._session
@classmethod
def _strip_protocol(cls, path):
"""For HTTP, we always want to keep the full URL"""
return path
@classmethod
def _parent(cls, path):
# override, since _strip_protocol is different for URLs
par = super()._parent(path)
if len(par) > 7: # "http://..."
return par
return ""
async def _ls_real(self, url, detail=True, **kwargs):
# ignoring URL-encoded arguments
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
session = await self.set_session()
async with session.get(self.encode_url(url), **self.kwargs) as r:
self._raise_not_found_for_status(r, url)
try:
text = await r.text()
if self.simple_links:
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
else:
links = [u[2] for u in ex.findall(text)]
except UnicodeDecodeError:
links = [] # binary, not HTML
out = set()
parts = urlparse(url)
for l in links:
if isinstance(l, tuple):
l = l[1]
if l.startswith("/") and len(l) > 1:
# absolute URL on this server
l = f"{parts.scheme}://{parts.netloc}{l}"
if l.startswith("http"):
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
out.add(l)
elif l.replace("https", "http").startswith(
url.replace("https", "http").rstrip("/") + "/"
):
# allowed to cross http <-> https
out.add(l)
else:
if l not in ["..", "../"]:
# Ignore FTP-like "parent"
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
if not out and url.endswith("/"):
out = await self._ls_real(url.rstrip("/"), detail=False)
if detail:
return [
{
"name": u,
"size": None,
"type": "directory" if u.endswith("/") else "file",
}
for u in out
]
else:
return sorted(out)
async def _ls(self, url, detail=True, **kwargs):
if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
out = await self._ls_real(url, detail=detail, **kwargs)
self.dircache[url] = out
return out
ls = sync_wrapper(_ls)
def _raise_not_found_for_status(self, response, url):
"""
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
"""
if response.status == 404:
raise FileNotFoundError(url)
response.raise_for_status()
async def _cat_file(self, url, start=None, end=None, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(url)
if start is not None or end is not None:
if start == end:
return b""
headers = kw.pop("headers", {}).copy()
headers["Range"] = await self._process_limits(url, start, end)
kw["headers"] = headers
session = await self.set_session()
async with session.get(self.encode_url(url), **kw) as r:
out = await r.read()
self._raise_not_found_for_status(r, url)
return out
async def _get_file(
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
):
kw = self.kwargs.copy()
kw.update(kwargs)
logger.debug(rpath)
session = await self.set_session()
async with session.get(self.encode_url(rpath), **kw) as r:
try:
size = int(r.headers["content-length"])
except (ValueError, KeyError):
size = None
callback.set_size(size)
self._raise_not_found_for_status(r, rpath)
if isfilelike(lpath):
outfile = lpath
else:
outfile = open(lpath, "wb") # noqa: ASYNC101, ASYNC230
try:
chunk = True
while chunk:
chunk = await r.content.read(chunk_size)
outfile.write(chunk)
callback.relative_update(len(chunk))
finally:
if not isfilelike(lpath):
outfile.close()
async def _put_file(
self,
lpath,
rpath,
chunk_size=5 * 2**20,
callback=DEFAULT_CALLBACK,
method="post",
**kwargs,
):
async def gen_chunks():
# Support passing arbitrary file-like objects
# and use them instead of streams.
if isinstance(lpath, io.IOBase):
context = nullcontext(lpath)
use_seek = False # might not support seeking
else:
context = open(lpath, "rb") # noqa: ASYNC101, ASYNC230
use_seek = True
with context as f:
if use_seek:
callback.set_size(f.seek(0, 2))
f.seek(0)
else:
callback.set_size(getattr(f, "size", None))
chunk = f.read(chunk_size)
while chunk:
yield chunk
callback.relative_update(len(chunk))
chunk = f.read(chunk_size)
kw = self.kwargs.copy()
kw.update(kwargs)
session = await self.set_session()
method = method.lower()
if method not in ("post", "put"):
raise ValueError(
f"method has to be either 'post' or 'put', not: {method!r}"
)
meth = getattr(session, method)
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
self._raise_not_found_for_status(resp, rpath)
async def _exists(self, path, **kwargs):
kw = self.kwargs.copy()
kw.update(kwargs)
try:
logger.debug(path)
session = await self.set_session()
r = await session.get(self.encode_url(path), **kw)
async with r:
return r.status < 400
except aiohttp.ClientError:
return False
async def _isfile(self, path, **kwargs):
return await self._exists(path, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=None, # XXX: This differs from the base class.
cache_type=None,
cache_options=None,
size=None,
**kwargs,
):
"""Make a file-like object
Parameters
----------
path: str
Full URL with protocol
mode: string
must be "rb"
block_size: int or None
Bytes to download in one request; use instance value if None. If
zero, will return a streaming Requests file-like instance.
kwargs: key-value
Any other parameters, passed to requests calls
"""
if mode != "rb":
raise NotImplementedError
block_size = block_size if block_size is not None else self.block_size
kw = self.kwargs.copy()
kw["asynchronous"] = self.asynchronous
kw.update(kwargs)
info = {}
size = size or info.update(self.info(path, **kwargs)) or info["size"]
session = sync(self.loop, self.set_session)
if block_size and size and info.get("partial", True):
return HTTPFile(
self,
path,
session=session,
block_size=block_size,
mode=mode,
size=size,
cache_type=cache_type or self.cache_type,
cache_options=cache_options or self.cache_options,
loop=self.loop,
**kw,
)
else:
return HTTPStreamFile(
self,
path,
mode=mode,
loop=self.loop,
session=session,
**kw,
)
async def open_async(self, path, mode="rb", size=None, **kwargs):
session = await self.set_session()
if size is None:
try:
size = (await self._info(path, **kwargs))["size"]
except FileNotFoundError:
pass
return AsyncStreamFile(
self,
path,
loop=self.loop,
session=session,
size=size,
**kwargs,
)
def ukey(self, url):
"""Unique identifier; assume HTTP files are static, unchanging"""
return tokenize(url, self.kwargs, self.protocol)
async def _info(self, url, **kwargs):
"""Get info of URL
Tries to access location via HEAD, and then GET methods, but does
not fetch the data.
It is possible that the server does not supply any size information, in
which case size will be given as None (and certain operations on the
corresponding file will not work).
"""
info = {}
session = await self.set_session()
for policy in ["head", "get"]:
try:
info.update(
await _file_info(
self.encode_url(url),
size_policy=policy,
session=session,
**self.kwargs,
**kwargs,
)
)
if info.get("size") is not None:
break
except Exception as exc:
if policy == "get":
# If get failed, then raise a FileNotFoundError
raise FileNotFoundError(url) from exc
logger.debug("", exc_info=exc)
return {"name": url, "size": None, **info, "type": "file"}
async def _glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
import re
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
path = self._strip_protocol(path)
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
min_idx = min(idx_star, idx_brace)
detail = kwargs.pop("detail", False)
if not has_magic(path):
if await self._exists(path, **kwargs):
if not detail:
return [path]
else:
return {path: await self._info(path, **kwargs)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = path[min_idx + 1 :].count("/") + 1
if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None
allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
)
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
pattern = re.compile(pattern)
out = {
(
p.rstrip("/")
if not append_slash_to_dirname
and info["type"] == "directory"
and p.endswith("/")
else p
): info
for p, info in sorted(allpaths.items())
if pattern.match(p.rstrip("/"))
}
if detail:
return out
else:
return list(out)
async def _isdir(self, path):
# override, since all URLs are (also) files
try:
return bool(await self._ls(path))
except (FileNotFoundError, ValueError):
return False
class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remote HTTP(S) resource
Supports only reading, with read-ahead of a predetermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Parameters
----------
url: str
Full URL of the remote resource, including the protocol
session: aiohttp.ClientSession or None
All calls will be made within this session, to avoid restarting
connections where the server allows this
block_size: int or None
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
configured for the FileSystem creating this file
size: None or int
If given, this is the size of the file in bytes, and we don't attempt
to call the server to find the value.
kwargs: all other key-values are passed to requests calls.
"""
def __init__(
self,
fs,
url,
session=None,
block_size=None,
mode="rb",
cache_type="bytes",
cache_options=None,
size=None,
loop=None,
asynchronous=False,
**kwargs,
):
if mode != "rb":
raise NotImplementedError("File mode not supported")
self.asynchronous = asynchronous
self.loop = loop
self.url = url
self.session = session
self.details = {"name": url, "size": size, "type": "file"}
super().__init__(
fs=fs,
path=url,
mode=mode,
block_size=block_size,
cache_type=cache_type,
cache_options=cache_options,
**kwargs,
)
def read(self, length=-1):
"""Read bytes from file
Parameters
----------
length: int
Read up to this many bytes. If negative, read all content to end of
file. If the server has not supplied the filesize, attempting to
read only part of the data will raise a ValueError.
"""
if (
(length < 0 and self.loc == 0) # explicit read all
# but not when the size is known and fits into a block anyways
and not (self.size is not None and self.size <= self.blocksize)
):
self._fetch_all()
if self.size is None:
if length < 0:
self._fetch_all()
else:
length = min(self.size - self.loc, length)
return super().read(length)
async def async_fetch_all(self):
"""Read whole file in one shot, without caching
This is only called when position is still at zero,
and read() is called without a byte-count.
"""
logger.debug(f"Fetch all for {self}")
if not isinstance(self.cache, AllBytes):
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
async with r:
r.raise_for_status()
out = await r.read()
self.cache = AllBytes(
size=len(out), fetcher=None, blocksize=None, data=out
)
self.size = len(out)
_fetch_all = sync_wrapper(async_fetch_all)
def _parse_content_range(self, headers):
"""Parse the Content-Range header"""
s = headers.get("Content-Range", "")
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
if not m:
return None, None, None
if m[1] == "*":
start = end = None
else:
start, end = [int(x) for x in m[1].split("-")]
total = None if m[2] == "*" else int(m[2])
return start, end, total
async def async_fetch_range(self, start, end):
"""Download a block of data
The expectation is that the server returns only the requested bytes,
with HTTP code 206. If this is not the case, we first check the headers,
and then stream the output - if the data size is bigger than we
requested, an exception is raised.
"""
logger.debug(f"Fetch range for {self}: {start}-{end}")
kwargs = self.kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
logger.debug(f"{self.url} : {headers['Range']}")
r = await self.session.get(
self.fs.encode_url(self.url), headers=headers, **kwargs
)
async with r:
if r.status == 416:
# range request outside file
return b""
r.raise_for_status()
# If the server has handled the range request, it should reply
# with status 206 (partial content). But we'll guess that a suitable
# Content-Range header or a Content-Length no more than the
# requested range also mean we have got the desired range.
response_is_range = (
r.status == 206
or self._parse_content_range(r.headers)[0] == start
or int(r.headers.get("Content-Length", end + 1)) <= end - start
)
if response_is_range:
# partial content, as expected
out = await r.read()
elif start > 0:
raise ValueError(
"The HTTP server doesn't appear to support range requests. "
"Only reading this file from the beginning is supported. "
"Open with block_size=0 for a streaming file interface."
)
else:
# Response is not a range, but we want the start of the file,
# so we can read the required amount anyway.
cl = 0
out = []
while True:
chunk = await r.content.read(2**20)
# data size unknown, let's read until we have enough
if chunk:
out.append(chunk)
cl += len(chunk)
if cl > end - start:
break
else:
break
out = b"".join(out)[: end - start]
return out
_fetch_range = sync_wrapper(async_fetch_range)
def __reduce__(self):
return (
reopen,
(
self.fs,
self.url,
self.mode,
self.blocksize,
self.cache.name if self.cache else "none",
self.size,
),
)
def reopen(fs, url, mode, blocksize, cache_type, size=None):
return fs.open(
url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
)
magic_check = re.compile("([*[])")
def has_magic(s):
match = magic_check.search(s)
return match is not None
class HTTPStreamFile(AbstractBufferedFile):
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
self.asynchronous = kwargs.pop("asynchronous", False)
self.url = url
self.loop = loop
self.session = session
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
async def cor():
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
self.fs._raise_not_found_for_status(r, url)
return r
self.r = sync(self.loop, cor)
self.loop = fs.loop
def seek(self, loc, whence=0):
if loc == 0 and whence == 1:
return
if loc == self.loc and whence == 0:
return
raise ValueError("Cannot seek streaming HTTP file")
async def _read(self, num=-1):
out = await self.r.content.read(num)
self.loc += len(out)
return out
read = sync_wrapper(_read)
async def _close(self):
self.r.close()
def close(self):
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
super().close()
def __reduce__(self):
return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
class AsyncStreamFile(AbstractAsyncStreamedFile):
def __init__(
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
):
self.url = url
self.session = session
self.r = None
if mode != "rb":
raise ValueError
self.details = {"name": url, "size": None}
self.kwargs = kwargs
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
self.size = size
async def read(self, num=-1):
if self.r is None:
r = await self.session.get(
self.fs.encode_url(self.url), **self.kwargs
).__aenter__()
self.fs._raise_not_found_for_status(r, self.url)
self.r = r
out = await self.r.content.read(num)
self.loc += len(out)
return out
async def close(self):
if self.r is not None:
self.r.close()
self.r = None
await super().close()
async def get_range(session, url, start, end, file=None, **kwargs):
# explicit get a range when we know it must be safe
kwargs = kwargs.copy()
headers = kwargs.pop("headers", {}).copy()
headers["Range"] = f"bytes={start}-{end - 1}"
r = await session.get(url, headers=headers, **kwargs)
r.raise_for_status()
async with r:
out = await r.read()
if file:
with open(file, "r+b") as f: # noqa: ASYNC101, ASYNC230
f.seek(start)
f.write(out)
else:
return out
async def _file_info(url, session, size_policy="head", **kwargs):
"""Call HEAD on the server to get details about the file (size/checksum etc.)
Default operation is to explicitly allow redirects and use encoding
'identity' (no compression) to get the true size of the target.
"""
logger.debug("Retrieve file size for %s", url)
kwargs = kwargs.copy()
ar = kwargs.pop("allow_redirects", True)
head = kwargs.get("headers", {}).copy()
head["Accept-Encoding"] = "identity"
kwargs["headers"] = head
info = {}
if size_policy == "head":
r = await session.head(url, allow_redirects=ar, **kwargs)
elif size_policy == "get":
r = await session.get(url, allow_redirects=ar, **kwargs)
else:
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
async with r:
r.raise_for_status()
if "Content-Length" in r.headers:
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
"identity",
"",
]:
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])
if "Content-Type" in r.headers:
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
if r.headers.get("Accept-Ranges") == "none":
# Some servers may explicitly discourage partial content requests, but
# the lack of "Accept-Ranges" does not always indicate they would fail
info["partial"] = False
info["url"] = str(r.url)
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
if r.headers.get(checksum_field):
info[checksum_field] = r.headers[checksum_field]
return info
async def _file_size(url, session=None, *args, **kwargs):
if session is None:
session = await get_client()
info = await _file_info(url, session=session, *args, **kwargs)
return info.get("size")
file_size = sync_wrapper(_file_size)

View File

@ -0,0 +1,124 @@
import base64
import io
import re
import requests
import fsspec
class JupyterFileSystem(fsspec.AbstractFileSystem):
"""View of the files as seen by a Jupyter server (notebook or lab)"""
protocol = ("jupyter", "jlab")
def __init__(self, url, tok=None, **kwargs):
"""
Parameters
----------
url : str
Base URL of the server, like "http://127.0.0.1:8888". May include
token in the string, which is given by the process when starting up
tok : str
If the token is obtained separately, can be given here
kwargs
"""
if "?" in url:
if tok is None:
try:
tok = re.findall("token=([a-z0-9]+)", url)[0]
except IndexError as e:
raise ValueError("Could not determine token") from e
url = url.split("?", 1)[0]
self.url = url.rstrip("/") + "/api/contents"
self.session = requests.Session()
if tok:
self.session.headers["Authorization"] = f"token {tok}"
super().__init__(**kwargs)
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
return FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["type"] == "directory":
out = out["content"]
else:
out = [out]
for o in out:
o["name"] = o.pop("path")
o.pop("content")
if o["type"] == "notebook":
o["type"] = "file"
if detail:
return out
return [o["name"] for o in out]
def cat_file(self, path, start=None, end=None, **kwargs):
path = self._strip_protocol(path)
r = self.session.get(f"{self.url}/{path}")
if r.status_code == 404:
return FileNotFoundError(path)
r.raise_for_status()
out = r.json()
if out["format"] == "text":
# data should be binary
b = out["content"].encode()
else:
b = base64.b64decode(out["content"])
return b[start:end]
def pipe_file(self, path, value, **_):
path = self._strip_protocol(path)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": len(value),
"content": base64.b64encode(value).decode(),
"format": "base64",
"type": "file",
}
self.session.put(f"{self.url}/{path}", json=json)
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if create_parents and "/" in path:
self.mkdir(path.rsplit("/", 1)[0], True)
json = {
"name": path.rsplit("/", 1)[-1],
"path": path,
"size": None,
"content": None,
"type": "directory",
}
self.session.put(f"{self.url}/{path}", json=json)
def _rm(self, path):
path = self._strip_protocol(path)
self.session.delete(f"{self.url}/{path}")
def _open(self, path, mode="rb", **kwargs):
path = self._strip_protocol(path)
if mode == "rb":
data = self.cat_file(path)
return io.BytesIO(data)
else:
return SimpleFileWriter(self, path, mode="wb")
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
def _upload_chunk(self, final=False):
"""Never uploads a chunk until file is done
Not suitable for large files
"""
if final is False:
return False
self.buffer.seek(0)
data = self.buffer.read()
self.fs.pipe_file(self.path, data)

View File

@ -0,0 +1,213 @@
from contextlib import contextmanager
from ctypes import (
CFUNCTYPE,
POINTER,
c_int,
c_longlong,
c_void_p,
cast,
create_string_buffer,
)
import libarchive
import libarchive.ffi as ffi
from fsspec import open_files
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.implementations.memory import MemoryFile
from fsspec.utils import DEFAULT_BLOCK_SIZE
# Libarchive requires seekable files or memory only for certain archive
# types. However, since we read the directory first to cache the contents
# and also allow random access to any file, the file-like object needs
# to be seekable no matter what.
# Seek call-backs (not provided in the libarchive python wrapper)
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
read_set_seek_callback = ffi.ffi(
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
)
new_api = hasattr(ffi, "NO_OPEN_CB")
@contextmanager
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
"""Read an archive from a seekable file-like object.
The `file` object must support the standard `readinto` and 'seek' methods.
"""
buf = create_string_buffer(block_size)
buf_p = cast(buf, c_void_p)
def read_func(archive_p, context, ptrptr):
# readinto the buffer, returns number of bytes read
length = file.readinto(buf)
# write the address of the buffer into the pointer
ptrptr = cast(ptrptr, POINTER(c_void_p))
ptrptr[0] = buf_p
# tell libarchive how much data was written into the buffer
return length
def seek_func(archive_p, context, offset, whence):
file.seek(offset, whence)
# tell libarchvie the current position
return file.tell()
read_cb = ffi.READ_CALLBACK(read_func)
seek_cb = SEEK_CALLBACK(seek_func)
if new_api:
open_cb = ffi.NO_OPEN_CB
close_cb = ffi.NO_CLOSE_CB
else:
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
read_set_seek_callback(archive_p, seek_cb)
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
yield libarchive.read.ArchiveRead(archive_p)
class LibArchiveFileSystem(AbstractArchiveFileSystem):
"""Compressed archives as a file-system (read-only)
Supports the following formats:
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
Microsoft CAB, 7-Zip, WARC
See the libarchive documentation for further restrictions.
https://www.libarchive.org/
Keeps file object open while instance lives. It only works in seekable
file-like objects. In case the filesystem does not support this kind of
file object, it is recommended to cache locally.
This class is pickleable, but not necessarily thread-safe (depends on the
platform). See libarchive documentation for details.
"""
root_marker = ""
protocol = "libarchive"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
block_size=DEFAULT_BLOCK_SIZE,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Currently, only 'r' accepted
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
"""
super().__init__(self, **kwargs)
if mode != "r":
raise ValueError("Only read from archive files accepted")
if isinstance(fo, str):
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
if len(files) != 1:
raise ValueError(
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
)
fo = files[0]
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.block_size = block_size
self.dir_cache = None
@contextmanager
def _open_archive(self):
self.fo.seek(0)
with custom_reader(self.fo, block_size=self.block_size) as arc:
yield arc
@classmethod
def _strip_protocol(cls, path):
# file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def _get_dirs(self):
fields = {
"name": "pathname",
"size": "size",
"created": "ctime",
"mode": "mode",
"uid": "uid",
"gid": "gid",
"mtime": "mtime",
}
if self.dir_cache is not None:
return
self.dir_cache = {}
list_names = []
with self._open_archive() as arc:
for entry in arc:
if not entry.isdir and not entry.isfile:
# Skip symbolic links, fifo entries, etc.
continue
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(set(entry.name))
}
)
f = {key: getattr(entry, fields[key]) for key in fields}
f["type"] = "directory" if entry.isdir else "file"
list_names.append(entry.name)
self.dir_cache[f["name"]] = f
# libarchive does not seem to return an entry for the directories (at least
# not in all formats), so get the directories names from the files names
self.dir_cache.update(
{
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(list_names)
}
)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if mode != "rb":
raise NotImplementedError
data = bytes()
with self._open_archive() as arc:
for entry in arc:
if entry.pathname != path:
continue
if entry.size == 0:
# empty file, so there are no blocks
break
for block in entry.get_blocks(entry.size):
data = block
break
else:
raise ValueError
return MemoryFile(fs=self, path=path, data=data)

View File

@ -0,0 +1,471 @@
import datetime
import io
import logging
import os
import os.path as osp
import shutil
import stat
import tempfile
from fsspec import AbstractFileSystem
from fsspec.compression import compr
from fsspec.core import get_compression
from fsspec.utils import isfilelike, stringify_path
logger = logging.getLogger("fsspec.local")
class LocalFileSystem(AbstractFileSystem):
"""Interface to files on local storage
Parameters
----------
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
code.
"""
root_marker = "/"
protocol = "file", "local"
local_file = True
def __init__(self, auto_mkdir=False, **kwargs):
super().__init__(**kwargs)
self.auto_mkdir = auto_mkdir
@property
def fsid(self):
return "local"
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if self.exists(path):
raise FileExistsError(path)
if create_parents:
self.makedirs(path, exist_ok=True)
else:
os.mkdir(path, **kwargs)
def makedirs(self, path, exist_ok=False):
path = self._strip_protocol(path)
os.makedirs(path, exist_ok=exist_ok)
def rmdir(self, path):
path = self._strip_protocol(path)
os.rmdir(path)
def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
info = self.info(path)
if info["type"] == "directory":
with os.scandir(path) as it:
infos = [self.info(f) for f in it]
else:
infos = [info]
if not detail:
return [i["name"] for i in infos]
return infos
def info(self, path, **kwargs):
if isinstance(path, os.DirEntry):
# scandir DirEntry
out = path.stat(follow_symlinks=False)
link = path.is_symlink()
if path.is_dir(follow_symlinks=False):
t = "directory"
elif path.is_file(follow_symlinks=False):
t = "file"
else:
t = "other"
size = out.st_size
if link:
try:
out2 = path.stat(follow_symlinks=True)
size = out2.st_size
except OSError:
size = 0
path = self._strip_protocol(path.path)
else:
# str or path-like
path = self._strip_protocol(path)
out = os.stat(path, follow_symlinks=False)
link = stat.S_ISLNK(out.st_mode)
if link:
out = os.stat(path, follow_symlinks=True)
size = out.st_size
if stat.S_ISDIR(out.st_mode):
t = "directory"
elif stat.S_ISREG(out.st_mode):
t = "file"
else:
t = "other"
result = {
"name": path,
"size": size,
"type": t,
"created": out.st_ctime,
"islink": link,
}
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
result[field] = getattr(out, f"st_{field}")
if link:
result["destination"] = os.readlink(path)
return result
def lexists(self, path, **kwargs):
return osp.lexists(path)
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
if self.isfile(path1):
shutil.copyfile(path1, path2)
elif self.isdir(path1):
self.mkdirs(path2, exist_ok=True)
else:
raise FileNotFoundError(path1)
def isfile(self, path):
path = self._strip_protocol(path)
return os.path.isfile(path)
def isdir(self, path):
path = self._strip_protocol(path)
return os.path.isdir(path)
def get_file(self, path1, path2, callback=None, **kwargs):
if isfilelike(path2):
with open(path1, "rb") as f:
shutil.copyfileobj(f, path2)
else:
return self.cp_file(path1, path2, **kwargs)
def put_file(self, path1, path2, callback=None, **kwargs):
return self.cp_file(path1, path2, **kwargs)
def mv(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
shutil.move(path1, path2)
def link(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.link(src, dst, **kwargs)
def symlink(self, src, dst, **kwargs):
src = self._strip_protocol(src)
dst = self._strip_protocol(dst)
os.symlink(src, dst, **kwargs)
def islink(self, path) -> bool:
return os.path.islink(self._strip_protocol(path))
def rm_file(self, path):
os.remove(self._strip_protocol(path))
def rm(self, path, recursive=False, maxdepth=None):
if not isinstance(path, list):
path = [path]
for p in path:
p = self._strip_protocol(p)
if self.isdir(p):
if not recursive:
raise ValueError("Cannot delete directory, set recursive=True")
if osp.abspath(p) == os.getcwd():
raise ValueError("Cannot delete current working directory")
shutil.rmtree(p)
else:
os.remove(p)
def unstrip_protocol(self, name):
name = self._strip_protocol(name) # normalise for local/win/...
return f"file://{name}"
def _open(self, path, mode="rb", block_size=None, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
return LocalFileOpener(path, mode, fs=self, **kwargs)
def touch(self, path, truncate=True, **kwargs):
path = self._strip_protocol(path)
if self.auto_mkdir:
self.makedirs(self._parent(path), exist_ok=True)
if self.exists(path):
os.utime(path, None)
else:
open(path, "a").close()
if truncate:
os.truncate(path, 0)
def created(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(
info["created"], tz=datetime.timezone.utc
)
def modified(self, path):
info = self.info(path=path)
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
@classmethod
def _parent(cls, path):
path = cls._strip_protocol(path)
if os.sep == "/":
# posix native
return path.rsplit("/", 1)[0] or "/"
else:
# NT
path_ = path.rsplit("/", 1)[0]
if len(path_) <= 3:
if path_[1:2] == ":":
# nt root (something like c:/)
return path_[0] + ":/"
# More cases may be required here
return path_
@classmethod
def _strip_protocol(cls, path):
path = stringify_path(path)
if path.startswith("file://"):
path = path[7:]
elif path.startswith("file:"):
path = path[5:]
elif path.startswith("local://"):
path = path[8:]
elif path.startswith("local:"):
path = path[6:]
path = make_path_posix(path)
if os.sep != "/":
# This code-path is a stripped down version of
# > drive, path = ntpath.splitdrive(path)
if path[1:2] == ":":
# Absolute drive-letter path, e.g. X:\Windows
# Relative path with drive, e.g. X:Windows
drive, path = path[:2], path[2:]
elif path[:2] == "//":
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
# Device drives, e.g. \\.\device or \\?\device
if (index1 := path.find("/", 2)) == -1 or (
index2 := path.find("/", index1 + 1)
) == -1:
drive, path = path, ""
else:
drive, path = path[:index2], path[index2:]
else:
# Relative path, e.g. Windows
drive = ""
path = path.rstrip("/") or cls.root_marker
return drive + path
else:
return path.rstrip("/") or cls.root_marker
def _isfilestore(self):
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
# the original motivation. But we are a posix-like file system.
# See https://github.com/dask/dask/issues/5526
return True
def chmod(self, path, mode):
path = stringify_path(path)
return os.chmod(path, mode)
def make_path_posix(path):
"""Make path generic and absolute for current OS"""
if not isinstance(path, str):
if isinstance(path, (list, set, tuple)):
return type(path)(make_path_posix(p) for p in path)
else:
path = stringify_path(path)
if not isinstance(path, str):
raise TypeError(f"could not convert {path!r} to string")
if os.sep == "/":
# Native posix
if path.startswith("/"):
# most common fast case for posix
return path
elif path.startswith("~"):
return osp.expanduser(path)
elif path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{os.getcwd()}/{path}"
else:
# NT handling
if path[0:1] == "/" and path[2:3] == ":":
# path is like "/c:/local/path"
path = path[1:]
if path[1:2] == ":":
# windows full path like "C:\\local\\path"
if len(path) <= 3:
# nt root (something like c:/)
return path[0] + ":/"
path = path.replace("\\", "/")
return path
elif path[0:1] == "~":
return make_path_posix(osp.expanduser(path))
elif path.startswith(("\\\\", "//")):
# windows UNC/DFS-style paths
return "//" + path[2:].replace("\\", "/")
elif path.startswith(("\\", "/")):
# windows relative path with root
path = path.replace("\\", "/")
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
else:
path = path.replace("\\", "/")
if path.startswith("./"):
path = path[2:]
elif path == ".":
path = ""
return f"{make_path_posix(os.getcwd())}/{path}"
def trailing_sep(path):
"""Return True if the path ends with a path separator.
A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
# TODO: if all incoming paths were posix-compliant then separator would
# always be a forward slash, simplifying this function.
# See https://github.com/fsspec/filesystem_spec/pull/1250
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
class LocalFileOpener(io.IOBase):
def __init__(
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
):
logger.debug("open file: %s", path)
self.path = path
self.mode = mode
self.fs = fs
self.f = None
self.autocommit = autocommit
self.compression = get_compression(path, compression)
self.blocksize = io.DEFAULT_BUFFER_SIZE
self._open()
def _open(self):
if self.f is None or self.f.closed:
if self.autocommit or "w" not in self.mode:
self.f = open(self.path, mode=self.mode)
if self.compression:
compress = compr[self.compression]
self.f = compress(self.f, mode=self.mode)
else:
# TODO: check if path is writable?
i, name = tempfile.mkstemp()
os.close(i) # we want normal open and normal buffered file
self.temp = name
self.f = open(name, mode=self.mode)
if "w" not in self.mode:
self.size = self.f.seek(0, 2)
self.f.seek(0)
self.f.size = self.size
def _fetch_range(self, start, end):
# probably only used by cached FS
if "r" not in self.mode:
raise ValueError
self._open()
self.f.seek(start)
return self.f.read(end - start)
def __setstate__(self, state):
self.f = None
loc = state.pop("loc", None)
self.__dict__.update(state)
if "r" in state["mode"]:
self.f = None
self._open()
self.f.seek(loc)
def __getstate__(self):
d = self.__dict__.copy()
d.pop("f")
if "r" in self.mode:
d["loc"] = self.f.tell()
else:
if not self.f.closed:
raise ValueError("Cannot serialise open write-mode local file")
return d
def commit(self):
if self.autocommit:
raise RuntimeError("Can only commit if not already set to autocommit")
shutil.move(self.temp, self.path)
def discard(self):
if self.autocommit:
raise RuntimeError("Cannot discard if set to autocommit")
os.remove(self.temp)
def readable(self) -> bool:
return True
def writable(self) -> bool:
return "r" not in self.mode
def read(self, *args, **kwargs):
return self.f.read(*args, **kwargs)
def write(self, *args, **kwargs):
return self.f.write(*args, **kwargs)
def tell(self, *args, **kwargs):
return self.f.tell(*args, **kwargs)
def seek(self, *args, **kwargs):
return self.f.seek(*args, **kwargs)
def seekable(self, *args, **kwargs):
return self.f.seekable(*args, **kwargs)
def readline(self, *args, **kwargs):
return self.f.readline(*args, **kwargs)
def readlines(self, *args, **kwargs):
return self.f.readlines(*args, **kwargs)
def close(self):
return self.f.close()
def truncate(self, size=None) -> int:
return self.f.truncate(size)
@property
def closed(self):
return self.f.closed
def fileno(self):
return self.raw.fileno()
def flush(self) -> None:
self.f.flush()
def __iter__(self):
return self.f.__iter__()
def __getattr__(self, item):
return getattr(self.f, item)
def __enter__(self):
self._incontext = True
return self
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.f.__exit__(exc_type, exc_value, traceback)

View File

@ -0,0 +1,307 @@
from __future__ import annotations
import logging
from datetime import datetime, timezone
from errno import ENOTEMPTY
from io import BytesIO
from pathlib import PurePath, PureWindowsPath
from typing import Any, ClassVar
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from fsspec.utils import stringify_path
logger = logging.getLogger("fsspec.memoryfs")
class MemoryFileSystem(AbstractFileSystem):
"""A filesystem based on a dict of BytesIO objects
This is a global filesystem so instances of this class all point to the same
in memory filesystem.
"""
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
pseudo_dirs = [""] # global, do not overwrite!
protocol = "memory"
root_marker = "/"
@classmethod
def _strip_protocol(cls, path):
if isinstance(path, PurePath):
if isinstance(path, PureWindowsPath):
return LocalFileSystem._strip_protocol(path)
else:
path = stringify_path(path)
if path.startswith("memory://"):
path = path[len("memory://") :]
if "::" in path or "://" in path:
return path.rstrip("/")
path = path.lstrip("/").rstrip("/")
return "/" + path if path else ""
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store:
# there is a key with this exact name
if not detail:
return [path]
return [
{
"name": path,
"size": self.store[path].size,
"type": "file",
"created": self.store[path].created.timestamp(),
}
]
paths = set()
starter = path + "/"
out = []
for p2 in tuple(self.store):
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child
out.append(
{
"name": p2,
"size": self.store[p2].size,
"type": "file",
"created": self.store[p2].created.timestamp(),
}
)
elif len(p2) > len(starter):
# implied child directory
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out = out or []
out.append(
{
"name": ppath,
"size": 0,
"type": "directory",
}
)
paths.add(ppath)
for p2 in self.pseudo_dirs:
if p2.startswith(starter):
if "/" not in p2[len(starter) :]:
# exact child pdir
if p2 not in paths:
out.append({"name": p2, "size": 0, "type": "directory"})
paths.add(p2)
else:
# directory implied by deeper pdir
ppath = starter + p2[len(starter) :].split("/", 1)[0]
if ppath not in paths:
out.append({"name": ppath, "size": 0, "type": "directory"})
paths.add(ppath)
if not out:
if path in self.pseudo_dirs:
# empty dir
return []
raise FileNotFoundError(path)
if detail:
return out
return sorted([f["name"] for f in out])
def mkdir(self, path, create_parents=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store or path in self.pseudo_dirs:
raise FileExistsError(path)
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
raise NotADirectoryError(self._parent(path))
if create_parents and self._parent(path).strip("/"):
try:
self.mkdir(self._parent(path), create_parents, **kwargs)
except FileExistsError:
pass
if path and path not in self.pseudo_dirs:
self.pseudo_dirs.append(path)
def makedirs(self, path, exist_ok=False):
try:
self.mkdir(path, create_parents=True)
except FileExistsError:
if not exist_ok:
raise
def pipe_file(self, path, value, **kwargs):
"""Set the bytes of given file
Avoids copies of the data if possible
"""
self.open(path, "wb", data=value)
def rmdir(self, path):
path = self._strip_protocol(path)
if path == "":
# silently avoid deleting FS root
return
if path in self.pseudo_dirs:
if not self.ls(path):
self.pseudo_dirs.remove(path)
else:
raise OSError(ENOTEMPTY, "Directory not empty", path)
else:
raise FileNotFoundError(path)
def info(self, path, **kwargs):
logger.debug("info: %s", path)
path = self._strip_protocol(path)
if path in self.pseudo_dirs or any(
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
):
return {
"name": path,
"size": 0,
"type": "directory",
}
elif path in self.store:
filelike = self.store[path]
return {
"name": path,
"size": filelike.size,
"type": "file",
"created": getattr(filelike, "created", None),
}
else:
raise FileNotFoundError(path)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if path in self.pseudo_dirs:
raise IsADirectoryError(path)
parent = path
while len(parent) > 1:
parent = self._parent(parent)
if self.isfile(parent):
raise FileExistsError(parent)
if mode in ["rb", "ab", "r+b"]:
if path in self.store:
f = self.store[path]
if mode == "ab":
# position at the end of file
f.seek(0, 2)
else:
# position at the beginning of file
f.seek(0)
return f
else:
raise FileNotFoundError(path)
elif mode == "wb":
m = MemoryFile(self, path, kwargs.get("data"))
if not self._intrans:
m.commit()
return m
else:
name = self.__class__.__name__
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
def cp_file(self, path1, path2, **kwargs):
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)
if self.isfile(path1):
self.store[path2] = MemoryFile(
self, path2, self.store[path1].getvalue()
) # implicit copy
elif self.isdir(path1):
if path2 not in self.pseudo_dirs:
self.pseudo_dirs.append(path2)
else:
raise FileNotFoundError(path1)
def cat_file(self, path, start=None, end=None, **kwargs):
logger.debug("cat: %s", path)
path = self._strip_protocol(path)
try:
return bytes(self.store[path].getbuffer()[start:end])
except KeyError as e:
raise FileNotFoundError(path) from e
def _rm(self, path):
path = self._strip_protocol(path)
try:
del self.store[path]
except KeyError as e:
raise FileNotFoundError(path) from e
def modified(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].modified
except KeyError as e:
raise FileNotFoundError(path) from e
def created(self, path):
path = self._strip_protocol(path)
try:
return self.store[path].created
except KeyError as e:
raise FileNotFoundError(path) from e
def isfile(self, path):
path = self._strip_protocol(path)
return path in self.store
def rm(self, path, recursive=False, maxdepth=None):
if isinstance(path, str):
path = self._strip_protocol(path)
else:
path = [self._strip_protocol(p) for p in path]
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
for p in reversed(paths):
if self.isfile(p):
self.rm_file(p)
# If the expanded path doesn't exist, it is only because the expanded
# path was a directory that does not exist in self.pseudo_dirs. This
# is possible if you directly create files without making the
# directories first.
elif not self.exists(p):
continue
else:
self.rmdir(p)
class MemoryFile(BytesIO):
"""A BytesIO which can't close and works as a context manager
Can initialise with data. Each path should only be active once at any moment.
No need to provide fs, path if auto-committing (default)
"""
def __init__(self, fs=None, path=None, data=None):
logger.debug("open file %s", path)
self.fs = fs
self.path = path
self.created = datetime.now(tz=timezone.utc)
self.modified = datetime.now(tz=timezone.utc)
if data:
super().__init__(data)
self.seek(0)
@property
def size(self):
return self.getbuffer().nbytes
def __enter__(self):
return self
def close(self):
pass
def discard(self):
pass
def commit(self):
self.fs.store[self.path] = self
self.modified = datetime.now(tz=timezone.utc)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,180 @@
import datetime
import logging
import os
import types
import uuid
from stat import S_ISDIR, S_ISLNK
import paramiko
from .. import AbstractFileSystem
from ..utils import infer_storage_options
logger = logging.getLogger("fsspec.sftp")
class SFTPFileSystem(AbstractFileSystem):
"""Files over SFTP/SSH
Peer-to-peer filesystem over SSH using paramiko.
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "sftp", "ssh"
def __init__(self, host, **ssh_kwargs):
"""
Parameters
----------
host: str
Hostname or IP as a string
temppath: str
Location on the server to put files, when within a transaction
ssh_kwargs: dict
Parameters passed on to connection. See details in
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
May include port, username, password...
"""
if self._cached:
return
super().__init__(**ssh_kwargs)
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
self.host = host
self.ssh_kwargs = ssh_kwargs
self._connect()
def _connect(self):
logger.debug("Connecting to SFTP server %s", self.host)
self.client = paramiko.SSHClient()
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
self.client.connect(self.host, **self.ssh_kwargs)
self.ftp = self.client.open_sftp()
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, mode=511):
logger.debug("Creating folder %s", path)
if self.exists(path):
raise FileExistsError(f"File exists: {path}")
if create_parents:
self.makedirs(path)
else:
self.ftp.mkdir(path, mode)
def makedirs(self, path, exist_ok=False, mode=511):
if self.exists(path) and not exist_ok:
raise FileExistsError(f"File exists: {path}")
parts = path.split("/")
new_path = "/" if path[:1] == "/" else ""
for part in parts:
if part:
new_path = f"{new_path}/{part}" if new_path else part
if not self.exists(new_path):
self.ftp.mkdir(new_path, mode)
def rmdir(self, path):
logger.debug("Removing folder %s", path)
self.ftp.rmdir(path)
def info(self, path):
stat = self._decode_stat(self.ftp.stat(path))
stat["name"] = path
return stat
@staticmethod
def _decode_stat(stat, parent_path=None):
if S_ISDIR(stat.st_mode):
t = "directory"
elif S_ISLNK(stat.st_mode):
t = "link"
else:
t = "file"
out = {
"name": "",
"size": stat.st_size,
"type": t,
"uid": stat.st_uid,
"gid": stat.st_gid,
"time": datetime.datetime.fromtimestamp(
stat.st_atime, tz=datetime.timezone.utc
),
"mtime": datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
),
}
if parent_path:
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
return out
def ls(self, path, detail=False):
logger.debug("Listing folder %s", path)
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
if detail:
return stats
else:
paths = [stat["name"] for stat in stats]
return sorted(paths)
def put(self, lpath, rpath, callback=None, **kwargs):
logger.debug("Put file %s into %s", lpath, rpath)
self.ftp.put(lpath, rpath)
def get_file(self, rpath, lpath, **kwargs):
if self.isdir(rpath):
os.makedirs(lpath, exist_ok=True)
else:
self.ftp.get(self._strip_protocol(rpath), lpath)
def _open(self, path, mode="rb", block_size=None, **kwargs):
"""
block_size: int or None
If 0, no buffering, if 1, line buffering, if >1, buffer that many
bytes, if None use default from paramiko.
"""
logger.debug("Opening file %s", path)
if kwargs.get("autocommit", True) is False:
# writes to temporary file, move on commit
path2 = "/".join([self.temppath, str(uuid.uuid4())])
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
f.temppath = path2
f.targetpath = path
f.fs = self
f.commit = types.MethodType(commit_a_file, f)
f.discard = types.MethodType(discard_a_file, f)
else:
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
return f
def _rm(self, path):
if self.isdir(path):
self.ftp.rmdir(path)
else:
self.ftp.remove(path)
def mv(self, old, new):
logger.debug("Renaming %s into %s", old, new)
self.ftp.posix_rename(old, new)
def commit_a_file(self):
self.fs.mv(self.temppath, self.targetpath)
def discard_a_file(self):
self.fs._rm(self.temppath)

View File

@ -0,0 +1,416 @@
"""
This module contains SMBFileSystem class responsible for handling access to
Windows Samba network shares by using package smbprotocol
"""
import datetime
import re
import uuid
from stat import S_ISDIR, S_ISLNK
import smbclient
import smbprotocol.exceptions
from .. import AbstractFileSystem
from ..utils import infer_storage_options
# ! pylint: disable=bad-continuation
class SMBFileSystem(AbstractFileSystem):
"""Allow reading and writing to Windows and Samba network shares.
When using `fsspec.open()` for getting a file-like object the URI
should be specified as this format:
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
Example::
>>> import fsspec
>>> with fsspec.open(
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
... ) as smbfile:
... df = pd.read_csv(smbfile, sep='|', header=None)
Note that you need to pass in a valid hostname or IP address for the host
component of the URL. Do not use the Windows/NetBIOS machine name for the
host component.
The first component of the path in the URL points to the name of the shared
folder. Subsequent path components will point to the directory/folder/file.
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
optional.
.. note::
For working this source require `smbprotocol`_ to be installed, e.g.::
$ pip install smbprotocol
# or
# pip install smbprotocol[kerberos]
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
Note: if using this with the ``open`` or ``open_files``, with full URLs,
there is no way to tell if a path is relative, so all paths are assumed
to be absolute.
"""
protocol = "smb"
# pylint: disable=too-many-arguments
def __init__(
self,
host,
port=None,
username=None,
password=None,
timeout=60,
encrypt=None,
share_access=None,
register_session_retries=4,
register_session_retry_wait=1,
register_session_retry_factor=10,
auto_mkdir=False,
**kwargs,
):
"""
You can use _get_kwargs_from_urls to get some kwargs from
a reasonable SMB url.
Authentication will be anonymous or integrated if username/password are not
given.
Parameters
----------
host: str
The remote server name/ip to connect to
port: int or None
Port to connect with. Usually 445, sometimes 139.
username: str or None
Username to connect with. Required if Kerberos auth is not being used.
password: str or None
User's password on the server, if using username
timeout: int
Connection timeout in seconds
encrypt: bool
Whether to force encryption or not, once this has been set to True
the session cannot be changed back to False.
share_access: str or None
Specifies the default access applied to file open operations
performed with this file system object.
This affects whether other processes can concurrently open a handle
to the same file.
- None (the default): exclusively locks the file until closed.
- 'r': Allow other handles to be opened with read access.
- 'w': Allow other handles to be opened with write access.
- 'd': Allow other handles to be opened with delete access.
register_session_retries: int
Number of retries to register a session with the server. Retries are not performed
for authentication errors, as they are considered as invalid credentials and not network
issues. If set to negative value, no register attempts will be performed.
register_session_retry_wait: int
Time in seconds to wait between each retry. Number must be non-negative.
register_session_retry_factor: int
Base factor for the wait time between each retry. The wait time
is calculated using exponential function. For factor=1 all wait times
will be equal to `register_session_retry_wait`. For any number of retries,
the last wait time will be equal to `register_session_retry_wait` and for retries>1
the first wait time will be equal to `register_session_retry_wait / factor`.
Number must be equal to or greater than 1. Optimal factor is 10.
auto_mkdir: bool
Whether, when opening a file, the directory containing it should
be created (if it doesn't already exist). This is assumed by pyarrow
and zarr-python code.
"""
super().__init__(**kwargs)
self.host = host
self.port = port
self.username = username
self.password = password
self.timeout = timeout
self.encrypt = encrypt
self.temppath = kwargs.pop("temppath", "")
self.share_access = share_access
self.register_session_retries = register_session_retries
if register_session_retry_wait < 0:
raise ValueError(
"register_session_retry_wait must be a non-negative integer"
)
self.register_session_retry_wait = register_session_retry_wait
if register_session_retry_factor < 1:
raise ValueError(
"register_session_retry_factor must be a positive "
"integer equal to or greater than 1"
)
self.register_session_retry_factor = register_session_retry_factor
self.auto_mkdir = auto_mkdir
self._connect()
@property
def _port(self):
return 445 if self.port is None else self.port
def _connect(self):
import time
if self.register_session_retries <= -1:
return
retried_errors = []
wait_time = self.register_session_retry_wait
n_waits = (
self.register_session_retries - 1
) # -1 = No wait time after the last retry
factor = self.register_session_retry_factor
# Generate wait times for each retry attempt.
# Wait times are calculated using exponential function. For factor=1 all wait times
# will be equal to `wait`. For any number of retries the last wait time will be
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
wait_times = iter(
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
)
for attempt in range(self.register_session_retries + 1):
try:
smbclient.register_session(
self.host,
username=self.username,
password=self.password,
port=self._port,
encrypt=self.encrypt,
connection_timeout=self.timeout,
)
return
except (
smbprotocol.exceptions.SMBAuthenticationError,
smbprotocol.exceptions.LogonFailure,
):
# These exceptions should not be repeated, as they clearly indicate
# that the credentials are invalid and not a network issue.
raise
except ValueError as exc:
if re.findall(r"\[Errno -\d+]", str(exc)):
# This exception is raised by the smbprotocol.transport:Tcp.connect
# and originates from socket.gaierror (OSError). These exceptions might
# be raised due to network instability. We will retry to connect.
retried_errors.append(exc)
else:
# All another ValueError exceptions should be raised, as they are not
# related to network issues.
raise
except Exception as exc:
# Save the exception and retry to connect. This except might be dropped
# in the future, once all exceptions suited for retry are identified.
retried_errors.append(exc)
if attempt < self.register_session_retries:
time.sleep(next(wait_times))
# Raise last exception to inform user about the connection issues.
# Note: Should we use ExceptionGroup to raise all exceptions?
raise retried_errors[-1]
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(path):
# smb://workgroup;user:password@host:port/share/folder/file.csv
out = infer_storage_options(path)
out.pop("path", None)
out.pop("protocol", None)
return out
def mkdir(self, path, create_parents=True, **kwargs):
wpath = _as_unc_path(self.host, path)
if create_parents:
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
else:
smbclient.mkdir(wpath, port=self._port, **kwargs)
def makedirs(self, path, exist_ok=False):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
def rmdir(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
smbclient.rmdir(wpath, port=self._port)
def info(self, path, **kwargs):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port, **kwargs)
if S_ISDIR(stats.st_mode):
stype = "directory"
elif S_ISLNK(stats.st_mode):
stype = "link"
else:
stype = "file"
res = {
"name": path + "/" if stype == "directory" else path,
"size": stats.st_size,
"type": stype,
"uid": stats.st_uid,
"gid": stats.st_gid,
"time": stats.st_atime,
"mtime": stats.st_mtime,
}
return res
def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
def ls(self, path, detail=True, **kwargs):
unc = _as_unc_path(self.host, path)
listed = smbclient.listdir(unc, port=self._port, **kwargs)
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
if detail:
dirs = [self.info(d) for d in dirs]
return dirs
# pylint: disable=too-many-arguments
def _open(
self,
path,
mode="rb",
block_size=-1,
autocommit=True,
cache_options=None,
**kwargs,
):
"""
block_size: int or None
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
Notes
-----
By specifying 'share_access' in 'kwargs' it is possible to override the
default shared access setting applied in the constructor of this object.
"""
if self.auto_mkdir and "w" in mode:
self.makedirs(self._parent(path), exist_ok=True)
bls = block_size if block_size is not None and block_size >= 0 else -1
wpath = _as_unc_path(self.host, path)
share_access = kwargs.pop("share_access", self.share_access)
if "w" in mode and autocommit is False:
temp = _as_temp_path(self.host, path, self.temppath)
return SMBFileOpener(
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
)
return smbclient.open_file(
wpath,
mode,
buffering=bls,
share_access=share_access,
port=self._port,
**kwargs,
)
def copy(self, path1, path2, **kwargs):
"""Copy within two locations in the same filesystem"""
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
if self.auto_mkdir:
self.makedirs(self._parent(path2), exist_ok=True)
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
def _rm(self, path):
if _share_has_path(path):
wpath = _as_unc_path(self.host, path)
stats = smbclient.stat(wpath, port=self._port)
if S_ISDIR(stats.st_mode):
smbclient.rmdir(wpath, port=self._port)
else:
smbclient.remove(wpath, port=self._port)
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
wpath1 = _as_unc_path(self.host, path1)
wpath2 = _as_unc_path(self.host, path2)
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
def _as_unc_path(host, path):
rpath = path.replace("/", "\\")
unc = f"\\\\{host}{rpath}"
return unc
def _as_temp_path(host, path, temppath):
share = path.split("/")[1]
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
unc = _as_unc_path(host, temp_file)
return unc
def _share_has_path(path):
parts = path.count("/")
if path.endswith("/"):
return parts > 2
return parts > 1
class SMBFileOpener:
"""writes to remote temporary file, move on commit"""
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
self.path = path
self.temp = temp
self.mode = mode
self.block_size = block_size
self.kwargs = kwargs
self.smbfile = None
self._incontext = False
self.port = port
self._open()
def _open(self):
if self.smbfile is None or self.smbfile.closed:
self.smbfile = smbclient.open_file(
self.temp,
self.mode,
port=self.port,
buffering=self.block_size,
**self.kwargs,
)
def commit(self):
"""Move temp file to definitive on success."""
# TODO: use transaction support in SMB protocol
smbclient.replace(self.temp, self.path, port=self.port)
def discard(self):
"""Remove the temp file on failure."""
smbclient.remove(self.temp, port=self.port)
def __fspath__(self):
return self.path
def __iter__(self):
return self.smbfile.__iter__()
def __getattr__(self, item):
return getattr(self.smbfile, item)
def __enter__(self):
self._incontext = True
return self.smbfile.__enter__()
def __exit__(self, exc_type, exc_value, traceback):
self._incontext = False
self.smbfile.__exit__(exc_type, exc_value, traceback)

View File

@ -0,0 +1,124 @@
import logging
import tarfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
from fsspec.compression import compr
from fsspec.utils import infer_compression
typemap = {b"0": "file", b"5": "directory"}
logger = logging.getLogger("tar")
class TarFileSystem(AbstractArchiveFileSystem):
"""Compressed Tar archives as a file-system (read-only)
Supports the following formats:
tar.gz, tar.bz2, tar.xz
"""
root_marker = ""
protocol = "tar"
cachable = False
def __init__(
self,
fo="",
index_store=None,
target_options=None,
target_protocol=None,
compression=None,
**kwargs,
):
super().__init__(**kwargs)
target_options = target_options or {}
if isinstance(fo, str):
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
fo = self.of.open() # keep the reference
# Try to infer compression.
if compression is None:
name = None
# Try different ways to get hold of the filename. `fo` might either
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
# `fsspec.AbstractFileSystem` instance.
try:
# Amended io.BufferedReader or similar.
# This uses a "protocol extension" where original filenames are
# propagated to archive-like filesystems in order to let them
# infer the right compression appropriately.
if hasattr(fo, "original"):
name = fo.original
# fsspec.LocalFileOpener
elif hasattr(fo, "path"):
name = fo.path
# io.BufferedReader
elif hasattr(fo, "name"):
name = fo.name
# fsspec.AbstractFileSystem
elif hasattr(fo, "info"):
name = fo.info()["name"]
except Exception as ex:
logger.warning(
f"Unable to determine file name, not inferring compression: {ex}"
)
if name is not None:
compression = infer_compression(name)
logger.info(f"Inferred compression {compression} from file name {name}")
if compression is not None:
# TODO: tarfile already implements compression with modes like "'r:gz'",
# but then would seek to offset in the file work?
fo = compr[compression](fo)
self._fo_ref = fo
self.fo = fo # the whole instance is a context
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None
self.index_store = index_store
self.index = None
self._index()
def _index(self):
# TODO: load and set saved index, if exists
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
name = ti.get_info()["name"].rstrip("/")
out[name] = (info, ti.offset_data)
self.index = out
# TODO: save index to self.index_store here, if set
def _get_dirs(self):
if self.dir_cache is not None:
return
# This enables ls to get directories as children as well as files
self.dir_cache = {
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(self.tar.getnames())
}
for member in self.tar.getmembers():
info = member.get_info()
info["name"] = info["name"].rstrip("/")
info["type"] = typemap.get(info["type"], "file")
self.dir_cache[info["name"]] = info
def _open(self, path, mode="rb", **kwargs):
if mode != "rb":
raise ValueError("Read-only filesystem implementation")
details, offset = self.index[path]
if details["type"] != "file":
raise ValueError("Can only handle regular files")
return self.tar.extractfile(path)

View File

@ -0,0 +1,484 @@
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
import logging
import os
import secrets
import shutil
import tempfile
import uuid
from contextlib import suppress
from urllib.parse import quote
import requests
from ..spec import AbstractBufferedFile, AbstractFileSystem
from ..utils import infer_storage_options, tokenize
logger = logging.getLogger("webhdfs")
class WebHDFS(AbstractFileSystem):
"""
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
Four auth mechanisms are supported:
insecure: no auth is done, and the user is assumed to be whoever they
say they are (parameter ``user``), or a predefined value such as
"dr.who" if not given
spnego: when kerberos authentication is enabled, auth is negotiated by
requests_kerberos https://github.com/requests/requests-kerberos .
This establishes a session based on existing kinit login and/or
specified principal/password; parameters are passed with ``kerb_kwargs``
token: uses an existing Hadoop delegation token from another secured
service. Indeed, this client can also generate such tokens when
not insecure. Note that tokens expire, but can be renewed (by a
previously specified user) and may allow for proxying.
basic-auth: used when both parameter ``user`` and parameter ``password``
are provided.
"""
tempdir = str(tempfile.gettempdir())
protocol = "webhdfs", "webHDFS"
def __init__(
self,
host,
port=50070,
kerberos=False,
token=None,
user=None,
password=None,
proxy_to=None,
kerb_kwargs=None,
data_proxy=None,
use_https=False,
session_cert=None,
session_verify=True,
**kwargs,
):
"""
Parameters
----------
host: str
Name-node address
port: int
Port for webHDFS
kerberos: bool
Whether to authenticate with kerberos for this connection
token: str or None
If given, use this token on every call to authenticate. A user
and user-proxy may be encoded in the token and should not be also
given
user: str or None
If given, assert the user name to connect with
password: str or None
If given, assert the password to use for basic auth. If password
is provided, user must be provided also
proxy_to: str or None
If given, the user has the authority to proxy, and this value is
the user in who's name actions are taken
kerb_kwargs: dict
Any extra arguments for HTTPKerberosAuth, see
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
data_proxy: dict, callable or None
If given, map data-node addresses. This can be necessary if the
HDFS cluster is behind a proxy, running on Docker or otherwise has
a mismatch between the host-names given by the name-node and the
address by which to refer to them from the client. If a dict,
maps host names ``host->data_proxy[host]``; if a callable, full
URLs are passed, and function must conform to
``url->data_proxy(url)``.
use_https: bool
Whether to connect to the Name-node using HTTPS instead of HTTP
session_cert: str or Tuple[str, str] or None
Path to a certificate file, or tuple of (cert, key) files to use
for the requests.Session
session_verify: str, bool or None
Path to a certificate file to use for verifying the requests.Session.
kwargs
"""
if self._cached:
return
super().__init__(**kwargs)
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
self.kerb = kerberos
self.kerb_kwargs = kerb_kwargs or {}
self.pars = {}
self.proxy = data_proxy or {}
if token is not None:
if user is not None or proxy_to is not None:
raise ValueError(
"If passing a delegation token, must not set "
"user or proxy_to, as these are encoded in the"
" token"
)
self.pars["delegation"] = token
self.user = user
self.password = password
if password is not None:
if user is None:
raise ValueError(
"If passing a password, the user must also be"
"set in order to set up the basic-auth"
)
else:
if user is not None:
self.pars["user.name"] = user
if proxy_to is not None:
self.pars["doas"] = proxy_to
if kerberos and user is not None:
raise ValueError(
"If using Kerberos auth, do not specify the "
"user, this is handled by kinit."
)
self.session_cert = session_cert
self.session_verify = session_verify
self._connect()
self._fsid = f"webhdfs_{tokenize(host, port)}"
@property
def fsid(self):
return self._fsid
def _connect(self):
self.session = requests.Session()
if self.session_cert:
self.session.cert = self.session_cert
self.session.verify = self.session_verify
if self.kerb:
from requests_kerberos import HTTPKerberosAuth
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
if self.user is not None and self.password is not None:
from requests.auth import HTTPBasicAuth
self.session.auth = HTTPBasicAuth(self.user, self.password)
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
url = self._apply_proxy(self.url + quote(path or "", safe="/="))
args = kwargs.copy()
args.update(self.pars)
args["op"] = op.upper()
logger.debug("sending %s with %s", url, method)
out = self.session.request(
method=method.upper(),
url=url,
params=args,
data=data,
allow_redirects=redirect,
)
if out.status_code in [400, 401, 403, 404, 500]:
try:
err = out.json()
msg = err["RemoteException"]["message"]
exp = err["RemoteException"]["exception"]
except (ValueError, KeyError):
pass
else:
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
raise ValueError(msg)
elif exp in ["SecurityException", "AccessControlException"]:
raise PermissionError(msg)
elif exp in ["FileNotFoundException"]:
raise FileNotFoundError(msg)
else:
raise RuntimeError(msg)
out.raise_for_status()
return out
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
replication=None,
permissions=None,
**kwargs,
):
"""
Parameters
----------
path: str
File location
mode: str
'rb', 'wb', etc.
block_size: int
Client buffer size for read-ahead or write buffer
autocommit: bool
If False, writes to temporary file that only gets put in final
location upon commit
replication: int
Number of copies of file on the cluster, write mode only
permissions: str or int
posix permissions, write mode only
kwargs
Returns
-------
WebHDFile instance
"""
block_size = block_size or self.blocksize
return WebHDFile(
self,
path,
mode=mode,
block_size=block_size,
tempdir=self.tempdir,
autocommit=autocommit,
replication=replication,
permissions=permissions,
)
@staticmethod
def _process_info(info):
info["type"] = info["type"].lower()
info["size"] = info["length"]
return info
@classmethod
def _strip_protocol(cls, path):
return infer_storage_options(path)["path"]
@staticmethod
def _get_kwargs_from_urls(urlpath):
out = infer_storage_options(urlpath)
out.pop("path", None)
out.pop("protocol", None)
if "username" in out:
out["user"] = out.pop("username")
return out
def info(self, path):
out = self._call("GETFILESTATUS", path=path)
info = out.json()["FileStatus"]
info["name"] = path
return self._process_info(info)
def ls(self, path, detail=False):
out = self._call("LISTSTATUS", path=path)
infos = out.json()["FileStatuses"]["FileStatus"]
for info in infos:
self._process_info(info)
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
if detail:
return sorted(infos, key=lambda i: i["name"])
else:
return sorted(info["name"] for info in infos)
def content_summary(self, path):
"""Total numbers of files, directories and bytes under path"""
out = self._call("GETCONTENTSUMMARY", path=path)
return out.json()["ContentSummary"]
def ukey(self, path):
"""Checksum info of file, giving method and result"""
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
if "Location" in out.headers:
location = self._apply_proxy(out.headers["Location"])
out2 = self.session.get(location)
out2.raise_for_status()
return out2.json()["FileChecksum"]
else:
out.raise_for_status()
return out.json()["FileChecksum"]
def home_directory(self):
"""Get user's home directory"""
out = self._call("GETHOMEDIRECTORY")
return out.json()["Path"]
def get_delegation_token(self, renewer=None):
"""Retrieve token which can give the same authority to other uses
Parameters
----------
renewer: str or None
User who may use this token; if None, will be current user
"""
if renewer:
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
else:
out = self._call("GETDELEGATIONTOKEN")
t = out.json()["Token"]
if t is None:
raise ValueError("No token available for this user/security context")
return t["urlString"]
def renew_delegation_token(self, token):
"""Make token live longer. Returns new expiry time"""
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
return out.json()["long"]
def cancel_delegation_token(self, token):
"""Stop the token from being useful"""
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
def chmod(self, path, mod):
"""Set the permission at path
Parameters
----------
path: str
location to set (file or directory)
mod: str or int
posix epresentation or permission, give as oct string, e.g, '777'
or 0o777
"""
self._call("SETPERMISSION", method="put", path=path, permission=mod)
def chown(self, path, owner=None, group=None):
"""Change owning user and/or group"""
kwargs = {}
if owner is not None:
kwargs["owner"] = owner
if group is not None:
kwargs["group"] = group
self._call("SETOWNER", method="put", path=path, **kwargs)
def set_replication(self, path, replication):
"""
Set file replication factor
Parameters
----------
path: str
File location (not for directories)
replication: int
Number of copies of file on the cluster. Should be smaller than
number of data nodes; normally 3 on most systems.
"""
self._call("SETREPLICATION", path=path, method="put", replication=replication)
def mkdir(self, path, **kwargs):
self._call("MKDIRS", method="put", path=path)
def makedirs(self, path, exist_ok=False):
if exist_ok is False and self.exists(path):
raise FileExistsError(path)
self.mkdir(path)
def mv(self, path1, path2, **kwargs):
self._call("RENAME", method="put", path=path1, destination=path2)
def rm(self, path, recursive=False, **kwargs):
self._call(
"DELETE",
method="delete",
path=path,
recursive="true" if recursive else "false",
)
def rm_file(self, path, **kwargs):
self.rm(path)
def cp_file(self, lpath, rpath, **kwargs):
with self.open(lpath) as lstream:
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
# Perform an atomic copy (stream to a temporary file and
# move it to the actual destination).
try:
with self.open(tmp_fname, "wb") as rstream:
shutil.copyfileobj(lstream, rstream)
self.mv(tmp_fname, rpath)
except BaseException:
with suppress(FileNotFoundError):
self.rm(tmp_fname)
raise
def _apply_proxy(self, location):
if self.proxy and callable(self.proxy):
location = self.proxy(location)
elif self.proxy:
# as a dict
for k, v in self.proxy.items():
location = location.replace(k, v, 1)
return location
class WebHDFile(AbstractBufferedFile):
"""A file living in HDFS over webHDFS"""
def __init__(self, fs, path, **kwargs):
super().__init__(fs, path, **kwargs)
kwargs = kwargs.copy()
if kwargs.get("permissions", None) is None:
kwargs.pop("permissions", None)
if kwargs.get("replication", None) is None:
kwargs.pop("replication", None)
self.permissions = kwargs.pop("permissions", 511)
tempdir = kwargs.pop("tempdir")
if kwargs.pop("autocommit", False) is False:
self.target = self.path
self.path = os.path.join(tempdir, str(uuid.uuid4()))
def _upload_chunk(self, final=False):
"""Write one part of a multi-block file upload
Parameters
==========
final: bool
This is the last block, so should complete file, if
self.autocommit is True.
"""
out = self.fs.session.post(
self.location,
data=self.buffer.getvalue(),
headers={"content-type": "application/octet-stream"},
)
out.raise_for_status()
return True
def _initiate_upload(self):
"""Create remote file/upload"""
kwargs = self.kwargs.copy()
if "a" in self.mode:
op, method = "APPEND", "POST"
else:
op, method = "CREATE", "PUT"
kwargs["overwrite"] = "true"
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
location = self.fs._apply_proxy(out.headers["Location"])
if "w" in self.mode:
# create empty file to append to
out2 = self.fs.session.put(
location, headers={"content-type": "application/octet-stream"}
)
out2.raise_for_status()
# after creating empty file, change location to append to
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
self.location = self.fs._apply_proxy(out2.headers["Location"])
def _fetch_range(self, start, end):
start = max(start, 0)
end = min(self.size, end)
if start >= end or start >= self.size:
return b""
out = self.fs._call(
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
)
out.raise_for_status()
if "Location" in out.headers:
location = out.headers["Location"]
out2 = self.fs.session.get(self.fs._apply_proxy(location))
return out2.content
else:
return out.content
def commit(self):
self.fs.mv(self.path, self.target)
def discard(self):
self.fs.rm(self.path)

View File

@ -0,0 +1,177 @@
import os
import zipfile
import fsspec
from fsspec.archive import AbstractArchiveFileSystem
class ZipFileSystem(AbstractArchiveFileSystem):
"""Read/Write contents of ZIP archive as a file-system
Keeps file object open while instance lives.
This class is pickleable, but not necessarily thread-safe
"""
root_marker = ""
protocol = "zip"
cachable = False
def __init__(
self,
fo="",
mode="r",
target_protocol=None,
target_options=None,
compression=zipfile.ZIP_STORED,
allowZip64=True,
compresslevel=None,
**kwargs,
):
"""
Parameters
----------
fo: str or file-like
Contains ZIP, and must exist. If a str, will fetch file using
:meth:`~fsspec.open_files`, which must return one file exactly.
mode: str
Accept: "r", "w", "a"
target_protocol: str (optional)
If ``fo`` is a string, this value can be used to override the
FS protocol inferred from a URL
target_options: dict (optional)
Kwargs passed when instantiating the target FS, if ``fo`` is
a string.
compression, allowZip64, compresslevel: passed to ZipFile
Only relevant when creating a ZIP
"""
super().__init__(self, **kwargs)
if mode not in set("rwa"):
raise ValueError(f"mode '{mode}' no understood")
self.mode = mode
if isinstance(fo, (str, os.PathLike)):
if mode == "a":
m = "r+b"
else:
m = mode + "b"
fo = fsspec.open(
fo, mode=m, protocol=target_protocol, **(target_options or {})
)
self.force_zip_64 = allowZip64
self.of = fo
self.fo = fo.__enter__() # the whole instance is a context
self.zip = zipfile.ZipFile(
self.fo,
mode=mode,
compression=compression,
allowZip64=allowZip64,
compresslevel=compresslevel,
)
self.dir_cache = None
@classmethod
def _strip_protocol(cls, path):
# zip file paths are always relative to the archive root
return super()._strip_protocol(path).lstrip("/")
def __del__(self):
if hasattr(self, "zip"):
self.close()
del self.zip
def close(self):
"""Commits any write changes to the file. Done on ``del`` too."""
self.zip.close()
def _get_dirs(self):
if self.dir_cache is None or self.mode in set("wa"):
# when writing, dir_cache is always in the ZipFile's attributes,
# not read from the file.
files = self.zip.infolist()
self.dir_cache = {
dirname.rstrip("/"): {
"name": dirname.rstrip("/"),
"size": 0,
"type": "directory",
}
for dirname in self._all_dirnames(self.zip.namelist())
}
for z in files:
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
f.update(
{
"name": z.filename.rstrip("/"),
"size": z.file_size,
"type": ("directory" if z.is_dir() else "file"),
}
)
self.dir_cache[f["name"]] = f
def pipe_file(self, path, value, **kwargs):
# override upstream, because we know the exact file size in this case
self.zip.writestr(path, value, **kwargs)
def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
path = self._strip_protocol(path)
if "r" in mode and self.mode in set("wa"):
if self.exists(path):
raise OSError("ZipFS can only be open for reading or writing, not both")
raise FileNotFoundError(path)
if "r" in self.mode and "w" in mode:
raise OSError("ZipFS can only be open for reading or writing, not both")
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
if "r" in mode:
info = self.info(path)
out.size = info["size"]
out.name = info["name"]
return out
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
# Remove the leading slash, as the zip file paths are always
# given without a leading slash
path = path.lstrip("/")
path_parts = list(filter(lambda s: bool(s), path.split("/")))
def _matching_starts(file_path):
file_parts = filter(lambda s: bool(s), file_path.split("/"))
return all(a == b for a, b in zip(path_parts, file_parts))
self._get_dirs()
result = {}
# To match posix find, if an exact file name is given, we should
# return only that file
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
result[path] = self.dir_cache[path]
return result if detail else [path]
for file_path, file_info in self.dir_cache.items():
if not (path == "" or _matching_starts(file_path)):
continue
if file_info["type"] == "directory":
if withdirs:
if file_path not in result:
result[file_path.strip("/")] = file_info
continue
if file_path not in result:
result[file_path] = file_info if detail else None
if maxdepth:
path_depth = path.count("/")
result = {
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
}
return result if detail else sorted(result)

View File

@ -0,0 +1,121 @@
import json
from contextlib import suppress
from pathlib import PurePath
from typing import (
Any,
Callable,
ClassVar,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
)
from .registry import _import_class, get_filesystem_class
from .spec import AbstractFileSystem
class FilesystemJSONEncoder(json.JSONEncoder):
include_password: ClassVar[bool] = True
def default(self, o: Any) -> Any:
if isinstance(o, AbstractFileSystem):
return o.to_dict(include_password=self.include_password)
if isinstance(o, PurePath):
cls = type(o)
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
return super().default(o)
def make_serializable(self, obj: Any) -> Any:
"""
Recursively converts an object so that it can be JSON serialized via
:func:`json.dumps` and :func:`json.dump`, without actually calling
said functions.
"""
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, Mapping):
return {k: self.make_serializable(v) for k, v in obj.items()}
if isinstance(obj, Sequence):
return [self.make_serializable(v) for v in obj]
return self.default(obj)
class FilesystemJSONDecoder(json.JSONDecoder):
def __init__(
self,
*,
object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
parse_float: Optional[Callable[[str], Any]] = None,
parse_int: Optional[Callable[[str], Any]] = None,
parse_constant: Optional[Callable[[str], Any]] = None,
strict: bool = True,
object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
) -> None:
self.original_object_hook = object_hook
super().__init__(
object_hook=self.custom_object_hook,
parse_float=parse_float,
parse_int=parse_int,
parse_constant=parse_constant,
strict=strict,
object_pairs_hook=object_pairs_hook,
)
@classmethod
def try_resolve_path_cls(cls, dct: Dict[str, Any]):
with suppress(Exception):
fqp = dct["cls"]
path_cls = _import_class(fqp)
if issubclass(path_cls, PurePath):
return path_cls
return None
@classmethod
def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
with suppress(Exception):
if "cls" in dct:
try:
fs_cls = _import_class(dct["cls"])
if issubclass(fs_cls, AbstractFileSystem):
return fs_cls
except Exception:
if "protocol" in dct: # Fallback if cls cannot be imported
return get_filesystem_class(dct["protocol"])
raise
return None
def custom_object_hook(self, dct: Dict[str, Any]):
if "cls" in dct:
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
return AbstractFileSystem.from_dict(dct)
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
return obj_cls(dct["str"])
if self.original_object_hook is not None:
return self.original_object_hook(dct)
return dct
def unmake_serializable(self, obj: Any) -> Any:
"""
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
"""
if isinstance(obj, dict):
obj = self.custom_object_hook(obj)
if isinstance(obj, dict):
return {k: self.unmake_serializable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [self.unmake_serializable(v) for v in obj]
return obj

View File

@ -0,0 +1,251 @@
import array
import logging
import posixpath
import warnings
from collections.abc import MutableMapping
from functools import cached_property
from fsspec.core import url_to_fs
logger = logging.getLogger("fsspec.mapping")
class FSMap(MutableMapping):
"""Wrap a FileSystem instance as a mutable wrapping.
The keys of the mapping become files under the given root, and the
values (which must be bytes) the contents of those files.
Parameters
----------
root: string
prefix for all the files
fs: FileSystem instance
check: bool (=True)
performs a touch at the location, to check for write access.
Examples
--------
>>> fs = FileSystem(**parameters) # doctest: +SKIP
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
or, more likely
>>> d = fs.get_mapper('my-data/path/')
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
>>> list(d.keys()) # doctest: +SKIP
['loc1']
>>> d['loc1'] # doctest: +SKIP
b'Hello World'
"""
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
self.fs = fs
self.root = fs._strip_protocol(root)
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
if missing_exceptions is None:
missing_exceptions = (
FileNotFoundError,
IsADirectoryError,
NotADirectoryError,
)
self.missing_exceptions = missing_exceptions
self.check = check
self.create = create
if create:
if not self.fs.exists(root):
self.fs.mkdir(root)
if check:
if not self.fs.exists(root):
raise ValueError(
f"Path {root} does not exist. Create "
f" with the ``create=True`` keyword"
)
self.fs.touch(root + "/a")
self.fs.rm(root + "/a")
@cached_property
def dirfs(self):
"""dirfs instance that can be used with the same keys as the mapper"""
from .implementations.dirfs import DirFileSystem
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
def clear(self):
"""Remove all keys below root - empties out mapping"""
logger.info("Clear mapping at %s", self.root)
try:
self.fs.rm(self.root, True)
self.fs.mkdir(self.root)
except: # noqa: E722
pass
def getitems(self, keys, on_error="raise"):
"""Fetch multiple items from the store
If the backend is async-able, this might proceed concurrently
Parameters
----------
keys: list(str)
They keys to be fetched
on_error : "raise", "omit", "return"
If raise, an underlying exception will be raised (converted to KeyError
if the type is in self.missing_exceptions); if omit, keys with exception
will simply not be included in the output; if "return", all keys are
included in the output, but the value will be bytes or an exception
instance.
Returns
-------
dict(key, bytes|exception)
"""
keys2 = [self._key_to_str(k) for k in keys]
oe = on_error if on_error == "raise" else "return"
try:
out = self.fs.cat(keys2, on_error=oe)
if isinstance(out, bytes):
out = {keys2[0]: out}
except self.missing_exceptions as e:
raise KeyError from e
out = {
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
for k, v in out.items()
}
return {
key: out[k2]
for key, k2 in zip(keys, keys2)
if on_error == "return" or not isinstance(out[k2], BaseException)
}
def setitems(self, values_dict):
"""Set the values of multiple items in the store
Parameters
----------
values_dict: dict(str, bytes)
"""
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
self.fs.pipe(values)
def delitems(self, keys):
"""Remove multiple keys from the store"""
self.fs.rm([self._key_to_str(k) for k in keys])
def _key_to_str(self, key):
"""Generate full path for the key"""
if not isinstance(key, str):
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
warnings.warn(
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
DeprecationWarning,
)
if isinstance(key, list):
key = tuple(key)
key = str(key)
return f"{self._root_key_to_str}{key}".rstrip("/")
def _str_to_key(self, s):
"""Strip path of to leave key name"""
return s[len(self.root) :].lstrip("/")
def __getitem__(self, key, default=None):
"""Retrieve data"""
k = self._key_to_str(key)
try:
result = self.fs.cat(k)
except self.missing_exceptions as exc:
if default is not None:
return default
raise KeyError(key) from exc
return result
def pop(self, key, default=None):
"""Pop data"""
result = self.__getitem__(key, default)
try:
del self[key]
except KeyError:
pass
return result
def __setitem__(self, key, value):
"""Store value in key"""
key = self._key_to_str(key)
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
self.fs.pipe_file(key, maybe_convert(value))
def __iter__(self):
return (self._str_to_key(x) for x in self.fs.find(self.root))
def __len__(self):
return len(self.fs.find(self.root))
def __delitem__(self, key):
"""Remove key"""
try:
self.fs.rm(self._key_to_str(key))
except Exception as exc:
raise KeyError from exc
def __contains__(self, key):
"""Does key exist in mapping?"""
path = self._key_to_str(key)
return self.fs.isfile(path)
def __reduce__(self):
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
def maybe_convert(value):
if isinstance(value, array.array) or hasattr(value, "__array__"):
# bytes-like things
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
# The buffer interface doesn't support datetime64/timdelta64 numpy
# arrays
value = value.view("int64")
value = bytes(memoryview(value))
return value
def get_mapper(
url="",
check=False,
create=False,
missing_exceptions=None,
alternate_root=None,
**kwargs,
):
"""Create key-value interface for given URL and options
The URL will be of the form "protocol://location" and point to the root
of the mapper required. All keys will be file-names below this location,
and their values the contents of each key.
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
Parameters
----------
url: str
Root URL of mapping
check: bool
Whether to attempt to read from the location before instantiation, to
check that the mapping does exist
create: bool
Whether to make the directory corresponding to the root before
instantiating
missing_exceptions: None or tuple
If given, these exception types will be regarded as missing keys and
return KeyError when trying to read data. By default, you get
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
alternate_root: None or str
In cases of complex URLs, the parser may fail to pick the correct part
for the mapper root, so this arg can override
Returns
-------
``FSMap`` instance, the dict-like key-value store.
"""
# Removing protocol here - could defer to each open() on the backend
fs, urlpath = url_to_fs(url, **kwargs)
root = alternate_root if alternate_root is not None else urlpath
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)

View File

@ -0,0 +1,541 @@
import io
import json
import warnings
from .core import url_to_fs
from .utils import merge_offset_ranges
# Parquet-Specific Utilities for fsspec
#
# Most of the functions defined in this module are NOT
# intended for public consumption. The only exception
# to this is `open_parquet_file`, which should be used
# place of `fs.open()` to open parquet-formatted files
# on remote file systems.
def open_parquet_file(
path,
mode="rb",
fs=None,
metadata=None,
columns=None,
row_groups=None,
storage_options=None,
strict=False,
engine="auto",
max_gap=64_000,
max_block=256_000_000,
footer_sample_size=1_000_000,
**kwargs,
):
"""
Return a file-like object for a single Parquet file.
The specified parquet `engine` will be used to parse the
footer metadata, and determine the required byte ranges
from the file. The target path will then be opened with
the "parts" (`KnownPartsOfAFile`) caching strategy.
Note that this method is intended for usage with remote
file systems, and is unlikely to improve parquet-read
performance on local file systems.
Parameters
----------
path: str
Target file path.
mode: str, optional
Mode option to be passed through to `fs.open`. Default is "rb".
metadata: Any, optional
Parquet metadata object. Object type must be supported
by the backend parquet engine. For now, only the "fastparquet"
engine supports an explicit `ParquetFile` metadata object.
If a metadata object is supplied, the remote footer metadata
will not need to be transferred into local memory.
fs: AbstractFileSystem, optional
Filesystem object to use for opening the file. If nothing is
specified, an `AbstractFileSystem` object will be inferred.
engine : str, default "auto"
Parquet engine to use for metadata parsing. Allowed options
include "fastparquet", "pyarrow", and "auto". The specified
engine must be installed in the current environment. If
"auto" is specified, and both engines are installed,
"fastparquet" will take precedence over "pyarrow".
columns: list, optional
List of all column names that may be read from the file.
row_groups : list, optional
List of all row-groups that may be read from the file. This
may be a list of row-group indices (integers), or it may be
a list of `RowGroup` metadata objects (if the "fastparquet"
engine is used).
storage_options : dict, optional
Used to generate an `AbstractFileSystem` object if `fs` was
not specified.
strict : bool, optional
Whether the resulting `KnownPartsOfAFile` cache should
fetch reads that go beyond a known byte-range boundary.
If `False` (the default), any read that ends outside a
known part will be zero padded. Note that using
`strict=True` may be useful for debugging.
max_gap : int, optional
Neighboring byte ranges will only be merged when their
inter-range gap is <= `max_gap`. Default is 64KB.
max_block : int, optional
Neighboring byte ranges will only be merged when the size of
the aggregated range is <= `max_block`. Default is 256MB.
footer_sample_size : int, optional
Number of bytes to read from the end of the path to look
for the footer metadata. If the sampled bytes do not contain
the footer, a second read request will be required, and
performance will suffer. Default is 1MB.
**kwargs :
Optional key-word arguments to pass to `fs.open`
"""
# Make sure we have an `AbstractFileSystem` object
# to work with
if fs is None:
fs = url_to_fs(path, **(storage_options or {}))[0]
# For now, `columns == []` not supported. Just use
# default `open` command with `path` input
if columns is not None and len(columns) == 0:
return fs.open(path, mode=mode)
# Set the engine
engine = _set_engine(engine)
# Fetch the known byte ranges needed to read
# `columns` and/or `row_groups`
data = _get_parquet_byte_ranges(
[path],
fs,
metadata=metadata,
columns=columns,
row_groups=row_groups,
engine=engine,
max_gap=max_gap,
max_block=max_block,
footer_sample_size=footer_sample_size,
)
# Extract file name from `data`
fn = next(iter(data)) if data else path
# Call self.open with "parts" caching
options = kwargs.pop("cache_options", {}).copy()
return fs.open(
fn,
mode=mode,
cache_type="parts",
cache_options={
**options,
"data": data.get(fn, {}),
"strict": strict,
},
**kwargs,
)
def _get_parquet_byte_ranges(
paths,
fs,
metadata=None,
columns=None,
row_groups=None,
max_gap=64_000,
max_block=256_000_000,
footer_sample_size=1_000_000,
engine="auto",
):
"""Get a dictionary of the known byte ranges needed
to read a specific column/row-group selection from a
Parquet dataset. Each value in the output dictionary
is intended for use as the `data` argument for the
`KnownPartsOfAFile` caching strategy of a single path.
"""
# Set engine if necessary
if isinstance(engine, str):
engine = _set_engine(engine)
# Pass to specialized function if metadata is defined
if metadata is not None:
# Use the provided parquet metadata object
# to avoid transferring/parsing footer metadata
return _get_parquet_byte_ranges_from_metadata(
metadata,
fs,
engine,
columns=columns,
row_groups=row_groups,
max_gap=max_gap,
max_block=max_block,
)
# Get file sizes asynchronously
file_sizes = fs.sizes(paths)
# Populate global paths, starts, & ends
result = {}
data_paths = []
data_starts = []
data_ends = []
add_header_magic = True
if columns is None and row_groups is None:
# We are NOT selecting specific columns or row-groups.
#
# We can avoid sampling the footers, and just transfer
# all file data with cat_ranges
for i, path in enumerate(paths):
result[path] = {}
for b in range(0, file_sizes[i], max_block):
data_paths.append(path)
data_starts.append(b)
data_ends.append(min(b + max_block, file_sizes[i]))
add_header_magic = False # "Magic" should already be included
else:
# We ARE selecting specific columns or row-groups.
#
# Gather file footers.
# We just take the last `footer_sample_size` bytes of each
# file (or the entire file if it is smaller than that)
footer_starts = []
footer_ends = []
for i, path in enumerate(paths):
footer_ends.append(file_sizes[i])
sample_size = max(0, file_sizes[i] - footer_sample_size)
footer_starts.append(sample_size)
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
# Check our footer samples and re-sample if necessary.
missing_footer_starts = footer_starts.copy()
large_footer = 0
for i, path in enumerate(paths):
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
real_footer_start = file_sizes[i] - (footer_size + 8)
if real_footer_start < footer_starts[i]:
missing_footer_starts[i] = real_footer_start
large_footer = max(large_footer, (footer_size + 8))
if large_footer:
warnings.warn(
f"Not enough data was used to sample the parquet footer. "
f"Try setting footer_sample_size >= {large_footer}."
)
for i, block in enumerate(
fs.cat_ranges(
paths,
missing_footer_starts,
footer_starts,
)
):
footer_samples[i] = block + footer_samples[i]
footer_starts[i] = missing_footer_starts[i]
# Calculate required byte ranges for each path
for i, path in enumerate(paths):
# Deal with small-file case.
# Just include all remaining bytes of the file
# in a single range.
if file_sizes[i] < max_block:
if footer_starts[i] > 0:
# Only need to transfer the data if the
# footer sample isn't already the whole file
data_paths.append(path)
data_starts.append(0)
data_ends.append(footer_starts[i])
continue
# Use "engine" to collect data byte ranges
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
footer=footer_samples[i],
footer_start=footer_starts[i],
)
data_paths += [path] * len(path_data_starts)
data_starts += path_data_starts
data_ends += path_data_ends
# Merge adjacent offset ranges
data_paths, data_starts, data_ends = merge_offset_ranges(
data_paths,
data_starts,
data_ends,
max_gap=max_gap,
max_block=max_block,
sort=False, # Should already be sorted
)
# Start by populating `result` with footer samples
for i, path in enumerate(paths):
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
# Transfer the data byte-ranges into local memory
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
# Add b"PAR1" to header if necessary
if add_header_magic:
_add_header_magic(result)
return result
def _get_parquet_byte_ranges_from_metadata(
metadata,
fs,
engine,
columns=None,
row_groups=None,
max_gap=64_000,
max_block=256_000_000,
):
"""Simplified version of `_get_parquet_byte_ranges` for
the case that an engine-specific `metadata` object is
provided, and the remote footer metadata does not need to
be transferred before calculating the required byte ranges.
"""
# Use "engine" to collect data byte ranges
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
columns,
row_groups=row_groups,
metadata=metadata,
)
# Merge adjacent offset ranges
data_paths, data_starts, data_ends = merge_offset_ranges(
data_paths,
data_starts,
data_ends,
max_gap=max_gap,
max_block=max_block,
sort=False, # Should be sorted
)
# Transfer the data byte-ranges into local memory
result = {fn: {} for fn in list(set(data_paths))}
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
# Add b"PAR1" to header
_add_header_magic(result)
return result
def _transfer_ranges(fs, blocks, paths, starts, ends):
# Use cat_ranges to gather the data byte_ranges
ranges = (paths, starts, ends)
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
blocks[path][(start, stop)] = data
def _add_header_magic(data):
# Add b"PAR1" to file headers
for path in list(data.keys()):
add_magic = True
for k in data[path].keys():
if k[0] == 0 and k[1] >= 4:
add_magic = False
break
if add_magic:
data[path][(0, 4)] = b"PAR1"
def _set_engine(engine_str):
# Define a list of parquet engines to try
if engine_str == "auto":
try_engines = ("fastparquet", "pyarrow")
elif not isinstance(engine_str, str):
raise ValueError(
"Failed to set parquet engine! "
"Please pass 'fastparquet', 'pyarrow', or 'auto'"
)
elif engine_str not in ("fastparquet", "pyarrow"):
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
else:
try_engines = [engine_str]
# Try importing the engines in `try_engines`,
# and choose the first one that succeeds
for engine in try_engines:
try:
if engine == "fastparquet":
return FastparquetEngine()
elif engine == "pyarrow":
return PyarrowEngine()
except ImportError:
pass
# Raise an error if a supported parquet engine
# was not found
raise ImportError(
f"The following parquet engines are not installed "
f"in your python environment: {try_engines}."
f"Please install 'fastparquert' or 'pyarrow' to "
f"utilize the `fsspec.parquet` module."
)
class FastparquetEngine:
# The purpose of the FastparquetEngine class is
# to check if fastparquet can be imported (on initialization)
# and to define a `_parquet_byte_ranges` method. In the
# future, this class may also be used to define other
# methods/logic that are specific to fastparquet.
def __init__(self):
import fastparquet as fp
self.fp = fp
def _row_group_filename(self, row_group, pf):
return pf.row_group_filename(row_group)
def _parquet_byte_ranges(
self,
columns,
row_groups=None,
metadata=None,
footer=None,
footer_start=None,
):
# Initialize offset ranges and define ParqetFile metadata
pf = metadata
data_paths, data_starts, data_ends = [], [], []
if pf is None:
pf = self.fp.ParquetFile(io.BytesIO(footer))
# Convert columns to a set and add any index columns
# specified in the pandas metadata (just in case)
column_set = None if columns is None else set(columns)
if column_set is not None and hasattr(pf, "pandas_metadata"):
md_index = [
ind
for ind in pf.pandas_metadata.get("index_columns", [])
# Ignore RangeIndex information
if not isinstance(ind, dict)
]
column_set |= set(md_index)
# Check if row_groups is a list of integers
# or a list of row-group metadata
if row_groups and not isinstance(row_groups[0], int):
# Input row_groups contains row-group metadata
row_group_indices = None
else:
# Input row_groups contains row-group indices
row_group_indices = row_groups
row_groups = pf.row_groups
# Loop through column chunks to add required byte ranges
for r, row_group in enumerate(row_groups):
# Skip this row-group if we are targeting
# specific row-groups
if row_group_indices is None or r in row_group_indices:
# Find the target parquet-file path for `row_group`
fn = self._row_group_filename(row_group, pf)
for column in row_group.columns:
name = column.meta_data.path_in_schema[0]
# Skip this column if we are targeting a
# specific columns
if column_set is None or name in column_set:
file_offset0 = column.meta_data.dictionary_page_offset
if file_offset0 is None:
file_offset0 = column.meta_data.data_page_offset
num_bytes = column.meta_data.total_compressed_size
if footer_start is None or file_offset0 < footer_start:
data_paths.append(fn)
data_starts.append(file_offset0)
data_ends.append(
min(
file_offset0 + num_bytes,
footer_start or (file_offset0 + num_bytes),
)
)
if metadata:
# The metadata in this call may map to multiple
# file paths. Need to include `data_paths`
return data_paths, data_starts, data_ends
return data_starts, data_ends
class PyarrowEngine:
# The purpose of the PyarrowEngine class is
# to check if pyarrow can be imported (on initialization)
# and to define a `_parquet_byte_ranges` method. In the
# future, this class may also be used to define other
# methods/logic that are specific to pyarrow.
def __init__(self):
import pyarrow.parquet as pq
self.pq = pq
def _row_group_filename(self, row_group, metadata):
raise NotImplementedError
def _parquet_byte_ranges(
self,
columns,
row_groups=None,
metadata=None,
footer=None,
footer_start=None,
):
if metadata is not None:
raise ValueError("metadata input not supported for PyarrowEngine")
data_starts, data_ends = [], []
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
# Convert columns to a set and add any index columns
# specified in the pandas metadata (just in case)
column_set = None if columns is None else set(columns)
if column_set is not None:
schema = md.schema.to_arrow_schema()
has_pandas_metadata = (
schema.metadata is not None and b"pandas" in schema.metadata
)
if has_pandas_metadata:
md_index = [
ind
for ind in json.loads(
schema.metadata[b"pandas"].decode("utf8")
).get("index_columns", [])
# Ignore RangeIndex information
if not isinstance(ind, dict)
]
column_set |= set(md_index)
# Loop through column chunks to add required byte ranges
for r in range(md.num_row_groups):
# Skip this row-group if we are targeting
# specific row-groups
if row_groups is None or r in row_groups:
row_group = md.row_group(r)
for c in range(row_group.num_columns):
column = row_group.column(c)
name = column.path_in_schema
# Skip this column if we are targeting a
# specific columns
split_name = name.split(".")[0]
if (
column_set is None
or name in column_set
or split_name in column_set
):
file_offset0 = column.dictionary_page_offset
if file_offset0 is None:
file_offset0 = column.data_page_offset
num_bytes = column.total_compressed_size
if file_offset0 < footer_start:
data_starts.append(file_offset0)
data_ends.append(
min(file_offset0 + num_bytes, footer_start)
)
return data_starts, data_ends

View File

@ -0,0 +1,311 @@
from __future__ import annotations
import importlib
import types
import warnings
__all__ = ["registry", "get_filesystem_class", "default"]
# internal, mutable
_registry: dict[str, type] = {}
# external, immutable
registry = types.MappingProxyType(_registry)
default = "file"
def register_implementation(name, cls, clobber=False, errtxt=None):
"""Add implementation class to the registry
Parameters
----------
name: str
Protocol name to associate with the class
cls: class or str
if a class: fsspec-compliant implementation class (normally inherits from
``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
str, the full path to an implementation class like package.module.class,
which gets added to known_implementations,
so the import is deferred until the filesystem is actually used.
clobber: bool (optional)
Whether to overwrite a protocol with the same name; if False, will raise
instead.
errtxt: str (optional)
If given, then a failure to import the given class will result in this
text being given.
"""
if isinstance(cls, str):
if name in known_implementations and clobber is False:
if cls != known_implementations[name]["class"]:
raise ValueError(
f"Name ({name}) already in the known_implementations and clobber "
f"is False"
)
else:
known_implementations[name] = {
"class": cls,
"err": errtxt or f"{cls} import failed for protocol {name}",
}
else:
if name in registry and clobber is False:
if _registry[name] is not cls:
raise ValueError(
f"Name ({name}) already in the registry and clobber is False"
)
else:
_registry[name] = cls
# protocols mapped to the class which implements them. This dict can be
# updated with register_implementation
known_implementations = {
"abfs": {
"class": "adlfs.AzureBlobFileSystem",
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
},
"adl": {
"class": "adlfs.AzureDatalakeFileSystem",
"err": "Install adlfs to access Azure Datalake Gen1",
},
"arrow_hdfs": {
"class": "fsspec.implementations.arrow.HadoopFileSystem",
"err": "pyarrow and local java libraries required for HDFS",
},
"asynclocal": {
"class": "morefs.asyn_local.AsyncLocalFileSystem",
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
},
"az": {
"class": "adlfs.AzureBlobFileSystem",
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
},
"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
"box": {
"class": "boxfs.BoxFileSystem",
"err": "Please install boxfs to access BoxFileSystem",
},
"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
"dask": {
"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
"err": "Install dask distributed to access worker file system",
},
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
"dbfs": {
"class": "fsspec.implementations.dbfs.DatabricksFileSystem",
"err": "Install the requests package to use the DatabricksFileSystem",
},
"dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
"dropbox": {
"class": "dropboxdrivefs.DropboxDriveFileSystem",
"err": (
'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
'"dropbox" to be installed'
),
},
"dvc": {
"class": "dvc.api.DVCFileSystem",
"err": "Install dvc to access DVCFileSystem",
},
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
"gcs": {
"class": "gcsfs.GCSFileSystem",
"err": "Please install gcsfs to access Google Storage",
},
"gdrive": {
"class": "gdrivefs.GoogleDriveFileSystem",
"err": "Please install gdrivefs for access to Google Drive",
},
"generic": {"class": "fsspec.generic.GenericFileSystem"},
"git": {
"class": "fsspec.implementations.git.GitFileSystem",
"err": "Install pygit2 to browse local git repos",
},
"github": {
"class": "fsspec.implementations.github.GithubFileSystem",
"err": "Install the requests package to use the github FS",
},
"gs": {
"class": "gcsfs.GCSFileSystem",
"err": "Please install gcsfs to access Google Storage",
},
"hdfs": {
"class": "fsspec.implementations.arrow.HadoopFileSystem",
"err": "pyarrow and local java libraries required for HDFS",
},
"hf": {
"class": "huggingface_hub.HfFileSystem",
"err": "Install huggingface_hub to access HfFileSystem",
},
"http": {
"class": "fsspec.implementations.http.HTTPFileSystem",
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
},
"https": {
"class": "fsspec.implementations.http.HTTPFileSystem",
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
},
"jlab": {
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
"err": "Jupyter FS requires requests to be installed",
},
"jupyter": {
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
"err": "Jupyter FS requires requests to be installed",
},
"lakefs": {
"class": "lakefs_spec.LakeFSFileSystem",
"err": "Please install lakefs-spec to access LakeFSFileSystem",
},
"libarchive": {
"class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
"err": "LibArchive requires to be installed",
},
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
"oci": {
"class": "ocifs.OCIFileSystem",
"err": "Install ocifs to access OCI Object Storage",
},
"ocilake": {
"class": "ocifs.OCIFileSystem",
"err": "Install ocifs to access OCI Data Lake",
},
"oss": {
"class": "ossfs.OSSFileSystem",
"err": "Install ossfs to access Alibaba Object Storage System",
},
"reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
"root": {
"class": "fsspec_xrootd.XRootDFileSystem",
"err": (
"Install fsspec-xrootd to access xrootd storage system. "
"Note: 'root' is the protocol name for xrootd storage systems, "
"not referring to root directories"
),
},
"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
"s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
"sftp": {
"class": "fsspec.implementations.sftp.SFTPFileSystem",
"err": 'SFTPFileSystem requires "paramiko" to be installed',
},
"simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
"smb": {
"class": "fsspec.implementations.smb.SMBFileSystem",
"err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
},
"ssh": {
"class": "fsspec.implementations.sftp.SFTPFileSystem",
"err": 'SFTPFileSystem requires "paramiko" to be installed',
},
"tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
"wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
"webdav": {
"class": "webdav4.fsspec.WebdavFileSystem",
"err": "Install webdav4 to access WebDAV",
},
"webhdfs": {
"class": "fsspec.implementations.webhdfs.WebHDFS",
"err": 'webHDFS access requires "requests" to be installed',
},
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
}
assert list(known_implementations) == sorted(
known_implementations
), "Not in alphabetical order"
def get_filesystem_class(protocol):
"""Fetch named protocol implementation from the registry
The dict ``known_implementations`` maps protocol names to the locations
of classes implementing the corresponding file-system. When used for the
first time, appropriate imports will happen and the class will be placed in
the registry. All subsequent calls will fetch directly from the registry.
Some protocol implementations require additional dependencies, and so the
import may fail. In this case, the string in the "err" field of the
``known_implementations`` will be given as the error message.
"""
if not protocol:
protocol = default
if protocol not in registry:
if protocol not in known_implementations:
raise ValueError(f"Protocol not known: {protocol}")
bit = known_implementations[protocol]
try:
register_implementation(protocol, _import_class(bit["class"]))
except ImportError as e:
raise ImportError(bit["err"]) from e
cls = registry[protocol]
if getattr(cls, "protocol", None) in ("abstract", None):
cls.protocol = protocol
return cls
s3_msg = """Your installed version of s3fs is very old and known to cause
severe performance issues, see also https://github.com/dask/dask/issues/10276
To fix, you should specify a lower version bound on s3fs, or
update the current installation.
"""
def _import_class(fqp: str):
"""Take a fully-qualified path and return the imported class or identifier.
``fqp`` is of the form "package.module.klass" or
"package.module:subobject.klass".
Warnings
--------
This can import arbitrary modules. Make sure you haven't installed any modules
that may execute malicious code at import time.
"""
if ":" in fqp:
mod, name = fqp.rsplit(":", 1)
else:
mod, name = fqp.rsplit(".", 1)
is_s3 = mod == "s3fs"
mod = importlib.import_module(mod)
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
warnings.warn(s3_msg)
for part in name.split("."):
mod = getattr(mod, part)
if not isinstance(mod, type):
raise TypeError(f"{fqp} is not a class")
return mod
def filesystem(protocol, **storage_options):
"""Instantiate filesystems for given protocol and arguments
``storage_options`` are specific to the protocol being chosen, and are
passed directly to the class.
"""
if protocol == "arrow_hdfs":
warnings.warn(
"The 'arrow_hdfs' protocol has been deprecated and will be "
"removed in the future. Specify it as 'hdfs'.",
DeprecationWarning,
)
cls = get_filesystem_class(protocol)
return cls(**storage_options)
def available_protocols():
"""Return a list of the implemented protocols.
Note that any given protocol may require extra packages to be importable.
"""
return list(known_implementations)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,287 @@
import os
from hashlib import md5
import pytest
from fsspec.implementations.local import LocalFileSystem
from fsspec.tests.abstract.copy import AbstractCopyTests # noqa: F401
from fsspec.tests.abstract.get import AbstractGetTests # noqa: F401
from fsspec.tests.abstract.put import AbstractPutTests # noqa: F401
class BaseAbstractFixtures:
"""
Abstract base class containing fixtures that are used by but never need to
be overridden in derived filesystem-specific classes to run the abstract
tests on such filesystems.
"""
@pytest.fixture
def fs_bulk_operations_scenario_0(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used for many cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._bulk_operations_scenario_0(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._glob_edge_cases_files(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used to check cp/get/put on directory
and file with the same name prefixes.
Cleans up at the end of each test it which it is used.
"""
source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
"""
Scenario on remote filesystem that is used to check cp/get/put files order
when source and destination are lists.
Cleans up at the end of each test it which it is used.
"""
source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
yield source
fs.rm(source, recursive=True)
@pytest.fixture
def fs_target(self, fs, fs_join, fs_path):
"""
Return name of remote directory that does not yet exist to copy into.
Cleans up at the end of each test it which it is used.
"""
target = fs_join(fs_path, "target")
yield target
if fs.exists(target):
fs.rm(target, recursive=True)
@pytest.fixture
def local_bulk_operations_scenario_0(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used for many cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._bulk_operations_scenario_0(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
Cleans up at the end of each test it which it is used.
"""
source = self._glob_edge_cases_files(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_dir_and_file_with_same_name_prefix(
self, local_fs, local_join, local_path
):
"""
Scenario on local filesystem that is used to check cp/get/put on directory
and file with the same name prefixes.
Cleans up at the end of each test it which it is used.
"""
source = self._dir_and_file_with_same_name_prefix(
local_fs, local_join, local_path
)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
"""
Scenario on local filesystem that is used to check cp/get/put files order
when source and destination are lists.
Cleans up at the end of each test it which it is used.
"""
source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
yield source
local_fs.rm(source, recursive=True)
@pytest.fixture
def local_target(self, local_fs, local_join, local_path):
"""
Return name of local directory that does not yet exist to copy into.
Cleans up at the end of each test it which it is used.
"""
target = local_join(local_path, "target")
yield target
if local_fs.exists(target):
local_fs.rm(target, recursive=True)
def _glob_edge_cases_files(self, some_fs, some_join, some_path):
"""
Scenario that is used for glob edge cases cp/get/put tests.
Creates the following directory and file structure:
📁 source
├── 📄 file1
├── 📄 file2
├── 📁 subdir0
│ ├── 📄 subfile1
│ ├── 📄 subfile2
│ └── 📁 nesteddir
│ └── 📄 nestedfile
└── 📁 subdir1
├── 📄 subfile1
├── 📄 subfile2
└── 📁 nesteddir
└── 📄 nestedfile
"""
source = some_join(some_path, "source")
some_fs.touch(some_join(source, "file1"))
some_fs.touch(some_join(source, "file2"))
for subdir_idx in range(2):
subdir = some_join(source, f"subdir{subdir_idx}")
nesteddir = some_join(subdir, "nesteddir")
some_fs.makedirs(nesteddir)
some_fs.touch(some_join(subdir, "subfile1"))
some_fs.touch(some_join(subdir, "subfile2"))
some_fs.touch(some_join(nesteddir, "nestedfile"))
return source
def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
"""
Scenario that is used for many cp/get/put tests. Creates the following
directory and file structure:
📁 source
├── 📄 file1
├── 📄 file2
└── 📁 subdir
├── 📄 subfile1
├── 📄 subfile2
└── 📁 nesteddir
└── 📄 nestedfile
"""
source = some_join(some_path, "source")
subdir = some_join(source, "subdir")
nesteddir = some_join(subdir, "nesteddir")
some_fs.makedirs(nesteddir)
some_fs.touch(some_join(source, "file1"))
some_fs.touch(some_join(source, "file2"))
some_fs.touch(some_join(subdir, "subfile1"))
some_fs.touch(some_join(subdir, "subfile2"))
some_fs.touch(some_join(nesteddir, "nestedfile"))
return source
def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
"""
Scenario that is used to check cp/get/put on directory and file with
the same name prefixes. Creates the following directory and file structure:
📁 source
├── 📄 subdir.txt
└── 📁 subdir
└── 📄 subfile.txt
"""
source = some_join(some_path, "source")
subdir = some_join(source, "subdir")
file = some_join(source, "subdir.txt")
subfile = some_join(subdir, "subfile.txt")
some_fs.makedirs(subdir)
some_fs.touch(file)
some_fs.touch(subfile)
return source
def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
"""
Scenario that is used to check cp/get/put files order when source and
destination are lists. Creates the following directory and file structure:
📁 source
└── 📄 {hashed([0-9])}.txt
"""
source = some_join(some_path, "source")
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
path = some_join(source, f"{hashed_i}.txt")
some_fs.pipe(path=path, value=f"{i}".encode("utf-8"))
return source
class AbstractFixtures(BaseAbstractFixtures):
"""
Abstract base class containing fixtures that may be overridden in derived
filesystem-specific classes to run the abstract tests on such filesystems.
For any particular filesystem some of these fixtures must be overridden,
such as ``fs`` and ``fs_path``, and others may be overridden if the
default functions here are not appropriate, such as ``fs_join``.
"""
@pytest.fixture
def fs(self):
raise NotImplementedError("This function must be overridden in derived classes")
@pytest.fixture
def fs_join(self):
"""
Return a function that joins its arguments together into a path.
Most fsspec implementations join paths in a platform-dependent way,
but some will override this to always use a forward slash.
"""
return os.path.join
@pytest.fixture
def fs_path(self):
raise NotImplementedError("This function must be overridden in derived classes")
@pytest.fixture(scope="class")
def local_fs(self):
# Maybe need an option for auto_mkdir=False? This is only relevant
# for certain implementations.
return LocalFileSystem(auto_mkdir=True)
@pytest.fixture
def local_join(self):
"""
Return a function that joins its arguments together into a path, on
the local filesystem.
"""
return os.path.join
@pytest.fixture
def local_path(self, tmpdir):
return tmpdir
@pytest.fixture
def supports_empty_directories(self):
"""
Return whether this implementation supports empty directories.
"""
return True
@pytest.fixture
def fs_sanitize_path(self):
return lambda x: x

View File

@ -0,0 +1,175 @@
GLOB_EDGE_CASES_TESTS = {
"argnames": ("path", "recursive", "maxdepth", "expected"),
"argvalues": [
("fil?1", False, None, ["file1"]),
("fil?1", True, None, ["file1"]),
("file[1-2]", False, None, ["file1", "file2"]),
("file[1-2]", True, None, ["file1", "file2"]),
("*", False, None, ["file1", "file2"]),
(
"*",
True,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("*", True, 1, ["file1", "file2"]),
(
"*",
True,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
("*1", False, None, ["file1"]),
(
"*1",
True,
None,
[
"file1",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("*1", True, 2, ["file1", "subdir1/subfile1", "subdir1/subfile2"]),
(
"**",
False,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"**",
True,
None,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("**", True, 1, ["file1", "file2"]),
(
"**",
True,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"**",
False,
2,
[
"file1",
"file2",
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
(
"**/*1",
True,
None,
[
"file1",
"subdir0/subfile1",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
("**/*1", True, 1, ["file1"]),
(
"**/*1",
True,
2,
["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"],
),
("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
("**/subdir0", False, None, []),
("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
("**/subdir0/nested*", False, 2, []),
("**/subdir0/nested*", True, 2, ["nestedfile"]),
("subdir[1-2]", False, None, []),
("subdir[1-2]", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
("subdir[1-2]", True, 2, ["subfile1", "subfile2"]),
("subdir[0-1]", False, None, []),
(
"subdir[0-1]",
True,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir0/nesteddir/nestedfile",
"subdir1/subfile1",
"subdir1/subfile2",
"subdir1/nesteddir/nestedfile",
],
),
(
"subdir[0-1]/*fil[e]*",
False,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
(
"subdir[0-1]/*fil[e]*",
True,
None,
[
"subdir0/subfile1",
"subdir0/subfile2",
"subdir1/subfile1",
"subdir1/subfile2",
],
),
],
}

View File

@ -0,0 +1,557 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractCopyTests:
def test_copy_file_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1a
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
target_file2 = fs_join(target, "file2")
target_subfile1 = fs_join(target, "subfile1")
# Copy from source directory
fs.cp(fs_join(source, "file2"), target)
assert fs.isfile(target_file2)
# Copy from sub directory
fs.cp(fs_join(source, "subdir", "subfile1"), target)
assert fs.isfile(target_subfile1)
# Remove copied files
fs.rm([target_file2, target_subfile1])
assert not fs.exists(target_file2)
assert not fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.cp(fs_join(source, "file2"), target + "/")
assert fs.isdir(target)
assert fs.isfile(target_file2)
fs.cp(fs_join(source, "subdir", "subfile1"), target + "/")
assert fs.isfile(target_subfile1)
def test_copy_file_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1b
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.cp(
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
) # Note trailing slash
assert fs.isdir(target)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_copy_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1c
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
fs.cp(fs_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
assert fs.isfile(fs_join(target, "newfile"))
def test_copy_file_to_file_in_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1d
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.cp(
fs_join(source, "subdir", "subfile1"), fs_join(target, "newdir", "newfile")
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "newfile"))
def test_copy_directory_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1e
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.cp(s, t)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
fs.cp(s, t, recursive=True)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.cp(s, t, recursive=True, maxdepth=1)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_directory_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1f
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.cp(s, t)
if supports_empty_directories:
assert fs.ls(target) == []
else:
with pytest.raises(FileNotFoundError):
fs.ls(target)
# With recursive
fs.cp(s, t, recursive=True)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.cp(s, t, recursive=True, maxdepth=1)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
def test_copy_glob_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 1g
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.cp(fs_join(source, "subdir", "*"), t)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.isdir(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.cp(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_glob_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 1h
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.cp(fs_join(source, "subdir", "*"), t)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.cp(fs_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.cp(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_copy_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_glob_edge_cases_files,
fs_target,
fs_sanitize_path,
):
# Copy scenario 1g
source = fs_glob_edge_cases_files
target = fs_target
for new_dir, target_slash in product([True, False], [True, False]):
fs.mkdir(target)
t = fs_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.copy(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = fs.find(target)
if new_dir:
prefixed_expected = [
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
fs_sanitize_path(fs_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_copy_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
fs_target,
supports_empty_directories,
):
# Copy scenario 2a
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.cp(source_files, t)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
assert fs.isfile(fs_join(target, "subfile1"))
fs.rm(
[
fs_join(target, "file1"),
fs_join(target, "file2"),
fs_join(target, "subfile1"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_copy_list_of_files_to_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# Copy scenario 2b
source = fs_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
fs.cp(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "file1"))
assert fs.isfile(fs_join(target, "newdir", "file2"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_copy_two_files_new_directory(
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
):
# This is a duplicate of test_copy_list_of_files_to_new_directory and
# can eventually be removed.
source = fs_bulk_operations_scenario_0
target = fs_target
assert not fs.exists(target)
fs.cp([fs_join(source, "file1"), fs_join(source, "file2")], target)
assert fs.isdir(target)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
def test_copy_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
fs_target,
fs_dir_and_file_with_same_name_prefix,
supports_empty_directories,
):
# Create the test dirs
source = fs_dir_and_file_with_same_name_prefix
target = fs_target
# Test without glob
fs.cp(fs_join(source, "subdir"), target, recursive=True)
assert fs.isfile(fs_join(target, "subfile.txt"))
assert not fs.isfile(fs_join(target, "subdir.txt"))
fs.rm([fs_join(target, "subfile.txt")])
if supports_empty_directories:
assert fs.ls(target) == []
else:
assert not fs.exists(target)
# Test with glob
fs.cp(fs_join(source, "subdir*"), target, recursive=True)
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile.txt"))
assert fs.isfile(fs_join(target, "subdir.txt"))
def test_copy_with_source_and_destination_as_list(
self, fs, fs_target, fs_join, fs_10_files_with_hashed_names
):
# Create the test dir
source = fs_10_files_with_hashed_names
target = fs_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(fs_join(source, f"{hashed_i}.txt"))
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
# Copy and assert order was kept
fs.copy(path1=source_files, path2=destination_files)
for i in range(10):
file_content = fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

View File

@ -0,0 +1,587 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.implementations.local import make_path_posix
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractGetTests:
def test_get_file_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1a
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
assert local_fs.isdir(target)
target_file2 = local_join(target, "file2")
target_subfile1 = local_join(target, "subfile1")
# Copy from source directory
fs.get(fs_join(source, "file2"), target)
assert local_fs.isfile(target_file2)
# Copy from sub directory
fs.get(fs_join(source, "subdir", "subfile1"), target)
assert local_fs.isfile(target_subfile1)
# Remove copied files
local_fs.rm([target_file2, target_subfile1])
assert not local_fs.exists(target_file2)
assert not local_fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.get(fs_join(source, "file2"), target + "/")
assert local_fs.isdir(target)
assert local_fs.isfile(target_file2)
fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
assert local_fs.isfile(target_subfile1)
def test_get_file_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1b
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(
fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
) # Note trailing slash
assert local_fs.isdir(target)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
def test_get_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1c
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
assert local_fs.isfile(local_join(target, "newfile"))
def test_get_file_to_file_in_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1d
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
fs.get(
fs_join(source, "subdir", "subfile1"),
local_join(target, "newdir", "newfile"),
)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "newfile"))
def test_get_directory_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1e
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
assert local_fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.get(s, t)
assert local_fs.ls(target) == []
# With recursive
fs.get(s, t, recursive=True)
if source_slash:
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert local_fs.isdir(local_join(target, "nesteddir"))
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
local_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "subdir", "nesteddir", "nestedfile")
)
local_fs.rm(local_join(target, "subdir"), recursive=True)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(s, t, recursive=True, maxdepth=1)
if source_slash:
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.exists(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
else:
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
local_fs.rm(local_join(target, "subdir"), recursive=True)
assert local_fs.ls(target) == []
def test_get_directory_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1f
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = local_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.get(s, t)
assert local_fs.ls(target) == []
# With recursive
fs.get(s, t, recursive=True)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(s, t, recursive=True, maxdepth=1)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
def test_get_glob_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1g
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.get(fs_join(source, "subdir", "*"), t)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.isdir(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
assert local_fs.ls(target) == []
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert local_fs.isdir(local_join(target, "nesteddir"))
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
local_join(target, "nesteddir"),
],
recursive=True,
)
assert local_fs.ls(target) == []
# Limit recursive by maxdepth
fs.get(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert local_fs.isfile(local_join(target, "subfile1"))
assert local_fs.isfile(local_join(target, "subfile2"))
assert not local_fs.exists(local_join(target, "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
local_fs.rm(
[
local_join(target, "subfile1"),
local_join(target, "subfile2"),
],
recursive=True,
)
assert local_fs.ls(target) == []
def test_get_glob_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 1h
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.get(fs_join(source, "subdir", "*"), t)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert local_fs.ls(target) == []
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
assert local_fs.isfile(
local_join(target, "newdir", "nesteddir", "nestedfile")
)
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_join(target, "newdir"), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
# Limit recursive by maxdepth
fs.get(
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
)
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
assert not local_fs.exists(local_join(target, "subdir"))
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
assert not local_fs.exists(local_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_get_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_glob_edge_cases_files,
local_fs,
local_join,
local_target,
):
# Copy scenario 1g
source = fs_glob_edge_cases_files
target = local_target
for new_dir, target_slash in product([True, False], [True, False]):
local_fs.mkdir(target)
t = local_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = local_fs.find(target)
if new_dir:
prefixed_expected = [
make_path_posix(local_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
make_path_posix(local_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
local_fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_get_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 2a
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.get(source_files, t)
assert local_fs.isfile(local_join(target, "file1"))
assert local_fs.isfile(local_join(target, "file2"))
assert local_fs.isfile(local_join(target, "subfile1"))
local_fs.rm(
[
local_join(target, "file1"),
local_join(target, "file2"),
local_join(target, "subfile1"),
],
recursive=True,
)
assert local_fs.ls(target) == []
def test_get_list_of_files_to_new_directory(
self,
fs,
fs_join,
fs_bulk_operations_scenario_0,
local_fs,
local_join,
local_target,
):
# Copy scenario 2b
source = fs_bulk_operations_scenario_0
target = local_target
local_fs.mkdir(target)
source_files = [
fs_join(source, "file1"),
fs_join(source, "file2"),
fs_join(source, "subdir", "subfile1"),
]
fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash
assert local_fs.isdir(local_join(target, "newdir"))
assert local_fs.isfile(local_join(target, "newdir", "file1"))
assert local_fs.isfile(local_join(target, "newdir", "file2"))
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
def test_get_directory_recursive(
self, fs, fs_join, fs_path, local_fs, local_join, local_target
):
# https://github.com/fsspec/filesystem_spec/issues/1062
# Recursive cp/get/put of source directory into non-existent target directory.
src = fs_join(fs_path, "src")
src_file = fs_join(src, "file")
fs.mkdir(src)
fs.touch(src_file)
target = local_target
# get without slash
assert not local_fs.exists(target)
for loop in range(2):
fs.get(src, target, recursive=True)
assert local_fs.isdir(target)
if loop == 0:
assert local_fs.isfile(local_join(target, "file"))
assert not local_fs.exists(local_join(target, "src"))
else:
assert local_fs.isfile(local_join(target, "file"))
assert local_fs.isdir(local_join(target, "src"))
assert local_fs.isfile(local_join(target, "src", "file"))
local_fs.rm(target, recursive=True)
# get with slash
assert not local_fs.exists(target)
for loop in range(2):
fs.get(src + "/", target, recursive=True)
assert local_fs.isdir(target)
assert local_fs.isfile(local_join(target, "file"))
assert not local_fs.exists(local_join(target, "src"))
def test_get_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
local_fs,
local_join,
local_target,
fs_dir_and_file_with_same_name_prefix,
):
# Create the test dirs
source = fs_dir_and_file_with_same_name_prefix
target = local_target
# Test without glob
fs.get(fs_join(source, "subdir"), target, recursive=True)
assert local_fs.isfile(local_join(target, "subfile.txt"))
assert not local_fs.isfile(local_join(target, "subdir.txt"))
local_fs.rm([local_join(target, "subfile.txt")])
assert local_fs.ls(target) == []
# Test with glob
fs.get(fs_join(source, "subdir*"), target, recursive=True)
assert local_fs.isdir(local_join(target, "subdir"))
assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
assert local_fs.isfile(local_join(target, "subdir.txt"))
def test_get_with_source_and_destination_as_list(
self,
fs,
fs_join,
local_fs,
local_join,
local_target,
fs_10_files_with_hashed_names,
):
# Create the test dir
source = fs_10_files_with_hashed_names
target = local_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(fs_join(source, f"{hashed_i}.txt"))
destination_files.append(
make_path_posix(local_join(target, f"{hashed_i}.txt"))
)
# Copy and assert order was kept
fs.get(rpath=source_files, lpath=destination_files)
for i in range(10):
file_content = local_fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

View File

@ -0,0 +1,57 @@
import os
import pytest
import fsspec
def test_move_raises_error_with_tmpdir(tmpdir):
# Create a file in the temporary directory
source = tmpdir.join("source_file.txt")
source.write("content")
# Define a destination that simulates a protected or invalid path
destination = tmpdir.join("non_existent_directory/destination_file.txt")
# Instantiate the filesystem (assuming the local file system interface)
fs = fsspec.filesystem("file")
# Use the actual file paths as string
with pytest.raises(FileNotFoundError):
fs.mv(str(source), str(destination))
@pytest.mark.parametrize("recursive", (True, False))
def test_move_raises_error_with_tmpdir_permission(recursive, tmpdir):
# Create a file in the temporary directory
source = tmpdir.join("source_file.txt")
source.write("content")
# Create a protected directory (non-writable)
protected_dir = tmpdir.mkdir("protected_directory")
protected_path = str(protected_dir)
# Set the directory to read-only
if os.name == "nt":
os.system(f'icacls "{protected_path}" /deny Everyone:(W)')
else:
os.chmod(protected_path, 0o555) # Sets the directory to read-only
# Define a destination inside the protected directory
destination = protected_dir.join("destination_file.txt")
# Instantiate the filesystem (assuming the local file system interface)
fs = fsspec.filesystem("file")
# Try to move the file to the read-only directory, expecting a permission error
with pytest.raises(PermissionError):
fs.mv(str(source), str(destination), recursive=recursive)
# Assert the file was not created in the destination
assert not os.path.exists(destination)
# Cleanup: Restore permissions so the directory can be cleaned up
if os.name == "nt":
os.system(f'icacls "{protected_path}" /remove:d Everyone')
else:
os.chmod(protected_path, 0o755) # Restore write permission for cleanup

View File

@ -0,0 +1,591 @@
from hashlib import md5
from itertools import product
import pytest
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
class AbstractPutTests:
def test_put_file_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1a
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
target_file2 = fs_join(target, "file2")
target_subfile1 = fs_join(target, "subfile1")
# Copy from source directory
fs.put(local_join(source, "file2"), target)
assert fs.isfile(target_file2)
# Copy from sub directory
fs.put(local_join(source, "subdir", "subfile1"), target)
assert fs.isfile(target_subfile1)
# Remove copied files
fs.rm([target_file2, target_subfile1])
assert not fs.exists(target_file2)
assert not fs.exists(target_subfile1)
# Repeat with trailing slash on target
fs.put(local_join(source, "file2"), target + "/")
assert fs.isdir(target)
assert fs.isfile(target_file2)
fs.put(local_join(source, "subdir", "subfile1"), target + "/")
assert fs.isfile(target_subfile1)
def test_put_file_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1b
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.put(
local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
) # Note trailing slash
assert fs.isdir(target)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_put_file_to_file_in_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
supports_empty_directories,
local_bulk_operations_scenario_0,
):
# Copy scenario 1c
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
fs.touch(fs_join(target, "dummy"))
assert fs.isdir(target)
fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
assert fs.isfile(fs_join(target, "newfile"))
def test_put_file_to_file_in_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1d
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
fs.put(
local_join(source, "subdir", "subfile1"),
fs_join(target, "newdir", "newfile"),
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "newfile"))
def test_put_directory_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1e
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = target + "/" if target_slash else target
# Without recursive does nothing
fs.put(s, t)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
fs.put(s, t, recursive=True)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.put(s, t, recursive=True, maxdepth=1)
if source_slash:
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
else:
assert fs.isdir(fs_join(target, "subdir"))
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
fs.rm(fs_join(target, "subdir"), recursive=True)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_directory_to_new_directory(
self,
fs,
fs_join,
fs_target,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 1f
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for source_slash, target_slash in zip([False, True], [False, True]):
s = fs_join(source, "subdir")
if source_slash:
s += "/"
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive does nothing
fs.put(s, t)
if supports_empty_directories:
assert fs.ls(target) == []
else:
with pytest.raises(FileNotFoundError):
fs.ls(target)
# With recursive
fs.put(s, t, recursive=True)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.put(s, t, recursive=True, maxdepth=1)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
def test_put_glob_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
supports_empty_directories,
local_bulk_operations_scenario_0,
):
# Copy scenario 1g
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
for target_slash in [False, True]:
t = target + "/" if target_slash else target
# Without recursive
fs.put(local_join(source, "subdir", "*"), t)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.isdir(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert fs.isdir(fs_join(target, "nesteddir"))
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
fs_join(target, "nesteddir"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
# Limit recursive by maxdepth
fs.put(
local_join(source, "subdir", glob),
t,
recursive=recursive,
maxdepth=1,
)
assert fs.isfile(fs_join(target, "subfile1"))
assert fs.isfile(fs_join(target, "subfile2"))
assert not fs.exists(fs_join(target, "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
fs.rm(
[
fs_join(target, "subfile1"),
fs_join(target, "subfile2"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_glob_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 1h
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
for target_slash in [False, True]:
t = fs_join(target, "newdir")
if target_slash:
t += "/"
# Without recursive
fs.put(local_join(source, "subdir", "*"), t)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# With recursive
for glob, recursive in zip(["*", "**"], [True, False]):
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
# Limit recursive by maxdepth
fs.put(
local_join(source, "subdir", glob),
t,
recursive=recursive,
maxdepth=1,
)
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
assert not fs.exists(fs_join(target, "subdir"))
assert not fs.exists(fs_join(target, "newdir", "subdir"))
fs.rm(fs_join(target, "newdir"), recursive=True)
assert not fs.exists(fs_join(target, "newdir"))
@pytest.mark.parametrize(
GLOB_EDGE_CASES_TESTS["argnames"],
GLOB_EDGE_CASES_TESTS["argvalues"],
)
def test_put_glob_edge_cases(
self,
path,
recursive,
maxdepth,
expected,
fs,
fs_join,
fs_target,
local_glob_edge_cases_files,
local_join,
fs_sanitize_path,
):
# Copy scenario 1g
source = local_glob_edge_cases_files
target = fs_target
for new_dir, target_slash in product([True, False], [True, False]):
fs.mkdir(target)
t = fs_join(target, "newdir") if new_dir else target
t = t + "/" if target_slash else t
fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
output = fs.find(target)
if new_dir:
prefixed_expected = [
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
]
else:
prefixed_expected = [
fs_sanitize_path(fs_join(target, p)) for p in expected
]
assert sorted(output) == sorted(prefixed_expected)
try:
fs.rm(target, recursive=True)
except FileNotFoundError:
pass
def test_put_list_of_files_to_existing_directory(
self,
fs,
fs_join,
fs_target,
local_join,
local_bulk_operations_scenario_0,
supports_empty_directories,
):
# Copy scenario 2a
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
if not supports_empty_directories:
# Force target directory to exist by adding a dummy file
dummy = fs_join(target, "dummy")
fs.touch(dummy)
assert fs.isdir(target)
source_files = [
local_join(source, "file1"),
local_join(source, "file2"),
local_join(source, "subdir", "subfile1"),
]
for target_slash in [False, True]:
t = target + "/" if target_slash else target
fs.put(source_files, t)
assert fs.isfile(fs_join(target, "file1"))
assert fs.isfile(fs_join(target, "file2"))
assert fs.isfile(fs_join(target, "subfile1"))
fs.rm(
[
fs_join(target, "file1"),
fs_join(target, "file2"),
fs_join(target, "subfile1"),
],
recursive=True,
)
assert fs.ls(target, detail=False) == (
[] if supports_empty_directories else [dummy]
)
def test_put_list_of_files_to_new_directory(
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
):
# Copy scenario 2b
source = local_bulk_operations_scenario_0
target = fs_target
fs.mkdir(target)
source_files = [
local_join(source, "file1"),
local_join(source, "file2"),
local_join(source, "subdir", "subfile1"),
]
fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
assert fs.isdir(fs_join(target, "newdir"))
assert fs.isfile(fs_join(target, "newdir", "file1"))
assert fs.isfile(fs_join(target, "newdir", "file2"))
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
def test_put_directory_recursive(
self, fs, fs_join, fs_target, local_fs, local_join, local_path
):
# https://github.com/fsspec/filesystem_spec/issues/1062
# Recursive cp/get/put of source directory into non-existent target directory.
src = local_join(local_path, "src")
src_file = local_join(src, "file")
local_fs.mkdir(src)
local_fs.touch(src_file)
target = fs_target
# put without slash
assert not fs.exists(target)
for loop in range(2):
fs.put(src, target, recursive=True)
assert fs.isdir(target)
if loop == 0:
assert fs.isfile(fs_join(target, "file"))
assert not fs.exists(fs_join(target, "src"))
else:
assert fs.isfile(fs_join(target, "file"))
assert fs.isdir(fs_join(target, "src"))
assert fs.isfile(fs_join(target, "src", "file"))
fs.rm(target, recursive=True)
# put with slash
assert not fs.exists(target)
for loop in range(2):
fs.put(src + "/", target, recursive=True)
assert fs.isdir(target)
assert fs.isfile(fs_join(target, "file"))
assert not fs.exists(fs_join(target, "src"))
def test_put_directory_without_files_with_same_name_prefix(
self,
fs,
fs_join,
fs_target,
local_join,
local_dir_and_file_with_same_name_prefix,
supports_empty_directories,
):
# Create the test dirs
source = local_dir_and_file_with_same_name_prefix
target = fs_target
# Test without glob
fs.put(local_join(source, "subdir"), fs_target, recursive=True)
assert fs.isfile(fs_join(fs_target, "subfile.txt"))
assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
fs.rm([fs_join(target, "subfile.txt")])
if supports_empty_directories:
assert fs.ls(target) == []
else:
assert not fs.exists(target)
# Test with glob
fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
assert fs.isdir(fs_join(fs_target, "subdir"))
assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
assert fs.isfile(fs_join(fs_target, "subdir.txt"))
def test_copy_with_source_and_destination_as_list(
self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
):
# Create the test dir
source = local_10_files_with_hashed_names
target = fs_target
# Create list of files for source and destination
source_files = []
destination_files = []
for i in range(10):
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
source_files.append(local_join(source, f"{hashed_i}.txt"))
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
# Copy and assert order was kept
fs.put(lpath=source_files, rpath=destination_files)
for i in range(10):
file_content = fs.cat(destination_files[i]).decode("utf-8")
assert file_content == str(i)

Some files were not shown because too many files have changed in this diff Show More