703 lines
22 KiB
Python
703 lines
22 KiB
Python
"""A file interface for handling local and remote data files.
|
|
|
|
The goal of datasource is to abstract some of the file system operations
|
|
when dealing with data files so the researcher doesn't have to know all the
|
|
low-level details. Through datasource, a researcher can obtain and use a
|
|
file with one function call, regardless of location of the file.
|
|
|
|
DataSource is meant to augment standard python libraries, not replace them.
|
|
It should work seamlessly with standard file IO operations and the os
|
|
module.
|
|
|
|
DataSource files can originate locally or remotely:
|
|
|
|
- local files : '/home/guido/src/local/data.txt'
|
|
- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
|
|
|
|
DataSource files can also be compressed or uncompressed. Currently only
|
|
gzip, bz2 and xz are supported.
|
|
|
|
Example::
|
|
|
|
>>> # Create a DataSource, use os.curdir (default) for local storage.
|
|
>>> from numpy import DataSource
|
|
>>> ds = DataSource()
|
|
>>>
|
|
>>> # Open a remote file.
|
|
>>> # DataSource downloads the file, stores it locally in:
|
|
>>> # './www.google.com/index.html'
|
|
>>> # opens the file and returns a file object.
|
|
>>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
|
|
>>>
|
|
>>> # Use the file as you normally would
|
|
>>> fp.read() # doctest: +SKIP
|
|
>>> fp.close() # doctest: +SKIP
|
|
|
|
"""
|
|
import os
|
|
import shutil
|
|
import io
|
|
|
|
from numpy.core.overrides import set_module
|
|
|
|
|
|
_open = open
|
|
|
|
|
|
def _check_mode(mode, encoding, newline):
|
|
"""Check mode and that encoding and newline are compatible.
|
|
|
|
Parameters
|
|
----------
|
|
mode : str
|
|
File open mode.
|
|
encoding : str
|
|
File encoding.
|
|
newline : str
|
|
Newline for text files.
|
|
|
|
"""
|
|
if "t" in mode:
|
|
if "b" in mode:
|
|
raise ValueError("Invalid mode: %r" % (mode,))
|
|
else:
|
|
if encoding is not None:
|
|
raise ValueError("Argument 'encoding' not supported in binary mode")
|
|
if newline is not None:
|
|
raise ValueError("Argument 'newline' not supported in binary mode")
|
|
|
|
|
|
# Using a class instead of a module-level dictionary
|
|
# to reduce the initial 'import numpy' overhead by
|
|
# deferring the import of lzma, bz2 and gzip until needed
|
|
|
|
# TODO: .zip support, .tar support?
|
|
class _FileOpeners:
|
|
"""
|
|
Container for different methods to open (un-)compressed files.
|
|
|
|
`_FileOpeners` contains a dictionary that holds one method for each
|
|
supported file format. Attribute lookup is implemented in such a way
|
|
that an instance of `_FileOpeners` itself can be indexed with the keys
|
|
of that dictionary. Currently uncompressed files as well as files
|
|
compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
|
|
|
|
Notes
|
|
-----
|
|
`_file_openers`, an instance of `_FileOpeners`, is made available for
|
|
use in the `_datasource` module.
|
|
|
|
Examples
|
|
--------
|
|
>>> import gzip
|
|
>>> np.lib._datasource._file_openers.keys()
|
|
[None, '.bz2', '.gz', '.xz', '.lzma']
|
|
>>> np.lib._datasource._file_openers['.gz'] is gzip.open
|
|
True
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._loaded = False
|
|
self._file_openers = {None: io.open}
|
|
|
|
def _load(self):
|
|
if self._loaded:
|
|
return
|
|
|
|
try:
|
|
import bz2
|
|
self._file_openers[".bz2"] = bz2.open
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
import gzip
|
|
self._file_openers[".gz"] = gzip.open
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
import lzma
|
|
self._file_openers[".xz"] = lzma.open
|
|
self._file_openers[".lzma"] = lzma.open
|
|
except (ImportError, AttributeError):
|
|
# There are incompatible backports of lzma that do not have the
|
|
# lzma.open attribute, so catch that as well as ImportError.
|
|
pass
|
|
|
|
self._loaded = True
|
|
|
|
def keys(self):
|
|
"""
|
|
Return the keys of currently supported file openers.
|
|
|
|
Parameters
|
|
----------
|
|
None
|
|
|
|
Returns
|
|
-------
|
|
keys : list
|
|
The keys are None for uncompressed files and the file extension
|
|
strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
|
|
methods.
|
|
|
|
"""
|
|
self._load()
|
|
return list(self._file_openers.keys())
|
|
|
|
def __getitem__(self, key):
|
|
self._load()
|
|
return self._file_openers[key]
|
|
|
|
_file_openers = _FileOpeners()
|
|
|
|
def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
|
|
"""
|
|
Open `path` with `mode` and return the file object.
|
|
|
|
If ``path`` is an URL, it will be downloaded, stored in the
|
|
`DataSource` `destpath` directory and opened from there.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Local file path or URL to open.
|
|
mode : str, optional
|
|
Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
|
|
append. Available modes depend on the type of object specified by
|
|
path. Default is 'r'.
|
|
destpath : str, optional
|
|
Path to the directory where the source file gets downloaded to for
|
|
use. If `destpath` is None, a temporary directory will be created.
|
|
The default path is the current directory.
|
|
encoding : {None, str}, optional
|
|
Open text file with given encoding. The default encoding will be
|
|
what `io.open` uses.
|
|
newline : {None, str}, optional
|
|
Newline to use when reading text file.
|
|
|
|
Returns
|
|
-------
|
|
out : file object
|
|
The opened file.
|
|
|
|
Notes
|
|
-----
|
|
This is a convenience function that instantiates a `DataSource` and
|
|
returns the file object from ``DataSource.open(path)``.
|
|
|
|
"""
|
|
|
|
ds = DataSource(destpath)
|
|
return ds.open(path, mode, encoding=encoding, newline=newline)
|
|
|
|
|
|
@set_module('numpy')
|
|
class DataSource:
|
|
"""
|
|
DataSource(destpath='.')
|
|
|
|
A generic data source file (file, http, ftp, ...).
|
|
|
|
DataSources can be local files or remote files/URLs. The files may
|
|
also be compressed or uncompressed. DataSource hides some of the
|
|
low-level details of downloading the file, allowing you to simply pass
|
|
in a valid file path (or URL) and obtain a file object.
|
|
|
|
Parameters
|
|
----------
|
|
destpath : str or None, optional
|
|
Path to the directory where the source file gets downloaded to for
|
|
use. If `destpath` is None, a temporary directory will be created.
|
|
The default path is the current directory.
|
|
|
|
Notes
|
|
-----
|
|
URLs require a scheme string (``http://``) to be used, without it they
|
|
will fail::
|
|
|
|
>>> repos = np.DataSource()
|
|
>>> repos.exists('www.google.com/index.html')
|
|
False
|
|
>>> repos.exists('http://www.google.com/index.html')
|
|
True
|
|
|
|
Temporary directories are deleted when the DataSource is deleted.
|
|
|
|
Examples
|
|
--------
|
|
::
|
|
|
|
>>> ds = np.DataSource('/home/guido')
|
|
>>> urlname = 'http://www.google.com/'
|
|
>>> gfile = ds.open('http://www.google.com/')
|
|
>>> ds.abspath(urlname)
|
|
'/home/guido/www.google.com/index.html'
|
|
|
|
>>> ds = np.DataSource(None) # use with temporary file
|
|
>>> ds.open('/home/guido/foobar.txt')
|
|
<open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
|
|
>>> ds.abspath('/home/guido/foobar.txt')
|
|
'/tmp/.../home/guido/foobar.txt'
|
|
|
|
"""
|
|
|
|
def __init__(self, destpath=os.curdir):
|
|
"""Create a DataSource with a local path at destpath."""
|
|
if destpath:
|
|
self._destpath = os.path.abspath(destpath)
|
|
self._istmpdest = False
|
|
else:
|
|
import tempfile # deferring import to improve startup time
|
|
self._destpath = tempfile.mkdtemp()
|
|
self._istmpdest = True
|
|
|
|
def __del__(self):
|
|
# Remove temp directories
|
|
if hasattr(self, '_istmpdest') and self._istmpdest:
|
|
shutil.rmtree(self._destpath)
|
|
|
|
def _iszip(self, filename):
|
|
"""Test if the filename is a zip file by looking at the file extension.
|
|
|
|
"""
|
|
fname, ext = os.path.splitext(filename)
|
|
return ext in _file_openers.keys()
|
|
|
|
def _iswritemode(self, mode):
|
|
"""Test if the given mode will open a file for writing."""
|
|
|
|
# Currently only used to test the bz2 files.
|
|
_writemodes = ("w", "+")
|
|
for c in mode:
|
|
if c in _writemodes:
|
|
return True
|
|
return False
|
|
|
|
def _splitzipext(self, filename):
|
|
"""Split zip extension from filename and return filename.
|
|
|
|
*Returns*:
|
|
base, zip_ext : {tuple}
|
|
|
|
"""
|
|
|
|
if self._iszip(filename):
|
|
return os.path.splitext(filename)
|
|
else:
|
|
return filename, None
|
|
|
|
def _possible_names(self, filename):
|
|
"""Return a tuple containing compressed filename variations."""
|
|
names = [filename]
|
|
if not self._iszip(filename):
|
|
for zipext in _file_openers.keys():
|
|
if zipext:
|
|
names.append(filename+zipext)
|
|
return names
|
|
|
|
def _isurl(self, path):
|
|
"""Test if path is a net location. Tests the scheme and netloc."""
|
|
|
|
# We do this here to reduce the 'import numpy' initial import time.
|
|
from urllib.parse import urlparse
|
|
|
|
# BUG : URLs require a scheme string ('http://') to be used.
|
|
# www.google.com will fail.
|
|
# Should we prepend the scheme for those that don't have it and
|
|
# test that also? Similar to the way we append .gz and test for
|
|
# for compressed versions of files.
|
|
|
|
scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
|
|
return bool(scheme and netloc)
|
|
|
|
def _cache(self, path):
|
|
"""Cache the file specified by path.
|
|
|
|
Creates a copy of the file in the datasource cache.
|
|
|
|
"""
|
|
# We import these here because importing urllib is slow and
|
|
# a significant fraction of numpy's total import time.
|
|
from urllib.request import urlopen
|
|
from urllib.error import URLError
|
|
|
|
upath = self.abspath(path)
|
|
|
|
# ensure directory exists
|
|
if not os.path.exists(os.path.dirname(upath)):
|
|
os.makedirs(os.path.dirname(upath))
|
|
|
|
# TODO: Doesn't handle compressed files!
|
|
if self._isurl(path):
|
|
with urlopen(path) as openedurl:
|
|
with _open(upath, 'wb') as f:
|
|
shutil.copyfileobj(openedurl, f)
|
|
else:
|
|
shutil.copyfile(path, upath)
|
|
return upath
|
|
|
|
def _findfile(self, path):
|
|
"""Searches for ``path`` and returns full path if found.
|
|
|
|
If path is an URL, _findfile will cache a local copy and return the
|
|
path to the cached file. If path is a local file, _findfile will
|
|
return a path to that local file.
|
|
|
|
The search will include possible compressed versions of the file
|
|
and return the first occurrence found.
|
|
|
|
"""
|
|
|
|
# Build list of possible local file paths
|
|
if not self._isurl(path):
|
|
# Valid local paths
|
|
filelist = self._possible_names(path)
|
|
# Paths in self._destpath
|
|
filelist += self._possible_names(self.abspath(path))
|
|
else:
|
|
# Cached URLs in self._destpath
|
|
filelist = self._possible_names(self.abspath(path))
|
|
# Remote URLs
|
|
filelist = filelist + self._possible_names(path)
|
|
|
|
for name in filelist:
|
|
if self.exists(name):
|
|
if self._isurl(name):
|
|
name = self._cache(name)
|
|
return name
|
|
return None
|
|
|
|
def abspath(self, path):
|
|
"""
|
|
Return absolute path of file in the DataSource directory.
|
|
|
|
If `path` is an URL, then `abspath` will return either the location
|
|
the file exists locally or the location it would exist when opened
|
|
using the `open` method.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Can be a local file or a remote URL.
|
|
|
|
Returns
|
|
-------
|
|
out : str
|
|
Complete path, including the `DataSource` destination directory.
|
|
|
|
Notes
|
|
-----
|
|
The functionality is based on `os.path.abspath`.
|
|
|
|
"""
|
|
# We do this here to reduce the 'import numpy' initial import time.
|
|
from urllib.parse import urlparse
|
|
|
|
# TODO: This should be more robust. Handles case where path includes
|
|
# the destpath, but not other sub-paths. Failing case:
|
|
# path = /home/guido/datafile.txt
|
|
# destpath = /home/alex/
|
|
# upath = self.abspath(path)
|
|
# upath == '/home/alex/home/guido/datafile.txt'
|
|
|
|
# handle case where path includes self._destpath
|
|
splitpath = path.split(self._destpath, 2)
|
|
if len(splitpath) > 1:
|
|
path = splitpath[1]
|
|
scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
|
|
netloc = self._sanitize_relative_path(netloc)
|
|
upath = self._sanitize_relative_path(upath)
|
|
return os.path.join(self._destpath, netloc, upath)
|
|
|
|
def _sanitize_relative_path(self, path):
|
|
"""Return a sanitised relative path for which
|
|
os.path.abspath(os.path.join(base, path)).startswith(base)
|
|
"""
|
|
last = None
|
|
path = os.path.normpath(path)
|
|
while path != last:
|
|
last = path
|
|
# Note: os.path.join treats '/' as os.sep on Windows
|
|
path = path.lstrip(os.sep).lstrip('/')
|
|
path = path.lstrip(os.pardir).lstrip('..')
|
|
drive, path = os.path.splitdrive(path) # for Windows
|
|
return path
|
|
|
|
def exists(self, path):
|
|
"""
|
|
Test if path exists.
|
|
|
|
Test if `path` exists as (and in this order):
|
|
|
|
- a local file.
|
|
- a remote URL that has been downloaded and stored locally in the
|
|
`DataSource` directory.
|
|
- a remote URL that has not been downloaded, but is valid and
|
|
accessible.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Can be a local file or a remote URL.
|
|
|
|
Returns
|
|
-------
|
|
out : bool
|
|
True if `path` exists.
|
|
|
|
Notes
|
|
-----
|
|
When `path` is an URL, `exists` will return True if it's either
|
|
stored locally in the `DataSource` directory, or is a valid remote
|
|
URL. `DataSource` does not discriminate between the two, the file
|
|
is accessible if it exists in either location.
|
|
|
|
"""
|
|
|
|
# First test for local path
|
|
if os.path.exists(path):
|
|
return True
|
|
|
|
# We import this here because importing urllib is slow and
|
|
# a significant fraction of numpy's total import time.
|
|
from urllib.request import urlopen
|
|
from urllib.error import URLError
|
|
|
|
# Test cached url
|
|
upath = self.abspath(path)
|
|
if os.path.exists(upath):
|
|
return True
|
|
|
|
# Test remote url
|
|
if self._isurl(path):
|
|
try:
|
|
netfile = urlopen(path)
|
|
netfile.close()
|
|
del(netfile)
|
|
return True
|
|
except URLError:
|
|
return False
|
|
return False
|
|
|
|
def open(self, path, mode='r', encoding=None, newline=None):
|
|
"""
|
|
Open and return file-like object.
|
|
|
|
If `path` is an URL, it will be downloaded, stored in the
|
|
`DataSource` directory and opened from there.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Local file path or URL to open.
|
|
mode : {'r', 'w', 'a'}, optional
|
|
Mode to open `path`. Mode 'r' for reading, 'w' for writing,
|
|
'a' to append. Available modes depend on the type of object
|
|
specified by `path`. Default is 'r'.
|
|
encoding : {None, str}, optional
|
|
Open text file with given encoding. The default encoding will be
|
|
what `io.open` uses.
|
|
newline : {None, str}, optional
|
|
Newline to use when reading text file.
|
|
|
|
Returns
|
|
-------
|
|
out : file object
|
|
File object.
|
|
|
|
"""
|
|
|
|
# TODO: There is no support for opening a file for writing which
|
|
# doesn't exist yet (creating a file). Should there be?
|
|
|
|
# TODO: Add a ``subdir`` parameter for specifying the subdirectory
|
|
# used to store URLs in self._destpath.
|
|
|
|
if self._isurl(path) and self._iswritemode(mode):
|
|
raise ValueError("URLs are not writeable")
|
|
|
|
# NOTE: _findfile will fail on a new file opened for writing.
|
|
found = self._findfile(path)
|
|
if found:
|
|
_fname, ext = self._splitzipext(found)
|
|
if ext == 'bz2':
|
|
mode.replace("+", "")
|
|
return _file_openers[ext](found, mode=mode,
|
|
encoding=encoding, newline=newline)
|
|
else:
|
|
raise IOError("%s not found." % path)
|
|
|
|
|
|
class Repository (DataSource):
|
|
"""
|
|
Repository(baseurl, destpath='.')
|
|
|
|
A data repository where multiple DataSource's share a base
|
|
URL/directory.
|
|
|
|
`Repository` extends `DataSource` by prepending a base URL (or
|
|
directory) to all the files it handles. Use `Repository` when you will
|
|
be working with multiple files from one base URL. Initialize
|
|
`Repository` with the base URL, then refer to each file by its filename
|
|
only.
|
|
|
|
Parameters
|
|
----------
|
|
baseurl : str
|
|
Path to the local directory or remote location that contains the
|
|
data files.
|
|
destpath : str or None, optional
|
|
Path to the directory where the source file gets downloaded to for
|
|
use. If `destpath` is None, a temporary directory will be created.
|
|
The default path is the current directory.
|
|
|
|
Examples
|
|
--------
|
|
To analyze all files in the repository, do something like this
|
|
(note: this is not self-contained code)::
|
|
|
|
>>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
|
|
>>> for filename in filelist:
|
|
... fp = repos.open(filename)
|
|
... fp.analyze()
|
|
... fp.close()
|
|
|
|
Similarly you could use a URL for a repository::
|
|
|
|
>>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
|
|
|
|
"""
|
|
|
|
def __init__(self, baseurl, destpath=os.curdir):
|
|
"""Create a Repository with a shared url or directory of baseurl."""
|
|
DataSource.__init__(self, destpath=destpath)
|
|
self._baseurl = baseurl
|
|
|
|
def __del__(self):
|
|
DataSource.__del__(self)
|
|
|
|
def _fullpath(self, path):
|
|
"""Return complete path for path. Prepends baseurl if necessary."""
|
|
splitpath = path.split(self._baseurl, 2)
|
|
if len(splitpath) == 1:
|
|
result = os.path.join(self._baseurl, path)
|
|
else:
|
|
result = path # path contains baseurl already
|
|
return result
|
|
|
|
def _findfile(self, path):
|
|
"""Extend DataSource method to prepend baseurl to ``path``."""
|
|
return DataSource._findfile(self, self._fullpath(path))
|
|
|
|
def abspath(self, path):
|
|
"""
|
|
Return absolute path of file in the Repository directory.
|
|
|
|
If `path` is an URL, then `abspath` will return either the location
|
|
the file exists locally or the location it would exist when opened
|
|
using the `open` method.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Can be a local file or a remote URL. This may, but does not
|
|
have to, include the `baseurl` with which the `Repository` was
|
|
initialized.
|
|
|
|
Returns
|
|
-------
|
|
out : str
|
|
Complete path, including the `DataSource` destination directory.
|
|
|
|
"""
|
|
return DataSource.abspath(self, self._fullpath(path))
|
|
|
|
def exists(self, path):
|
|
"""
|
|
Test if path exists prepending Repository base URL to path.
|
|
|
|
Test if `path` exists as (and in this order):
|
|
|
|
- a local file.
|
|
- a remote URL that has been downloaded and stored locally in the
|
|
`DataSource` directory.
|
|
- a remote URL that has not been downloaded, but is valid and
|
|
accessible.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Can be a local file or a remote URL. This may, but does not
|
|
have to, include the `baseurl` with which the `Repository` was
|
|
initialized.
|
|
|
|
Returns
|
|
-------
|
|
out : bool
|
|
True if `path` exists.
|
|
|
|
Notes
|
|
-----
|
|
When `path` is an URL, `exists` will return True if it's either
|
|
stored locally in the `DataSource` directory, or is a valid remote
|
|
URL. `DataSource` does not discriminate between the two, the file
|
|
is accessible if it exists in either location.
|
|
|
|
"""
|
|
return DataSource.exists(self, self._fullpath(path))
|
|
|
|
def open(self, path, mode='r', encoding=None, newline=None):
|
|
"""
|
|
Open and return file-like object prepending Repository base URL.
|
|
|
|
If `path` is an URL, it will be downloaded, stored in the
|
|
DataSource directory and opened from there.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
Local file path or URL to open. This may, but does not have to,
|
|
include the `baseurl` with which the `Repository` was
|
|
initialized.
|
|
mode : {'r', 'w', 'a'}, optional
|
|
Mode to open `path`. Mode 'r' for reading, 'w' for writing,
|
|
'a' to append. Available modes depend on the type of object
|
|
specified by `path`. Default is 'r'.
|
|
encoding : {None, str}, optional
|
|
Open text file with given encoding. The default encoding will be
|
|
what `io.open` uses.
|
|
newline : {None, str}, optional
|
|
Newline to use when reading text file.
|
|
|
|
Returns
|
|
-------
|
|
out : file object
|
|
File object.
|
|
|
|
"""
|
|
return DataSource.open(self, self._fullpath(path), mode,
|
|
encoding=encoding, newline=newline)
|
|
|
|
def listdir(self):
|
|
"""
|
|
List files in the source Repository.
|
|
|
|
Returns
|
|
-------
|
|
files : list of str
|
|
List of file names (not containing a directory part).
|
|
|
|
Notes
|
|
-----
|
|
Does not currently work for remote repositories.
|
|
|
|
"""
|
|
if self._isurl(self._baseurl):
|
|
raise NotImplementedError(
|
|
"Directory listing of URLs, not supported yet.")
|
|
else:
|
|
return os.listdir(self._baseurl)
|