201 lines
6.1 KiB
Python
201 lines
6.1 KiB
Python
"""Download files with progress indicators.
|
|
"""
|
|
import cgi
|
|
import logging
|
|
import mimetypes
|
|
import os
|
|
|
|
from pip._vendor import requests
|
|
from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
|
|
|
|
from pip._internal.models.index import PyPI
|
|
from pip._internal.network.cache import is_from_cache
|
|
from pip._internal.network.utils import response_chunks
|
|
from pip._internal.utils.misc import (
|
|
format_size,
|
|
redact_auth_from_url,
|
|
splitext,
|
|
)
|
|
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
|
|
from pip._internal.utils.ui import DownloadProgressProvider
|
|
|
|
if MYPY_CHECK_RUNNING:
|
|
from typing import Iterable, Optional
|
|
|
|
from pip._vendor.requests.models import Response
|
|
|
|
from pip._internal.models.link import Link
|
|
from pip._internal.network.session import PipSession
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _get_http_response_size(resp):
|
|
# type: (Response) -> Optional[int]
|
|
try:
|
|
return int(resp.headers['content-length'])
|
|
except (ValueError, KeyError, TypeError):
|
|
return None
|
|
|
|
|
|
def _prepare_download(
|
|
resp, # type: Response
|
|
link, # type: Link
|
|
progress_bar # type: str
|
|
):
|
|
# type: (...) -> Iterable[bytes]
|
|
total_length = _get_http_response_size(resp)
|
|
|
|
if link.netloc == PyPI.file_storage_domain:
|
|
url = link.show_url
|
|
else:
|
|
url = link.url_without_fragment
|
|
|
|
logged_url = redact_auth_from_url(url)
|
|
|
|
if total_length:
|
|
logged_url = '{} ({})'.format(logged_url, format_size(total_length))
|
|
|
|
if is_from_cache(resp):
|
|
logger.info("Using cached %s", logged_url)
|
|
else:
|
|
logger.info("Downloading %s", logged_url)
|
|
|
|
if logger.getEffectiveLevel() > logging.INFO:
|
|
show_progress = False
|
|
elif is_from_cache(resp):
|
|
show_progress = False
|
|
elif not total_length:
|
|
show_progress = True
|
|
elif total_length > (40 * 1000):
|
|
show_progress = True
|
|
else:
|
|
show_progress = False
|
|
|
|
chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
|
|
|
|
if not show_progress:
|
|
return chunks
|
|
|
|
return DownloadProgressProvider(
|
|
progress_bar, max=total_length
|
|
)(chunks)
|
|
|
|
|
|
def sanitize_content_filename(filename):
|
|
# type: (str) -> str
|
|
"""
|
|
Sanitize the "filename" value from a Content-Disposition header.
|
|
"""
|
|
return os.path.basename(filename)
|
|
|
|
|
|
def parse_content_disposition(content_disposition, default_filename):
|
|
# type: (str, str) -> str
|
|
"""
|
|
Parse the "filename" value from a Content-Disposition header, and
|
|
return the default filename if the result is empty.
|
|
"""
|
|
_type, params = cgi.parse_header(content_disposition)
|
|
filename = params.get('filename')
|
|
if filename:
|
|
# We need to sanitize the filename to prevent directory traversal
|
|
# in case the filename contains ".." path parts.
|
|
filename = sanitize_content_filename(filename)
|
|
return filename or default_filename
|
|
|
|
|
|
def _get_http_response_filename(resp, link):
|
|
# type: (Response, Link) -> str
|
|
"""Get an ideal filename from the given HTTP response, falling back to
|
|
the link filename if not provided.
|
|
"""
|
|
filename = link.filename # fallback
|
|
# Have a look at the Content-Disposition header for a better guess
|
|
content_disposition = resp.headers.get('content-disposition')
|
|
if content_disposition:
|
|
filename = parse_content_disposition(content_disposition, filename)
|
|
ext = splitext(filename)[1] # type: Optional[str]
|
|
if not ext:
|
|
ext = mimetypes.guess_extension(
|
|
resp.headers.get('content-type', '')
|
|
)
|
|
if ext:
|
|
filename += ext
|
|
if not ext and link.url != resp.url:
|
|
ext = os.path.splitext(resp.url)[1]
|
|
if ext:
|
|
filename += ext
|
|
return filename
|
|
|
|
|
|
def _http_get_download(session, link):
|
|
# type: (PipSession, Link) -> Response
|
|
target_url = link.url.split('#', 1)[0]
|
|
resp = session.get(
|
|
target_url,
|
|
# We use Accept-Encoding: identity here because requests
|
|
# defaults to accepting compressed responses. This breaks in
|
|
# a variety of ways depending on how the server is configured.
|
|
# - Some servers will notice that the file isn't a compressible
|
|
# file and will leave the file alone and with an empty
|
|
# Content-Encoding
|
|
# - Some servers will notice that the file is already
|
|
# compressed and will leave the file alone and will add a
|
|
# Content-Encoding: gzip header
|
|
# - Some servers won't notice anything at all and will take
|
|
# a file that's already been compressed and compress it again
|
|
# and set the Content-Encoding: gzip header
|
|
# By setting this to request only the identity encoding We're
|
|
# hoping to eliminate the third case. Hopefully there does not
|
|
# exist a server which when given a file will notice it is
|
|
# already compressed and that you're not asking for a
|
|
# compressed file and will then decompress it before sending
|
|
# because if that's the case I don't think it'll ever be
|
|
# possible to make this work.
|
|
headers={"Accept-Encoding": "identity"},
|
|
stream=True,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
class Download(object):
|
|
def __init__(
|
|
self,
|
|
response, # type: Response
|
|
filename, # type: str
|
|
chunks, # type: Iterable[bytes]
|
|
):
|
|
# type: (...) -> None
|
|
self.response = response
|
|
self.filename = filename
|
|
self.chunks = chunks
|
|
|
|
|
|
class Downloader(object):
|
|
def __init__(
|
|
self,
|
|
session, # type: PipSession
|
|
progress_bar, # type: str
|
|
):
|
|
# type: (...) -> None
|
|
self._session = session
|
|
self._progress_bar = progress_bar
|
|
|
|
def __call__(self, link):
|
|
# type: (Link) -> Download
|
|
try:
|
|
resp = _http_get_download(self._session, link)
|
|
except requests.HTTPError as e:
|
|
logger.critical(
|
|
"HTTP error %s while getting %s", e.response.status_code, link
|
|
)
|
|
raise
|
|
|
|
return Download(
|
|
resp,
|
|
_get_http_response_filename(resp, link),
|
|
_prepare_download(resp, link, self._progress_bar),
|
|
)
|