Source code for plasmapy.utils.data.downloader

"""
Contains functionality for downloading files from a URL. Intended for
downloading files from |PlasmaPy's data repository|.
"""

import contextlib
import json
import time
import warnings
from pathlib import Path
from urllib.parse import urljoin

import requests

__all__ = ["Downloader"]


# TODO: use a config file variable to allow users to set a location
# for the data download folder?


[docs] class Downloader: """ Accesses the PlasmaPy resource files. Retrieves local paths to resource files, and downloads those files from |PlasmaPy's data repository| if they cannot be found locally. Parameters ---------- directory : `~pathlib.Path`, optional The directory into which files will be downloaded. The default is :file:`/~//.plasmapy//downloads//` validate : `bool`, default: `True` If `True`, verify that local files are up-to-date with the data repository, and use the GitHub API to verify download URLs before downloading files. If `False`, return any matching local file without verification and, if a local file cannot be found, attempt to download from the repository without validation. api_token : `str`, optional A GitHub authorization token that, if provided, will be used for queries to the GitHub API. If none is provided, public API calls will be used. """ # URL for the PlasmaPy-data repository through the GitHub API _API_BASE_URL = "https://api.github.com/repos/PlasmaPy/PlasmaPy-data/contents/" # Name for the local file that stores the SHA hashes of downloaded files # and information about the SHA on the server _blob_file_name = "RESOURCE_BLOB_SHA.json" # Base URL for RAW files _RAW_BASE_URL = "https://raw.githubusercontent.com/PlasmaPy/PlasmaPy-data/main/" def __init__( self, directory: Path | None = None, validate: bool = True, api_token: str | None = None, ): if directory is None: # No test coverage for default directory, since pytest always # saves into a temporary directory self._download_directory = ( Path.home() / ".plasmapy" / "downloads" ) # coverage: ignore else: self._download_directory = Path(directory) self._validate = validate self._api_token = api_token # Flag to record whether the blob file has been updated from the repo # by this instantiation of the class. Once the file has been updated # once, we won't update it again to limit API calls. self._updated_blob_file_from_repo = False self._download_directory.mkdir(parents=True, exist_ok=True) # Path to the local SHA blob file self._blob_file = Path(self._download_directory, self._blob_file_name) # Create the SHA blob file if it doesn't already exist if not self._blob_file.is_file(): self._blob_dict = {} self._write_blobfile() # Otherwise, read the SHA blob file else: self._read_blobfile() def _write_blobfile(self) -> None: """ Write the _local_blob_dict to disk. """ with self._blob_file.open("w") as f: json.dump(self._blob_dict, fp=f) def _read_blobfile(self) -> None: """ Read the _local_blob_dict from disk. """ with self._blob_file.open("r") as f: self._blob_dict = json.load(f) @property def _api_connected(self) -> bool: """ Return `True` if a connection exists to the API, otherwise `False`. """ try: # Requesting this URL does not count as an API query self._http_request("https://api.github.com/rate_limit") # No testing because CI always has a connection to the API except requests.ConnectionError: # coverage: ignore return False return True @property def _api_usage(self) -> tuple[int, int]: """ Return the API call limit and the number currently used from this IP. """ # Ensure that the GitHub API is not rate limited reply = self._http_request("https://api.github.com/rate_limit") info = reply.json() rate_info = info["resources"]["core"] limit = int(rate_info["limit"]) used = int(rate_info["used"]) return limit, used @property def _api_is_rate_limited(self) -> bool: """ Whether or not the API is currently rate limited. """ limit, used = self._api_usage return used >= limit @property def _do_validation(self) -> bool: """ Determine whether or not to enforce validation using the GitHub API. """ return self._validate and self._api_connected and not self._api_is_rate_limited def _update_repo_blob_dict(self) -> None: """ Update the blob file with a call to the repository. Raises ------ ValueError If the URL does not return the expected JSON file with the expected keys. Returns ------- repo_blob_dict : dict Dictionary with filenames as keys. Each item is another entry with keys ``"sha"`` and ``"download_url"``. """ # If the current blob file has been updated in the past 5 minutes, # don't bother doing it again # Ignore in tests, as this won't happen in CI with contextlib.suppress(KeyError): # If the _timestamp key hasn't been set yet, the blob file has # never been updated before if time.time() - self._blob_dict["_timestamp"] < 300: return None # coverage : ignore # If this instance of Downloader has already updated from the API once, # don't do it again. Almost certainly nothing has changed! # Not tested, since CI never waits >5 min with the same Downloader # instantiated. if self._updated_blob_file_from_repo: # coverage: ignore return None reply = self._http_request(self._API_BASE_URL) # Extract the SHA hash and the download URL from the response # Extract contents to JSON # Not tested, since any URL on the GitHub API that doesn't raise a 404 error # should return a JSON try: # coverage: ignore info = reply.json() except requests.exceptions.JSONDecodeError as err: # coverage: ignore warnings.warn( "URL did not return the expected JSON file: " f"{self._API_BASE_URL}. " f"Response content: {reply.content}. Exception: {err}" ) self._validate = False return None for item in info: try: filename = item["name"] repo_sha = item["sha"] download_url = item["download_url"] # Not tested, since any URL on the GitHub API that doesn't return a 404 # should be a JSON with these keys except (KeyError, TypeError) as err: # coverage: ignore warnings.warn( f"URL {self._API_BASE_URL} returned JSON file, " "missing expected keys 'sha' and 'download_url`." f" JSON contents: {info}. Exception: {err}" ) filename = None repo_sha = None download_url = None if filename is not None: self._update_blob_entry( filename, repo_sha=repo_sha, download_url=download_url ) # Save the current epoch time in the blob file as a record of when # it was updated self._blob_dict["_timestamp"] = time.time() # At the end, write back to the blobfile self._write_blobfile() # The blob file has been updated: set this flag so we won't do it # again on this instance of Downloader self._updated_blob_file_from_repo = True def _update_blob_entry( self, filename: str, local_sha: str | None = None, repo_sha: str | None = None, download_url: str | None = None, ) -> None: """ Update an entry in the blobfile, or create a new one if one doesn't exist. """ if filename in self._blob_dict: if local_sha is not None: self._blob_dict[filename]["local_sha"] = local_sha if repo_sha is not None: self._blob_dict[filename]["repo_sha"] = repo_sha if download_url is not None: self._blob_dict[filename]["download_url"] = download_url else: self._blob_dict[filename] = { "local_sha": local_sha, "repo_sha": repo_sha, "download_url": download_url, } def _http_request(self, url: str) -> requests.Response: """ Issue an HTTP request to the specified URL, handling exceptions. """ # Only send GitHub api authorization if querying GitHub # auth = self._api_auth if "github.com" in url else None headers = {"Content-Type": "application/json"} if self._api_token is not None: headers["authorization"] = f"Bearer {self._api_token}" try: reply = requests.get(url, headers=headers, timeout=10) # No test coverage for this exception since we can't test it without # severing the network connectivity in pytest except requests.ConnectionError as err: # coverage: ignore raise requests.ConnectionError( f"Unable to connect to data repository {self._API_BASE_URL}" ) from err # Extract the 'message' value if it is there # If the file does not exist on the repository, the GitHub API # will return `Not Found` in response to this but not raise a 404 error if reply.status_code == 404: raise ValueError(f"URL returned 404: {url}") return reply def _filepath(self, filename: str) -> Path: """Formats a filepath from a filename.""" return Path(self._download_directory, filename) def _download_file(self, filename: str, dl_url: str) -> Path: """ Download a file from a given URL to a specified path. Parameters ---------- filename : str Name of the file to download: determines the filepath dl_url : str URL from which to download Returns ------- filepath : str Path to the downloaded file """ # Request the contents of the file from the download URL reply = self._http_request(dl_url) filepath = self._filepath(filename) # Write the contents to file with filepath.open(mode="wb") as f: f.write(reply.content) return filepath def _get_file_without_validation(self, filename: str) -> Path: """ Return file logic without validation. Returns ------- filepath : Path Path to file. Raises ------ ValueError If the resource cannot be found locally or on the repository. """ filepath = self._filepath(filename) # If the file exists locally, return that if filepath.is_file(): return filepath # Try blindly downloading from the base URL # Note that downloading directly from the RAW url does not # require an API call. with contextlib.suppress(ValueError): dl_url = urljoin(self._RAW_BASE_URL, filename) return self._download_file(filename, dl_url) raise ValueError( "Resource could not be found locally or " "retrieved from the PlasmaPy-data repository: " f"{filename}." ) def _get_file_with_validation(self, filename: str) -> Path: """ Return file logic with validation. """ filepath = self._filepath(filename) self._update_repo_blob_dict() # Retrieve the values: try/except catches KeyError both for # key `filename` and `local_sha`/`repo_sha` try: local_sha = self._blob_dict[filename]["local_sha"] except KeyError: local_sha = None try: repo_sha = self._blob_dict[filename]["repo_sha"] except KeyError: repo_sha = None # If local sha and online sha are equal, return the local filepath if local_sha == repo_sha and local_sha is not None: return filepath # If the file is found online, try downloading from the repository elif repo_sha is not None: dl_url = self._blob_dict[filename]["download_url"] # Download the file filepath = self._download_file(filename, dl_url) # This is a verified download, so we now know the local_sha is # the same as the repo_sha self._update_blob_entry(filename, local_sha=repo_sha) self._write_blobfile() return filepath # Otherwise fall back to retrieving the file without validation else: self._validate = False warnings.warn( f"Could not retrieve file {filename} with validation: " "trying again without validation." ) return self._get_file_without_validation(filename)
[docs] def get_file(self, filename: str) -> Path: """ Returns a local path to a resource file, downloading it if necessary. Parameters ---------- filename : str The name of the file in the |PlasmaPy's data repository|. Returns ------- Path : `~pathlib.Path` The local path to the resource file. """ if self._do_validation: return self._get_file_with_validation(filename) return self._get_file_without_validation(filename)