diff --git a/CHANGELOG.md b/CHANGELOG.md index 2436a3c..fd7b2b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased (Targeting 0.2.5)] + +### Added +- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API +- `fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target + location and has a modified time that is newer than the requested file + +### Changed +- slight change to raise a file not downloaded error if `tind_download()` fails to return a written file path + + ## [0.2.4] ### Added diff --git a/tind_client/client.py b/tind_client/client.py index 77b9883..b3da18f 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -3,12 +3,14 @@ """ import json +import logging import os import re from io import StringIO from pathlib import Path from typing import Any, Iterator import xml.etree.ElementTree as E +from datetime import datetime, timezone from pymarc import Record from pymarc.marcxml import parse_xml_to_array @@ -16,6 +18,7 @@ from .api import tind_get, tind_download from .errors import RecordNotFoundError, TINDError +logger = logging.getLogger(__name__) NS = "http://www.loc.gov/MARC21/slim" E.register_namespace("", NS) @@ -69,12 +72,15 @@ def fetch_metadata(self, record: str) -> Record: return records[0] - def fetch_file(self, file_url: str, output_dir: str = "") -> str: - """Download a file from TIND and save it locally. + def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> str: + """Download a file from TIND and save it locally. If the file already exists in the output + directory and has a local modified timestamp that is newer than supplied ``modified`` + timestamp, the file will not be re-downloaded. :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. Falls back to ``default_storage_dir`` when empty. + :param str modified: Optional modified timestamp from the file metadata returned by TIND :raises AuthorizationError: When the TIND API key is invalid or the file is restricted. :raises ValueError: When ``file_url`` is not a valid TIND file download URL. :raises RecordNotFoundError: When the file is invalid or not found. @@ -84,9 +90,20 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str: raise ValueError("URL is not a valid TIND file download URL.") output_target = output_dir or self.default_storage_dir + + expected_filename = file_url.rstrip("/").split("/")[-2] + expected_path = Path(output_target) / expected_filename + + if modified and expected_path.exists(): + meta_mtime = datetime.fromisoformat(modified).replace(tzinfo=timezone.utc) + local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc) + if local_mtime >= meta_mtime: + logger.debug("Cached file at (%s) is newer; skipping download.", expected_path) + return str(expected_path) + (status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key) - if status != 200: + if status != 200 or not saved_to: raise RecordNotFoundError("Referenced file could not be downloaded.") return saved_to