From cb1df88d86c69b2b2c64b5ca9afaa9d75d2e7bda Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 31 Dec 2025 12:50:40 +0000 Subject: [PATCH 01/11] Add a helper command for downloading files. This commit adds a new command to the `odk-helper` tool: `download`. The new command is intended to be used from within ODK workflows to download files (e.g. when mirroring an ontology, or when refreshing a remotely sourced component). The aim is to simplify the standard Makefile by removing from it all the suff needed to ensure that the rules that depend on a downloaded file are not needlessly triggered when we download a file that has, in fact, not changed since the last time it was downloaded. The `odk-helper download` command takes care of that by keeping track, for every downloaded file, of up to three pieces of information: * the time at which the file was downloaded; * the ETag returned by the server for that file (if any); * the SHA-256 checksum of the downloaded file. When we ask to download the same file again, the command uses the time of last download and the ETag (if available) to specifically ask the server for a newer version of the file, thereby avoiding download completely if the server is able to honor the If-Modified-Since and/or If-None-Match headers. If we do download the file, then we first write it into a temporary location, compute its SHA-256 checksum, and compare it with the recorded checksum from the last download; if the checksums match, then the newly downloaded file is in fact not new and the temporary file is deleted without ever touching the existing file. In addition, the command can automatically: * uncompress a .gz or .bz2 file; * try to download a compressed version of a remote resource first (by appending `.gz` to the requested URL), then fallback to the original URL. --- src/incatools/odk/download.py | 255 ++++++++++++++++++++++++++++++++++ src/incatools/odk/helper.py | 81 ++++++++++- 2 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 src/incatools/odk/download.py diff --git a/src/incatools/odk/download.py b/src/incatools/odk/download.py new file mode 100644 index 0000000..65a6bbe --- /dev/null +++ b/src/incatools/odk/download.py @@ -0,0 +1,255 @@ +# odkcore - Ontology Development Kit Core +# Copyright © 2025 ODK Developers +# +# This file is part of the ODK Core project and distributed under the +# terms of a 3-clause BSD license. See the LICENSE file in that project +# for the detailed conditions. + +import logging +import os +from bz2 import BZ2File +from dataclasses import dataclass +from datetime import UTC, datetime +from enum import Enum +from gzip import GzipFile +from hashlib import sha256 +from io import BufferedIOBase, BytesIO +from pathlib import Path +from typing import Dict, Iterator, Optional, Union +from urllib.parse import urlparse + +import requests + +RFC5322_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT" +# Those are the HTTP errors that Curl considers as "transient" and +# eligible for retrying when the --retry option is used. +RETRIABLE_HTTP_ERRORS = (408, 429, 500, 502, 503, 504) + + +class Compression(Enum): + NONE = (0, None) + GZIP = (1, ".gz") + BZIP2 = (2, ".bz2") + + extension: Optional[str] + + def __new__(cls, value: int, extension: Optional[str] = None): + self = object.__new__(cls) + self._value_ = value + self.extension = extension + return self + + @classmethod + def from_extension(cls, path: Path) -> "Compression": + for v in Compression: + if v.extension is not None and v.extension == path.suffix: + return v + return Compression.NONE + + +class DownloadError(Exception): + pass + + +@dataclass +class RemoteFileInfo: + """Informations about a downloaded file. + + This class is a simple structure holding the informations we need in + order to decide whether to update a file that has already been + downloaded previously. + """ + + sha256: Optional[str] = None + """The SHA-256 checksum of the downloaded file.""" + + etag: Optional[str] = None + """The tag returned by the server for the file.""" + + time: Optional[datetime] = None + """The time the file was last downloaded.""" + + def to_file(self, dest: Path) -> None: + """Writes the information to a cache file.""" + with dest.open("w") as fd: + if self.sha256 is not None: + fd.write(f"sha256: {self.sha256}\n") + if self.etag is not None: + fd.write(f"etag: {self.etag}\n") + if self.time is not None: + fd.write(f"time: {self.time.strftime(RFC5322_DATE_FORMAT)}\n") + + def from_cache_file(self, source: Path) -> "RemoteFileInfo": + """Reads the information from a cache file.""" + if source.exists(): + with source.open("r") as fd: + for line in fd: + if line.startswith("#"): + continue + items = line.strip().split(": ", maxsplit=1) + if len(items) != 2: + continue + if items[0] == "sha256": + self.sha256 = items[1] + elif items[0] == "etag": + self.etag = items[1] + elif items[0] == "time": + self.time = datetime.strptime(items[1], RFC5322_DATE_FORMAT) + return self + + @classmethod + def from_file(cls, source: Path) -> "RemoteFileInfo": + """Gets the information from an existing file.""" + info = RemoteFileInfo() + if source.exists(): + info.time = datetime.fromtimestamp(source.stat().st_mtime, UTC) + h = sha256() + with source.open("rb") as fd: + while True: + chunk = fd.read(512) + if not chunk: + break + h.update(chunk) + info.sha256 = h.hexdigest() + return info + + +def download_file( + url: str, + output: Path, + info: RemoteFileInfo, + max_retry: int = 0, + compression: Compression = Compression.NONE, +) -> int: + """Downloads a remote file. + + This function will avoid needlessly downloading the file if the + remote server can tell us that the remote file has not changed since + the last time it was downloaded. In addition, even if the file is + downloaded, if it is found to be identical to the locally available + version, the existing file is not touched at all. + + :param url: The URL to download the file from. + :param output: Where the downloaded file should be written. + :param info: Informations about the last time the file was + downloaded. If the fields of that structure are set to None, + this means there is no local version of the file, and the + remote file should always be downloaded. If the download is + successful, the structure will be updated with informations from + the newly downloaded file. + :param max_retry: Number of download attempts to perform. + :param compression: How the remote file is compressed (if at all). + The file will be automatically uncompressed after being + downloaded. + + :returns: The HTTP status code. + """ + headers: Dict[str, str] = {} + if info.time: + headers["If-Modified-Since"] = info.time.strftime(RFC5322_DATE_FORMAT) + if info.etag: + headers["If-None-Match"] = info.etag + + n_try = 0 + hostname = urlparse(url).hostname + while True: + try: + response = requests.get(url, timeout=5, headers=headers) + if response.status_code == 200: + return _handle_successful_download(response, output, info, compression) + elif response.status_code == 304: + logging.info(f"{output.name}: Not modified at {url}") + return 304 + elif response.status_code == 404: + logging.warn(f"{output.name}: Not found at {url}") + return 404 + elif response.status_code in RETRIABLE_HTTP_ERRORS and n_try < max_retry: + n_try += 1 + logging.warn( + f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}" + ) + else: + response.raise_for_status() + except requests.exceptions.ConnectTimeout: + # `curl --retry` retries on timeout errors, and so do we + if n_try < max_retry: + n_try += 1 + logging.warn( + f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})" + ) + else: + raise DownloadError(f"Timeout when connecting to {hostname}") + except requests.exceptions.ConnectionError: + raise DownloadError(f"Cannot connect to {hostname}") + except requests.exceptions.HTTPError: + raise DownloadError(f"HTTP error when downloading {url}") + except requests.exceptions.ReadTimeout: + raise DownloadError(f"Timeout when downloading {url}") + + +def _handle_successful_download( + response: requests.Response, + output: Path, + info: RemoteFileInfo, + comp: Compression, +) -> int: + h = sha256() + + # We download into a temporary file so that we do not touch the + # output file until (1) the download is complete and (2) we have + # verified that the downloaded file is different from the output + # file, if it already exists + tmpfile = output.with_suffix(output.suffix + ".tmp") + with tmpfile.open("wb") as fd: + for chunk in _ResponseWrapper.maybe_wrap(response, comp).iter_content(512): + h.update(chunk) + fd.write(chunk) + checksum = h.hexdigest() + if info.sha256 == checksum: + logging.info( + f"{output.name}: File newly downloaded is identical to previously downloaded file" + ) + # Remove the file we just downloaded, and report to caller as a + # 304 Not-Modified status + tmpfile.unlink() + return 304 + else: + logging.info(f"{output.name}: Download OK, file is new") + os.replace(tmpfile, output) + info.sha256 = checksum + info.time = datetime.now(tz=UTC) + info.etag = response.headers.get("ETag", None) + return 200 + + +class _ResponseWrapper: + """Helper class to handle compressed files. + + This class allows to use the same `iter_content` method (as found on + a requests.Response object) to get the content of a downloaded file, + regardless of how the file has been compressed (if at all). + """ + + _stream: BufferedIOBase + + def __init__(self, stream: BufferedIOBase): + self._stream = stream + + def iter_content(self, size: int = 512) -> Iterator[bytes]: + while True: + chunk = self._stream.read(size) + if not chunk: + break + yield chunk + self._stream.close() + + @classmethod + def maybe_wrap( + cls, response: requests.Response, compression: Compression + ) -> Union[requests.Response, "_ResponseWrapper"]: + if compression == Compression.GZIP: + return _ResponseWrapper(GzipFile(fileobj=BytesIO(response.content))) + elif compression == Compression.BZIP2: + return _ResponseWrapper(BZ2File(BytesIO(response.content))) + else: + return response diff --git a/src/incatools/odk/helper.py b/src/incatools/odk/helper.py index e30feec..c1e31c0 100644 --- a/src/incatools/odk/helper.py +++ b/src/incatools/odk/helper.py @@ -6,22 +6,26 @@ # for the detailed conditions. import json +import logging import shutil import subprocess from pathlib import Path +from typing import List, Tuple +from urllib.parse import urlparse import click from lightrdf import Parser as LightRDFParser # type: ignore from rdflib import Graph from . import __version__ +from .download import Compression, DownloadError, RemoteFileInfo, download_file from .template import DEFAULT_TEMPLATE_DIR, RESOURCES_DIR @click.group() def main() -> None: """Helper commands for ODK workflows.""" - pass + logging.basicConfig(level=logging.INFO) @main.command() @@ -132,3 +136,78 @@ def info(tools) -> None: if tools: cmd.append("--tools") subprocess.run(cmd) + + +@main.command() +@click.argument("url") +@click.option( + "-o", + "--output", + type=click.Path(path_type=Path), + help="""Write the downloaded file to the specified location. + The default is derived from the URL.""", +) +@click.option( + "-r", + "--reference", + type=click.Path(path_type=Path), + help="""Use the specified file as reference to decide whether to + download the file again. The default is to use the file + specified with the --output option.""", +) +@click.option( + "-i", + "--cache-info", + type=click.Path(path_type=Path), + help="""Read/write cache data from/to the specified file. The + default is the same location as specified with the + --reference option plus an added '.info' extension.""", +) +@click.option("--max-retry", default=0, metavar="N", help="Retry at most N times.") +@click.option( + "--try-gzip/--no-try-gzip", + default=True, + help="""Given the URL "U", automatically attempt to download "U.gz", + then fallback to "U". This is enabled by default, unless the + provided URL already points to a compressed file.""", +) +def download(url, output, reference, cache_info, max_retry, try_gzip) -> None: + """Download a file.""" + + urlpath = Path(urlparse(url).path) + compression = Compression.from_extension(urlpath) + + attempts: List[Tuple[str, Compression]] = [] + if compression == Compression.NONE and try_gzip: + attempts.append((url + ".gz", Compression.GZIP)) + attempts.append((url, compression)) + + if output is None: + output = Path(urlpath.name) + if not output.name: + raise click.ClickException( + f"Explicit output name required for downloading from {url}" + ) + if reference is None: + reference = output + if cache_info is None: + cache_info = output.with_suffix(output.suffix + ".info") + + info = RemoteFileInfo() + if reference.exists(): + info.from_cache_file(cache_info) + if reference != output and output.exists(): + output.unlink() + + try: + for u, c in attempts: + status = download_file(u, output, info, max_retry, c) + if status == 200: + info.to_file(cache_info) + return + elif status == 304: + return + if status == 404: # Last attempt failed + raise click.ClickException(f"Cannot download {url}: 404 Not Found") + except DownloadError as e: + raise click.ClickException(f"Cannot download {url}: {e}") From 50bd04ec9ff4e781ff79891648de6bc1651bdd63 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 31 Dec 2025 13:02:34 +0000 Subject: [PATCH 02/11] Refactor all downloading rules. Now that we have a `odk-helper download` command, we can use it to overhaul the way we download mirrors and remotely sourced components in the standard Makefile. Basically, to download and prepare a mirror M, we have two rules: (1) A "phony" rule `download-mirror-M`. That rule is responsible for actually downloading the remote file. It does so with the `odk-helper download` command. The newly downloaded file is expected to be written into `$(TMPDIR)/download-mirror-M.owl`, _unless_ the file has not been changed since the last time it was downloaded (in which case nothing will be written at that location). It is also in that rule that we deal with the `use_base` and `use_gzipped` options. (2) A "normal" rule `$(MIRRORDIR)/M.owl`, which depends on the first rule. That rule should convert the downloaded file into the final mirror file, by running `robot remove` (if `make_base` is enabled for the mirror) and `robot convert`. Importantly, that rule must check beforehand whether the `$(TMPDIR)/download-mirror-M.owl` file exists, and do nothing if it does not (which will be the case when the remote file has not changed since the last time it was downloaded). The same principles applies for downloading remotely sourced components. --- .../templates/src/ontology/Makefile.jinja2 | 120 +++++++++--------- 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 index 9f292eb..7f1ee79 100644 --- a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 +++ b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 @@ -773,6 +773,7 @@ refresh-imports-excluding-large: .PHONY: refresh-% refresh-%: + rm -f $(TMPDIR)/download-mirror-$*.owl.info $(MAKE) --assume-new=$(SRC) --assume-new=$(IMPORTDIR)/$*_terms.txt \ IMP=true IMP_LARGE=true MIR=true PAT=false $(IMPORTDIR)/$*_import.owl @@ -805,6 +806,7 @@ no-mirror-recreate-components: .PHONY: recreate-% recreate-%: + rm -f $(TMPDIR)/download-component-$*.owl.info $(MAKE) --assume-new=$(TMPDIR)/stamp-component-$*.owl \ COMP=true IMP=false MIR=true PAT=true $(COMPONENTSDIR)/$*.owl @@ -824,23 +826,21 @@ $(TMPDIR)/stamp-component-%.owl: | $(TMPDIR) {% for component in project.components.products -%} {% if component.source is not none -%} ifeq ($(MIR),true) -.PHONY: component-download-{{ component.filename }} -component-download-{{ component.filename }}: | $(TMPDIR) - $(ROBOT) merge -I {{ component.source }} \{% if component.make_base %} - remove {% for iri in component.base_iris %}--base-iri {{ iri }} \ - {% endfor %}--axioms external --preserve-structure false --trim false \{% endif %} - annotate --annotation owl:versionInfo $(VERSION) --output $(TMPDIR)/$@.owl - -$(COMPONENTSDIR)/{{ component.filename }}: component-download-{{ component.filename }} $(TMPDIR)/stamp-component-{{ component.filename }} - @if cmp -s $(TMPDIR)/component-download-{{ component.filename }}.owl $(TMPDIR)/component-download-{{ component.filename }}.tmp.owl ; then \ - echo "Component identical." ; \ - else \ - echo "Component different, updating." && \ - cp $(TMPDIR)/component-download-{{ component.filename }}.owl $(TMPDIR)/component-download-{{ component.filename }}.tmp.owl && \ - $(ROBOT) annotate --input $(TMPDIR)/component-download-{{ component.filename }}.owl \ - --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) \ - --output $@ ; \ +.PHONY: download-component-{{ component.filename }} +download-component-{{ component.filename }}: | $(TMPDIR) + @odk-helper download --output $(TMPDIR)/$@ \ + --reference $(COMPONENTSDIR)/{{ component.filename }} \ + {{ component.source }} + +$(COMPONENTSDIR)/{{ component.filename }}: download-component-{{ component.filename }} $(TMPDIR)/stamp-component-{{ component.filename }} + @if [ -f $(TMPDIR)/download-component-{{ component.filename }} ]; then \ + $(ROBOT) merge -i $(TMPDIR)/download-component-{{ component.filename }} \{% if component.make_base %} + remove {% for iri in component.base_iris %}--base-iri {{ iri }} \ + {% endfor %}--axioms external --preserve-structure false --trim false \{% endif %} + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) \ + --output $@ ; \ fi + .PRECIOUS: $(COMPONENTSDIR)/{{ component.filename }} endif # MIR=true @@ -864,61 +864,58 @@ $(COMPONENTSDIR)/{{ component.filename }}:{% for mapping in component.mappings % {% endfor -%} endif # COMP=true {% endif %}{# ! project.components is not none -#} + {% if project.import_group is defined -%} # ---------------------------------------- # Mirroring upstream ontologies # ---------------------------------------- ifeq ($(MIR),true) -{% for ont in project.import_group.products %} +{% for ont in project.import_group.products -%} ## ONTOLOGY: {{ ont.id }} -{% if ont.description -%} +{% if ont.description -%} ## {{ ont.description }} -{% endif -%} +{% endif -%} -{%- if ont.mirror_type != 'no_mirror' -%} -.PHONY: mirror-{{ ont.id }} +{% if ont.mirror_type != 'no_mirror' -%} +.PHONY: download-mirror-{{ ont.id }} .PRECIOUS: $(MIRRORDIR)/{{ ont.id }}.owl -{%- endif -%} +download-mirror-{{ ont.id }}: | $(TMPDIR) + odk-helper download --output $(TMPDIR)/$@.owl \ + --reference $(MIRRORDIR)/{{ ont.id }}.owl \ + {% if ont.mirror_from %}{{ ont.mirror_from }}{% elif ont.use_base %}$(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl{% else %}$(OBOBASE)/{{ ont.id }}.owl{% endif %} + +{% if ont.mirror_type == 'custom' -%} +$(MIRRORDIR)/{{ ont.id }}.owl: + @echo "ERROR: You have configured your default mirror type to be custom; + @echo " this behavior needs to be overwritten in {{ project.id }}.Makefile!" + @false -{%- if ont.is_large %} +{% else -%} +{% if ont.is_large -%} ifeq ($(IMP_LARGE),true) -{%- endif %} -{%- if ont.mirror_from %} -mirror-{{ ont.id }}: | $(TMPDIR) - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -I {{ ont.mirror_from }} {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %} --axioms external --preserve-structure false --trim false{% else %}convert -I {{ ont.mirror_from }}{% endif %} -o $(TMPDIR)/$@.owl -{%- elif ont.use_base %} -{%- if ont.use_gzipped %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl.gz --create-dirs -o $(MIRRORDIR)/{{ ont.id }}-base.owl.gz --retry {{project.import_group.mirror_retry_download}} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(MIRROR_DIR)/{{ ont.id }}-base.owl.gz {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{ else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %} --axioms external --preserve-structure false --trim false{% else %}convert -i $(MIRRORDIR)/{{ ont.id }}-base.owl.gz{% endif %} -o $(TMPDIR)/$@.owl -{%- else %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl --create-dirs -o $(TMPDIR)/{{ ont.id }}-download.owl --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) convert -i $(TMPDIR)/{{ ont.id }}-download.owl -o $(TMPDIR)/$@.owl -{%- endif %} -{%- else %} -{%- if ont.use_gzipped %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}.owl.gz --create-dirs -o $(MIRRORDIR)/{{ ont.id }}.owl.gz --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(MIRRORDIR)/{{ ont.id }}.owl.gz {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %}--axioms external --preserve-structure false --trim false{% else %}convert -i $(MIRRORDIR)/{{ ont.id }}.owl.gz{% endif %} -o $(TMPDIR)/$@.owl -{%- elif 'custom' == ont.mirror_type %} -$(MIRRORDIR)/{{ ont.id }}.owl: - echo "ERROR: You have configured your default mirror type to be custom; this behavior needs to be overwritten in {{ project.id }}.Makefile!" && false -{%- elif 'no_mirror' == ont.mirror_type -%} -## You have configured your default mirror type to no_mirror. -{%- else %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}.owl --create-dirs -o $(TMPDIR)/{{ ont.id }}-download.owl --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(TMPDIR)/{{ ont.id }}-download.owl {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %}--axioms external --preserve-structure false --trim false{% else %}convert -i $(TMPDIR)/{{ ont.id }}-download.owl{% endif %} -o $(TMPDIR)/$@.owl -{%- endif %} -{%- endif %} -{%- if ont.is_large %} +{% endif -%} +$(MIRRORDIR)/{{ ont.id }}.owl: download-mirror-{{ ont.id }} + @if [ -f $(TMPDIR)/download-mirror-{{ ont.id }}.owl ]; then \ + $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(TMPDIR)/download-mirror-{{ ont.id }}.owl \{% if ont.base_iris is not none %}{% for iri in ont.base_iris %} + --base-iri {{ iri }} \{% endfor %}{% else %} + --base-iri $(OBOBASE)/{{ ont.id.upper() }} \{% endif %} + --axioms external --preserve-structure false --trim false \ + convert {% else %}convert -i $(TMPDIR)/download-mirror-{{ ont.id }}.owl {% endif %}-o $@ ; \ + fi + +{% if ont.is_large -%} endif -{%- endif %} -{% endfor -%} -{% if project.import_group.use_base_merging %} +{% endif -%} + +{% endif -%} + +{% endif %}{# !ont.mirror_type != 'no_mirror' -#} + +{% endfor %}{# for ont in project.import_group.products -#} + +{% if project.import_group.use_base_merging -%} ALL_MIRRORS = $(patsubst %, $(MIRRORDIR)/%.owl, $(IMPORTS)) MERGE_MIRRORS = true @@ -927,14 +924,10 @@ $(MIRRORDIR)/merged.owl: $(ALL_MIRRORS) $(ROBOT) merge $(patsubst %, -i %, $^) {% if project.import_group.annotate_defined_by %}--annotate-defined-by true{% endif %} {% if project.import_group.base_merge_drop_equivalent_class_axioms %}remove --axioms equivalent --preserve-structure false {% endif %}-o $@ .PRECIOUS: $(MIRRORDIR)/merged.owl endif -{% endif %} - -$(MIRRORDIR)/%.owl: mirror-% | $(MIRRORDIR) - if [ -f $(TMPDIR)/mirror-$*.owl ]; then if cmp -s $(TMPDIR)/mirror-$*.owl $@ ; then echo "Mirror identical, ignoring."; else echo "Mirrors different, updating." &&\ - cp $(TMPDIR)/mirror-$*.owl $@; fi; fi +{% endif -%} endif # MIR=true -{% endif %} +{% endif %}{# project.import_group is defined -#} {% if project.subset_group is defined %} # ---------------------------------------- @@ -1633,6 +1626,7 @@ clean:{% if project.use_dosdps %} case $$reldir in .*|"") ;; *) rm -rf $$reldir/* ;; esac \ done rm -f $(CLEANFILES) + rm -f $(TMPDIR)/download-*.info .PHONY: help help: From fc03a404e95416c070565e8f084fc8ba1bfd8004 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Thu, 1 Jan 2026 12:12:07 +0000 Subject: [PATCH 03/11] Honour the `mirror_retry_download` and `use_gzipped` settings. Update the Makefile template so that the rule for downloading a mirror takes into account the `mirror_retry_download` option (at the ImportGroup level) and the `use_gzipped` option (at the individual import level). Mark the `mirror_max_time_download` option as deprecated, as (1) Python's Requests library does not offer a way to set a timeout for the entire request and (2) I happen to agree with the Requests developers when they say that total timeouts are in fact a misfeature (https://github.com/psf/requests/issues/3099#issuecomment-215498005). Read timeouts (when we timeout after not receiving a data packet for a while) should be enough to deal with flaky servers. --- src/incatools/odk/model.py | 5 ++--- src/incatools/odk/templates/src/ontology/Makefile.jinja2 | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/incatools/odk/model.py b/src/incatools/odk/model.py index f95d45d..4979deb 100644 --- a/src/incatools/odk/model.py +++ b/src/incatools/odk/model.py @@ -439,15 +439,14 @@ class ImportGroup(ProductGroup): mirror_retry_download: int = 4 """How many times to try downloading the source of an import module. - This corresponds to the cURL --retry parameter, + This behaves similarly to cURL’s --retry parameter, see . """ mirror_max_time_download: int = 200 """How long downloading the source of an import module is allowed to take. - This corresponds to the cURL --max-time parameter (in seconds), see - . + DEPRECATED: This option no longer has any effect. """ release_imports: bool = False diff --git a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 index 7f1ee79..c356833 100644 --- a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 +++ b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 @@ -884,7 +884,8 @@ ifeq ($(MIR),true) download-mirror-{{ ont.id }}: | $(TMPDIR) odk-helper download --output $(TMPDIR)/$@.owl \ --reference $(MIRRORDIR)/{{ ont.id }}.owl \ - {% if ont.mirror_from %}{{ ont.mirror_from }}{% elif ont.use_base %}$(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl{% else %}$(OBOBASE)/{{ ont.id }}.owl{% endif %} + --max-retry {{ project.import_group.mirror_retry_download }} \ + {% if ont.mirror_from %}{{ ont.mirror_from }}{% elif ont.use_base %}$(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl{% if ont.use_gzipped %}.gz{% endif %}{% else %}$(OBOBASE)/{{ ont.id }}.owl{% if ont.use_gzipped %}.gz{% endif %}{% endif %} {% if ont.mirror_type == 'custom' -%} $(MIRRORDIR)/{{ ont.id }}.owl: From 8aac485963f2d67a26d65a907c286463b6500cd1 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Thu, 1 Jan 2026 16:22:04 +0000 Subject: [PATCH 04/11] Retry downloads by default. We almost regularly run into transient network errors when running the test suite. Better to systematically retry downloads by default. --- src/incatools/odk/helper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/incatools/odk/helper.py b/src/incatools/odk/helper.py index c1e31c0..8352dcf 100644 --- a/src/incatools/odk/helper.py +++ b/src/incatools/odk/helper.py @@ -163,7 +163,12 @@ def info(tools) -> None: default is the same location as specified with the --reference option plus an added '.info' extension.""", ) -@click.option("--max-retry", default=0, metavar="N", help="Retry at most N times.") +@click.option( + "--max-retry", + default=4, + metavar="N", + help="""Retry at most N times. Default is 4. Set to zero to disable retrying.""", +) @click.option( "--try-gzip/--no-try-gzip", default=True, From d82f6e89e3543e768107d5b5696822309ddae66f Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Thu, 1 Jan 2026 16:26:45 +0000 Subject: [PATCH 05/11] Wait for one second before re-trying. If we have to retry a download because of a transient network issue, wait for one second before attempting again. --- src/incatools/odk/download.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/incatools/odk/download.py b/src/incatools/odk/download.py index 65a6bbe..0098599 100644 --- a/src/incatools/odk/download.py +++ b/src/incatools/odk/download.py @@ -15,6 +15,7 @@ from hashlib import sha256 from io import BufferedIOBase, BytesIO from pathlib import Path +from time import sleep from typing import Dict, Iterator, Optional, Union from urllib.parse import urlparse @@ -168,6 +169,7 @@ def download_file( logging.warn( f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}" ) + sleep(1) else: response.raise_for_status() except requests.exceptions.ConnectTimeout: @@ -177,6 +179,7 @@ def download_file( logging.warn( f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})" ) + sleep(1) else: raise DownloadError(f"Timeout when connecting to {hostname}") except requests.exceptions.ConnectionError: From ebd58d95680ac92fcac24f0f088bec10e65e243d Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Fri, 9 Jan 2026 21:30:27 +0000 Subject: [PATCH 06/11] Use an explicit selection of linting rules. The default rules selected by `ruff check` do not include the `isort` rules (I), which I like to always use. Better to hardcode that in the pyproject file rather than having to remember to select them on the command line. --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 160cb25..44d0005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,9 @@ source-include = [ "tests/**", ] +[tool.ruff.lint] +select = [ "E4", "E7", "E9", "F", "I" ] + [dependency-groups] dev = [ "ruff", From c3c0c7d35a3aad011dd5f3a2e6095c22b9144a88 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Fri, 9 Jan 2026 22:35:41 +0000 Subject: [PATCH 07/11] Add unit tests for new download_file function. Add a test case for the `download_file` function that is at the core of the `odk-helper download` command. Update the `code-quality.yml` workflow to (1) add the `tests` directory to the linting, formatting, and static typing checks, and (2) run the unit tests. --- .github/workflows/code-quality.yml | 12 +++-- pyproject.toml | 5 ++ tests/test_download.py | 76 ++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 tests/test_download.py diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 15dc21b..1adda54 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -1,4 +1,4 @@ -name: Static code quality tests +name: Static code quality tests and unit tests on: push: @@ -35,8 +35,12 @@ jobs: - name: Install Python dependencies run: uv sync --dev --all-extras - name: Code linting - run: uv run ruff check src + run: uv run ruff check src tests - name: Code formatting - run: uv run ruff format --check src + run: uv run ruff format --check src tests - name: Static type checking - run: uv run mypy src + run: | + uv run mypy src + uv run mypy tests + - name: Unit tests + run: uv run pytest tests diff --git a/pyproject.toml b/pyproject.toml index 160cb25..8622ba5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,10 +46,15 @@ source-include = [ "tests/**", ] +[[tool.mypy.overrides]] +module = ["incatools.odk.*"] +follow_untyped_imports = true + [dependency-groups] dev = [ "ruff", "mypy", + "pytest", "types-PyYAML", "types-defusedxml", "types-requests", diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 0000000..7e87bac --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,76 @@ +# odkcore - Ontology Development Kit Core +# Copyright © 2026 ODK Developers +# +# This file is part of the ODK Core project and distributed under the +# terms of a 3-clause BSD license. See the LICENSE file in that project +# for the detailed conditions. + +import unittest + +from pathlib import Path +from time import sleep + +from incatools.odk.download import download_file, RemoteFileInfo + + +class TestDownload(unittest.TestCase): + """A test case for the download function.""" + + downloaded_file: Path + url: str + + def setUp(self) -> None: + self.downloaded_file = Path("xxx-downloaded_file.txt") + self.url = "https://raw.githubusercontent.com/INCATools/odkcore/refs/heads/main/README.md" + + def tearDown(self) -> None: + if self.downloaded_file.exists(): + self.downloaded_file.unlink() + + def test_download_once(self) -> None: + """Test that we do not re-download a file needlessly.""" + + ri = RemoteFileInfo() + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + if code != 200: + self.skipTest(f"Could not dowload from {self.url}, cannot test") + return + + self.assertTrue(self.downloaded_file.exists()) + mtime = self.downloaded_file.stat().st_mtime + + sleep(1) + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + + # Either the server tols us the file had not been modified, or + # the download function checked the newly downloaded file is the + # same as the one downloaded just before. Either way, we should + # get a 304, and the file should not have been touched. + self.assertEqual(304, code) + self.assertEqual(mtime, self.downloaded_file.stat().st_mtime) + + def test_no_overwrite_identical_file(self) -> None: + """Test that a re-downloaded file is not overwritten needlessly.""" + + ri = RemoteFileInfo() + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + if code != 200: + self.skipTest(f"Could not download from {self.url}, cannot test") + return + + mtime = self.downloaded_file.stat().st_mtime + + # Remove server-side data from the RemoteFileInfo, so that the + # server cannot tell us whether the file has been modified or + # not, thereby forcing us to re-download the file. + ri.etag = None + ri.time = None + + sleep(1) + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + + # Even if we have re-downloaded the file, the function should + # still report that as a 304 and the previously downloaded file + # should not have been touched. + self.assertEqual(304, code) + self.assertEqual(mtime, self.downloaded_file.stat().st_mtime) From ea227668181b0c5ce4317dd9fbc5f88e3b85c832 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Fri, 9 Jan 2026 22:42:16 +0000 Subject: [PATCH 08/11] Fix import declarations (isort). --- tests/test_download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index 7e87bac..e2be5a5 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -6,11 +6,10 @@ # for the detailed conditions. import unittest - from pathlib import Path from time import sleep -from incatools.odk.download import download_file, RemoteFileInfo +from incatools.odk.download import RemoteFileInfo, download_file class TestDownload(unittest.TestCase): From 9ae0876cbaaea7103c51e1eb39264b64e15e7799 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Mon, 12 Jan 2026 19:39:04 +0000 Subject: [PATCH 09/11] Do not use `logging.warn()`. The `logging.warn()` method is deprecated, the correct method is `logging.warning()` instead. --- src/incatools/odk/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/incatools/odk/download.py b/src/incatools/odk/download.py index 0098599..eabcb6b 100644 --- a/src/incatools/odk/download.py +++ b/src/incatools/odk/download.py @@ -162,11 +162,11 @@ def download_file( logging.info(f"{output.name}: Not modified at {url}") return 304 elif response.status_code == 404: - logging.warn(f"{output.name}: Not found at {url}") + logging.warning(f"{output.name}: Not found at {url}") return 404 elif response.status_code in RETRIABLE_HTTP_ERRORS and n_try < max_retry: n_try += 1 - logging.warn( + logging.warning( f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}" ) sleep(1) @@ -176,7 +176,7 @@ def download_file( # `curl --retry` retries on timeout errors, and so do we if n_try < max_retry: n_try += 1 - logging.warn( + logging.warning( f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})" ) sleep(1) From ecc1886e50d367c6520aaaa756c1215f43ca6c9d Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Mon, 12 Jan 2026 19:40:31 +0000 Subject: [PATCH 10/11] Remove unneeded `return`. `skipTest` works by raising an exception, so it automatically interrupts the current code flow, there's no need for an explicit `return`. --- tests/test_download.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index e2be5a5..04e533b 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -33,7 +33,6 @@ def test_download_once(self) -> None: code = download_file(self.url, self.downloaded_file, ri, max_retry=5) if code != 200: self.skipTest(f"Could not dowload from {self.url}, cannot test") - return self.assertTrue(self.downloaded_file.exists()) mtime = self.downloaded_file.stat().st_mtime @@ -55,7 +54,6 @@ def test_no_overwrite_identical_file(self) -> None: code = download_file(self.url, self.downloaded_file, ri, max_retry=5) if code != 200: self.skipTest(f"Could not download from {self.url}, cannot test") - return mtime = self.downloaded_file.stat().st_mtime From 07ecf28c2c4855139382ac403b04dd0bee470927 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Mon, 12 Jan 2026 19:41:49 +0000 Subject: [PATCH 11/11] Typo fixes. --- tests/test_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index 04e533b..b10f7c5 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -32,7 +32,7 @@ def test_download_once(self) -> None: ri = RemoteFileInfo() code = download_file(self.url, self.downloaded_file, ri, max_retry=5) if code != 200: - self.skipTest(f"Could not dowload from {self.url}, cannot test") + self.skipTest(f"Could not download from {self.url}, cannot test") self.assertTrue(self.downloaded_file.exists()) mtime = self.downloaded_file.stat().st_mtime @@ -40,7 +40,7 @@ def test_download_once(self) -> None: sleep(1) code = download_file(self.url, self.downloaded_file, ri, max_retry=5) - # Either the server tols us the file had not been modified, or + # Either the server told us the file had not been modified, or # the download function checked the newly downloaded file is the # same as the one downloaded just before. Either way, we should # get a 304, and the file should not have been touched.