From ee60050b68d1bd73aa83e6e779fed379f3f91f26 Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 11:35:14 +0200 Subject: [PATCH 1/9] Adding Ruff Lint: adding missing docstrings, improving consistency, updating README --- .github/workflows/pylint.yml | 23 +++++ README.md | 96 ++++++------------- main.py | 45 +++++---- pyproject.toml | 25 +++++ src/Normalizer/JSONNormalizer.py | 30 +++++- src/Normalizer/TTLNormalizer.py | 45 ++++++--- src/Normalizer/__init__.py | 6 +- src/Textifier/WikidataTextifier.py | 45 ++++++++- src/Textifier/__init__.py | 20 +++- src/WikidataLabel.py | 147 ++++++++++++++++++----------- src/__init__.py | 40 +++++++- src/utils.py | 62 ++++++++---- uv.lock | 33 +++++++ 13 files changed, 438 insertions(+), 179 deletions(-) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..c73e032 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') diff --git a/README.md b/README.md index 8ab9065..274b422 100644 --- a/README.md +++ b/README.md @@ -1,81 +1,47 @@ # Wikidata Textifier -**Wikidata Textifier** is an API that transforms Wikidata items into compact format for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines. +**Wikidata Textifier** is an API that transforms Wikidata entities into compact outputs for LLM and GenAI use cases. +It resolves missing labels for properties and claim values using the Wikidata Action API and caches labels to reduce repeated lookups. -🔗 Live API: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/) +Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/) +API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) ---- +## Features -## Functionalities +- Textify Wikidata entities as `json`, `text`, or `triplet`. +- Resolve labels for linked entities and properties. +- Cache labels in MariaDB for faster repeated requests. +- Support multilingual output with fallback language support. +- Avoid SPARQL and use Wikidata Action API / EntityData endpoints. -- **Textifies** any Wikidata item into a readable or JSON format suitable for LLMs. -- **Resolves all labels**, including those missing when querying the Wikidata API. -- **Caches labels** for 90 days to boost performance and reduce API load. -- **Avoids SPARQL** and uses the Wikidata Action API for better efficiency and compatibility. -- **Hosted on Toolforge**: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/) +## Output Formats ---- +- `json`: Structured representation with claims (and optionally qualifiers/references). +- `text`: Readable summary including label, description, aliases, and attributes. +- `triplet`: Triplet-style lines with labels and IDs for graph-style traversal. -## Formats - -- **Text**: A textual representation or summary of the Wikidata item, including its label, description, aliases, and claims. Useful for helping LLMs understand what the item represents. -- **Triplet**: Outputs each triplet as a structured line, including labels and IDs, but omits descriptions and aliases. Ideal for agentic LLMs to traverse and explore Wikidata. -- **JSON**: A structured and compact representation of the full item, suitable for custom formats. - ---- - -## API Usage +## API ### `GET /` -#### Query Parameters - -| Name | Type | Required | Description | -|----------------|---------|----------|-----------------------------------------------------------------------------| -| `id` | string | Yes | Wikidata item ID (e.g., `Q42`) | -| `lang` | string | No | Language code for labels (default: `en`) | -| `format` | string | No | The format of the response, either 'json', 'text', or 'triplet' (default: `json`) | -| `external_ids` | bool | No | Whether to include external IDs in the output (default: `true`) | -| `all_ranks` | bool | No | If false, returns ranked preferred statements, falling back to normal when unavailable (default: `false`) | -| `references` | bool | No | Whether to include references (default: `false`) | -| `fallback_lang` | string | No | Fallback language code if the preferred language is not available (default: `en`) | - ---- - -## Deploy to Toolforge - -1. Shell into the Toolforge system: - -```bash -ssh [UNIX shell username]@login.toolforge.org -``` - -2. Switch to tool user account: - -```bash -become wd-textify -``` - -3. Build from Git: - -```bash -toolforge build start https://github.com/philippesaade-wmde/WikidataTextifier.git -``` +#### Query parameters -4. Start the web service: +| Name | Type | Required | Description | +|---|---|---|---| +| `id` | string | Yes | Comma-separated Wikidata IDs (for example: `Q42` or `Q42,Q2`). | +| `pid` | string | No | Comma-separated property IDs to filter claims (for example: `P31,P279`). | +| `lang` | string | No | Preferred language code (default: `en`). | +| `fallback_lang` | string | No | Fallback language code (default: `en`). | +| `format` | string | No | Output format: `json`, `text`, or `triplet` (default: `json`). | +| `external_ids` | bool | No | Include `external-id` datatype claims (default: `true`). | +| `all_ranks` | bool | No | Include all statement ranks instead of preferred/normal filtering (default: `false`). | +| `qualifiers` | bool | No | Include qualifiers in claim values (default: `true`). | +| `references` | bool | No | Include references in claim values (default: `false`). | -```bash -webservice buildservice start --mount all -``` - -5. Debugging the web service: - -Read the logs: -```bash -webservice logs -``` +#### Example requests -Open the service shell: ```bash -webservice shell +curl "https://wd-textify.wmcloud.org/?id=Q42" +curl "https://wd-textify.wmcloud.org/?id=Q42&format=text&lang=en" +curl "https://wd-textify.wmcloud.org/?id=Q42,Q2&pid=P31,P279&format=triplet" ``` diff --git a/main.py b/main.py index f999d75..be09041 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,16 @@ -from fastapi import FastAPI, HTTPException, Query, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi import BackgroundTasks +"""FastAPI application that exposes Wikidata textification endpoints.""" + +import os +import time import traceback + import requests -import time -import os +from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request +from fastapi.middleware.cors import CORSMiddleware -from src.Normalizer import TTLNormalizer, JSONNormalizer -from src.WikidataLabel import WikidataLabel, LazyLabelFactory from src import utils +from src.Normalizer import JSONNormalizer, TTLNormalizer +from src.WikidataLabel import LazyLabelFactory, WikidataLabel # Start Fastapi app app = FastAPI( @@ -34,6 +36,7 @@ @app.on_event("startup") async def startup(): + """Initialize database resources required by the API.""" WikidataLabel.initialize_database() @app.get( @@ -71,22 +74,26 @@ async def get_textified_wd( qualifiers: bool = True, fallback_lang: str = 'en' ): - """ - Retrieve a Wikidata item with all labels or textual representations for an LLM. + """Return normalized Wikidata entities in JSON, text, or triplet format. Args: - id (str): The Wikidata item ID (e.g., "Q42"). - pid (str): Comma-separated list of property IDs to filter claims (e.g., "P31,P279"). - format (str): The format of the response, either 'json', 'text', or 'triplet'. - lang (str): The language code for labels (default is 'en'). - external_ids (bool): If True, includes external IDs in the response. - all_ranks (bool): If True, includes statements of all ranks (preferred, normal, deprecated). - references (bool): If True, includes references in the response. (only available in JSON format) - qualifiers (bool): If True, includes qualifiers in the response. - fallback_lang (str): The fallback language code if the preferred language is not available. + request (Request): Incoming request object (currently unused). + background_tasks (BackgroundTasks): Background task queue for periodic cache cleanup. + id (str): Comma-separated entity IDs (for example, ``"Q42,Q2"``). + pid (str): Optional comma-separated property IDs used to filter claims. + lang (str): Preferred language code for labels and formatted values. + format (str): Output format: ``"json"``, ``"text"``, or ``"triplet"``. + external_ids (bool): Whether to include claims with the ``external-id`` datatype. + references (bool): Whether to include references in claim values. + all_ranks (bool): Whether to include all statement ranks (preferred, normal, deprecated). + qualifiers (bool): Whether to include qualifiers in claim values. + fallback_lang (str): Fallback language when ``lang`` is unavailable. Returns: - list: A list of dictionaries containing QIDs and the similarity scores. + dict[str, object | None]: Mapping of requested QIDs to their normalized payloads. + + Raises: + HTTPException: If an entity is not found, an upstream request fails, or internal processing fails. """ try: filter_pids = [] diff --git a/pyproject.toml b/pyproject.toml index 32e80d5..098c4c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,3 +13,28 @@ dependencies = [ "sqlalchemy>=2.0.41", "uvicorn>=0.35.0", ] + +[dependency-groups] +dev = [ + "ruff>=0.9.0" +] + +[tool.ruff] +target-version = "py313" +line-length = 120 + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "F", # Pyflakes (catches undefined names, unused imports, etc.) + "I", # isort (import sorting) + "D", # pydocstyle (function/class documentation) +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.isort] +known-first-party = [ + "wikidatasearch" +] diff --git a/src/Normalizer/JSONNormalizer.py b/src/Normalizer/JSONNormalizer.py index 87773f8..15a5cdd 100644 --- a/src/Normalizer/JSONNormalizer.py +++ b/src/Normalizer/JSONNormalizer.py @@ -1,9 +1,11 @@ +"""Normalize Wikidata Action API JSON into internal textifier objects.""" + from __future__ import annotations from typing import Any, Dict, List, Optional + import requests -from ..WikidataLabel import WikidataLabel, LazyLabelFactory from ..Textifier.WikidataTextifier import ( WikidataClaim, WikidataClaimValue, @@ -14,11 +16,11 @@ WikidataTime, ) from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text +from ..WikidataLabel import LazyLabelFactory, WikidataLabel class JSONNormalizer: - """Build WikidataEntity + claims tree from Wikidata JSON (wbgetentities style). - """ + """Normalize ``wbgetentities`` JSON into internal textifier objects.""" def __init__( self, @@ -29,6 +31,16 @@ def __init__( label_factory: Optional[LazyLabelFactory] = None, debug: bool = False, ): + """Initialize a normalizer for a single entity payload. + + Args: + entity_id (str): Entity ID being normalized. + entity_json (dict[str, Any]): Raw ``wbgetentities`` JSON for ``entity_id``. + lang (str): Preferred language for label selection. + fallback_lang (str): Fallback language when ``lang`` is unavailable. + label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities. + debug (bool): Whether to print additional debug output while parsing. + """ self.entity_id = entity_id self.entity_json = entity_json @@ -51,6 +63,18 @@ def normalize( qualifiers: bool = True, filter_pids: List[str] = [], ) -> WikidataEntity: + """Normalize the entity JSON payload into a ``WikidataEntity`` tree. + + Args: + external_ids (bool): Whether to include ``external-id`` datatype claims. + references (bool): Whether to include references for each statement value. + all_ranks (bool): Whether to include statements of all ranks. + qualifiers (bool): Whether to include qualifiers for statement values. + filter_pids (list[str]): Optional allow-list of property IDs to keep. + + Returns: + WikidataEntity: Parsed entity object with claims and values. + """ e = self.entity_json if not isinstance(e, dict) or "labels" not in e: if self.debug: diff --git a/src/Normalizer/TTLNormalizer.py b/src/Normalizer/TTLNormalizer.py index c3c45a1..2d8af38 100644 --- a/src/Normalizer/TTLNormalizer.py +++ b/src/Normalizer/TTLNormalizer.py @@ -1,13 +1,14 @@ +"""Normalize Wikidata TTL into internal textifier objects.""" + from __future__ import annotations from collections import defaultdict from typing import Any, DefaultDict, Dict, List, Optional, Set -import requests +import requests from rdflib import Graph, Literal, Namespace, URIRef from rdflib.namespace import RDF, RDFS -from ..WikidataLabel import WikidataLabel, LazyLabelFactory from ..Textifier.WikidataTextifier import ( WikidataClaim, WikidataClaimValue, @@ -18,9 +19,9 @@ WikidataTime, ) from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text +from ..WikidataLabel import LazyLabelFactory, WikidataLabel - -# Namespaces used by Wikidata TTL exports +# Namespaces used by Wikidata TTL WD = Namespace("http://www.wikidata.org/entity/") P = Namespace("http://www.wikidata.org/prop/") PS = Namespace("http://www.wikidata.org/prop/statement/") @@ -39,18 +40,18 @@ class TTLNormalizer: - """Parse a Wikidata Special:EntityData TTL and build a WikidataEntity with claims. + """Normalize ``Special:EntityData`` TTL into internal textifier objects. Label resolution order: - 1) labels present in TTL - 2) LazyLabelFactory bulk lookup for the remainder + 1) Labels present in TTL. + 2) ``LazyLabelFactory`` bulk lookup for unresolved IDs. Notes: - - Claims are extracted from wd: p:

triples only. - - Statement nodes are validated structurally before value extraction. - - Special values (somevalue/novalue) are treated as "no main value" when + - Claims are extracted from ``wd: p:

`` triples only. + - Statement nodes are validated structurally before value extraction. + - Special values (somevalue/novalue) are treated as "no main value" when neither ps: nor psv: is present on the statement node. - - Property datatype is read from wikibase:propertyType when available, + - Property datatype is read from ``wikibase:propertyType`` when available, otherwise inferred from the statement's value nodes when possible. """ @@ -63,6 +64,16 @@ def __init__( label_factory: Optional[LazyLabelFactory] = None, debug: bool = False, ): + """Initialize a normalizer for a single TTL document. + + Args: + entity_id (str): Entity ID being normalized. + ttl_text (str): Raw TTL document from ``Special:EntityData``. + lang (str): Preferred language for label selection. + fallback_lang (str): Fallback language when ``lang`` is unavailable. + label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities. + debug (bool): Whether to print additional debug output while parsing. + """ self.entity_id = entity_id self.g = Graph() self.g.parse(data=ttl_text, format="turtle") @@ -85,6 +96,18 @@ def normalize( qualifiers: bool = True, filter_pids: List[str] = [] ) -> WikidataEntity: + """Normalize the parsed graph into a ``WikidataEntity`` tree. + + Args: + external_ids (bool): Whether to include ``external-id`` datatype claims. + references (bool): Whether to include references for each statement value. + all_ranks (bool): Whether to include statements of all ranks. + qualifiers (bool): Whether to include qualifiers for statement values. + filter_pids (list[str]): Optional allow-list of property IDs to keep. + + Returns: + WikidataEntity: Parsed entity object with claims and values. + """ # Preload labels found inside TTL so LazyLabelFactory can avoid lookups. self.label_factory._resolved_labels = self._build_label_cache_from_ttl() diff --git a/src/Normalizer/__init__.py b/src/Normalizer/__init__.py index d1d327c..b35407a 100644 --- a/src/Normalizer/__init__.py +++ b/src/Normalizer/__init__.py @@ -1,2 +1,6 @@ +"""Public exports for normalizer classes.""" + +from .JSONNormalizer import JSONNormalizer from .TTLNormalizer import TTLNormalizer -from .JSONNormalizer import JSONNormalizer \ No newline at end of file + +__all__ = ["JSONNormalizer", "TTLNormalizer"] diff --git a/src/Textifier/WikidataTextifier.py b/src/Textifier/WikidataTextifier.py index d09b909..a34ed1b 100644 --- a/src/Textifier/WikidataTextifier.py +++ b/src/Textifier/WikidataTextifier.py @@ -1,9 +1,11 @@ +"""Data structures for Wikidata entities and serialization helpers.""" + from __future__ import annotations +import json from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Union -import json LANGUAGE_VARIABLES_PATH = Path(__file__).with_name("language_variables.json") with LANGUAGE_VARIABLES_PATH.open("r", encoding="utf-8") as f: @@ -15,32 +17,42 @@ @dataclass(slots=True) class WikidataText: + """Object for Wikidata plain text values.""" + text: Optional[str] = None def __str__(self) -> str: + """Return the text representation.""" return self.text or "" def __bool__(self) -> bool: + """Return whether this text wrapper contains content.""" return bool(self.text) def to_json(self) -> Optional[str]: + """Serialize to a JSON-friendly scalar.""" return self.text @dataclass(slots=True) class WikidataCoordinates: + """Object for Wikidata coordinate values.""" + latitude: Optional[float] = None longitude: Optional[float] = None string_val: Optional[str] = None def __str__(self) -> str: + """Return a readable coordinate string.""" return self.string_val or "" def __bool__(self) -> bool: + """Return whether both latitude and longitude are present.""" # coordinates are meaningful if we have both lat/lon return self.latitude is not None and self.longitude is not None def to_json(self) -> Dict[str, Any]: + """Serialize coordinates to a JSON object.""" return { "latitude": self.latitude, "longitude": self.longitude, @@ -50,18 +62,23 @@ def to_json(self) -> Dict[str, Any]: @dataclass(slots=True) class WikidataTime: + """Object for Wikidata time values.""" + time: Optional[str] = None precision: Optional[int] = None calendarmodel: Optional[str] = None string_val: Optional[str] = None def __str__(self) -> str: + """Return a readable time string.""" return self.string_val or "" def __bool__(self) -> bool: + """Return whether this instance contains a time value.""" return bool(self.time) or bool(self.string_val) def to_json(self) -> Dict[str, Any]: + """Serialize time to a JSON object.""" return { "time": self.time, "precision": self.precision, @@ -72,11 +89,14 @@ def to_json(self) -> Dict[str, Any]: @dataclass(slots=True) class WikidataQuantity: + """Object for Wikidata quantity values.""" + amount: Optional[str] = None unit: Optional[Any] = None unit_id: Optional[str] = None def __str__(self) -> str: + """Return a readable quantity string.""" if not self.amount: return "" if self.unit_id: @@ -84,9 +104,11 @@ def __str__(self) -> str: return str(self.amount) def __bool__(self) -> bool: + """Return whether this quantity has an amount.""" return bool(self.amount) def to_json(self) -> Any: + """Serialize quantity to a scalar or object.""" if not self.amount: return None if self.unit_id: @@ -104,6 +126,8 @@ def to_json(self) -> Any: @dataclass(slots=True) class WikidataEntity: + """Object for Wikidata entities.""" + id: str label: Optional[Any] = None description: Optional[str] = None @@ -111,9 +135,11 @@ class WikidataEntity: claims: List["WikidataClaim"] = field(default_factory=list) def __bool__(self) -> bool: + """Return whether this entity has a usable id and label.""" return bool(self.id) and self.label is not None and str(self.label) != "" def to_text(self, lang='en', keep_empty: bool = False) -> str: + """Render the entity into a readable text.""" lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) label_str = str(self.label) if self.label else '' @@ -125,9 +151,7 @@ def to_text(self, lang='en', keep_empty: bool = False) -> str: string += f"{lang_var[', ']}{lang_var['also known as']}" string += f" {lang_var[', '].join(map(str, self.aliases))}" - attributes = [c.to_text(lang, keep_empty=keep_empty) \ - for c in self.claims \ - if keep_empty or c] + attributes = [c.to_text(lang) for c in self.claims if keep_empty or c] if len(attributes) > 0: attributes = "\n- ".join(attributes) string += f". {lang_var['Attributes include']}:\n- {attributes}" @@ -137,6 +161,7 @@ def to_text(self, lang='en', keep_empty: bool = False) -> str: return string def to_json(self) -> Dict[str, Any]: + """Serialize the entity to a JSON object.""" id_key = "PID" if self.id.startswith("P") else "QID" return { id_key: self.id, @@ -147,6 +172,7 @@ def to_json(self) -> Dict[str, Any]: } def to_triplet(self) -> str: + """Render the entity as triplet lines.""" head = f"{str(self.label) if self.label else ''} ({self.id})" lines: List[str] = [] if self.description: @@ -164,12 +190,15 @@ def to_triplet(self) -> str: @dataclass(slots=True) class WikidataClaim: + """Object for Wikidata claims.""" + subject: WikidataEntity property: WikidataEntity values: List["WikidataClaimValue"] = field(default_factory=list) datatype: str = "string" def __bool__(self) -> bool: + """Return whether this claim contains a value.""" return ( self.property is not None and str(self.property.label) != "" @@ -178,6 +207,7 @@ def __bool__(self) -> bool: ) def to_text(self, lang='en') -> str: + """Render the claim into a readable text.""" lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) if self.values: @@ -187,6 +217,7 @@ def to_text(self, lang='en') -> str: return f"{lang_var['has']} {str(self.property.label)}" def to_json(self) -> Dict[str, Any]: + """Serialize the claim to a JSON object.""" prop_json = self.property.to_json() prop_id = prop_json.get("PID") or prop_json.get("QID") return { @@ -197,6 +228,7 @@ def to_json(self) -> Dict[str, Any]: } def to_triplet(self, as_qualifier: bool = False) -> str: + """Render the claim as triplet text.""" if not self: return "" @@ -216,6 +248,7 @@ def to_triplet(self, as_qualifier: bool = False) -> str: @dataclass(slots=True) class WikidataClaimValue: + """Object for Wikidata claim values.""" claim: WikidataClaim value: Optional[ Union[WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText] @@ -225,9 +258,11 @@ class WikidataClaimValue: rank: Optional[str] = None # preferred|normal|deprecated def __bool__(self) -> bool: + """Return whether this claim value has non-empty values.""" return self.value is not None and str(self.value) != "" def to_text(self, lang='en') -> str: + """Render the value and qualifiers as readable text.""" lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) if not self: @@ -248,6 +283,7 @@ def to_text(self, lang='en') -> str: return s def to_json(self) -> Optional[Dict[str, Any]]: + """Serialize the claim value to a JSON object.""" if not self: return None @@ -280,6 +316,7 @@ def to_json(self) -> Optional[Dict[str, Any]]: return out def to_triplet(self) -> str: + """Render the value as triplet text.""" if not self: return "" diff --git a/src/Textifier/__init__.py b/src/Textifier/__init__.py index 18af4cb..218caa9 100644 --- a/src/Textifier/__init__.py +++ b/src/Textifier/__init__.py @@ -1 +1,19 @@ -from .WikidataTextifier import WikidataEntity, WikidataClaim, WikidataClaimValue, WikidataCoordinates, WikidataTime, WikidataQuantity \ No newline at end of file +"""Public exports for textifier data structures.""" + +from .WikidataTextifier import ( + WikidataClaim, + WikidataClaimValue, + WikidataCoordinates, + WikidataEntity, + WikidataQuantity, + WikidataTime, +) + +__all__ = [ + "WikidataClaim", + "WikidataClaimValue", + "WikidataCoordinates", + "WikidataEntity", + "WikidataQuantity", + "WikidataTime", +] diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py index c082e57..04d04e8 100644 --- a/src/WikidataLabel.py +++ b/src/WikidataLabel.py @@ -1,15 +1,14 @@ -from sqlalchemy import Column, String, DateTime, create_engine, text -from sqlalchemy.dialects.mysql import JSON -from sqlalchemy.orm import sessionmaker, declarative_base +"""Label cache and lazy label resolution for Wikidata entities.""" -from .utils import get_wikidata_json_by_ids -from datetime import datetime, timedelta -import os import json +import os +from datetime import datetime, timedelta -""" -MySQL database setup for storing Wikidata labels in all languages. -""" +from sqlalchemy import Column, DateTime, String, create_engine, text +from sqlalchemy.dialects.mysql import JSON +from sqlalchemy.orm import declarative_base, sessionmaker + +from .utils import get_wikidata_json_by_ids DB_HOST = os.environ.get("DB_HOST", "localhost") DB_NAME = os.environ.get("DB_NAME", "label") @@ -38,7 +37,10 @@ Base = declarative_base() Session = sessionmaker(bind=engine, expire_on_commit=False) + class WikidataLabel(Base): + """Database-backed cache for multilingual Wikidata labels.""" + __tablename__ = 'labels' id = Column(String(64), primary_key=True) labels = Column(JSON, default=dict) @@ -46,9 +48,7 @@ class WikidataLabel(Base): @staticmethod def initialize_database(): - """ - Create tables if they don't already exist. - """ + """Create tables if they do not already exist.""" try: Base.metadata.create_all(engine) return True @@ -58,14 +58,13 @@ def initialize_database(): @staticmethod def add_bulk_labels(data): - """ - Insert multiple label records in bulk. + """Insert or update multiple label records in one transaction. - Parameters: - - data (list[dict]): A list of dictionaries, each containing 'id', 'labels' keys. + Args: + data (list[dict]): Records containing at least ``id`` and ``labels`` keys. Returns: - - bool: True if the operation was successful, False otherwise. + bool: ``True`` when the operation succeeds, otherwise ``False``. """ if not data: return True @@ -95,15 +94,14 @@ def add_bulk_labels(data): @staticmethod def add_label(id, labels): - """ - Insert a labels and descriptions into the database. + """Insert or update labels for a single entity. - Parameters: - - id (str): The unique identifier for the entity. - - labels (dict): A dictionary of labels (e.g. { "en": "Label in English", "fr": "Label in French", ... }). + Args: + id (str): Entity ID. + labels (dict): Mapping of language code to label text. Returns: - - bool: True if the operation was successful, False otherwise. + bool: ``True`` when the operation succeeds, otherwise ``False``. """ with Session() as session: try: @@ -121,14 +119,13 @@ def add_label(id, labels): @staticmethod def get_labels(id): - """ - Retrieve labels and descriptions for a given entity by its ID. + """Retrieve cached labels for one entity, with API fallback. - Parameters: - - id (str): The unique identifier of the entity. + Args: + id (str): Entity ID. Returns: - - dict: The labels dictionary if found, otherwise an empty dict. + dict | None: Cached or fetched labels for the entity, if available. """ try: with Session() as session: @@ -153,14 +150,13 @@ def get_labels(id): @staticmethod def get_bulk_labels(ids): - """ - Retrieve labels for multiple entities by their IDs. + """Retrieve cached labels for multiple entities, with API fallback. - Parameters: - - ids (list[str]): A list of entity IDs to retrieve. + Args: + ids (list[str]): Entity IDs to fetch. Returns: - - dict[str, dict]: A dictionary mapping each ID to its labels. + dict[str, dict]: Mapping of each requested ID to its labels. """ if not ids: return {} @@ -195,9 +191,10 @@ def get_bulk_labels(ids): @staticmethod def delete_old_labels(): - """ - Delete labels older than X days. - If the database exceeds 10 million rows, delete the oldest rows until it is below the threshold. + """Delete expired labels and enforce maximum cache size. + + Returns: + bool: ``True`` when cleanup succeeds or is skipped, otherwise ``False``. """ if LABEL_UNLIMITED: return True @@ -244,14 +241,13 @@ def delete_old_labels(): @staticmethod def _get_labels_wdapi(ids): - """ - Retrieve labels from the Wikidata API for a list of IDs. + """Retrieve labels from the Wikidata API. - Parameters: - - ids (list[str] or str): A list of Wikidata entity IDs or a single string of IDs separated by '|'. + Args: + ids (list[str] | str): IDs as a list or ``|``-separated string. Returns: - - dict: A dictionary mapping each ID to its labels. + dict[str, dict]: Mapping of each ID to compressed labels. """ entities_data = get_wikidata_json_by_ids(ids, props="labels") entities_data = WikidataLabel._compress_labels(entities_data) @@ -259,14 +255,13 @@ def _get_labels_wdapi(ids): @staticmethod def _compress_labels(data): - """ - Compress labels by extracting the 'value' field from each label. + """Compress API labels by extracting each language's ``value`` field. - Parameters: - - data (dict): A dictionary of labels from Wikidata API. + Args: + data (dict): Raw entities payload from the Wikidata API. Returns: - - dict: A new dictionary with labels compressed to their 'value' field. + dict[str, dict]: Mapping of entity ID to ``{lang: label}``. """ new_labels = {} for qid, labels in data.items(): @@ -281,8 +276,15 @@ def _compress_labels(data): @staticmethod def get_lang_val(data, lang='en', fallback_lang=None): - """ - Extracts the value for a given language from a dictionary of labels. + """Return the best label text from a labels dictionary. + + Args: + data (dict): Label dictionary keyed by language. + lang (str): Preferred language code. + fallback_lang (str | None): Optional fallback language code. + + Returns: + str: Selected label text, or an empty string when missing. """ label = data.get(lang, data.get('mul', {})) if fallback_lang and not label: @@ -294,14 +296,13 @@ def get_lang_val(data, lang='en', fallback_lang=None): @staticmethod def get_all_missing_labels_ids(data): - """ - Get the IDs of the entity dictionary where their labels are missing. + """Collect all referenced IDs that may require label lookup. - Parameters: - - data (dict or list): The data structure to search for missing labels. + Args: + data (dict | list): Nested entity structure to scan. Returns: - - set: A set of IDs that are missing labels. + set[str]: Referenced IDs that may be missing resolved labels. """ ids_list = set() @@ -327,26 +328,53 @@ def get_all_missing_labels_ids(data): return ids_list class LazyLabel: + """Deferred label string that resolves via a shared factory.""" + def __init__(self, qid, factory): + """Store the target entity ID and the lookup factory. + + Args: + qid (str): Entity ID whose label should be resolved lazily. + factory (LazyLabelFactory): Factory that performs batched label resolution. + """ self.qid = qid self.factory = factory def __str__(self): + """Resolve and return the label text for the configured entity.""" self.factory.resolve_all() return self.factory.get_label(self.qid) + class LazyLabelFactory: + """Create and batch-resolve lazy Wikidata labels.""" + def __init__(self, lang='en', fallback_lang='en'): + """Initialize a lazy label factory. + + Args: + lang (str): Preferred language code. + fallback_lang (str): Fallback language code. + """ self.lang = lang self.fallback_lang = fallback_lang self._pending_ids = set() self._resolved_labels = {} def create(self, qid: str) -> "LazyLabel": + """Create a lazy label handle and queue its ID for resolution. + + Args: + qid (str): Entity ID to resolve. + + Returns: + LazyLabel: Lazy label wrapper bound to this factory. + """ self._pending_ids.add(qid) return LazyLabel(qid, factory=self) def resolve_all(self): + """Resolve all pending IDs in a single bulk lookup.""" if not self._pending_ids: return @@ -356,10 +384,23 @@ def resolve_all(self): self._pending_ids.clear() def get_label(self, qid: str) -> str: + """Return the resolved label text for an entity ID. + + Args: + qid (str): Entity ID. + + Returns: + str: Best label text according to current language settings. + """ label_dict = self._resolved_labels.get(qid, {}) label = WikidataLabel.get_lang_val(label_dict, lang=self.lang, fallback_lang=self.fallback_lang) return label def set_lang(self, lang: str): + """Update preferred language and resolve pending IDs. + + Args: + lang (str): Preferred language code. + """ self.lang = lang self.resolve_all() diff --git a/src/__init__.py b/src/__init__.py index dde2dfc..acaf1ea 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,4 +1,36 @@ -from .WikidataLabel import * -from .Textifier import * -from .Normalizer import * -from .utils import * \ No newline at end of file +"""Public package exports for Wikidata textification primitives.""" + +from .Normalizer import JSONNormalizer, TTLNormalizer +from .Textifier import ( + WikidataClaim, + WikidataClaimValue, + WikidataCoordinates, + WikidataEntity, + WikidataQuantity, + WikidataTime, +) +from .utils import ( + get_wikidata_json_by_ids, + get_wikidata_ttl_by_id, + wikidata_geolocation_to_text, + wikidata_time_to_text, +) +from .WikidataLabel import LazyLabel, LazyLabelFactory, WikidataLabel + +__all__ = [ + "JSONNormalizer", + "TTLNormalizer", + "WikidataClaim", + "WikidataClaimValue", + "WikidataCoordinates", + "WikidataEntity", + "WikidataLabel", + "WikidataQuantity", + "WikidataTime", + "LazyLabel", + "LazyLabelFactory", + "get_wikidata_json_by_ids", + "get_wikidata_ttl_by_id", + "wikidata_geolocation_to_text", + "wikidata_time_to_text", +] diff --git a/src/utils.py b/src/utils.py index 880a951..b53d86c 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,10 +1,12 @@ -import requests -from requests.adapters import HTTPAdapter +"""HTTP helpers and value-formatting utilities for Wikidata APIs.""" -import json import html +import json import os +import requests +from requests.adapters import HTTPAdapter + REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) SESSION = requests.Session() @@ -16,14 +18,17 @@ def get_wikidata_ttl_by_id( id, lang='en', ): - """Fetches a Wikidata entity by its ID and returns its TTL representation. + """Fetch a Wikidata entity as TTL from ``Special:EntityData``. Args: - id (str): A Wikidata entity ID (e.g., Q42, P31). - lang (str, optional): The language to use for the response. Defaults to 'en'. + id (str): Wikidata entity ID, for example ``"Q42"`` or ``"P31"``. + lang (str, optional): Language code for server-side label rendering. Returns: - str: The TTL representation of the entity. + str: TTL document for the requested entity. + + Raises: + requests.HTTPError: If Wikidata returns an error response. """ params = { 'uselang': lang, @@ -46,17 +51,18 @@ def get_wikidata_json_by_ids( ids, props='labels|descriptions|aliases|claims' ): - """ - Fetches Wikidata entities by their IDs and returns a dictionary of entities. + """Fetch one or more Wikidata entities from ``wbgetentities``. - Parameters: - - ids (list[str] or str): A list of Wikidata entity IDs (e.g., Q42, P31) or a single ID as a string. - - props (str): The properties to retrieve (default is 'labels|descriptions|aliases|claims'). + Args: + ids (list[str] | str): Entity IDs as a list or ``|``-separated string. + props (str): Pipe-delimited properties requested from the API. Returns: - - dict: A dictionary containing the entities, where keys are entity IDs and values are dictionaries of properties. - """ + dict[str, dict]: Mapping of entity IDs to API entity payloads. + Raises: + requests.HTTPError: If Wikidata returns an error response. + """ if isinstance(ids, str): ids = ids.split('|') ids = list(dict.fromkeys(ids)) # Ensure unique IDs @@ -97,8 +103,18 @@ def get_wikidata_json_by_ids( ##################################### def wikidata_time_to_text(value: dict, lang: str = "en"): - """ - Convert a Wikidata time value into natural language text. + """Format a time datavalue into localized display text using a local Wikibase instance. + + Args: + value (dict): Time value payload in Wikibase datavalue format. + lang (str): Language code used by ``wbformatvalue``. + + Returns: + str: Localized human-readable representation of the time value. + + Raises: + ValueError: If the input payload is invalid or the API response is malformed. + requests.HTTPError: If the formatting API returns an error response. """ WIKIBASE_HOST = os.environ.get("WIKIBASE_HOST", "wikibase") WIKIBASE_API = f"http://{WIKIBASE_HOST}/w/api.php" @@ -138,8 +154,18 @@ def wikidata_time_to_text(value: dict, lang: str = "en"): def wikidata_geolocation_to_text(value: dict, lang: str = "en"): - """ - Convert a Wikidata geolocation value into natural language text. + """Format a globe-coordinate value into localized display text using a local Wikibase instance. + + Args: + value (dict): Coordinate payload in Wikibase datavalue format. + lang (str): Language code used by ``wbformatvalue``. + + Returns: + str: Localized human-readable representation of the coordinate value. + + Raises: + ValueError: If the formatting API response is malformed. + requests.HTTPError: If the formatting API returns an error response. """ WIKIBASE_HOST = os.environ.get("WIKIBASE_HOST", "wikibase") WIKIBASE_API = f"http://{WIKIBASE_HOST}/w/api.php" diff --git a/uv.lock b/uv.lock index 5694f09..d6fd462 100644 --- a/uv.lock +++ b/uv.lock @@ -241,6 +241,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, ] +[[package]] +name = "ruff" +version = "0.15.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/14/b0/73cf7550861e2b4824950b8b52eebdcc5adc792a00c514406556c5b80817/ruff-0.15.8.tar.gz", hash = "sha256:995f11f63597ee362130d1d5a327a87cb6f3f5eae3094c620bcc632329a4d26e", size = 4610921, upload-time = "2026-03-26T18:39:38.675Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/92/c445b0cd6da6e7ae51e954939cb69f97e008dbe750cfca89b8cedc081be7/ruff-0.15.8-py3-none-linux_armv6l.whl", hash = "sha256:cbe05adeba76d58162762d6b239c9056f1a15a55bd4b346cfd21e26cd6ad7bc7", size = 10527394, upload-time = "2026-03-26T18:39:41.566Z" }, + { url = "https://files.pythonhosted.org/packages/eb/92/f1c662784d149ad1414cae450b082cf736430c12ca78367f20f5ed569d65/ruff-0.15.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d3e3d0b6ba8dca1b7ef9ab80a28e840a20070c4b62e56d675c24f366ef330570", size = 10905693, upload-time = "2026-03-26T18:39:30.364Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f2/7a631a8af6d88bcef997eb1bf87cc3da158294c57044aafd3e17030613de/ruff-0.15.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6ee3ae5c65a42f273f126686353f2e08ff29927b7b7e203b711514370d500de3", size = 10323044, upload-time = "2026-03-26T18:39:33.37Z" }, + { url = "https://files.pythonhosted.org/packages/67/18/1bf38e20914a05e72ef3b9569b1d5c70a7ef26cd188d69e9ca8ef588d5bf/ruff-0.15.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdce027ada77baa448077ccc6ebb2fa9c3c62fd110d8659d601cf2f475858d94", size = 10629135, upload-time = "2026-03-26T18:39:44.142Z" }, + { url = "https://files.pythonhosted.org/packages/d2/e9/138c150ff9af60556121623d41aba18b7b57d95ac032e177b6a53789d279/ruff-0.15.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12e617fc01a95e5821648a6df341d80456bd627bfab8a829f7cfc26a14a4b4a3", size = 10348041, upload-time = "2026-03-26T18:39:52.178Z" }, + { url = "https://files.pythonhosted.org/packages/02/f1/5bfb9298d9c323f842c5ddeb85f1f10ef51516ac7a34ba446c9347d898df/ruff-0.15.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:432701303b26416d22ba696c39f2c6f12499b89093b61360abc34bcc9bf07762", size = 11121987, upload-time = "2026-03-26T18:39:55.195Z" }, + { url = "https://files.pythonhosted.org/packages/10/11/6da2e538704e753c04e8d86b1fc55712fdbdcc266af1a1ece7a51fff0d10/ruff-0.15.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d910ae974b7a06a33a057cb87d2a10792a3b2b3b35e33d2699fdf63ec8f6b17a", size = 11951057, upload-time = "2026-03-26T18:39:19.18Z" }, + { url = "https://files.pythonhosted.org/packages/83/f0/c9208c5fd5101bf87002fed774ff25a96eea313d305f1e5d5744698dc314/ruff-0.15.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2033f963c43949d51e6fdccd3946633c6b37c484f5f98c3035f49c27395a8ab8", size = 11464613, upload-time = "2026-03-26T18:40:06.301Z" }, + { url = "https://files.pythonhosted.org/packages/f8/22/d7f2fabdba4fae9f3b570e5605d5eb4500dcb7b770d3217dca4428484b17/ruff-0.15.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f29b989a55572fb885b77464cf24af05500806ab4edf9a0fd8977f9759d85b1", size = 11257557, upload-time = "2026-03-26T18:39:57.972Z" }, + { url = "https://files.pythonhosted.org/packages/71/8c/382a9620038cf6906446b23ce8632ab8c0811b8f9d3e764f58bedd0c9a6f/ruff-0.15.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:ac51d486bf457cdc985a412fb1801b2dfd1bd8838372fc55de64b1510eff4bec", size = 11169440, upload-time = "2026-03-26T18:39:22.205Z" }, + { url = "https://files.pythonhosted.org/packages/4d/0d/0994c802a7eaaf99380085e4e40c845f8e32a562e20a38ec06174b52ef24/ruff-0.15.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c9861eb959edab053c10ad62c278835ee69ca527b6dcd72b47d5c1e5648964f6", size = 10605963, upload-time = "2026-03-26T18:39:46.682Z" }, + { url = "https://files.pythonhosted.org/packages/19/aa/d624b86f5b0aad7cef6bbf9cd47a6a02dfdc4f72c92a337d724e39c9d14b/ruff-0.15.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8d9a5b8ea13f26ae90838afc33f91b547e61b794865374f114f349e9036835fb", size = 10357484, upload-time = "2026-03-26T18:39:49.176Z" }, + { url = "https://files.pythonhosted.org/packages/35/c3/e0b7835d23001f7d999f3895c6b569927c4d39912286897f625736e1fd04/ruff-0.15.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c2a33a529fb3cbc23a7124b5c6ff121e4d6228029cba374777bd7649cc8598b8", size = 10830426, upload-time = "2026-03-26T18:40:03.702Z" }, + { url = "https://files.pythonhosted.org/packages/f0/51/ab20b322f637b369383adc341d761eaaa0f0203d6b9a7421cd6e783d81b9/ruff-0.15.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:75e5cd06b1cf3f47a3996cfc999226b19aa92e7cce682dcd62f80d7035f98f49", size = 11345125, upload-time = "2026-03-26T18:39:27.799Z" }, + { url = "https://files.pythonhosted.org/packages/37/e6/90b2b33419f59d0f2c4c8a48a4b74b460709a557e8e0064cf33ad894f983/ruff-0.15.8-py3-none-win32.whl", hash = "sha256:bc1f0a51254ba21767bfa9a8b5013ca8149dcf38092e6a9eb704d876de94dc34", size = 10571959, upload-time = "2026-03-26T18:39:36.117Z" }, + { url = "https://files.pythonhosted.org/packages/1f/a2/ef467cb77099062317154c63f234b8a7baf7cb690b99af760c5b68b9ee7f/ruff-0.15.8-py3-none-win_amd64.whl", hash = "sha256:04f79eff02a72db209d47d665ba7ebcad609d8918a134f86cb13dd132159fc89", size = 11743893, upload-time = "2026-03-26T18:39:25.01Z" }, + { url = "https://files.pythonhosted.org/packages/15/e2/77be4fff062fa78d9b2a4dea85d14785dac5f1d0c1fb58ed52331f0ebe28/ruff-0.15.8-py3-none-win_arm64.whl", hash = "sha256:cf891fa8e3bb430c0e7fac93851a5978fc99c8fa2c053b57b118972866f8e5f2", size = 11048175, upload-time = "2026-03-26T18:40:01.06Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -340,6 +365,11 @@ dependencies = [ { name = "uvicorn" }, ] +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + [package.metadata] requires-dist = [ { name = "fastapi", specifier = ">=0.116.1" }, @@ -350,3 +380,6 @@ requires-dist = [ { name = "sqlalchemy", specifier = ">=2.0.41" }, { name = "uvicorn", specifier = ">=0.35.0" }, ] + +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.9.0" }] From 8e4da6c94ee1714918550ccfe6b631b2362e9c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Saad=C3=A9?= Date: Fri, 3 Apr 2026 11:36:48 +0200 Subject: [PATCH 2/9] Update README with API Docs link Added API documentation link to the README. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 274b422..7cd85ed 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ **Wikidata Textifier** is an API that transforms Wikidata entities into compact outputs for LLM and GenAI use cases. It resolves missing labels for properties and claim values using the Wikidata Action API and caches labels to reduce repeated lookups. -Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/) +Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/) \ API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) ## Features From bf7d11b86ee2283512cbf56eeeaa8a3f221cb61c Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 11:38:31 +0200 Subject: [PATCH 3/9] Exclude mysql from Ruff check --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 098c4c8..7524048 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dev = [ [tool.ruff] target-version = "py313" line-length = 120 +exclude = ["data/mysql"] [tool.ruff.lint] select = [ From 9f1f412f9714cda380fd0c9c7a83dae6481ac93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Saad=C3=A9?= Date: Fri, 3 Apr 2026 12:26:52 +0200 Subject: [PATCH 4/9] Add Ruff linting workflow for pull requests --- .github/workflows/lint.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..0538a3f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,26 @@ +name: "Ruff Lint" + +on: + pull_request: + branches: ["main"] + +permissions: + contents: read + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Set up Python + run: uv python install + + - name: Run Ruff linter + run: uv run ruff check . + + - name: Run Ruff formatter check + run: uv run ruff format --check . From e3eecba3054b1590eca43ade75ff096f32a81762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Saad=C3=A9?= Date: Fri, 3 Apr 2026 12:27:08 +0200 Subject: [PATCH 5/9] Delete .github/workflows/pylint.yml --- .github/workflows/pylint.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index c73e032..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Pylint - -on: [push] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') From 00c2cb80ccd955a9316d406a818af03cc8cf459b Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 12:35:39 +0200 Subject: [PATCH 6/9] format route's docstring as a markdown for Swagger --- main.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index be09041..9cbee1b 100644 --- a/main.py +++ b/main.py @@ -74,26 +74,33 @@ async def get_textified_wd( qualifiers: bool = True, fallback_lang: str = 'en' ): - """Return normalized Wikidata entities in JSON, text, or triplet format. - - Args: - request (Request): Incoming request object (currently unused). - background_tasks (BackgroundTasks): Background task queue for periodic cache cleanup. - id (str): Comma-separated entity IDs (for example, ``"Q42,Q2"``). - pid (str): Optional comma-separated property IDs used to filter claims. - lang (str): Preferred language code for labels and formatted values. - format (str): Output format: ``"json"``, ``"text"``, or ``"triplet"``. - external_ids (bool): Whether to include claims with the ``external-id`` datatype. - references (bool): Whether to include references in claim values. - all_ranks (bool): Whether to include all statement ranks (preferred, normal, deprecated). - qualifiers (bool): Whether to include qualifiers in claim values. - fallback_lang (str): Fallback language when ``lang`` is unavailable. - - Returns: - dict[str, object | None]: Mapping of requested QIDs to their normalized payloads. - - Raises: - HTTPException: If an entity is not found, an upstream request fails, or internal processing fails. + """Retrieve Wikidata entities as structured JSON, natural text, or triplet lines. + + This endpoint fetches one or more entities, resolves missing labels, and normalizes + claims into a compact representation suitable for downstream LLM use. + + **Args:** + + - **id** (str): Comma-separated Wikidata IDs to fetch (for example: `"Q42"` or `"Q42,Q2"`). + - **pid** (str, optional): Comma-separated property IDs used to filter returned claims (for example: `"P31,P279"`). + - **lang** (str): Preferred language code for labels and formatted values. + - **format** (str): Output format. One of `"json"`, `"text"`, or `"triplet"`. + - **external_ids** (bool): If `true`, include claims with datatype `external-id`. + - **references** (bool): If `true`, include references in claim values (JSON output only). + - **all_ranks** (bool): If `true`, include preferred, normal, and deprecated statement ranks. + - **qualifiers** (bool): If `true`, include qualifiers for claim values. + - **fallback_lang** (str): Fallback language used when `lang` is unavailable. + - **request** (Request): FastAPI request context object. + - **background_tasks** (BackgroundTasks): Background task manager used for cache cleanup. + + **Returns:** + + A dictionary keyed by requested entity ID (for example, `"Q42"`). + Each value depends on `format`: + + - **json**: Structured entity payload with label, description, aliases, and claims. + - **text**: Human-readable summary text. + - **triplet**: Triplet-style text lines with labels and IDs. """ try: filter_pids = [] From ca993f2858641ba8b89bfcbc98821586c639d5dc Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 11:35:14 +0200 Subject: [PATCH 7/9] Adding Ruff Lint: adding missing docstrings, improving consistency, updating README --- .github/workflows/pylint.yml | 23 +++++++++++++++++++++++ pyproject.toml | 1 + src/WikidataLabel.py | 4 ++-- 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..c73e032 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') diff --git a/pyproject.toml b/pyproject.toml index 7524048..6eedfea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dev = [ [tool.ruff] target-version = "py313" line-length = 120 + exclude = ["data/mysql"] [tool.ruff.lint] diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py index 04d04e8..ad41707 100644 --- a/src/WikidataLabel.py +++ b/src/WikidataLabel.py @@ -39,7 +39,7 @@ class WikidataLabel(Base): - """Database-backed cache for multilingual Wikidata labels.""" + """Database cache for multilingual Wikidata labels.""" __tablename__ = 'labels' id = Column(String(64), primary_key=True) @@ -58,7 +58,7 @@ def initialize_database(): @staticmethod def add_bulk_labels(data): - """Insert or update multiple label records in one transaction. + """Insert or update multiple label records. Args: data (list[dict]): Records containing at least ``id`` and ``labels`` keys. From f6baa83634891a333417142f77f9029d6b8a1390 Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 13:53:35 +0200 Subject: [PATCH 8/9] Remove obsolete pylint workflow --- .github/workflows/pylint.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index c73e032..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Pylint - -on: [push] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') From 45c3f7bae7fe64682b4714f4dfcc2e13fcc393da Mon Sep 17 00:00:00 2001 From: Philippe Saade Date: Fri, 3 Apr 2026 14:02:14 +0200 Subject: [PATCH 9/9] Reformating with Ruff --- main.py | 45 +++++++------ src/Normalizer/JSONNormalizer.py | 5 +- src/Normalizer/TTLNormalizer.py | 8 +-- src/Textifier/WikidataTextifier.py | 23 +++---- src/WikidataLabel.py | 100 +++++++++++++---------------- src/utils.py | 70 ++++++++++---------- 6 files changed, 125 insertions(+), 126 deletions(-) diff --git a/main.py b/main.py index 9cbee1b..23cffdc 100644 --- a/main.py +++ b/main.py @@ -34,11 +34,13 @@ LABEL_CLEANUP_INTERVAL_SECONDS = int(os.environ.get("LABEL_CLEANUP_INTERVAL_SECONDS", 3600)) _last_label_cleanup = 0.0 + @app.on_event("startup") async def startup(): """Initialize database resources required by the API.""" WikidataLabel.initialize_database() + @app.get( "/", responses={ @@ -46,33 +48,32 @@ async def startup(): "description": "Returns a list of relevant Wikidata property PIDs with similarity scores", "content": { "application/json": { - "example": [{ - "Q42": "Douglas Adams (human), English writer, humorist, and dramatist...", - }] + "example": [ + { + "Q42": "Douglas Adams (human), English writer, humorist, and dramatist...", + } + ] } }, }, 422: { "description": "Missing or invalid query parameter", - "content": { - "application/json": { - "example": {"detail": "Invalid format specified"} - } - }, + "content": {"application/json": {"example": {"detail": "Invalid format specified"}}}, }, }, ) async def get_textified_wd( - request: Request, background_tasks: BackgroundTasks, + request: Request, + background_tasks: BackgroundTasks, id: str = Query(..., examples="Q42,Q2"), pid: str = Query(None, examples="P31,P279"), - lang: str = 'en', - format: str = 'json', + lang: str = "en", + format: str = "json", external_ids: bool = True, references: bool = False, all_ranks: bool = False, qualifiers: bool = True, - fallback_lang: str = 'en' + fallback_lang: str = "en", ): """Retrieve Wikidata entities as structured JSON, natural text, or triplet lines. @@ -95,7 +96,7 @@ async def get_textified_wd( **Returns:** - A dictionary keyed by requested entity ID (for example, `"Q42"`). + A dictionary keyed by requested entity ID (for example, `"Q42"`). Each value depends on `format`: - **json**: Structured entity payload with label, description, aliases, and claims. @@ -105,9 +106,9 @@ async def get_textified_wd( try: filter_pids = [] if pid: - filter_pids = [p.strip() for p in pid.split(',')] + filter_pids = [p.strip() for p in pid.split(",")] - qids = [q.strip() for q in id.split(',')] + qids = [q.strip() for q in id.split(",")] label_factory = LazyLabelFactory(lang=lang, fallback_lang=fallback_lang) entities = {} @@ -158,7 +159,9 @@ async def get_textified_wd( fallback_lang=fallback_lang, label_factory=label_factory, debug=False, - ) if entity_data.get(qid) else None + ) + if entity_data.get(qid) + else None for qid in qids } @@ -168,8 +171,10 @@ async def get_textified_wd( all_ranks=all_ranks, references=references, filter_pids=filter_pids, - qualifiers=qualifiers - ) if entity else None + qualifiers=qualifiers, + ) + if entity + else None for qid, entity in entity_data.items() } @@ -179,9 +184,9 @@ async def get_textified_wd( return_data[qid] = None continue - if format == 'text': + if format == "text": results = entity.to_text(lang) - elif format == 'triplet': + elif format == "triplet": results = entity.to_triplet() else: results = entity.to_json() diff --git a/src/Normalizer/JSONNormalizer.py b/src/Normalizer/JSONNormalizer.py index 15a5cdd..bb11b79 100644 --- a/src/Normalizer/JSONNormalizer.py +++ b/src/Normalizer/JSONNormalizer.py @@ -48,7 +48,6 @@ def __init__( self.fallback_lang = fallback_lang self.debug = debug - self.label_factory = label_factory or LazyLabelFactory(lang=lang, fallback_lang=fallback_lang) # ------------------------------------------------------------------------- @@ -119,7 +118,7 @@ def normalize( external_ids=external_ids, include_references=references, all_ranks=all_ranks, - qualifiers=qualifiers + qualifiers=qualifiers, ) if claim_obj is not None and claim_obj.values: claims_out.append(claim_obj) @@ -171,7 +170,7 @@ def _build_claim( statement=st, datatype=datatype, include_references=include_references, - qualifiers=qualifiers + qualifiers=qualifiers, ) if cv is not None: values.append(cv) diff --git a/src/Normalizer/TTLNormalizer.py b/src/Normalizer/TTLNormalizer.py index 2d8af38..b6b4f6f 100644 --- a/src/Normalizer/TTLNormalizer.py +++ b/src/Normalizer/TTLNormalizer.py @@ -94,7 +94,7 @@ def normalize( references: bool = False, all_ranks: bool = False, qualifiers: bool = True, - filter_pids: List[str] = [] + filter_pids: List[str] = [], ) -> WikidataEntity: """Normalize the parsed graph into a ``WikidataEntity`` tree. @@ -134,7 +134,7 @@ def normalize( include_references=references, all_ranks=all_ranks, qualifiers=qualifiers, - filter_pids=filter_pids + filter_pids=filter_pids, ) entity = WikidataEntity( @@ -191,7 +191,7 @@ def _claims_for_subject( include_references: bool, all_ranks: bool, qualifiers: bool, - filter_pids: List[str] = [] + filter_pids: List[str] = [], ) -> Dict[str, List[Dict[str, Any]]]: """Return mapping: pid -> list of statement dicts.""" out: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list) @@ -335,7 +335,7 @@ def _build_claim_object( refs_obj: List[List[WikidataClaim]] = [] if include_references: - for ref in (st.get("references") or []): + for ref in st.get("references") or []: ref_claims = [ self._build_snak_claim( pid=rpid, diff --git a/src/Textifier/WikidataTextifier.py b/src/Textifier/WikidataTextifier.py index a34ed1b..7687fca 100644 --- a/src/Textifier/WikidataTextifier.py +++ b/src/Textifier/WikidataTextifier.py @@ -15,6 +15,7 @@ # Atomic value types # --------------------------------------------------------------------------- + @dataclass(slots=True) class WikidataText: """Object for Wikidata plain text values.""" @@ -124,6 +125,7 @@ def to_json(self) -> Any: # Core graph types # --------------------------------------------------------------------------- + @dataclass(slots=True) class WikidataEntity: """Object for Wikidata entities.""" @@ -138,11 +140,11 @@ def __bool__(self) -> bool: """Return whether this entity has a usable id and label.""" return bool(self.id) and self.label is not None and str(self.label) != "" - def to_text(self, lang='en', keep_empty: bool = False) -> str: + def to_text(self, lang="en", keep_empty: bool = False) -> str: """Render the entity into a readable text.""" - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) - label_str = str(self.label) if self.label else '' + label_str = str(self.label) if self.label else "" string = label_str if self.description: @@ -206,12 +208,12 @@ def __bool__(self) -> bool: and any(bool(v) for v in self.values) ) - def to_text(self, lang='en') -> str: + def to_text(self, lang="en") -> str: """Render the claim into a readable text.""" - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) if self.values: - values = lang_var[', '].join(v.to_text(lang) for v in self.values if v) + values = lang_var[", "].join(v.to_text(lang) for v in self.values if v) return f"{str(self.property.label)}: {values}" return f"{lang_var['has']} {str(self.property.label)}" @@ -249,10 +251,9 @@ def to_triplet(self, as_qualifier: bool = False) -> str: @dataclass(slots=True) class WikidataClaimValue: """Object for Wikidata claim values.""" + claim: WikidataClaim - value: Optional[ - Union[WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText] - ] = None + value: Optional[Union[WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText]] = None qualifiers: List[WikidataClaim] = field(default_factory=list) references: List[List[WikidataClaim]] = field(default_factory=list) rank: Optional[str] = None # preferred|normal|deprecated @@ -261,9 +262,9 @@ def __bool__(self) -> bool: """Return whether this claim value has non-empty values.""" return self.value is not None and str(self.value) != "" - def to_text(self, lang='en') -> str: + def to_text(self, lang="en") -> str: """Render the value and qualifiers as readable text.""" - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) if not self: return "" diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py index ad41707..c4d9f18 100644 --- a/src/WikidataLabel.py +++ b/src/WikidataLabel.py @@ -21,10 +21,7 @@ LABEL_MAX_ROWS = int(os.environ.get("LABEL_MAX_ROWS", "10000000")) REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) -DATABASE_URL = ( - f"mariadb+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}" - f"?charset=utf8mb4" -) +DATABASE_URL = f"mariadb+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}?charset=utf8mb4" engine = create_engine( DATABASE_URL, @@ -41,7 +38,7 @@ class WikidataLabel(Base): """Database cache for multilingual Wikidata labels.""" - __tablename__ = 'labels' + __tablename__ = "labels" id = Column(String(64), primary_key=True) labels = Column(JSON, default=dict) date_added = Column(DateTime, default=datetime.now, index=True) @@ -70,20 +67,22 @@ def add_bulk_labels(data): return True for i in range(len(data)): - data[i]['date_added'] = datetime.now() + data[i]["date_added"] = datetime.now() if isinstance(data[i].get("labels"), dict): data[i]["labels"] = json.dumps(data[i]["labels"], ensure_ascii=False, separators=(",", ":")) - with Session() as session: try: - session.execute(text(''' + session.execute( + text(""" INSERT INTO labels (id, labels, date_added) VALUES (:id, :labels, :date_added) ON DUPLICATE KEY UPDATE labels = VALUES(labels), date_added = VALUES(date_added) - '''), data) + """), + data, + ) session.commit() return True @@ -105,10 +104,7 @@ def add_label(id, labels): """ with Session() as session: try: - new_entry = WikidataLabel( - id=id, - labels=labels - ) + new_entry = WikidataLabel(id=id, labels=labels) session.add(new_entry) session.commit() return True @@ -130,12 +126,12 @@ def get_labels(id): try: with Session() as session: # Get labels that are less than LABEL_TTL_DAYS old - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - item = session.query(WikidataLabel)\ - .filter( - WikidataLabel.id == id, - WikidataLabel.date_added >= date_limit - ).first() + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + item = ( + session.query(WikidataLabel) + .filter(WikidataLabel.id == id, WikidataLabel.date_added >= date_limit) + .first() + ) if item is not None: return item.labels or {} @@ -165,12 +161,12 @@ def get_bulk_labels(ids): try: with Session() as session: # Get labels that are less than LABEL_TTL_DAYS old - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - rows = session.query(WikidataLabel.id, WikidataLabel.labels)\ - .filter( - WikidataLabel.id.in_(ids), - WikidataLabel.date_added >= date_limit - ).all() + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + rows = ( + session.query(WikidataLabel.id, WikidataLabel.labels) + .filter(WikidataLabel.id.in_(ids), WikidataLabel.date_added >= date_limit) + .all() + ) labels = {id: labels for id, labels in rows} except Exception as e: print(f"Error while fetching cached labels in bulk: {e}") @@ -182,10 +178,9 @@ def get_bulk_labels(ids): labels.update(missing_labels) # Cache labels - WikidataLabel.add_bulk_labels([ - {'id': entity_id, 'labels': entity_labels} - for entity_id, entity_labels in missing_labels.items() - ]) + WikidataLabel.add_bulk_labels( + [{"id": entity_id, "labels": entity_labels} for entity_id, entity_labels in missing_labels.items()] + ) return labels @@ -202,11 +197,8 @@ def delete_old_labels(): with Session() as session: try: # Step 1: Delete labels older than X days - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - session.execute( - text("DELETE FROM labels WHERE date_added < :date_limit"), - {"date_limit": date_limit} - ) + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + session.execute(text("DELETE FROM labels WHERE date_added < :date_limit"), {"date_limit": date_limit}) session.commit() # Step 2: Check total count @@ -228,7 +220,7 @@ def delete_old_labels(): LIMIT :rows_to_delete ) AS old_labels ON l.id = old_labels.id """), - {"rows_to_delete": rows_to_delete} + {"rows_to_delete": rows_to_delete}, ) session.commit() @@ -265,17 +257,14 @@ def _compress_labels(data): """ new_labels = {} for qid, labels in data.items(): - if 'labels' in labels: - new_labels[qid] = { - lang: label.get('value') \ - for lang, label in labels['labels'].items() - } + if "labels" in labels: + new_labels[qid] = {lang: label.get("value") for lang, label in labels["labels"].items()} else: new_labels[qid] = {} return new_labels @staticmethod - def get_lang_val(data, lang='en', fallback_lang=None): + def get_lang_val(data, lang="en", fallback_lang=None): """Return the best label text from a labels dictionary. Args: @@ -286,13 +275,13 @@ def get_lang_val(data, lang='en', fallback_lang=None): Returns: str: Selected label text, or an empty string when missing. """ - label = data.get(lang, data.get('mul', {})) + label = data.get(lang, data.get("mul", {})) if fallback_lang and not label: label = data.get(fallback_lang, {}) if isinstance(label, str): return label - return label.get('value', '') + return label.get("value", "") @staticmethod def get_all_missing_labels_ids(data): @@ -307,16 +296,18 @@ def get_all_missing_labels_ids(data): ids_list = set() if isinstance(data, dict): - if 'property' in data: - ids_list.add(data['property']) - if ('unit' in data) and (data['unit'] != '1'): - ids_list.add(data['unit'].split('/')[-1]) - if ('datatype' in data) and \ - ('datavalue' in data) and \ - (data['datatype'] in ['wikibase-item', 'wikibase-property']): - ids_list.add(data['datavalue']['value']['id']) - if ('claims' in data) and isinstance(data['claims'], dict): - ids_list = ids_list | data['claims'].keys() + if "property" in data: + ids_list.add(data["property"]) + if ("unit" in data) and (data["unit"] != "1"): + ids_list.add(data["unit"].split("/")[-1]) + if ( + ("datatype" in data) + and ("datavalue" in data) + and (data["datatype"] in ["wikibase-item", "wikibase-property"]) + ): + ids_list.add(data["datavalue"]["value"]["id"]) + if ("claims" in data) and isinstance(data["claims"], dict): + ids_list = ids_list | data["claims"].keys() for _, value in data.items(): ids_list = ids_list | WikidataLabel.get_all_missing_labels_ids(value) @@ -327,6 +318,7 @@ def get_all_missing_labels_ids(data): return ids_list + class LazyLabel: """Deferred label string that resolves via a shared factory.""" @@ -349,7 +341,7 @@ def __str__(self): class LazyLabelFactory: """Create and batch-resolve lazy Wikidata labels.""" - def __init__(self, lang='en', fallback_lang='en'): + def __init__(self, lang="en", fallback_lang="en"): """Initialize a lazy label factory. Args: diff --git a/src/utils.py b/src/utils.py index b53d86c..5e125e5 100644 --- a/src/utils.py +++ b/src/utils.py @@ -14,10 +14,11 @@ SESSION.mount("http://", adapter) SESSION.mount("https://", adapter) + def get_wikidata_ttl_by_id( - id, - lang='en', - ): + id, + lang="en", +): """Fetch a Wikidata entity as TTL from ``Special:EntityData``. Args: @@ -31,11 +32,9 @@ def get_wikidata_ttl_by_id( requests.HTTPError: If Wikidata returns an error response. """ params = { - 'uselang': lang, - } - headers = { - 'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)' + "uselang": lang, } + headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} response = SESSION.get( f"https://www.wikidata.org/wiki/Special:EntityData/{id}.ttl", @@ -47,10 +46,7 @@ def get_wikidata_ttl_by_id( return response.text -def get_wikidata_json_by_ids( - ids, - props='labels|descriptions|aliases|claims' - ): +def get_wikidata_json_by_ids(ids, props="labels|descriptions|aliases|claims"): """Fetch one or more Wikidata entities from ``wbgetentities``. Args: @@ -64,7 +60,7 @@ def get_wikidata_json_by_ids( requests.HTTPError: If Wikidata returns an error response. """ if isinstance(ids, str): - ids = ids.split('|') + ids = ids.split("|") ids = list(dict.fromkeys(ids)) # Ensure unique IDs entities_data = {} @@ -72,18 +68,15 @@ def get_wikidata_json_by_ids( # Wikidata API has a limit on the number of IDs per request, # typically 50 for wbgetentities. for chunk_idx in range(0, len(ids), 50): - - ids_chunk = ids[chunk_idx:chunk_idx+50] + ids_chunk = ids[chunk_idx : chunk_idx + 50] params = { - 'action': 'wbgetentities', - 'ids': "|".join(ids_chunk), - 'props': props, - 'format': 'json', - 'origin': '*', - } - headers = { - 'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)' + "action": "wbgetentities", + "ids": "|".join(ids_chunk), + "props": props, + "format": "json", + "origin": "*", } + headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} response = SESSION.get( "https://www.wikidata.org/w/api.php?", @@ -102,6 +95,7 @@ def get_wikidata_json_by_ids( # Formatting ##################################### + def wikidata_time_to_text(value: dict, lang: str = "en"): """Format a time datavalue into localized display text using a local Wikibase instance. @@ -139,12 +133,16 @@ def wikidata_time_to_text(value: dict, lang: str = "en"): }, } - r = SESSION.post(WIKIBASE_API, data={ - "action": "wbformatvalue", - "format": "json", - "uselang": lang, - "datavalue": json.dumps(datavalue), - }, timeout=REQUEST_TIMEOUT_SECONDS) + r = SESSION.post( + WIKIBASE_API, + data={ + "action": "wbformatvalue", + "format": "json", + "uselang": lang, + "datavalue": json.dumps(datavalue), + }, + timeout=REQUEST_TIMEOUT_SECONDS, + ) r.raise_for_status() data = r.json() @@ -181,12 +179,16 @@ def wikidata_geolocation_to_text(value: dict, lang: str = "en"): }, } - r = SESSION.post(WIKIBASE_API, data={ - "action": "wbformatvalue", - "format": "json", - "uselang": lang, - "datavalue": json.dumps(datavalue), - }, timeout=REQUEST_TIMEOUT_SECONDS) + r = SESSION.post( + WIKIBASE_API, + data={ + "action": "wbformatvalue", + "format": "json", + "uselang": lang, + "datavalue": json.dumps(datavalue), + }, + timeout=REQUEST_TIMEOUT_SECONDS, + ) r.raise_for_status() data = r.json()