diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..0538a3f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,26 @@ +name: "Ruff Lint" + +on: + pull_request: + branches: ["main"] + +permissions: + contents: read + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Set up Python + run: uv python install + + - name: Run Ruff linter + run: uv run ruff check . + + - name: Run Ruff formatter check + run: uv run ruff format --check . diff --git a/README.md b/README.md index 8ab9065..7cd85ed 100644 --- a/README.md +++ b/README.md @@ -1,81 +1,47 @@ # Wikidata Textifier -**Wikidata Textifier** is an API that transforms Wikidata items into compact format for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines. +**Wikidata Textifier** is an API that transforms Wikidata entities into compact outputs for LLM and GenAI use cases. +It resolves missing labels for properties and claim values using the Wikidata Action API and caches labels to reduce repeated lookups. -🔗 Live API: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/) +Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/) \ +API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) ---- +## Features -## Functionalities +- Textify Wikidata entities as `json`, `text`, or `triplet`. +- Resolve labels for linked entities and properties. +- Cache labels in MariaDB for faster repeated requests. +- Support multilingual output with fallback language support. +- Avoid SPARQL and use Wikidata Action API / EntityData endpoints. -- **Textifies** any Wikidata item into a readable or JSON format suitable for LLMs. -- **Resolves all labels**, including those missing when querying the Wikidata API. -- **Caches labels** for 90 days to boost performance and reduce API load. -- **Avoids SPARQL** and uses the Wikidata Action API for better efficiency and compatibility. -- **Hosted on Toolforge**: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/) +## Output Formats ---- +- `json`: Structured representation with claims (and optionally qualifiers/references). +- `text`: Readable summary including label, description, aliases, and attributes. +- `triplet`: Triplet-style lines with labels and IDs for graph-style traversal. -## Formats - -- **Text**: A textual representation or summary of the Wikidata item, including its label, description, aliases, and claims. Useful for helping LLMs understand what the item represents. -- **Triplet**: Outputs each triplet as a structured line, including labels and IDs, but omits descriptions and aliases. Ideal for agentic LLMs to traverse and explore Wikidata. -- **JSON**: A structured and compact representation of the full item, suitable for custom formats. - ---- - -## API Usage +## API ### `GET /` -#### Query Parameters - -| Name | Type | Required | Description | -|----------------|---------|----------|-----------------------------------------------------------------------------| -| `id` | string | Yes | Wikidata item ID (e.g., `Q42`) | -| `lang` | string | No | Language code for labels (default: `en`) | -| `format` | string | No | The format of the response, either 'json', 'text', or 'triplet' (default: `json`) | -| `external_ids` | bool | No | Whether to include external IDs in the output (default: `true`) | -| `all_ranks` | bool | No | If false, returns ranked preferred statements, falling back to normal when unavailable (default: `false`) | -| `references` | bool | No | Whether to include references (default: `false`) | -| `fallback_lang` | string | No | Fallback language code if the preferred language is not available (default: `en`) | - ---- - -## Deploy to Toolforge - -1. Shell into the Toolforge system: - -```bash -ssh [UNIX shell username]@login.toolforge.org -``` - -2. Switch to tool user account: - -```bash -become wd-textify -``` - -3. Build from Git: - -```bash -toolforge build start https://github.com/philippesaade-wmde/WikidataTextifier.git -``` +#### Query parameters -4. Start the web service: +| Name | Type | Required | Description | +|---|---|---|---| +| `id` | string | Yes | Comma-separated Wikidata IDs (for example: `Q42` or `Q42,Q2`). | +| `pid` | string | No | Comma-separated property IDs to filter claims (for example: `P31,P279`). | +| `lang` | string | No | Preferred language code (default: `en`). | +| `fallback_lang` | string | No | Fallback language code (default: `en`). | +| `format` | string | No | Output format: `json`, `text`, or `triplet` (default: `json`). | +| `external_ids` | bool | No | Include `external-id` datatype claims (default: `true`). | +| `all_ranks` | bool | No | Include all statement ranks instead of preferred/normal filtering (default: `false`). | +| `qualifiers` | bool | No | Include qualifiers in claim values (default: `true`). | +| `references` | bool | No | Include references in claim values (default: `false`). | -```bash -webservice buildservice start --mount all -``` - -5. Debugging the web service: - -Read the logs: -```bash -webservice logs -``` +#### Example requests -Open the service shell: ```bash -webservice shell +curl "https://wd-textify.wmcloud.org/?id=Q42" +curl "https://wd-textify.wmcloud.org/?id=Q42&format=text&lang=en" +curl "https://wd-textify.wmcloud.org/?id=Q42,Q2&pid=P31,P279&format=triplet" ``` diff --git a/main.py b/main.py index f999d75..23cffdc 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,16 @@ -from fastapi import FastAPI, HTTPException, Query, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi import BackgroundTasks +"""FastAPI application that exposes Wikidata textification endpoints.""" + +import os +import time import traceback + import requests -import time -import os +from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request +from fastapi.middleware.cors import CORSMiddleware -from src.Normalizer import TTLNormalizer, JSONNormalizer -from src.WikidataLabel import WikidataLabel, LazyLabelFactory from src import utils +from src.Normalizer import JSONNormalizer, TTLNormalizer +from src.WikidataLabel import LazyLabelFactory, WikidataLabel # Start Fastapi app app = FastAPI( @@ -32,10 +34,13 @@ LABEL_CLEANUP_INTERVAL_SECONDS = int(os.environ.get("LABEL_CLEANUP_INTERVAL_SECONDS", 3600)) _last_label_cleanup = 0.0 + @app.on_event("startup") async def startup(): + """Initialize database resources required by the API.""" WikidataLabel.initialize_database() + @app.get( "/", responses={ @@ -43,57 +48,67 @@ async def startup(): "description": "Returns a list of relevant Wikidata property PIDs with similarity scores", "content": { "application/json": { - "example": [{ - "Q42": "Douglas Adams (human), English writer, humorist, and dramatist...", - }] + "example": [ + { + "Q42": "Douglas Adams (human), English writer, humorist, and dramatist...", + } + ] } }, }, 422: { "description": "Missing or invalid query parameter", - "content": { - "application/json": { - "example": {"detail": "Invalid format specified"} - } - }, + "content": {"application/json": {"example": {"detail": "Invalid format specified"}}}, }, }, ) async def get_textified_wd( - request: Request, background_tasks: BackgroundTasks, + request: Request, + background_tasks: BackgroundTasks, id: str = Query(..., examples="Q42,Q2"), pid: str = Query(None, examples="P31,P279"), - lang: str = 'en', - format: str = 'json', + lang: str = "en", + format: str = "json", external_ids: bool = True, references: bool = False, all_ranks: bool = False, qualifiers: bool = True, - fallback_lang: str = 'en' + fallback_lang: str = "en", ): - """ - Retrieve a Wikidata item with all labels or textual representations for an LLM. - - Args: - id (str): The Wikidata item ID (e.g., "Q42"). - pid (str): Comma-separated list of property IDs to filter claims (e.g., "P31,P279"). - format (str): The format of the response, either 'json', 'text', or 'triplet'. - lang (str): The language code for labels (default is 'en'). - external_ids (bool): If True, includes external IDs in the response. - all_ranks (bool): If True, includes statements of all ranks (preferred, normal, deprecated). - references (bool): If True, includes references in the response. (only available in JSON format) - qualifiers (bool): If True, includes qualifiers in the response. - fallback_lang (str): The fallback language code if the preferred language is not available. - - Returns: - list: A list of dictionaries containing QIDs and the similarity scores. + """Retrieve Wikidata entities as structured JSON, natural text, or triplet lines. + + This endpoint fetches one or more entities, resolves missing labels, and normalizes + claims into a compact representation suitable for downstream LLM use. + + **Args:** + + - **id** (str): Comma-separated Wikidata IDs to fetch (for example: `"Q42"` or `"Q42,Q2"`). + - **pid** (str, optional): Comma-separated property IDs used to filter returned claims (for example: `"P31,P279"`). + - **lang** (str): Preferred language code for labels and formatted values. + - **format** (str): Output format. One of `"json"`, `"text"`, or `"triplet"`. + - **external_ids** (bool): If `true`, include claims with datatype `external-id`. + - **references** (bool): If `true`, include references in claim values (JSON output only). + - **all_ranks** (bool): If `true`, include preferred, normal, and deprecated statement ranks. + - **qualifiers** (bool): If `true`, include qualifiers for claim values. + - **fallback_lang** (str): Fallback language used when `lang` is unavailable. + - **request** (Request): FastAPI request context object. + - **background_tasks** (BackgroundTasks): Background task manager used for cache cleanup. + + **Returns:** + + A dictionary keyed by requested entity ID (for example, `"Q42"`). + Each value depends on `format`: + + - **json**: Structured entity payload with label, description, aliases, and claims. + - **text**: Human-readable summary text. + - **triplet**: Triplet-style text lines with labels and IDs. """ try: filter_pids = [] if pid: - filter_pids = [p.strip() for p in pid.split(',')] + filter_pids = [p.strip() for p in pid.split(",")] - qids = [q.strip() for q in id.split(',')] + qids = [q.strip() for q in id.split(",")] label_factory = LazyLabelFactory(lang=lang, fallback_lang=fallback_lang) entities = {} @@ -144,7 +159,9 @@ async def get_textified_wd( fallback_lang=fallback_lang, label_factory=label_factory, debug=False, - ) if entity_data.get(qid) else None + ) + if entity_data.get(qid) + else None for qid in qids } @@ -154,8 +171,10 @@ async def get_textified_wd( all_ranks=all_ranks, references=references, filter_pids=filter_pids, - qualifiers=qualifiers - ) if entity else None + qualifiers=qualifiers, + ) + if entity + else None for qid, entity in entity_data.items() } @@ -165,9 +184,9 @@ async def get_textified_wd( return_data[qid] = None continue - if format == 'text': + if format == "text": results = entity.to_text(lang) - elif format == 'triplet': + elif format == "triplet": results = entity.to_triplet() else: results = entity.to_json() diff --git a/pyproject.toml b/pyproject.toml index 32e80d5..6eedfea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,3 +13,30 @@ dependencies = [ "sqlalchemy>=2.0.41", "uvicorn>=0.35.0", ] + +[dependency-groups] +dev = [ + "ruff>=0.9.0" +] + +[tool.ruff] +target-version = "py313" +line-length = 120 + +exclude = ["data/mysql"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "F", # Pyflakes (catches undefined names, unused imports, etc.) + "I", # isort (import sorting) + "D", # pydocstyle (function/class documentation) +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.isort] +known-first-party = [ + "wikidatasearch" +] diff --git a/src/Normalizer/JSONNormalizer.py b/src/Normalizer/JSONNormalizer.py index 87773f8..bb11b79 100644 --- a/src/Normalizer/JSONNormalizer.py +++ b/src/Normalizer/JSONNormalizer.py @@ -1,9 +1,11 @@ +"""Normalize Wikidata Action API JSON into internal textifier objects.""" + from __future__ import annotations from typing import Any, Dict, List, Optional + import requests -from ..WikidataLabel import WikidataLabel, LazyLabelFactory from ..Textifier.WikidataTextifier import ( WikidataClaim, WikidataClaimValue, @@ -14,11 +16,11 @@ WikidataTime, ) from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text +from ..WikidataLabel import LazyLabelFactory, WikidataLabel class JSONNormalizer: - """Build WikidataEntity + claims tree from Wikidata JSON (wbgetentities style). - """ + """Normalize ``wbgetentities`` JSON into internal textifier objects.""" def __init__( self, @@ -29,6 +31,16 @@ def __init__( label_factory: Optional[LazyLabelFactory] = None, debug: bool = False, ): + """Initialize a normalizer for a single entity payload. + + Args: + entity_id (str): Entity ID being normalized. + entity_json (dict[str, Any]): Raw ``wbgetentities`` JSON for ``entity_id``. + lang (str): Preferred language for label selection. + fallback_lang (str): Fallback language when ``lang`` is unavailable. + label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities. + debug (bool): Whether to print additional debug output while parsing. + """ self.entity_id = entity_id self.entity_json = entity_json @@ -36,7 +48,6 @@ def __init__( self.fallback_lang = fallback_lang self.debug = debug - self.label_factory = label_factory or LazyLabelFactory(lang=lang, fallback_lang=fallback_lang) # ------------------------------------------------------------------------- @@ -51,6 +62,18 @@ def normalize( qualifiers: bool = True, filter_pids: List[str] = [], ) -> WikidataEntity: + """Normalize the entity JSON payload into a ``WikidataEntity`` tree. + + Args: + external_ids (bool): Whether to include ``external-id`` datatype claims. + references (bool): Whether to include references for each statement value. + all_ranks (bool): Whether to include statements of all ranks. + qualifiers (bool): Whether to include qualifiers for statement values. + filter_pids (list[str]): Optional allow-list of property IDs to keep. + + Returns: + WikidataEntity: Parsed entity object with claims and values. + """ e = self.entity_json if not isinstance(e, dict) or "labels" not in e: if self.debug: @@ -95,7 +118,7 @@ def normalize( external_ids=external_ids, include_references=references, all_ranks=all_ranks, - qualifiers=qualifiers + qualifiers=qualifiers, ) if claim_obj is not None and claim_obj.values: claims_out.append(claim_obj) @@ -147,7 +170,7 @@ def _build_claim( statement=st, datatype=datatype, include_references=include_references, - qualifiers=qualifiers + qualifiers=qualifiers, ) if cv is not None: values.append(cv) diff --git a/src/Normalizer/TTLNormalizer.py b/src/Normalizer/TTLNormalizer.py index c3c45a1..b6b4f6f 100644 --- a/src/Normalizer/TTLNormalizer.py +++ b/src/Normalizer/TTLNormalizer.py @@ -1,13 +1,14 @@ +"""Normalize Wikidata TTL into internal textifier objects.""" + from __future__ import annotations from collections import defaultdict from typing import Any, DefaultDict, Dict, List, Optional, Set -import requests +import requests from rdflib import Graph, Literal, Namespace, URIRef from rdflib.namespace import RDF, RDFS -from ..WikidataLabel import WikidataLabel, LazyLabelFactory from ..Textifier.WikidataTextifier import ( WikidataClaim, WikidataClaimValue, @@ -18,9 +19,9 @@ WikidataTime, ) from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text +from ..WikidataLabel import LazyLabelFactory, WikidataLabel - -# Namespaces used by Wikidata TTL exports +# Namespaces used by Wikidata TTL WD = Namespace("http://www.wikidata.org/entity/") P = Namespace("http://www.wikidata.org/prop/") PS = Namespace("http://www.wikidata.org/prop/statement/") @@ -39,18 +40,18 @@ class TTLNormalizer: - """Parse a Wikidata Special:EntityData TTL and build a WikidataEntity with claims. + """Normalize ``Special:EntityData`` TTL into internal textifier objects. Label resolution order: - 1) labels present in TTL - 2) LazyLabelFactory bulk lookup for the remainder + 1) Labels present in TTL. + 2) ``LazyLabelFactory`` bulk lookup for unresolved IDs. Notes: - - Claims are extracted from wd: p:

triples only. - - Statement nodes are validated structurally before value extraction. - - Special values (somevalue/novalue) are treated as "no main value" when + - Claims are extracted from ``wd: p:

`` triples only. + - Statement nodes are validated structurally before value extraction. + - Special values (somevalue/novalue) are treated as "no main value" when neither ps: nor psv: is present on the statement node. - - Property datatype is read from wikibase:propertyType when available, + - Property datatype is read from ``wikibase:propertyType`` when available, otherwise inferred from the statement's value nodes when possible. """ @@ -63,6 +64,16 @@ def __init__( label_factory: Optional[LazyLabelFactory] = None, debug: bool = False, ): + """Initialize a normalizer for a single TTL document. + + Args: + entity_id (str): Entity ID being normalized. + ttl_text (str): Raw TTL document from ``Special:EntityData``. + lang (str): Preferred language for label selection. + fallback_lang (str): Fallback language when ``lang`` is unavailable. + label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities. + debug (bool): Whether to print additional debug output while parsing. + """ self.entity_id = entity_id self.g = Graph() self.g.parse(data=ttl_text, format="turtle") @@ -83,8 +94,20 @@ def normalize( references: bool = False, all_ranks: bool = False, qualifiers: bool = True, - filter_pids: List[str] = [] + filter_pids: List[str] = [], ) -> WikidataEntity: + """Normalize the parsed graph into a ``WikidataEntity`` tree. + + Args: + external_ids (bool): Whether to include ``external-id`` datatype claims. + references (bool): Whether to include references for each statement value. + all_ranks (bool): Whether to include statements of all ranks. + qualifiers (bool): Whether to include qualifiers for statement values. + filter_pids (list[str]): Optional allow-list of property IDs to keep. + + Returns: + WikidataEntity: Parsed entity object with claims and values. + """ # Preload labels found inside TTL so LazyLabelFactory can avoid lookups. self.label_factory._resolved_labels = self._build_label_cache_from_ttl() @@ -111,7 +134,7 @@ def normalize( include_references=references, all_ranks=all_ranks, qualifiers=qualifiers, - filter_pids=filter_pids + filter_pids=filter_pids, ) entity = WikidataEntity( @@ -168,7 +191,7 @@ def _claims_for_subject( include_references: bool, all_ranks: bool, qualifiers: bool, - filter_pids: List[str] = [] + filter_pids: List[str] = [], ) -> Dict[str, List[Dict[str, Any]]]: """Return mapping: pid -> list of statement dicts.""" out: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list) @@ -312,7 +335,7 @@ def _build_claim_object( refs_obj: List[List[WikidataClaim]] = [] if include_references: - for ref in (st.get("references") or []): + for ref in st.get("references") or []: ref_claims = [ self._build_snak_claim( pid=rpid, diff --git a/src/Normalizer/__init__.py b/src/Normalizer/__init__.py index d1d327c..b35407a 100644 --- a/src/Normalizer/__init__.py +++ b/src/Normalizer/__init__.py @@ -1,2 +1,6 @@ +"""Public exports for normalizer classes.""" + +from .JSONNormalizer import JSONNormalizer from .TTLNormalizer import TTLNormalizer -from .JSONNormalizer import JSONNormalizer \ No newline at end of file + +__all__ = ["JSONNormalizer", "TTLNormalizer"] diff --git a/src/Textifier/WikidataTextifier.py b/src/Textifier/WikidataTextifier.py index d09b909..7687fca 100644 --- a/src/Textifier/WikidataTextifier.py +++ b/src/Textifier/WikidataTextifier.py @@ -1,9 +1,11 @@ +"""Data structures for Wikidata entities and serialization helpers.""" + from __future__ import annotations +import json from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Union -import json LANGUAGE_VARIABLES_PATH = Path(__file__).with_name("language_variables.json") with LANGUAGE_VARIABLES_PATH.open("r", encoding="utf-8") as f: @@ -13,34 +15,45 @@ # Atomic value types # --------------------------------------------------------------------------- + @dataclass(slots=True) class WikidataText: + """Object for Wikidata plain text values.""" + text: Optional[str] = None def __str__(self) -> str: + """Return the text representation.""" return self.text or "" def __bool__(self) -> bool: + """Return whether this text wrapper contains content.""" return bool(self.text) def to_json(self) -> Optional[str]: + """Serialize to a JSON-friendly scalar.""" return self.text @dataclass(slots=True) class WikidataCoordinates: + """Object for Wikidata coordinate values.""" + latitude: Optional[float] = None longitude: Optional[float] = None string_val: Optional[str] = None def __str__(self) -> str: + """Return a readable coordinate string.""" return self.string_val or "" def __bool__(self) -> bool: + """Return whether both latitude and longitude are present.""" # coordinates are meaningful if we have both lat/lon return self.latitude is not None and self.longitude is not None def to_json(self) -> Dict[str, Any]: + """Serialize coordinates to a JSON object.""" return { "latitude": self.latitude, "longitude": self.longitude, @@ -50,18 +63,23 @@ def to_json(self) -> Dict[str, Any]: @dataclass(slots=True) class WikidataTime: + """Object for Wikidata time values.""" + time: Optional[str] = None precision: Optional[int] = None calendarmodel: Optional[str] = None string_val: Optional[str] = None def __str__(self) -> str: + """Return a readable time string.""" return self.string_val or "" def __bool__(self) -> bool: + """Return whether this instance contains a time value.""" return bool(self.time) or bool(self.string_val) def to_json(self) -> Dict[str, Any]: + """Serialize time to a JSON object.""" return { "time": self.time, "precision": self.precision, @@ -72,11 +90,14 @@ def to_json(self) -> Dict[str, Any]: @dataclass(slots=True) class WikidataQuantity: + """Object for Wikidata quantity values.""" + amount: Optional[str] = None unit: Optional[Any] = None unit_id: Optional[str] = None def __str__(self) -> str: + """Return a readable quantity string.""" if not self.amount: return "" if self.unit_id: @@ -84,9 +105,11 @@ def __str__(self) -> str: return str(self.amount) def __bool__(self) -> bool: + """Return whether this quantity has an amount.""" return bool(self.amount) def to_json(self) -> Any: + """Serialize quantity to a scalar or object.""" if not self.amount: return None if self.unit_id: @@ -102,8 +125,11 @@ def to_json(self) -> Any: # Core graph types # --------------------------------------------------------------------------- + @dataclass(slots=True) class WikidataEntity: + """Object for Wikidata entities.""" + id: str label: Optional[Any] = None description: Optional[str] = None @@ -111,12 +137,14 @@ class WikidataEntity: claims: List["WikidataClaim"] = field(default_factory=list) def __bool__(self) -> bool: + """Return whether this entity has a usable id and label.""" return bool(self.id) and self.label is not None and str(self.label) != "" - def to_text(self, lang='en', keep_empty: bool = False) -> str: - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + def to_text(self, lang="en", keep_empty: bool = False) -> str: + """Render the entity into a readable text.""" + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) - label_str = str(self.label) if self.label else '' + label_str = str(self.label) if self.label else "" string = label_str if self.description: @@ -125,9 +153,7 @@ def to_text(self, lang='en', keep_empty: bool = False) -> str: string += f"{lang_var[', ']}{lang_var['also known as']}" string += f" {lang_var[', '].join(map(str, self.aliases))}" - attributes = [c.to_text(lang, keep_empty=keep_empty) \ - for c in self.claims \ - if keep_empty or c] + attributes = [c.to_text(lang) for c in self.claims if keep_empty or c] if len(attributes) > 0: attributes = "\n- ".join(attributes) string += f". {lang_var['Attributes include']}:\n- {attributes}" @@ -137,6 +163,7 @@ def to_text(self, lang='en', keep_empty: bool = False) -> str: return string def to_json(self) -> Dict[str, Any]: + """Serialize the entity to a JSON object.""" id_key = "PID" if self.id.startswith("P") else "QID" return { id_key: self.id, @@ -147,6 +174,7 @@ def to_json(self) -> Dict[str, Any]: } def to_triplet(self) -> str: + """Render the entity as triplet lines.""" head = f"{str(self.label) if self.label else ''} ({self.id})" lines: List[str] = [] if self.description: @@ -164,12 +192,15 @@ def to_triplet(self) -> str: @dataclass(slots=True) class WikidataClaim: + """Object for Wikidata claims.""" + subject: WikidataEntity property: WikidataEntity values: List["WikidataClaimValue"] = field(default_factory=list) datatype: str = "string" def __bool__(self) -> bool: + """Return whether this claim contains a value.""" return ( self.property is not None and str(self.property.label) != "" @@ -177,16 +208,18 @@ def __bool__(self) -> bool: and any(bool(v) for v in self.values) ) - def to_text(self, lang='en') -> str: - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + def to_text(self, lang="en") -> str: + """Render the claim into a readable text.""" + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) if self.values: - values = lang_var[', '].join(v.to_text(lang) for v in self.values if v) + values = lang_var[", "].join(v.to_text(lang) for v in self.values if v) return f"{str(self.property.label)}: {values}" return f"{lang_var['has']} {str(self.property.label)}" def to_json(self) -> Dict[str, Any]: + """Serialize the claim to a JSON object.""" prop_json = self.property.to_json() prop_id = prop_json.get("PID") or prop_json.get("QID") return { @@ -197,6 +230,7 @@ def to_json(self) -> Dict[str, Any]: } def to_triplet(self, as_qualifier: bool = False) -> str: + """Render the claim as triplet text.""" if not self: return "" @@ -216,19 +250,21 @@ def to_triplet(self, as_qualifier: bool = False) -> str: @dataclass(slots=True) class WikidataClaimValue: + """Object for Wikidata claim values.""" + claim: WikidataClaim - value: Optional[ - Union[WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText] - ] = None + value: Optional[Union[WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText]] = None qualifiers: List[WikidataClaim] = field(default_factory=list) references: List[List[WikidataClaim]] = field(default_factory=list) rank: Optional[str] = None # preferred|normal|deprecated def __bool__(self) -> bool: + """Return whether this claim value has non-empty values.""" return self.value is not None and str(self.value) != "" - def to_text(self, lang='en') -> str: - lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get('en')) + def to_text(self, lang="en") -> str: + """Render the value and qualifiers as readable text.""" + lang_var = LANGUAGE_VARIABLES.get(lang, LANGUAGE_VARIABLES.get("en")) if not self: return "" @@ -248,6 +284,7 @@ def to_text(self, lang='en') -> str: return s def to_json(self) -> Optional[Dict[str, Any]]: + """Serialize the claim value to a JSON object.""" if not self: return None @@ -280,6 +317,7 @@ def to_json(self) -> Optional[Dict[str, Any]]: return out def to_triplet(self) -> str: + """Render the value as triplet text.""" if not self: return "" diff --git a/src/Textifier/__init__.py b/src/Textifier/__init__.py index 18af4cb..218caa9 100644 --- a/src/Textifier/__init__.py +++ b/src/Textifier/__init__.py @@ -1 +1,19 @@ -from .WikidataTextifier import WikidataEntity, WikidataClaim, WikidataClaimValue, WikidataCoordinates, WikidataTime, WikidataQuantity \ No newline at end of file +"""Public exports for textifier data structures.""" + +from .WikidataTextifier import ( + WikidataClaim, + WikidataClaimValue, + WikidataCoordinates, + WikidataEntity, + WikidataQuantity, + WikidataTime, +) + +__all__ = [ + "WikidataClaim", + "WikidataClaimValue", + "WikidataCoordinates", + "WikidataEntity", + "WikidataQuantity", + "WikidataTime", +] diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py index c082e57..c4d9f18 100644 --- a/src/WikidataLabel.py +++ b/src/WikidataLabel.py @@ -1,15 +1,14 @@ -from sqlalchemy import Column, String, DateTime, create_engine, text -from sqlalchemy.dialects.mysql import JSON -from sqlalchemy.orm import sessionmaker, declarative_base +"""Label cache and lazy label resolution for Wikidata entities.""" -from .utils import get_wikidata_json_by_ids -from datetime import datetime, timedelta -import os import json +import os +from datetime import datetime, timedelta + +from sqlalchemy import Column, DateTime, String, create_engine, text +from sqlalchemy.dialects.mysql import JSON +from sqlalchemy.orm import declarative_base, sessionmaker -""" -MySQL database setup for storing Wikidata labels in all languages. -""" +from .utils import get_wikidata_json_by_ids DB_HOST = os.environ.get("DB_HOST", "localhost") DB_NAME = os.environ.get("DB_NAME", "label") @@ -22,10 +21,7 @@ LABEL_MAX_ROWS = int(os.environ.get("LABEL_MAX_ROWS", "10000000")) REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) -DATABASE_URL = ( - f"mariadb+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}" - f"?charset=utf8mb4" -) +DATABASE_URL = f"mariadb+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}?charset=utf8mb4" engine = create_engine( DATABASE_URL, @@ -38,17 +34,18 @@ Base = declarative_base() Session = sessionmaker(bind=engine, expire_on_commit=False) + class WikidataLabel(Base): - __tablename__ = 'labels' + """Database cache for multilingual Wikidata labels.""" + + __tablename__ = "labels" id = Column(String(64), primary_key=True) labels = Column(JSON, default=dict) date_added = Column(DateTime, default=datetime.now, index=True) @staticmethod def initialize_database(): - """ - Create tables if they don't already exist. - """ + """Create tables if they do not already exist.""" try: Base.metadata.create_all(engine) return True @@ -58,33 +55,34 @@ def initialize_database(): @staticmethod def add_bulk_labels(data): - """ - Insert multiple label records in bulk. + """Insert or update multiple label records. - Parameters: - - data (list[dict]): A list of dictionaries, each containing 'id', 'labels' keys. + Args: + data (list[dict]): Records containing at least ``id`` and ``labels`` keys. Returns: - - bool: True if the operation was successful, False otherwise. + bool: ``True`` when the operation succeeds, otherwise ``False``. """ if not data: return True for i in range(len(data)): - data[i]['date_added'] = datetime.now() + data[i]["date_added"] = datetime.now() if isinstance(data[i].get("labels"), dict): data[i]["labels"] = json.dumps(data[i]["labels"], ensure_ascii=False, separators=(",", ":")) - with Session() as session: try: - session.execute(text(''' + session.execute( + text(""" INSERT INTO labels (id, labels, date_added) VALUES (:id, :labels, :date_added) ON DUPLICATE KEY UPDATE labels = VALUES(labels), date_added = VALUES(date_added) - '''), data) + """), + data, + ) session.commit() return True @@ -95,22 +93,18 @@ def add_bulk_labels(data): @staticmethod def add_label(id, labels): - """ - Insert a labels and descriptions into the database. + """Insert or update labels for a single entity. - Parameters: - - id (str): The unique identifier for the entity. - - labels (dict): A dictionary of labels (e.g. { "en": "Label in English", "fr": "Label in French", ... }). + Args: + id (str): Entity ID. + labels (dict): Mapping of language code to label text. Returns: - - bool: True if the operation was successful, False otherwise. + bool: ``True`` when the operation succeeds, otherwise ``False``. """ with Session() as session: try: - new_entry = WikidataLabel( - id=id, - labels=labels - ) + new_entry = WikidataLabel(id=id, labels=labels) session.add(new_entry) session.commit() return True @@ -121,24 +115,23 @@ def add_label(id, labels): @staticmethod def get_labels(id): - """ - Retrieve labels and descriptions for a given entity by its ID. + """Retrieve cached labels for one entity, with API fallback. - Parameters: - - id (str): The unique identifier of the entity. + Args: + id (str): Entity ID. Returns: - - dict: The labels dictionary if found, otherwise an empty dict. + dict | None: Cached or fetched labels for the entity, if available. """ try: with Session() as session: # Get labels that are less than LABEL_TTL_DAYS old - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - item = session.query(WikidataLabel)\ - .filter( - WikidataLabel.id == id, - WikidataLabel.date_added >= date_limit - ).first() + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + item = ( + session.query(WikidataLabel) + .filter(WikidataLabel.id == id, WikidataLabel.date_added >= date_limit) + .first() + ) if item is not None: return item.labels or {} @@ -153,14 +146,13 @@ def get_labels(id): @staticmethod def get_bulk_labels(ids): - """ - Retrieve labels for multiple entities by their IDs. + """Retrieve cached labels for multiple entities, with API fallback. - Parameters: - - ids (list[str]): A list of entity IDs to retrieve. + Args: + ids (list[str]): Entity IDs to fetch. Returns: - - dict[str, dict]: A dictionary mapping each ID to its labels. + dict[str, dict]: Mapping of each requested ID to its labels. """ if not ids: return {} @@ -169,12 +161,12 @@ def get_bulk_labels(ids): try: with Session() as session: # Get labels that are less than LABEL_TTL_DAYS old - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - rows = session.query(WikidataLabel.id, WikidataLabel.labels)\ - .filter( - WikidataLabel.id.in_(ids), - WikidataLabel.date_added >= date_limit - ).all() + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + rows = ( + session.query(WikidataLabel.id, WikidataLabel.labels) + .filter(WikidataLabel.id.in_(ids), WikidataLabel.date_added >= date_limit) + .all() + ) labels = {id: labels for id, labels in rows} except Exception as e: print(f"Error while fetching cached labels in bulk: {e}") @@ -186,18 +178,18 @@ def get_bulk_labels(ids): labels.update(missing_labels) # Cache labels - WikidataLabel.add_bulk_labels([ - {'id': entity_id, 'labels': entity_labels} - for entity_id, entity_labels in missing_labels.items() - ]) + WikidataLabel.add_bulk_labels( + [{"id": entity_id, "labels": entity_labels} for entity_id, entity_labels in missing_labels.items()] + ) return labels @staticmethod def delete_old_labels(): - """ - Delete labels older than X days. - If the database exceeds 10 million rows, delete the oldest rows until it is below the threshold. + """Delete expired labels and enforce maximum cache size. + + Returns: + bool: ``True`` when cleanup succeeds or is skipped, otherwise ``False``. """ if LABEL_UNLIMITED: return True @@ -205,11 +197,8 @@ def delete_old_labels(): with Session() as session: try: # Step 1: Delete labels older than X days - date_limit = (datetime.now() - timedelta(days=LABEL_TTL_DAYS)) - session.execute( - text("DELETE FROM labels WHERE date_added < :date_limit"), - {"date_limit": date_limit} - ) + date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) + session.execute(text("DELETE FROM labels WHERE date_added < :date_limit"), {"date_limit": date_limit}) session.commit() # Step 2: Check total count @@ -231,7 +220,7 @@ def delete_old_labels(): LIMIT :rows_to_delete ) AS old_labels ON l.id = old_labels.id """), - {"rows_to_delete": rows_to_delete} + {"rows_to_delete": rows_to_delete}, ) session.commit() @@ -244,14 +233,13 @@ def delete_old_labels(): @staticmethod def _get_labels_wdapi(ids): - """ - Retrieve labels from the Wikidata API for a list of IDs. + """Retrieve labels from the Wikidata API. - Parameters: - - ids (list[str] or str): A list of Wikidata entity IDs or a single string of IDs separated by '|'. + Args: + ids (list[str] | str): IDs as a list or ``|``-separated string. Returns: - - dict: A dictionary mapping each ID to its labels. + dict[str, dict]: Mapping of each ID to compressed labels. """ entities_data = get_wikidata_json_by_ids(ids, props="labels") entities_data = WikidataLabel._compress_labels(entities_data) @@ -259,63 +247,67 @@ def _get_labels_wdapi(ids): @staticmethod def _compress_labels(data): - """ - Compress labels by extracting the 'value' field from each label. + """Compress API labels by extracting each language's ``value`` field. - Parameters: - - data (dict): A dictionary of labels from Wikidata API. + Args: + data (dict): Raw entities payload from the Wikidata API. Returns: - - dict: A new dictionary with labels compressed to their 'value' field. + dict[str, dict]: Mapping of entity ID to ``{lang: label}``. """ new_labels = {} for qid, labels in data.items(): - if 'labels' in labels: - new_labels[qid] = { - lang: label.get('value') \ - for lang, label in labels['labels'].items() - } + if "labels" in labels: + new_labels[qid] = {lang: label.get("value") for lang, label in labels["labels"].items()} else: new_labels[qid] = {} return new_labels @staticmethod - def get_lang_val(data, lang='en', fallback_lang=None): - """ - Extracts the value for a given language from a dictionary of labels. + def get_lang_val(data, lang="en", fallback_lang=None): + """Return the best label text from a labels dictionary. + + Args: + data (dict): Label dictionary keyed by language. + lang (str): Preferred language code. + fallback_lang (str | None): Optional fallback language code. + + Returns: + str: Selected label text, or an empty string when missing. """ - label = data.get(lang, data.get('mul', {})) + label = data.get(lang, data.get("mul", {})) if fallback_lang and not label: label = data.get(fallback_lang, {}) if isinstance(label, str): return label - return label.get('value', '') + return label.get("value", "") @staticmethod def get_all_missing_labels_ids(data): - """ - Get the IDs of the entity dictionary where their labels are missing. + """Collect all referenced IDs that may require label lookup. - Parameters: - - data (dict or list): The data structure to search for missing labels. + Args: + data (dict | list): Nested entity structure to scan. Returns: - - set: A set of IDs that are missing labels. + set[str]: Referenced IDs that may be missing resolved labels. """ ids_list = set() if isinstance(data, dict): - if 'property' in data: - ids_list.add(data['property']) - if ('unit' in data) and (data['unit'] != '1'): - ids_list.add(data['unit'].split('/')[-1]) - if ('datatype' in data) and \ - ('datavalue' in data) and \ - (data['datatype'] in ['wikibase-item', 'wikibase-property']): - ids_list.add(data['datavalue']['value']['id']) - if ('claims' in data) and isinstance(data['claims'], dict): - ids_list = ids_list | data['claims'].keys() + if "property" in data: + ids_list.add(data["property"]) + if ("unit" in data) and (data["unit"] != "1"): + ids_list.add(data["unit"].split("/")[-1]) + if ( + ("datatype" in data) + and ("datavalue" in data) + and (data["datatype"] in ["wikibase-item", "wikibase-property"]) + ): + ids_list.add(data["datavalue"]["value"]["id"]) + if ("claims" in data) and isinstance(data["claims"], dict): + ids_list = ids_list | data["claims"].keys() for _, value in data.items(): ids_list = ids_list | WikidataLabel.get_all_missing_labels_ids(value) @@ -326,27 +318,55 @@ def get_all_missing_labels_ids(data): return ids_list + class LazyLabel: + """Deferred label string that resolves via a shared factory.""" + def __init__(self, qid, factory): + """Store the target entity ID and the lookup factory. + + Args: + qid (str): Entity ID whose label should be resolved lazily. + factory (LazyLabelFactory): Factory that performs batched label resolution. + """ self.qid = qid self.factory = factory def __str__(self): + """Resolve and return the label text for the configured entity.""" self.factory.resolve_all() return self.factory.get_label(self.qid) + class LazyLabelFactory: - def __init__(self, lang='en', fallback_lang='en'): + """Create and batch-resolve lazy Wikidata labels.""" + + def __init__(self, lang="en", fallback_lang="en"): + """Initialize a lazy label factory. + + Args: + lang (str): Preferred language code. + fallback_lang (str): Fallback language code. + """ self.lang = lang self.fallback_lang = fallback_lang self._pending_ids = set() self._resolved_labels = {} def create(self, qid: str) -> "LazyLabel": + """Create a lazy label handle and queue its ID for resolution. + + Args: + qid (str): Entity ID to resolve. + + Returns: + LazyLabel: Lazy label wrapper bound to this factory. + """ self._pending_ids.add(qid) return LazyLabel(qid, factory=self) def resolve_all(self): + """Resolve all pending IDs in a single bulk lookup.""" if not self._pending_ids: return @@ -356,10 +376,23 @@ def resolve_all(self): self._pending_ids.clear() def get_label(self, qid: str) -> str: + """Return the resolved label text for an entity ID. + + Args: + qid (str): Entity ID. + + Returns: + str: Best label text according to current language settings. + """ label_dict = self._resolved_labels.get(qid, {}) label = WikidataLabel.get_lang_val(label_dict, lang=self.lang, fallback_lang=self.fallback_lang) return label def set_lang(self, lang: str): + """Update preferred language and resolve pending IDs. + + Args: + lang (str): Preferred language code. + """ self.lang = lang self.resolve_all() diff --git a/src/__init__.py b/src/__init__.py index dde2dfc..acaf1ea 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,4 +1,36 @@ -from .WikidataLabel import * -from .Textifier import * -from .Normalizer import * -from .utils import * \ No newline at end of file +"""Public package exports for Wikidata textification primitives.""" + +from .Normalizer import JSONNormalizer, TTLNormalizer +from .Textifier import ( + WikidataClaim, + WikidataClaimValue, + WikidataCoordinates, + WikidataEntity, + WikidataQuantity, + WikidataTime, +) +from .utils import ( + get_wikidata_json_by_ids, + get_wikidata_ttl_by_id, + wikidata_geolocation_to_text, + wikidata_time_to_text, +) +from .WikidataLabel import LazyLabel, LazyLabelFactory, WikidataLabel + +__all__ = [ + "JSONNormalizer", + "TTLNormalizer", + "WikidataClaim", + "WikidataClaimValue", + "WikidataCoordinates", + "WikidataEntity", + "WikidataLabel", + "WikidataQuantity", + "WikidataTime", + "LazyLabel", + "LazyLabelFactory", + "get_wikidata_json_by_ids", + "get_wikidata_ttl_by_id", + "wikidata_geolocation_to_text", + "wikidata_time_to_text", +] diff --git a/src/utils.py b/src/utils.py index 880a951..5e125e5 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,10 +1,12 @@ -import requests -from requests.adapters import HTTPAdapter +"""HTTP helpers and value-formatting utilities for Wikidata APIs.""" -import json import html +import json import os +import requests +from requests.adapters import HTTPAdapter + REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) SESSION = requests.Session() @@ -12,25 +14,27 @@ SESSION.mount("http://", adapter) SESSION.mount("https://", adapter) + def get_wikidata_ttl_by_id( - id, - lang='en', - ): - """Fetches a Wikidata entity by its ID and returns its TTL representation. + id, + lang="en", +): + """Fetch a Wikidata entity as TTL from ``Special:EntityData``. Args: - id (str): A Wikidata entity ID (e.g., Q42, P31). - lang (str, optional): The language to use for the response. Defaults to 'en'. + id (str): Wikidata entity ID, for example ``"Q42"`` or ``"P31"``. + lang (str, optional): Language code for server-side label rendering. Returns: - str: The TTL representation of the entity. + str: TTL document for the requested entity. + + Raises: + requests.HTTPError: If Wikidata returns an error response. """ params = { - 'uselang': lang, - } - headers = { - 'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)' + "uselang": lang, } + headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} response = SESSION.get( f"https://www.wikidata.org/wiki/Special:EntityData/{id}.ttl", @@ -42,23 +46,21 @@ def get_wikidata_ttl_by_id( return response.text -def get_wikidata_json_by_ids( - ids, - props='labels|descriptions|aliases|claims' - ): - """ - Fetches Wikidata entities by their IDs and returns a dictionary of entities. +def get_wikidata_json_by_ids(ids, props="labels|descriptions|aliases|claims"): + """Fetch one or more Wikidata entities from ``wbgetentities``. - Parameters: - - ids (list[str] or str): A list of Wikidata entity IDs (e.g., Q42, P31) or a single ID as a string. - - props (str): The properties to retrieve (default is 'labels|descriptions|aliases|claims'). + Args: + ids (list[str] | str): Entity IDs as a list or ``|``-separated string. + props (str): Pipe-delimited properties requested from the API. Returns: - - dict: A dictionary containing the entities, where keys are entity IDs and values are dictionaries of properties. - """ + dict[str, dict]: Mapping of entity IDs to API entity payloads. + Raises: + requests.HTTPError: If Wikidata returns an error response. + """ if isinstance(ids, str): - ids = ids.split('|') + ids = ids.split("|") ids = list(dict.fromkeys(ids)) # Ensure unique IDs entities_data = {} @@ -66,18 +68,15 @@ def get_wikidata_json_by_ids( # Wikidata API has a limit on the number of IDs per request, # typically 50 for wbgetentities. for chunk_idx in range(0, len(ids), 50): - - ids_chunk = ids[chunk_idx:chunk_idx+50] + ids_chunk = ids[chunk_idx : chunk_idx + 50] params = { - 'action': 'wbgetentities', - 'ids': "|".join(ids_chunk), - 'props': props, - 'format': 'json', - 'origin': '*', - } - headers = { - 'User-Agent': 'Wikidata Textifier (embeddings@wikimedia.de)' + "action": "wbgetentities", + "ids": "|".join(ids_chunk), + "props": props, + "format": "json", + "origin": "*", } + headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} response = SESSION.get( "https://www.wikidata.org/w/api.php?", @@ -96,9 +95,20 @@ def get_wikidata_json_by_ids( # Formatting ##################################### + def wikidata_time_to_text(value: dict, lang: str = "en"): - """ - Convert a Wikidata time value into natural language text. + """Format a time datavalue into localized display text using a local Wikibase instance. + + Args: + value (dict): Time value payload in Wikibase datavalue format. + lang (str): Language code used by ``wbformatvalue``. + + Returns: + str: Localized human-readable representation of the time value. + + Raises: + ValueError: If the input payload is invalid or the API response is malformed. + requests.HTTPError: If the formatting API returns an error response. """ WIKIBASE_HOST = os.environ.get("WIKIBASE_HOST", "wikibase") WIKIBASE_API = f"http://{WIKIBASE_HOST}/w/api.php" @@ -123,12 +133,16 @@ def wikidata_time_to_text(value: dict, lang: str = "en"): }, } - r = SESSION.post(WIKIBASE_API, data={ - "action": "wbformatvalue", - "format": "json", - "uselang": lang, - "datavalue": json.dumps(datavalue), - }, timeout=REQUEST_TIMEOUT_SECONDS) + r = SESSION.post( + WIKIBASE_API, + data={ + "action": "wbformatvalue", + "format": "json", + "uselang": lang, + "datavalue": json.dumps(datavalue), + }, + timeout=REQUEST_TIMEOUT_SECONDS, + ) r.raise_for_status() data = r.json() @@ -138,8 +152,18 @@ def wikidata_time_to_text(value: dict, lang: str = "en"): def wikidata_geolocation_to_text(value: dict, lang: str = "en"): - """ - Convert a Wikidata geolocation value into natural language text. + """Format a globe-coordinate value into localized display text using a local Wikibase instance. + + Args: + value (dict): Coordinate payload in Wikibase datavalue format. + lang (str): Language code used by ``wbformatvalue``. + + Returns: + str: Localized human-readable representation of the coordinate value. + + Raises: + ValueError: If the formatting API response is malformed. + requests.HTTPError: If the formatting API returns an error response. """ WIKIBASE_HOST = os.environ.get("WIKIBASE_HOST", "wikibase") WIKIBASE_API = f"http://{WIKIBASE_HOST}/w/api.php" @@ -155,12 +179,16 @@ def wikidata_geolocation_to_text(value: dict, lang: str = "en"): }, } - r = SESSION.post(WIKIBASE_API, data={ - "action": "wbformatvalue", - "format": "json", - "uselang": lang, - "datavalue": json.dumps(datavalue), - }, timeout=REQUEST_TIMEOUT_SECONDS) + r = SESSION.post( + WIKIBASE_API, + data={ + "action": "wbformatvalue", + "format": "json", + "uselang": lang, + "datavalue": json.dumps(datavalue), + }, + timeout=REQUEST_TIMEOUT_SECONDS, + ) r.raise_for_status() data = r.json() diff --git a/uv.lock b/uv.lock index 5694f09..d6fd462 100644 --- a/uv.lock +++ b/uv.lock @@ -241,6 +241,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, ] +[[package]] +name = "ruff" +version = "0.15.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/14/b0/73cf7550861e2b4824950b8b52eebdcc5adc792a00c514406556c5b80817/ruff-0.15.8.tar.gz", hash = "sha256:995f11f63597ee362130d1d5a327a87cb6f3f5eae3094c620bcc632329a4d26e", size = 4610921, upload-time = "2026-03-26T18:39:38.675Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/92/c445b0cd6da6e7ae51e954939cb69f97e008dbe750cfca89b8cedc081be7/ruff-0.15.8-py3-none-linux_armv6l.whl", hash = "sha256:cbe05adeba76d58162762d6b239c9056f1a15a55bd4b346cfd21e26cd6ad7bc7", size = 10527394, upload-time = "2026-03-26T18:39:41.566Z" }, + { url = "https://files.pythonhosted.org/packages/eb/92/f1c662784d149ad1414cae450b082cf736430c12ca78367f20f5ed569d65/ruff-0.15.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d3e3d0b6ba8dca1b7ef9ab80a28e840a20070c4b62e56d675c24f366ef330570", size = 10905693, upload-time = "2026-03-26T18:39:30.364Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f2/7a631a8af6d88bcef997eb1bf87cc3da158294c57044aafd3e17030613de/ruff-0.15.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6ee3ae5c65a42f273f126686353f2e08ff29927b7b7e203b711514370d500de3", size = 10323044, upload-time = "2026-03-26T18:39:33.37Z" }, + { url = "https://files.pythonhosted.org/packages/67/18/1bf38e20914a05e72ef3b9569b1d5c70a7ef26cd188d69e9ca8ef588d5bf/ruff-0.15.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdce027ada77baa448077ccc6ebb2fa9c3c62fd110d8659d601cf2f475858d94", size = 10629135, upload-time = "2026-03-26T18:39:44.142Z" }, + { url = "https://files.pythonhosted.org/packages/d2/e9/138c150ff9af60556121623d41aba18b7b57d95ac032e177b6a53789d279/ruff-0.15.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12e617fc01a95e5821648a6df341d80456bd627bfab8a829f7cfc26a14a4b4a3", size = 10348041, upload-time = "2026-03-26T18:39:52.178Z" }, + { url = "https://files.pythonhosted.org/packages/02/f1/5bfb9298d9c323f842c5ddeb85f1f10ef51516ac7a34ba446c9347d898df/ruff-0.15.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:432701303b26416d22ba696c39f2c6f12499b89093b61360abc34bcc9bf07762", size = 11121987, upload-time = "2026-03-26T18:39:55.195Z" }, + { url = "https://files.pythonhosted.org/packages/10/11/6da2e538704e753c04e8d86b1fc55712fdbdcc266af1a1ece7a51fff0d10/ruff-0.15.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d910ae974b7a06a33a057cb87d2a10792a3b2b3b35e33d2699fdf63ec8f6b17a", size = 11951057, upload-time = "2026-03-26T18:39:19.18Z" }, + { url = "https://files.pythonhosted.org/packages/83/f0/c9208c5fd5101bf87002fed774ff25a96eea313d305f1e5d5744698dc314/ruff-0.15.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2033f963c43949d51e6fdccd3946633c6b37c484f5f98c3035f49c27395a8ab8", size = 11464613, upload-time = "2026-03-26T18:40:06.301Z" }, + { url = "https://files.pythonhosted.org/packages/f8/22/d7f2fabdba4fae9f3b570e5605d5eb4500dcb7b770d3217dca4428484b17/ruff-0.15.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f29b989a55572fb885b77464cf24af05500806ab4edf9a0fd8977f9759d85b1", size = 11257557, upload-time = "2026-03-26T18:39:57.972Z" }, + { url = "https://files.pythonhosted.org/packages/71/8c/382a9620038cf6906446b23ce8632ab8c0811b8f9d3e764f58bedd0c9a6f/ruff-0.15.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:ac51d486bf457cdc985a412fb1801b2dfd1bd8838372fc55de64b1510eff4bec", size = 11169440, upload-time = "2026-03-26T18:39:22.205Z" }, + { url = "https://files.pythonhosted.org/packages/4d/0d/0994c802a7eaaf99380085e4e40c845f8e32a562e20a38ec06174b52ef24/ruff-0.15.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c9861eb959edab053c10ad62c278835ee69ca527b6dcd72b47d5c1e5648964f6", size = 10605963, upload-time = "2026-03-26T18:39:46.682Z" }, + { url = "https://files.pythonhosted.org/packages/19/aa/d624b86f5b0aad7cef6bbf9cd47a6a02dfdc4f72c92a337d724e39c9d14b/ruff-0.15.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8d9a5b8ea13f26ae90838afc33f91b547e61b794865374f114f349e9036835fb", size = 10357484, upload-time = "2026-03-26T18:39:49.176Z" }, + { url = "https://files.pythonhosted.org/packages/35/c3/e0b7835d23001f7d999f3895c6b569927c4d39912286897f625736e1fd04/ruff-0.15.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c2a33a529fb3cbc23a7124b5c6ff121e4d6228029cba374777bd7649cc8598b8", size = 10830426, upload-time = "2026-03-26T18:40:03.702Z" }, + { url = "https://files.pythonhosted.org/packages/f0/51/ab20b322f637b369383adc341d761eaaa0f0203d6b9a7421cd6e783d81b9/ruff-0.15.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:75e5cd06b1cf3f47a3996cfc999226b19aa92e7cce682dcd62f80d7035f98f49", size = 11345125, upload-time = "2026-03-26T18:39:27.799Z" }, + { url = "https://files.pythonhosted.org/packages/37/e6/90b2b33419f59d0f2c4c8a48a4b74b460709a557e8e0064cf33ad894f983/ruff-0.15.8-py3-none-win32.whl", hash = "sha256:bc1f0a51254ba21767bfa9a8b5013ca8149dcf38092e6a9eb704d876de94dc34", size = 10571959, upload-time = "2026-03-26T18:39:36.117Z" }, + { url = "https://files.pythonhosted.org/packages/1f/a2/ef467cb77099062317154c63f234b8a7baf7cb690b99af760c5b68b9ee7f/ruff-0.15.8-py3-none-win_amd64.whl", hash = "sha256:04f79eff02a72db209d47d665ba7ebcad609d8918a134f86cb13dd132159fc89", size = 11743893, upload-time = "2026-03-26T18:39:25.01Z" }, + { url = "https://files.pythonhosted.org/packages/15/e2/77be4fff062fa78d9b2a4dea85d14785dac5f1d0c1fb58ed52331f0ebe28/ruff-0.15.8-py3-none-win_arm64.whl", hash = "sha256:cf891fa8e3bb430c0e7fac93851a5978fc99c8fa2c053b57b118972866f8e5f2", size = 11048175, upload-time = "2026-03-26T18:40:01.06Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -340,6 +365,11 @@ dependencies = [ { name = "uvicorn" }, ] +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + [package.metadata] requires-dist = [ { name = "fastapi", specifier = ">=0.116.1" }, @@ -350,3 +380,6 @@ requires-dist = [ { name = "sqlalchemy", specifier = ">=2.0.41" }, { name = "uvicorn", specifier = ">=0.35.0" }, ] + +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.9.0" }]