-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
207 lines (178 loc) · 6.97 KB
/
main.py
File metadata and controls
207 lines (178 loc) · 6.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""FastAPI application that exposes Wikidata textification endpoints."""
import os
import time
import traceback
import requests
from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request
from fastapi.middleware.cors import CORSMiddleware
from src import utils
from src.Normalizer import JSONNormalizer, TTLNormalizer
from src.WikidataLabel import LazyLabelFactory, WikidataLabel
# Start Fastapi app
app = FastAPI(
title="Wikidata Textifier",
description="Transforms Wikidata entities into text representations.",
version="1.0.0",
docs_url="/docs", # Change the Swagger UI path if needed
redoc_url="/redoc", # Change the ReDoc path if needed
swagger_ui_parameters={"persistAuthorization": True},
)
# Enable all Cors
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False,
allow_methods=["GET"],
allow_headers=["*"],
)
LABEL_CLEANUP_INTERVAL_SECONDS = int(os.environ.get("LABEL_CLEANUP_INTERVAL_SECONDS", 3600))
_last_label_cleanup = 0.0
@app.on_event("startup")
async def startup():
"""Initialize database resources required by the API."""
WikidataLabel.initialize_database()
@app.get(
"/",
responses={
200: {
"description": "Returns a list of relevant Wikidata property PIDs with similarity scores",
"content": {
"application/json": {
"example": [
{
"Q42": "Douglas Adams (human), English writer, humorist, and dramatist...",
}
]
}
},
},
422: {
"description": "Missing or invalid query parameter",
"content": {"application/json": {"example": {"detail": "Invalid format specified"}}},
},
},
)
async def get_textified_wd(
request: Request,
background_tasks: BackgroundTasks,
id: str = Query(..., examples="Q42,Q2"),
pid: str = Query(None, examples="P31,P279"),
lang: str = "en",
format: str = "json",
external_ids: bool = True,
references: bool = False,
all_ranks: bool = False,
qualifiers: bool = True,
fallback_lang: str = "en",
):
"""Retrieve Wikidata entities as structured JSON, natural text, or triplet lines.
This endpoint fetches one or more entities, resolves missing labels, and normalizes
claims into a compact representation suitable for downstream LLM use.
**Args:**
- **id** (str): Comma-separated Wikidata IDs to fetch (for example: `"Q42"` or `"Q42,Q2"`).
- **pid** (str, optional): Comma-separated property IDs used to filter returned claims (for example: `"P31,P279"`).
- **lang** (str): Preferred language code for labels and formatted values.
- **format** (str): Output format. One of `"json"`, `"text"`, or `"triplet"`.
- **external_ids** (bool): If `true`, include claims with datatype `external-id`.
- **references** (bool): If `true`, include references in claim values (JSON output only).
- **all_ranks** (bool): If `true`, include preferred, normal, and deprecated statement ranks.
- **qualifiers** (bool): If `true`, include qualifiers for claim values.
- **fallback_lang** (str): Fallback language used when `lang` is unavailable.
**Returns:**
A dictionary keyed by requested entity ID (for example, `"Q42"`).
Each value depends on `format`:
- **json**: Structured entity payload with label, description, aliases, and claims.
- **text**: Human-readable summary text.
- **triplet**: Triplet-style text lines with labels and IDs.
"""
try:
filter_pids = []
if pid:
filter_pids = [p.strip() for p in pid.split(",")]
qids = [q.strip() for q in id.split(",")]
label_factory = LazyLabelFactory(lang=lang, fallback_lang=fallback_lang)
entities = {}
if len(qids) == 1:
# When one QID is requested, TTL is used
try:
entity_data = utils.get_wikidata_ttl_by_id(qids[0], lang=lang)
except requests.HTTPError:
entity_data = None
if not entity_data:
response = "ID not found"
raise HTTPException(status_code=404, detail=response)
entity_data = TTLNormalizer(
entity_id=qids[0],
ttl_text=entity_data,
lang=lang,
fallback_lang=fallback_lang,
label_factory=label_factory,
debug=False,
)
entities = {
qids[0]: entity_data.normalize(
external_ids=external_ids,
all_ranks=all_ranks,
references=references,
filter_pids=filter_pids,
qualifiers=qualifiers,
)
}
else:
# JSON is used with Action API for bulk retrieval
try:
entity_data = utils.get_wikidata_json_by_ids(qids)
except requests.HTTPError:
entity_data = None
if not entity_data:
response = "IDs not found"
raise HTTPException(status_code=404, detail=response)
entity_data = {
qid: JSONNormalizer(
entity_id=qid,
entity_json=entity_data[qid],
lang=lang,
fallback_lang=fallback_lang,
label_factory=label_factory,
debug=False,
)
if entity_data.get(qid)
else None
for qid in qids
}
entities = {
qid: entity.normalize(
external_ids=external_ids,
all_ranks=all_ranks,
references=references,
filter_pids=filter_pids,
qualifiers=qualifiers,
)
if entity
else None
for qid, entity in entity_data.items()
}
return_data = {}
for qid, entity in entities.items():
if not entity:
return_data[qid] = None
continue
if format == "text":
results = entity.to_text(lang)
elif format == "triplet":
results = entity.to_triplet()
else:
results = entity.to_json()
return_data[qid] = results
global _last_label_cleanup
if time.time() - _last_label_cleanup > LABEL_CLEANUP_INTERVAL_SECONDS:
background_tasks.add_task(WikidataLabel.delete_old_labels)
_last_label_cleanup = time.time()
return return_data
except HTTPException:
raise
except requests.RequestException:
raise HTTPException(status_code=502, detail="Upstream service unavailable")
except Exception:
traceback.print_exc()
raise HTTPException(status_code=500, detail="Internal Server Error")