Files
herbapi/tools/enrichment/enrich_wikidata.py
T

157 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
import json
import time
import urllib.parse
import urllib.request
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
HEADERS_WD = {
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
"Accept": "application/json",
}
def herbapi_request(path, method="GET", data=None):
url = f"{HERBAPI_BASE}{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method, headers={
"Authorization": f"Bearer {HERBAPI_TOKEN}",
"Content-Type": "application/json",
})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def query_wikidata_batch(names):
"""Query Wikidata for a batch of scientific names."""
values = " ".join(f'"{n}"' for n in names)
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
VALUES ?name {{ {values} }}
?item wdt:P225 ?name .
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
}}"""
encoded = urllib.parse.quote(sparql)
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
req = urllib.request.Request(url, headers=HEADERS_WD)
with urllib.request.urlopen(req, timeout=60) as resp:
data = json.loads(resp.read())
results = {}
for binding in data.get("results", {}).get("bindings", []):
name = binding["name"]["value"]
qid_url = binding["item"]["value"]
qid = qid_url.rsplit("/", 1)[-1]
gbif = binding.get("gbifId", {}).get("value")
eppo = binding.get("eppoCode", {}).get("value")
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
return results
def main():
# 1. Fetch all species
resp = herbapi_request("/species?per_page=200")
species_list = resp["data"]
print(f"Fetched {len(species_list)} species from HerbAPI\n")
# 2. Collect species needing enrichment
to_enrich = [sp for sp in species_list
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
if not to_enrich:
print("All species already enriched.")
return
print(f"{len(to_enrich)} species need enrichment\n")
# 3. Batch query Wikidata
BATCH_SIZE = 20
wikidata_results = {}
names = [sp["name_scientific"] for sp in to_enrich]
for i in range(0, len(names), BATCH_SIZE):
batch = names[i:i + BATCH_SIZE]
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
try:
results = query_wikidata_batch(batch)
wikidata_results.update(results)
print(f" Got {len(results)} matches")
except Exception as e:
print(f" ERROR: {e}")
if i + BATCH_SIZE < len(names):
time.sleep(2)
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
updated = 0
skipped = 0
not_found = 0
errors = 0
for sp in to_enrich:
name = sp["name_scientific"]
wd = wikidata_results.get(name)
if not wd:
print(f" SKIP (no Wikidata match): {name}")
not_found += 1
continue
# Check what needs updating
needs_qid = not sp["wikidata_qid"] and wd["qid"]
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
if not (needs_qid or needs_gbif or needs_eppo):
print(f" SKIP (nothing new): {name}")
skipped += 1
continue
try:
# GET full species by slug for the complete object
full_sp = herbapi_request(f"/species/{sp['slug']}")
# Remove read-only fields
species_id = full_sp.pop("id")
full_sp.pop("slug", None)
full_sp.pop("created_at", None)
full_sp.pop("updated_at", None)
# Merge new data (only null fields)
if needs_qid:
full_sp["wikidata_qid"] = wd["qid"]
if needs_gbif:
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
if needs_eppo:
full_sp["eppo_code"] = wd["eppo_code"]
# PUT by UUID
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
fields = []
if needs_qid: fields.append(f"qid={wd['qid']}")
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
print(f" UPDATED: {name} -> {', '.join(fields)}")
updated += 1
except Exception as e:
print(f" ERROR updating {name}: {e}")
errors += 1
print(f"\n{'=' * 60}")
print(f"RESULTS:")
print(f" Updated: {updated}")
print(f" Skipped (no new data): {skipped}")
print(f" Not found on Wikidata: {not_found}")
print(f" Errors: {errors}")
print(f" Total species: {len(species_list)}")
if __name__ == "__main__":
main()