157 lines
5.2 KiB
Python
157 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
|
|
|
|
import json
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
|
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
HEADERS_WD = {
|
|
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
|
|
def herbapi_request(path, method="GET", data=None):
|
|
url = f"{HERBAPI_BASE}{path}"
|
|
body = json.dumps(data).encode() if data else None
|
|
req = urllib.request.Request(url, data=body, method=method, headers={
|
|
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
|
"Content-Type": "application/json",
|
|
})
|
|
with urllib.request.urlopen(req) as resp:
|
|
return json.loads(resp.read())
|
|
|
|
|
|
def query_wikidata_batch(names):
|
|
"""Query Wikidata for a batch of scientific names."""
|
|
values = " ".join(f'"{n}"' for n in names)
|
|
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
|
|
VALUES ?name {{ {values} }}
|
|
?item wdt:P225 ?name .
|
|
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
|
|
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
|
|
}}"""
|
|
encoded = urllib.parse.quote(sparql)
|
|
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
|
|
req = urllib.request.Request(url, headers=HEADERS_WD)
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
data = json.loads(resp.read())
|
|
|
|
results = {}
|
|
for binding in data.get("results", {}).get("bindings", []):
|
|
name = binding["name"]["value"]
|
|
qid_url = binding["item"]["value"]
|
|
qid = qid_url.rsplit("/", 1)[-1]
|
|
gbif = binding.get("gbifId", {}).get("value")
|
|
eppo = binding.get("eppoCode", {}).get("value")
|
|
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
|
|
return results
|
|
|
|
|
|
def main():
|
|
# 1. Fetch all species
|
|
resp = herbapi_request("/species?per_page=200")
|
|
species_list = resp["data"]
|
|
print(f"Fetched {len(species_list)} species from HerbAPI\n")
|
|
|
|
# 2. Collect species needing enrichment
|
|
to_enrich = [sp for sp in species_list
|
|
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
|
|
|
|
if not to_enrich:
|
|
print("All species already enriched.")
|
|
return
|
|
|
|
print(f"{len(to_enrich)} species need enrichment\n")
|
|
|
|
# 3. Batch query Wikidata
|
|
BATCH_SIZE = 20
|
|
wikidata_results = {}
|
|
names = [sp["name_scientific"] for sp in to_enrich]
|
|
|
|
for i in range(0, len(names), BATCH_SIZE):
|
|
batch = names[i:i + BATCH_SIZE]
|
|
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
|
|
try:
|
|
results = query_wikidata_batch(batch)
|
|
wikidata_results.update(results)
|
|
print(f" Got {len(results)} matches")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
if i + BATCH_SIZE < len(names):
|
|
time.sleep(2)
|
|
|
|
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
|
|
|
|
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
|
|
updated = 0
|
|
skipped = 0
|
|
not_found = 0
|
|
errors = 0
|
|
|
|
for sp in to_enrich:
|
|
name = sp["name_scientific"]
|
|
wd = wikidata_results.get(name)
|
|
if not wd:
|
|
print(f" SKIP (no Wikidata match): {name}")
|
|
not_found += 1
|
|
continue
|
|
|
|
# Check what needs updating
|
|
needs_qid = not sp["wikidata_qid"] and wd["qid"]
|
|
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
|
|
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
|
|
|
|
if not (needs_qid or needs_gbif or needs_eppo):
|
|
print(f" SKIP (nothing new): {name}")
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
# GET full species by slug for the complete object
|
|
full_sp = herbapi_request(f"/species/{sp['slug']}")
|
|
|
|
# Remove read-only fields
|
|
species_id = full_sp.pop("id")
|
|
full_sp.pop("slug", None)
|
|
full_sp.pop("created_at", None)
|
|
full_sp.pop("updated_at", None)
|
|
|
|
# Merge new data (only null fields)
|
|
if needs_qid:
|
|
full_sp["wikidata_qid"] = wd["qid"]
|
|
if needs_gbif:
|
|
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
|
|
if needs_eppo:
|
|
full_sp["eppo_code"] = wd["eppo_code"]
|
|
|
|
# PUT by UUID
|
|
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
|
|
|
|
fields = []
|
|
if needs_qid: fields.append(f"qid={wd['qid']}")
|
|
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
|
|
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
|
|
print(f" UPDATED: {name} -> {', '.join(fields)}")
|
|
updated += 1
|
|
except Exception as e:
|
|
print(f" ERROR updating {name}: {e}")
|
|
errors += 1
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"RESULTS:")
|
|
print(f" Updated: {updated}")
|
|
print(f" Skipped (no new data): {skipped}")
|
|
print(f" Not found on Wikidata: {not_found}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total species: {len(species_list)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|