#!/usr/bin/env python3 """Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code.""" import json import time import urllib.parse import urllib.request HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" HEADERS_WD = { "User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)", "Accept": "application/json", } def herbapi_request(path, method="GET", data=None): url = f"{HERBAPI_BASE}{path}" body = json.dumps(data).encode() if data else None req = urllib.request.Request(url, data=body, method=method, headers={ "Authorization": f"Bearer {HERBAPI_TOKEN}", "Content-Type": "application/json", }) with urllib.request.urlopen(req) as resp: return json.loads(resp.read()) def query_wikidata_batch(names): """Query Wikidata for a batch of scientific names.""" values = " ".join(f'"{n}"' for n in names) sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{ VALUES ?name {{ {values} }} ?item wdt:P225 ?name . OPTIONAL {{ ?item wdt:P846 ?gbifId }} OPTIONAL {{ ?item wdt:P3031 ?eppoCode }} }}""" encoded = urllib.parse.quote(sparql) url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json" req = urllib.request.Request(url, headers=HEADERS_WD) with urllib.request.urlopen(req, timeout=60) as resp: data = json.loads(resp.read()) results = {} for binding in data.get("results", {}).get("bindings", []): name = binding["name"]["value"] qid_url = binding["item"]["value"] qid = qid_url.rsplit("/", 1)[-1] gbif = binding.get("gbifId", {}).get("value") eppo = binding.get("eppoCode", {}).get("value") results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo} return results def main(): # 1. Fetch all species resp = herbapi_request("/species?per_page=200") species_list = resp["data"] print(f"Fetched {len(species_list)} species from HerbAPI\n") # 2. Collect species needing enrichment to_enrich = [sp for sp in species_list if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]] if not to_enrich: print("All species already enriched.") return print(f"{len(to_enrich)} species need enrichment\n") # 3. Batch query Wikidata BATCH_SIZE = 20 wikidata_results = {} names = [sp["name_scientific"] for sp in to_enrich] for i in range(0, len(names), BATCH_SIZE): batch = names[i:i + BATCH_SIZE] print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...") try: results = query_wikidata_batch(batch) wikidata_results.update(results) print(f" Got {len(results)} matches") except Exception as e: print(f" ERROR: {e}") if i + BATCH_SIZE < len(names): time.sleep(2) print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n") # 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID updated = 0 skipped = 0 not_found = 0 errors = 0 for sp in to_enrich: name = sp["name_scientific"] wd = wikidata_results.get(name) if not wd: print(f" SKIP (no Wikidata match): {name}") not_found += 1 continue # Check what needs updating needs_qid = not sp["wikidata_qid"] and wd["qid"] needs_gbif = not sp["gbif_id"] and wd["gbif_id"] needs_eppo = not sp["eppo_code"] and wd["eppo_code"] if not (needs_qid or needs_gbif or needs_eppo): print(f" SKIP (nothing new): {name}") skipped += 1 continue try: # GET full species by slug for the complete object full_sp = herbapi_request(f"/species/{sp['slug']}") # Remove read-only fields species_id = full_sp.pop("id") full_sp.pop("slug", None) full_sp.pop("created_at", None) full_sp.pop("updated_at", None) # Merge new data (only null fields) if needs_qid: full_sp["wikidata_qid"] = wd["qid"] if needs_gbif: full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string if needs_eppo: full_sp["eppo_code"] = wd["eppo_code"] # PUT by UUID herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp) fields = [] if needs_qid: fields.append(f"qid={wd['qid']}") if needs_gbif: fields.append(f"gbif={wd['gbif_id']}") if needs_eppo: fields.append(f"eppo={wd['eppo_code']}") print(f" UPDATED: {name} -> {', '.join(fields)}") updated += 1 except Exception as e: print(f" ERROR updating {name}: {e}") errors += 1 print(f"\n{'=' * 60}") print(f"RESULTS:") print(f" Updated: {updated}") print(f" Skipped (no new data): {skipped}") print(f" Not found on Wikidata: {not_found}") print(f" Errors: {errors}") print(f" Total species: {len(species_list)}") if __name__ == "__main__": main()