herbapi/tools/enrichment/enrich_wikidata.py

#!/usr/bin/env python3
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""

import json
import time
import urllib.parse
import urllib.request

HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

HEADERS_WD = {
    "User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
    "Accept": "application/json",
}


def herbapi_request(path, method="GET", data=None):
    url = f"{HERBAPI_BASE}{path}"
    body = json.dumps(data).encode() if data else None
    req = urllib.request.Request(url, data=body, method=method, headers={
        "Authorization": f"Bearer {HERBAPI_TOKEN}",
        "Content-Type": "application/json",
    })
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())


def query_wikidata_batch(names):
    """Query Wikidata for a batch of scientific names."""
    values = " ".join(f'"{n}"' for n in names)
    sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
  VALUES ?name {{ {values} }}
  ?item wdt:P225 ?name .
  OPTIONAL {{ ?item wdt:P846 ?gbifId }}
  OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
}}"""
    encoded = urllib.parse.quote(sparql)
    url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
    req = urllib.request.Request(url, headers=HEADERS_WD)
    with urllib.request.urlopen(req, timeout=60) as resp:
        data = json.loads(resp.read())

    results = {}
    for binding in data.get("results", {}).get("bindings", []):
        name = binding["name"]["value"]
        qid_url = binding["item"]["value"]
        qid = qid_url.rsplit("/", 1)[-1]
        gbif = binding.get("gbifId", {}).get("value")
        eppo = binding.get("eppoCode", {}).get("value")
        results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
    return results


def main():
    # 1. Fetch all species
    resp = herbapi_request("/species?per_page=200")
    species_list = resp["data"]
    print(f"Fetched {len(species_list)} species from HerbAPI\n")

    # 2. Collect species needing enrichment
    to_enrich = [sp for sp in species_list
                 if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]

    if not to_enrich:
        print("All species already enriched.")
        return

    print(f"{len(to_enrich)} species need enrichment\n")

    # 3. Batch query Wikidata
    BATCH_SIZE = 20
    wikidata_results = {}
    names = [sp["name_scientific"] for sp in to_enrich]

    for i in range(0, len(names), BATCH_SIZE):
        batch = names[i:i + BATCH_SIZE]
        print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
        try:
            results = query_wikidata_batch(batch)
            wikidata_results.update(results)
            print(f"  Got {len(results)} matches")
        except Exception as e:
            print(f"  ERROR: {e}")
        if i + BATCH_SIZE < len(names):
            time.sleep(2)

    print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")

    # 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
    updated = 0
    skipped = 0
    not_found = 0
    errors = 0

    for sp in to_enrich:
        name = sp["name_scientific"]
        wd = wikidata_results.get(name)
        if not wd:
            print(f"  SKIP (no Wikidata match): {name}")
            not_found += 1
            continue

        # Check what needs updating
        needs_qid = not sp["wikidata_qid"] and wd["qid"]
        needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
        needs_eppo = not sp["eppo_code"] and wd["eppo_code"]

        if not (needs_qid or needs_gbif or needs_eppo):
            print(f"  SKIP (nothing new): {name}")
            skipped += 1
            continue

        try:
            # GET full species by slug for the complete object
            full_sp = herbapi_request(f"/species/{sp['slug']}")

            # Remove read-only fields
            species_id = full_sp.pop("id")
            full_sp.pop("slug", None)
            full_sp.pop("created_at", None)
            full_sp.pop("updated_at", None)

            # Merge new data (only null fields)
            if needs_qid:
                full_sp["wikidata_qid"] = wd["qid"]
            if needs_gbif:
                full_sp["gbif_id"] = str(wd["gbif_id"])  # API expects string
            if needs_eppo:
                full_sp["eppo_code"] = wd["eppo_code"]

            # PUT by UUID
            herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)

            fields = []
            if needs_qid: fields.append(f"qid={wd['qid']}")
            if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
            if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
            print(f"  UPDATED: {name} -> {', '.join(fields)}")
            updated += 1
        except Exception as e:
            print(f"  ERROR updating {name}: {e}")
            errors += 1

    print(f"\n{'=' * 60}")
    print(f"RESULTS:")
    print(f"  Updated:               {updated}")
    print(f"  Skipped (no new data): {skipped}")
    print(f"  Not found on Wikidata: {not_found}")
    print(f"  Errors:                {errors}")
    print(f"  Total species:         {len(species_list)}")


if __name__ == "__main__":
    main()