Files
herbapi/tools/enrichment/enrich_botanical.py
T
florian.berthold 91aa9cb733 Enrich: NaturaDB writes structured wildlife fields + full pagination; min_temp from USDA zone
- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/
  butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status,
  naturadb_tags (not just the text wildlife_value); paginate all species;
  env-overridable base/token; only fill empty fields.
- enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
2026-06-05 18:18:00 +02:00

128 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""
Deterministic, authoritative enrichment for HerbAPI species — no external
scraping, just botanical facts:
* hardiness_zone_at — Austria/Europe uses the USDA zone system, so this
equals hardiness_zone_usda.
* nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera
(Frankia symbiosis), false otherwise. This is family/genus-level botany.
* min_temp — cold hardiness in °C, derived from the lower USDA zone
in hardiness_zone_usda (each zone has a standardized minimum temperature).
Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
"""
import argparse
import json
import os
import sys
import urllib.error
import urllib.request
BASE = os.environ.get("HERBAPI_BASE", "https://herbapi.naturalised.at/api/v1")
TOKEN = os.environ.get("HERBAPI_TOKEN", "")
# Genera that fix nitrogen via Frankia (actinorhizal), outside Fabaceae.
ACTINORHIZAL_GENERA = {
"Alnus", "Casuarina", "Allocasuarina", "Gymnostoma", "Comptonia", "Myrica",
"Morella", "Elaeagnus", "Hippophae", "Shepherdia", "Ceanothus", "Colletia",
"Discaria", "Coriaria", "Datisca", "Dryas", "Purshia", "Cercocarpus",
"Chamaebatia", "Cowania", "Trevoa",
}
def req(path, method="GET", data=None):
r = urllib.request.Request(
BASE + path,
data=json.dumps(data).encode() if data is not None else None,
method=method,
)
r.add_header("Authorization", "Bearer " + TOKEN)
r.add_header("Content-Type", "application/json")
r.add_header("Accept", "application/json")
with urllib.request.urlopen(r) as resp:
return json.loads(resp.read().decode())
def all_species():
out, page = [], 1
while True:
chunk = req(f"/species?per_page=100&page={page}")["data"]
if not chunk:
break
out.extend(chunk)
if len(chunk) < 100:
break
page += 1
return out
def fixes_nitrogen(family_slug, scientific):
genus = (scientific or "").split(" ")[0]
return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
# Standardized USDA hardiness-zone minimum temperatures (°C), rounded.
USDA_ZONE_MIN_C = {
1: -51, 2: -46, 3: -40, 4: -34, 5: -29, 6: -23, 7: -18,
8: -12, 9: -7, 10: -1, 11: 4, 12: 10, 13: 16,
}
def min_temp_from_zone(zone_str):
"""Lower bound of the coldest zone in e.g. '5-9' -> -29.0 (°C)."""
if not zone_str:
return None
first = str(zone_str).split("-")[0].strip()
digits = "".join(c for c in first if c.isdigit())
if not digits:
return None
return float(USDA_ZONE_MIN_C.get(int(digits))) if int(digits) in USDA_ZONE_MIN_C else None
def main():
global BASE
ap = argparse.ArgumentParser()
ap.add_argument("--base", default=BASE)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
BASE = args.base
if not TOKEN:
sys.exit("HERBAPI_TOKEN not set")
families = {f["id"]: f["slug"] for f in req("/families?per_page=100")["data"]}
species = all_species()
print(f"{len(species)} species, {len(families)} families")
changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0, "min_temp": 0}
for s in species:
updates = {}
if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
updates["hardiness_zone_at"] = s["hardiness_zone_usda"]
if s.get("nitrogen_fixer") is None:
updates["nitrogen_fixer"] = fixes_nitrogen(
families.get(s["family_id"]), s.get("name_scientific")
)
if s.get("min_temp") is None:
mt = min_temp_from_zone(s.get("hardiness_zone_usda"))
if mt is not None:
updates["min_temp"] = mt
if not updates:
continue
for k in updates:
changed[k] += 1
if args.dry_run:
continue
full = req(f"/species/{s['slug']}")
for k in ("created_at", "updated_at", "family"):
full.pop(k, None)
full.update(updates)
req(f"/species/{full['id']}", "PUT", full)
print(("DRY-RUN " if args.dry_run else "") + "updated counts:", changed)
if __name__ == "__main__":
main()