91aa9cb733
- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/ butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status, naturadb_tags (not just the text wildlife_value); paginate all species; env-overridable base/token; only fill empty fields. - enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
128 lines
4.2 KiB
Python
128 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Deterministic, authoritative enrichment for HerbAPI species — no external
|
|
scraping, just botanical facts:
|
|
|
|
* hardiness_zone_at — Austria/Europe uses the USDA zone system, so this
|
|
equals hardiness_zone_usda.
|
|
* nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera
|
|
(Frankia symbiosis), false otherwise. This is family/genus-level botany.
|
|
* min_temp — cold hardiness in °C, derived from the lower USDA zone
|
|
in hardiness_zone_usda (each zone has a standardized minimum temperature).
|
|
|
|
Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
|
|
Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
BASE = os.environ.get("HERBAPI_BASE", "https://herbapi.naturalised.at/api/v1")
|
|
TOKEN = os.environ.get("HERBAPI_TOKEN", "")
|
|
|
|
# Genera that fix nitrogen via Frankia (actinorhizal), outside Fabaceae.
|
|
ACTINORHIZAL_GENERA = {
|
|
"Alnus", "Casuarina", "Allocasuarina", "Gymnostoma", "Comptonia", "Myrica",
|
|
"Morella", "Elaeagnus", "Hippophae", "Shepherdia", "Ceanothus", "Colletia",
|
|
"Discaria", "Coriaria", "Datisca", "Dryas", "Purshia", "Cercocarpus",
|
|
"Chamaebatia", "Cowania", "Trevoa",
|
|
}
|
|
|
|
|
|
def req(path, method="GET", data=None):
|
|
r = urllib.request.Request(
|
|
BASE + path,
|
|
data=json.dumps(data).encode() if data is not None else None,
|
|
method=method,
|
|
)
|
|
r.add_header("Authorization", "Bearer " + TOKEN)
|
|
r.add_header("Content-Type", "application/json")
|
|
r.add_header("Accept", "application/json")
|
|
with urllib.request.urlopen(r) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
def all_species():
|
|
out, page = [], 1
|
|
while True:
|
|
chunk = req(f"/species?per_page=100&page={page}")["data"]
|
|
if not chunk:
|
|
break
|
|
out.extend(chunk)
|
|
if len(chunk) < 100:
|
|
break
|
|
page += 1
|
|
return out
|
|
|
|
|
|
def fixes_nitrogen(family_slug, scientific):
|
|
genus = (scientific or "").split(" ")[0]
|
|
return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
|
|
|
|
|
|
# Standardized USDA hardiness-zone minimum temperatures (°C), rounded.
|
|
USDA_ZONE_MIN_C = {
|
|
1: -51, 2: -46, 3: -40, 4: -34, 5: -29, 6: -23, 7: -18,
|
|
8: -12, 9: -7, 10: -1, 11: 4, 12: 10, 13: 16,
|
|
}
|
|
|
|
|
|
def min_temp_from_zone(zone_str):
|
|
"""Lower bound of the coldest zone in e.g. '5-9' -> -29.0 (°C)."""
|
|
if not zone_str:
|
|
return None
|
|
first = str(zone_str).split("-")[0].strip()
|
|
digits = "".join(c for c in first if c.isdigit())
|
|
if not digits:
|
|
return None
|
|
return float(USDA_ZONE_MIN_C.get(int(digits))) if int(digits) in USDA_ZONE_MIN_C else None
|
|
|
|
|
|
def main():
|
|
global BASE
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--base", default=BASE)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
BASE = args.base
|
|
if not TOKEN:
|
|
sys.exit("HERBAPI_TOKEN not set")
|
|
|
|
families = {f["id"]: f["slug"] for f in req("/families?per_page=100")["data"]}
|
|
species = all_species()
|
|
print(f"{len(species)} species, {len(families)} families")
|
|
|
|
changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0, "min_temp": 0}
|
|
for s in species:
|
|
updates = {}
|
|
if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
|
|
updates["hardiness_zone_at"] = s["hardiness_zone_usda"]
|
|
if s.get("nitrogen_fixer") is None:
|
|
updates["nitrogen_fixer"] = fixes_nitrogen(
|
|
families.get(s["family_id"]), s.get("name_scientific")
|
|
)
|
|
if s.get("min_temp") is None:
|
|
mt = min_temp_from_zone(s.get("hardiness_zone_usda"))
|
|
if mt is not None:
|
|
updates["min_temp"] = mt
|
|
if not updates:
|
|
continue
|
|
for k in updates:
|
|
changed[k] += 1
|
|
if args.dry_run:
|
|
continue
|
|
full = req(f"/species/{s['slug']}")
|
|
for k in ("created_at", "updated_at", "family"):
|
|
full.pop(k, None)
|
|
full.update(updates)
|
|
req(f"/species/{full['id']}", "PUT", full)
|
|
|
|
print(("DRY-RUN " if args.dry_run else "") + "updated counts:", changed)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|