Enrich: NaturaDB writes structured wildlife fields + full pagination; min_temp from USDA zone

- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/ butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status, naturadb_tags (not just the text wildlife_value); paginate all species; env-overridable base/token; only fill empty fields. - enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
2026-06-05 18:18:00 +02:00
parent 5992c486ae
commit 91aa9cb733
2 changed files with 93 additions and 34 deletions
@@ -7,6 +7,8 @@ scraping, just botanical facts:
    equals hardiness_zone_usda.
  * nitrogen_fixer     — true for Fabaceae (legumes) and actinorhizal genera
    (Frankia symbiosis), false otherwise. This is family/genus-level botany.
  * min_temp           — cold hardiness in °C, derived from the lower USDA zone
    in hardiness_zone_usda (each zone has a standardized minimum temperature).
 Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
 Run:  HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
@@ -61,6 +63,24 @@ def fixes_nitrogen(family_slug, scientific):
    return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
 # Standardized USDA hardiness-zone minimum temperatures (°C), rounded.
 USDA_ZONE_MIN_C = {
    1: -51, 2: -46, 3: -40, 4: -34, 5: -29, 6: -23, 7: -18,
    8: -12, 9: -7, 10: -1, 11: 4, 12: 10, 13: 16,
 }
 def min_temp_from_zone(zone_str):
    """Lower bound of the coldest zone in e.g. '5-9' -> -29.0 (°C)."""
    if not zone_str:
        return None
    first = str(zone_str).split("-")[0].strip()
    digits = "".join(c for c in first if c.isdigit())
    if not digits:
        return None
    return float(USDA_ZONE_MIN_C.get(int(digits))) if int(digits) in USDA_ZONE_MIN_C else None
 def main():
    global BASE
    ap = argparse.ArgumentParser()
@@ -75,7 +95,7 @@ def main():
    species = all_species()
    print(f"{len(species)} species, {len(families)} families")
-    changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0}
+    changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0, "min_temp": 0}
    for s in species:
        updates = {}
        if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
@@ -84,6 +104,10 @@ def main():
            updates["nitrogen_fixer"] = fixes_nitrogen(
                families.get(s["family_id"]), s.get("name_scientific")
            )
        if s.get("min_temp") is None:
            mt = min_temp_from_zone(s.get("hardiness_zone_usda"))
            if mt is not None:
                updates["min_temp"] = mt
        if not updates:
            continue
        for k in updates:
@@ -10,8 +10,9 @@ import urllib.request
 import urllib.error
 import sys
-HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+import os
-HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1")
 HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "")
 NATURADB_BASE = "https://www.naturadb.de/pflanzen"
 USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
 DELAY = 0.5
@@ -252,74 +253,108 @@ def has_any_data(data):
    return False
 # NaturaDB scraped key -> HerbAPI structured field.
 FIELD_MAP = {
    "nectar": "nectar_value",
    "pollen": "pollen_value",
    "wildbienen_count": "wild_bee_count",
    "wildbienen_specialists": "wild_bee_specialist_count",
    "schmetterlinge_count": "butterfly_moth_count",
    "raupen_count": "caterpillar_host_count",
    "raupen_specialists": "caterpillar_specialist_count",
    "schwebfliegen_count": "hoverfly_count",
    "kaefer_count": "beetle_count",
    "vogelarten_count": "bird_count",
    "saeugetier_count": "mammal_count",
 }
 # A species is considered already structurally enriched if it has these.
 STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count")
 def all_species():
    out, page = [], 1
    while True:
        chunk = api_get(f"/species?per_page=100&page={page}")["data"]
        if not chunk:
            break
        out.extend(chunk)
        if len(chunk) < 100:
            break
        page += 1
    return out
 def main():
    print("Fetching species list from HerbAPI...")
-    species_list = api_get("/species?per_page=200")["data"]
+    species_list = all_species()
    print(f"Found {len(species_list)} species.\n")
-    enriched = 0
+    enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0
    skipped_has_data = 0
    skipped_not_found = 0
    skipped_no_data = 0
    errors = 0
    for i, sp in enumerate(species_list):
-        slug = sp["slug"]
+        slug, name = sp["slug"], sp["name_scientific"]
        name = sp["name_scientific"]
        existing_wv = sp.get("wildlife_value")
-        # Only enrich if wildlife_value is empty/null
+        # Skip species already structurally enriched (any marker present).
-        if existing_wv:
+        if any(sp.get(m) is not None for m in STRUCTURED_MARKERS):
            print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
            skipped_has_data += 1
            continue
        print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
        # Fetch NaturaDB page
        html = fetch_naturadb(name)
        time.sleep(DELAY)
        if html is None:
            print("NOT FOUND on NaturaDB")
            skipped_not_found += 1
            continue
        # Parse wildlife data
        data = scrape_species(html)
        if not has_any_data(data):
            print("no wildlife data on page")
            skipped_no_data += 1
            continue
        # Build wildlife_value string
        wildlife_value = build_wildlife_value(data)
        if not wildlife_value:
            print("no wildlife data extracted")
            skipped_no_data += 1
            continue
        # GET full species, merge, PUT back
        try:
            full = api_get(f"/species/{slug}")
            full["wildlife_value"] = wildlife_value
            # Remove read-only / computed fields that the PUT endpoint might reject
            for key in ("created_at", "updated_at", "family"):
                full.pop(key, None)
            set_fields = []
            # Structured counts — only fill if currently empty.
            for src, dst in FIELD_MAP.items():
                if data.get(src) is not None and full.get(dst) is None:
                    full[dst] = data[src]
                    set_fields.append(dst)
            # Native status (German text, matches existing domain).
            if data.get("native_status") and not full.get("native_status"):
                full["native_status"] = " ".join(data["native_status"])[:120]
                set_fields.append("native_status")
            # NaturaDB badge tags.
            if data.get("badges") and not full.get("naturadb_tags"):
                full["naturadb_tags"] = ", ".join(data["badges"])[:500]
                set_fields.append("naturadb_tags")
            # Human-readable summary.
            wv = build_wildlife_value(data)
            if wv and not full.get("wildlife_value"):
                full["wildlife_value"] = wv
                set_fields.append("wildlife_value")
            if not set_fields:
                print("nothing new")
                skipped_no_data += 1
                continue
            api_put(f"/species/{full['id']}", full)
-            print(f"ENRICHED -> {wildlife_value[:80]}...")
+            print(f"ENRICHED -> {', '.join(set_fields)}")
            enriched += 1
        except Exception as e:
            print(f"API ERROR: {e}")
            errors += 1
    print("\n" + "=" * 70)
-    print(f"DONE. Results:")
+    print("DONE. Results:")
    print(f"  Enriched:           {enriched}")
-    print(f"  Already had data:   {skipped_has_data}")
+    print(f"  Already structured: {skipped_has_data}")
    print(f"  Not on NaturaDB:    {skipped_not_found}")
    print(f"  No wildlife data:   {skipped_no_data}")
    print(f"  Errors:             {errors}")