Enrich: NaturaDB writes structured wildlife fields + full pagination; min_temp from USDA zone

- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/
  butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status,
  naturadb_tags (not just the text wildlife_value); paginate all species;
  env-overridable base/token; only fill empty fields.
- enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
This commit is contained in:
2026-06-05 18:18:00 +02:00
parent 5992c486ae
commit 91aa9cb733
2 changed files with 93 additions and 34 deletions
+25 -1
View File
@@ -7,6 +7,8 @@ scraping, just botanical facts:
equals hardiness_zone_usda. equals hardiness_zone_usda.
* nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera * nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera
(Frankia symbiosis), false otherwise. This is family/genus-level botany. (Frankia symbiosis), false otherwise. This is family/genus-level botany.
* min_temp — cold hardiness in °C, derived from the lower USDA zone
in hardiness_zone_usda (each zone has a standardized minimum temperature).
Idempotent: GET full species -> merge only missing/derivable fields -> PUT. Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run] Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
@@ -61,6 +63,24 @@ def fixes_nitrogen(family_slug, scientific):
return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
# Standardized USDA hardiness-zone minimum temperatures (°C), rounded.
USDA_ZONE_MIN_C = {
1: -51, 2: -46, 3: -40, 4: -34, 5: -29, 6: -23, 7: -18,
8: -12, 9: -7, 10: -1, 11: 4, 12: 10, 13: 16,
}
def min_temp_from_zone(zone_str):
"""Lower bound of the coldest zone in e.g. '5-9' -> -29.0 (°C)."""
if not zone_str:
return None
first = str(zone_str).split("-")[0].strip()
digits = "".join(c for c in first if c.isdigit())
if not digits:
return None
return float(USDA_ZONE_MIN_C.get(int(digits))) if int(digits) in USDA_ZONE_MIN_C else None
def main(): def main():
global BASE global BASE
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
@@ -75,7 +95,7 @@ def main():
species = all_species() species = all_species()
print(f"{len(species)} species, {len(families)} families") print(f"{len(species)} species, {len(families)} families")
changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0} changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0, "min_temp": 0}
for s in species: for s in species:
updates = {} updates = {}
if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"): if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
@@ -84,6 +104,10 @@ def main():
updates["nitrogen_fixer"] = fixes_nitrogen( updates["nitrogen_fixer"] = fixes_nitrogen(
families.get(s["family_id"]), s.get("name_scientific") families.get(s["family_id"]), s.get("name_scientific")
) )
if s.get("min_temp") is None:
mt = min_temp_from_zone(s.get("hardiness_zone_usda"))
if mt is not None:
updates["min_temp"] = mt
if not updates: if not updates:
continue continue
for k in updates: for k in updates:
+68 -33
View File
@@ -10,8 +10,9 @@ import urllib.request
import urllib.error import urllib.error
import sys import sys
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" import os
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1")
HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "")
NATURADB_BASE = "https://www.naturadb.de/pflanzen" NATURADB_BASE = "https://www.naturadb.de/pflanzen"
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)" USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
DELAY = 0.5 DELAY = 0.5
@@ -252,74 +253,108 @@ def has_any_data(data):
return False return False
# NaturaDB scraped key -> HerbAPI structured field.
FIELD_MAP = {
"nectar": "nectar_value",
"pollen": "pollen_value",
"wildbienen_count": "wild_bee_count",
"wildbienen_specialists": "wild_bee_specialist_count",
"schmetterlinge_count": "butterfly_moth_count",
"raupen_count": "caterpillar_host_count",
"raupen_specialists": "caterpillar_specialist_count",
"schwebfliegen_count": "hoverfly_count",
"kaefer_count": "beetle_count",
"vogelarten_count": "bird_count",
"saeugetier_count": "mammal_count",
}
# A species is considered already structurally enriched if it has these.
STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count")
def all_species():
out, page = [], 1
while True:
chunk = api_get(f"/species?per_page=100&page={page}")["data"]
if not chunk:
break
out.extend(chunk)
if len(chunk) < 100:
break
page += 1
return out
def main(): def main():
print("Fetching species list from HerbAPI...") print("Fetching species list from HerbAPI...")
species_list = api_get("/species?per_page=200")["data"] species_list = all_species()
print(f"Found {len(species_list)} species.\n") print(f"Found {len(species_list)} species.\n")
enriched = 0 enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0
skipped_has_data = 0
skipped_not_found = 0
skipped_no_data = 0
errors = 0
for i, sp in enumerate(species_list): for i, sp in enumerate(species_list):
slug = sp["slug"] slug, name = sp["slug"], sp["name_scientific"]
name = sp["name_scientific"]
existing_wv = sp.get("wildlife_value")
# Only enrich if wildlife_value is empty/null # Skip species already structurally enriched (any marker present).
if existing_wv: if any(sp.get(m) is not None for m in STRUCTURED_MARKERS):
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
skipped_has_data += 1 skipped_has_data += 1
continue continue
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True) print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
# Fetch NaturaDB page
html = fetch_naturadb(name) html = fetch_naturadb(name)
time.sleep(DELAY) time.sleep(DELAY)
if html is None: if html is None:
print("NOT FOUND on NaturaDB") print("NOT FOUND on NaturaDB")
skipped_not_found += 1 skipped_not_found += 1
continue continue
# Parse wildlife data
data = scrape_species(html) data = scrape_species(html)
if not has_any_data(data): if not has_any_data(data):
print("no wildlife data on page") print("no wildlife data on page")
skipped_no_data += 1 skipped_no_data += 1
continue continue
# Build wildlife_value string
wildlife_value = build_wildlife_value(data)
if not wildlife_value:
print("no wildlife data extracted")
skipped_no_data += 1
continue
# GET full species, merge, PUT back
try: try:
full = api_get(f"/species/{slug}") full = api_get(f"/species/{slug}")
full["wildlife_value"] = wildlife_value
# Remove read-only / computed fields that the PUT endpoint might reject
for key in ("created_at", "updated_at", "family"): for key in ("created_at", "updated_at", "family"):
full.pop(key, None) full.pop(key, None)
set_fields = []
# Structured counts — only fill if currently empty.
for src, dst in FIELD_MAP.items():
if data.get(src) is not None and full.get(dst) is None:
full[dst] = data[src]
set_fields.append(dst)
# Native status (German text, matches existing domain).
if data.get("native_status") and not full.get("native_status"):
full["native_status"] = " ".join(data["native_status"])[:120]
set_fields.append("native_status")
# NaturaDB badge tags.
if data.get("badges") and not full.get("naturadb_tags"):
full["naturadb_tags"] = ", ".join(data["badges"])[:500]
set_fields.append("naturadb_tags")
# Human-readable summary.
wv = build_wildlife_value(data)
if wv and not full.get("wildlife_value"):
full["wildlife_value"] = wv
set_fields.append("wildlife_value")
if not set_fields:
print("nothing new")
skipped_no_data += 1
continue
api_put(f"/species/{full['id']}", full) api_put(f"/species/{full['id']}", full)
print(f"ENRICHED -> {wildlife_value[:80]}...") print(f"ENRICHED -> {', '.join(set_fields)}")
enriched += 1 enriched += 1
except Exception as e: except Exception as e:
print(f"API ERROR: {e}") print(f"API ERROR: {e}")
errors += 1 errors += 1
print("\n" + "=" * 70) print("\n" + "=" * 70)
print(f"DONE. Results:") print("DONE. Results:")
print(f" Enriched: {enriched}") print(f" Enriched: {enriched}")
print(f" Already had data: {skipped_has_data}") print(f" Already structured: {skipped_has_data}")
print(f" Not on NaturaDB: {skipped_not_found}") print(f" Not on NaturaDB: {skipped_not_found}")
print(f" No wildlife data: {skipped_no_data}") print(f" No wildlife data: {skipped_no_data}")
print(f" Errors: {errors}") print(f" Errors: {errors}")