Enrich: NaturaDB writes structured wildlife fields + full pagination; min_temp from USDA zone
- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/ butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status, naturadb_tags (not just the text wildlife_value); paginate all species; env-overridable base/token; only fill empty fields. - enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
This commit is contained in:
@@ -7,6 +7,8 @@ scraping, just botanical facts:
|
|||||||
equals hardiness_zone_usda.
|
equals hardiness_zone_usda.
|
||||||
* nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera
|
* nitrogen_fixer — true for Fabaceae (legumes) and actinorhizal genera
|
||||||
(Frankia symbiosis), false otherwise. This is family/genus-level botany.
|
(Frankia symbiosis), false otherwise. This is family/genus-level botany.
|
||||||
|
* min_temp — cold hardiness in °C, derived from the lower USDA zone
|
||||||
|
in hardiness_zone_usda (each zone has a standardized minimum temperature).
|
||||||
|
|
||||||
Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
|
Idempotent: GET full species -> merge only missing/derivable fields -> PUT.
|
||||||
Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
|
Run: HERBAPI_TOKEN=... python3 enrich_botanical.py [--base URL] [--dry-run]
|
||||||
@@ -61,6 +63,24 @@ def fixes_nitrogen(family_slug, scientific):
|
|||||||
return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
|
return family_slug == "fabaceae" or genus in ACTINORHIZAL_GENERA
|
||||||
|
|
||||||
|
|
||||||
|
# Standardized USDA hardiness-zone minimum temperatures (°C), rounded.
|
||||||
|
USDA_ZONE_MIN_C = {
|
||||||
|
1: -51, 2: -46, 3: -40, 4: -34, 5: -29, 6: -23, 7: -18,
|
||||||
|
8: -12, 9: -7, 10: -1, 11: 4, 12: 10, 13: 16,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def min_temp_from_zone(zone_str):
|
||||||
|
"""Lower bound of the coldest zone in e.g. '5-9' -> -29.0 (°C)."""
|
||||||
|
if not zone_str:
|
||||||
|
return None
|
||||||
|
first = str(zone_str).split("-")[0].strip()
|
||||||
|
digits = "".join(c for c in first if c.isdigit())
|
||||||
|
if not digits:
|
||||||
|
return None
|
||||||
|
return float(USDA_ZONE_MIN_C.get(int(digits))) if int(digits) in USDA_ZONE_MIN_C else None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global BASE
|
global BASE
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
@@ -75,7 +95,7 @@ def main():
|
|||||||
species = all_species()
|
species = all_species()
|
||||||
print(f"{len(species)} species, {len(families)} families")
|
print(f"{len(species)} species, {len(families)} families")
|
||||||
|
|
||||||
changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0}
|
changed = {"hardiness_zone_at": 0, "nitrogen_fixer": 0, "min_temp": 0}
|
||||||
for s in species:
|
for s in species:
|
||||||
updates = {}
|
updates = {}
|
||||||
if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
|
if not s.get("hardiness_zone_at") and s.get("hardiness_zone_usda"):
|
||||||
@@ -84,6 +104,10 @@ def main():
|
|||||||
updates["nitrogen_fixer"] = fixes_nitrogen(
|
updates["nitrogen_fixer"] = fixes_nitrogen(
|
||||||
families.get(s["family_id"]), s.get("name_scientific")
|
families.get(s["family_id"]), s.get("name_scientific")
|
||||||
)
|
)
|
||||||
|
if s.get("min_temp") is None:
|
||||||
|
mt = min_temp_from_zone(s.get("hardiness_zone_usda"))
|
||||||
|
if mt is not None:
|
||||||
|
updates["min_temp"] = mt
|
||||||
if not updates:
|
if not updates:
|
||||||
continue
|
continue
|
||||||
for k in updates:
|
for k in updates:
|
||||||
|
|||||||
@@ -10,8 +10,9 @@ import urllib.request
|
|||||||
import urllib.error
|
import urllib.error
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
import os
|
||||||
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1")
|
||||||
|
HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "")
|
||||||
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
|
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
|
||||||
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
|
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
|
||||||
DELAY = 0.5
|
DELAY = 0.5
|
||||||
@@ -252,74 +253,108 @@ def has_any_data(data):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# NaturaDB scraped key -> HerbAPI structured field.
|
||||||
|
FIELD_MAP = {
|
||||||
|
"nectar": "nectar_value",
|
||||||
|
"pollen": "pollen_value",
|
||||||
|
"wildbienen_count": "wild_bee_count",
|
||||||
|
"wildbienen_specialists": "wild_bee_specialist_count",
|
||||||
|
"schmetterlinge_count": "butterfly_moth_count",
|
||||||
|
"raupen_count": "caterpillar_host_count",
|
||||||
|
"raupen_specialists": "caterpillar_specialist_count",
|
||||||
|
"schwebfliegen_count": "hoverfly_count",
|
||||||
|
"kaefer_count": "beetle_count",
|
||||||
|
"vogelarten_count": "bird_count",
|
||||||
|
"saeugetier_count": "mammal_count",
|
||||||
|
}
|
||||||
|
|
||||||
|
# A species is considered already structurally enriched if it has these.
|
||||||
|
STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count")
|
||||||
|
|
||||||
|
|
||||||
|
def all_species():
|
||||||
|
out, page = [], 1
|
||||||
|
while True:
|
||||||
|
chunk = api_get(f"/species?per_page=100&page={page}")["data"]
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
out.extend(chunk)
|
||||||
|
if len(chunk) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Fetching species list from HerbAPI...")
|
print("Fetching species list from HerbAPI...")
|
||||||
species_list = api_get("/species?per_page=200")["data"]
|
species_list = all_species()
|
||||||
print(f"Found {len(species_list)} species.\n")
|
print(f"Found {len(species_list)} species.\n")
|
||||||
|
|
||||||
enriched = 0
|
enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0
|
||||||
skipped_has_data = 0
|
|
||||||
skipped_not_found = 0
|
|
||||||
skipped_no_data = 0
|
|
||||||
errors = 0
|
|
||||||
|
|
||||||
for i, sp in enumerate(species_list):
|
for i, sp in enumerate(species_list):
|
||||||
slug = sp["slug"]
|
slug, name = sp["slug"], sp["name_scientific"]
|
||||||
name = sp["name_scientific"]
|
|
||||||
existing_wv = sp.get("wildlife_value")
|
|
||||||
|
|
||||||
# Only enrich if wildlife_value is empty/null
|
# Skip species already structurally enriched (any marker present).
|
||||||
if existing_wv:
|
if any(sp.get(m) is not None for m in STRUCTURED_MARKERS):
|
||||||
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
|
|
||||||
skipped_has_data += 1
|
skipped_has_data += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
|
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
|
||||||
|
|
||||||
# Fetch NaturaDB page
|
|
||||||
html = fetch_naturadb(name)
|
html = fetch_naturadb(name)
|
||||||
time.sleep(DELAY)
|
time.sleep(DELAY)
|
||||||
|
|
||||||
if html is None:
|
if html is None:
|
||||||
print("NOT FOUND on NaturaDB")
|
print("NOT FOUND on NaturaDB")
|
||||||
skipped_not_found += 1
|
skipped_not_found += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse wildlife data
|
|
||||||
data = scrape_species(html)
|
data = scrape_species(html)
|
||||||
|
|
||||||
if not has_any_data(data):
|
if not has_any_data(data):
|
||||||
print("no wildlife data on page")
|
print("no wildlife data on page")
|
||||||
skipped_no_data += 1
|
skipped_no_data += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Build wildlife_value string
|
|
||||||
wildlife_value = build_wildlife_value(data)
|
|
||||||
if not wildlife_value:
|
|
||||||
print("no wildlife data extracted")
|
|
||||||
skipped_no_data += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# GET full species, merge, PUT back
|
|
||||||
try:
|
try:
|
||||||
full = api_get(f"/species/{slug}")
|
full = api_get(f"/species/{slug}")
|
||||||
full["wildlife_value"] = wildlife_value
|
|
||||||
|
|
||||||
# Remove read-only / computed fields that the PUT endpoint might reject
|
|
||||||
for key in ("created_at", "updated_at", "family"):
|
for key in ("created_at", "updated_at", "family"):
|
||||||
full.pop(key, None)
|
full.pop(key, None)
|
||||||
|
|
||||||
|
set_fields = []
|
||||||
|
# Structured counts — only fill if currently empty.
|
||||||
|
for src, dst in FIELD_MAP.items():
|
||||||
|
if data.get(src) is not None and full.get(dst) is None:
|
||||||
|
full[dst] = data[src]
|
||||||
|
set_fields.append(dst)
|
||||||
|
# Native status (German text, matches existing domain).
|
||||||
|
if data.get("native_status") and not full.get("native_status"):
|
||||||
|
full["native_status"] = " ".join(data["native_status"])[:120]
|
||||||
|
set_fields.append("native_status")
|
||||||
|
# NaturaDB badge tags.
|
||||||
|
if data.get("badges") and not full.get("naturadb_tags"):
|
||||||
|
full["naturadb_tags"] = ", ".join(data["badges"])[:500]
|
||||||
|
set_fields.append("naturadb_tags")
|
||||||
|
# Human-readable summary.
|
||||||
|
wv = build_wildlife_value(data)
|
||||||
|
if wv and not full.get("wildlife_value"):
|
||||||
|
full["wildlife_value"] = wv
|
||||||
|
set_fields.append("wildlife_value")
|
||||||
|
|
||||||
|
if not set_fields:
|
||||||
|
print("nothing new")
|
||||||
|
skipped_no_data += 1
|
||||||
|
continue
|
||||||
|
|
||||||
api_put(f"/species/{full['id']}", full)
|
api_put(f"/species/{full['id']}", full)
|
||||||
print(f"ENRICHED -> {wildlife_value[:80]}...")
|
print(f"ENRICHED -> {', '.join(set_fields)}")
|
||||||
enriched += 1
|
enriched += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"API ERROR: {e}")
|
print(f"API ERROR: {e}")
|
||||||
errors += 1
|
errors += 1
|
||||||
|
|
||||||
print("\n" + "=" * 70)
|
print("\n" + "=" * 70)
|
||||||
print(f"DONE. Results:")
|
print("DONE. Results:")
|
||||||
print(f" Enriched: {enriched}")
|
print(f" Enriched: {enriched}")
|
||||||
print(f" Already had data: {skipped_has_data}")
|
print(f" Already structured: {skipped_has_data}")
|
||||||
print(f" Not on NaturaDB: {skipped_not_found}")
|
print(f" Not on NaturaDB: {skipped_not_found}")
|
||||||
print(f" No wildlife data: {skipped_no_data}")
|
print(f" No wildlife data: {skipped_no_data}")
|
||||||
print(f" Errors: {errors}")
|
print(f" Errors: {errors}")
|
||||||
|
|||||||
Reference in New Issue
Block a user