91aa9cb733
- scrape_naturadb.py: write structured count fields (nectar/pollen/bee/ butterfly/caterpillar/hoverfly/beetle/bird/mammal), native_status, naturadb_tags (not just the text wildlife_value); paginate all species; env-overridable base/token; only fill empty fields. - enrich_botanical.py: derive min_temp from USDA hardiness zone min temp.
366 lines
12 KiB
Python
366 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
import sys
|
|
|
|
import os
|
|
HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1")
|
|
HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "")
|
|
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
|
|
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
|
|
DELAY = 0.5
|
|
|
|
|
|
def api_get(path):
|
|
"""GET from HerbAPI."""
|
|
url = f"{HERBAPI_BASE}{path}"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
|
req.add_header("Accept", "application/json")
|
|
with urllib.request.urlopen(req) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
def api_put(path, data):
|
|
"""PUT to HerbAPI."""
|
|
url = f"{HERBAPI_BASE}{path}"
|
|
body = json.dumps(data).encode()
|
|
req = urllib.request.Request(url, data=body, method="PUT")
|
|
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
|
req.add_header("Content-Type", "application/json")
|
|
req.add_header("Accept", "application/json")
|
|
with urllib.request.urlopen(req) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
def fetch_naturadb(latin_name):
|
|
"""Fetch a NaturaDB plant page. Returns HTML string or None."""
|
|
slug = latin_name.lower().replace(" ", "-")
|
|
url = f"{NATURADB_BASE}/{slug}/"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("User-Agent", USER_AGENT)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
return None
|
|
print(f" HTTP {e.code} for {url}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def extract_td_value(html, label):
|
|
"""Extract value from <td>label:</td><td>value</td> pattern."""
|
|
pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
|
|
m = re.search(pattern, html, re.DOTALL)
|
|
if m:
|
|
# Strip HTML tags from value
|
|
val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
|
|
return val
|
|
return None
|
|
|
|
|
|
def extract_native_status(html):
|
|
"""Extract native status from chip badges."""
|
|
# Look for the primary native status chips (large, colored)
|
|
statuses = []
|
|
for m in re.finditer(
|
|
r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
|
|
):
|
|
tag = m.group(1).strip()
|
|
if tag in (
|
|
"heimische Wildform",
|
|
"Archäophyt",
|
|
"Neophyt",
|
|
"nicht heimisch (Neophyt)",
|
|
):
|
|
statuses.append(tag)
|
|
return statuses
|
|
|
|
|
|
def extract_badge_tags(html):
|
|
"""Extract ecological badge chips (large, plain text)."""
|
|
tags = []
|
|
for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
|
|
tag = m.group(1).strip()
|
|
if tag and tag not in ("", "winterhart"):
|
|
tags.append(tag)
|
|
return tags
|
|
|
|
|
|
def parse_count(text):
|
|
"""Extract leading integer from text like '82 (Nektar und/oder ...)' """
|
|
if not text:
|
|
return None
|
|
m = re.match(r"(\d+)", text.strip())
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def parse_specialist_count(text):
|
|
"""Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
|
|
if not text:
|
|
return None
|
|
m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def parse_nectar_pollen(text):
|
|
"""Extract numeric value from '2/4 - mäßig' -> 2."""
|
|
if not text:
|
|
return None
|
|
m = re.match(r"(\d+)/4", text.strip())
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def build_wildlife_value(data):
|
|
"""Build a structured wildlife_value string from scraped data."""
|
|
parts = []
|
|
|
|
# Nectar and pollen
|
|
np_parts = []
|
|
if data.get("nectar") is not None:
|
|
np_parts.append(f"Nectar: {data['nectar']}/4")
|
|
if data.get("pollen") is not None:
|
|
np_parts.append(f"Pollen: {data['pollen']}/4")
|
|
if np_parts:
|
|
parts.append(", ".join(np_parts) + ".")
|
|
|
|
# Wild bees
|
|
if data.get("wildbienen_count") is not None:
|
|
s = f"Supports {data['wildbienen_count']} wild bee species"
|
|
if data.get("wildbienen_specialists") is not None:
|
|
s += f" ({data['wildbienen_specialists']} specialists)"
|
|
parts.append(s + ".")
|
|
|
|
# Butterflies / moths
|
|
if data.get("schmetterlinge_count") is not None:
|
|
s = f"{data['schmetterlinge_count']} butterfly/moth species"
|
|
if data.get("raupen_count") is not None:
|
|
spec = ""
|
|
if data.get("raupen_specialists") is not None:
|
|
spec = f" ({data['raupen_specialists']} specialized)"
|
|
s += f", {data['raupen_count']} as caterpillar host{spec}"
|
|
parts.append(s + ".")
|
|
|
|
# Hoverflies
|
|
if data.get("schwebfliegen_count") is not None:
|
|
parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
|
|
|
|
# Beetles
|
|
if data.get("kaefer_count") is not None:
|
|
parts.append(f"{data['kaefer_count']} beetle species.")
|
|
|
|
# Birds
|
|
if data.get("vogelarten_count") is not None:
|
|
parts.append(f"{data['vogelarten_count']} bird species.")
|
|
|
|
# Mammals
|
|
if data.get("saeugetier_count") is not None:
|
|
parts.append(f"{data['saeugetier_count']} mammal species.")
|
|
|
|
# Native status
|
|
if data.get("native_status"):
|
|
parts.append(" ".join(data["native_status"]) + ".")
|
|
|
|
# Notable badges
|
|
notable = [
|
|
t
|
|
for t in data.get("badges", [])
|
|
if any(
|
|
kw in t.lower()
|
|
for kw in [
|
|
"insektenpflanze",
|
|
"raupenfutter",
|
|
"vogelschutz",
|
|
"vogelnähr",
|
|
"bienenweide",
|
|
]
|
|
)
|
|
]
|
|
if notable:
|
|
parts.append("Tags: " + ", ".join(notable) + ".")
|
|
|
|
return " ".join(parts) if parts else None
|
|
|
|
|
|
def scrape_species(html):
|
|
"""Parse NaturaDB HTML and return structured wildlife data dict."""
|
|
data = {}
|
|
|
|
# Nectar and pollen values
|
|
nectar_raw = extract_td_value(html, "Nektarwert")
|
|
pollen_raw = extract_td_value(html, "Pollenwert")
|
|
data["nectar"] = parse_nectar_pollen(nectar_raw)
|
|
data["pollen"] = parse_nectar_pollen(pollen_raw)
|
|
|
|
# Wild bees
|
|
bees_raw = extract_td_value(html, "Wildbienen")
|
|
data["wildbienen_count"] = parse_count(bees_raw)
|
|
data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
|
|
|
|
# Butterflies/moths
|
|
schmett_raw = extract_td_value(html, "Schmetterlinge")
|
|
data["schmetterlinge_count"] = parse_count(schmett_raw)
|
|
|
|
# Caterpillar hosts
|
|
raupen_raw = extract_td_value(html, "Raupen")
|
|
data["raupen_count"] = parse_count(raupen_raw)
|
|
data["raupen_specialists"] = parse_specialist_count(raupen_raw)
|
|
|
|
# Hoverflies
|
|
schweb_raw = extract_td_value(html, "Schwebfliegen")
|
|
data["schwebfliegen_count"] = parse_count(schweb_raw)
|
|
|
|
# Beetles
|
|
kaefer_raw = extract_td_value(html, "Käfer")
|
|
data["kaefer_count"] = parse_count(kaefer_raw)
|
|
|
|
# Birds
|
|
vogel_raw = extract_td_value(html, "fressende Vogelarten")
|
|
data["vogelarten_count"] = parse_count(vogel_raw)
|
|
|
|
# Mammals
|
|
saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
|
|
data["saeugetier_count"] = parse_count(saeuget_raw)
|
|
|
|
# Native status
|
|
data["native_status"] = extract_native_status(html)
|
|
|
|
# Badge tags
|
|
data["badges"] = extract_badge_tags(html)
|
|
|
|
return data
|
|
|
|
|
|
def has_any_data(data):
|
|
"""Check if we scraped anything meaningful."""
|
|
for k, v in data.items():
|
|
if k in ("native_status", "badges"):
|
|
if v:
|
|
return True
|
|
elif v is not None:
|
|
return True
|
|
return False
|
|
|
|
|
|
# NaturaDB scraped key -> HerbAPI structured field.
|
|
FIELD_MAP = {
|
|
"nectar": "nectar_value",
|
|
"pollen": "pollen_value",
|
|
"wildbienen_count": "wild_bee_count",
|
|
"wildbienen_specialists": "wild_bee_specialist_count",
|
|
"schmetterlinge_count": "butterfly_moth_count",
|
|
"raupen_count": "caterpillar_host_count",
|
|
"raupen_specialists": "caterpillar_specialist_count",
|
|
"schwebfliegen_count": "hoverfly_count",
|
|
"kaefer_count": "beetle_count",
|
|
"vogelarten_count": "bird_count",
|
|
"saeugetier_count": "mammal_count",
|
|
}
|
|
|
|
# A species is considered already structurally enriched if it has these.
|
|
STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count")
|
|
|
|
|
|
def all_species():
|
|
out, page = [], 1
|
|
while True:
|
|
chunk = api_get(f"/species?per_page=100&page={page}")["data"]
|
|
if not chunk:
|
|
break
|
|
out.extend(chunk)
|
|
if len(chunk) < 100:
|
|
break
|
|
page += 1
|
|
return out
|
|
|
|
|
|
def main():
|
|
print("Fetching species list from HerbAPI...")
|
|
species_list = all_species()
|
|
print(f"Found {len(species_list)} species.\n")
|
|
|
|
enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0
|
|
|
|
for i, sp in enumerate(species_list):
|
|
slug, name = sp["slug"], sp["name_scientific"]
|
|
|
|
# Skip species already structurally enriched (any marker present).
|
|
if any(sp.get(m) is not None for m in STRUCTURED_MARKERS):
|
|
skipped_has_data += 1
|
|
continue
|
|
|
|
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
|
|
html = fetch_naturadb(name)
|
|
time.sleep(DELAY)
|
|
if html is None:
|
|
print("NOT FOUND on NaturaDB")
|
|
skipped_not_found += 1
|
|
continue
|
|
|
|
data = scrape_species(html)
|
|
if not has_any_data(data):
|
|
print("no wildlife data on page")
|
|
skipped_no_data += 1
|
|
continue
|
|
|
|
try:
|
|
full = api_get(f"/species/{slug}")
|
|
for key in ("created_at", "updated_at", "family"):
|
|
full.pop(key, None)
|
|
|
|
set_fields = []
|
|
# Structured counts — only fill if currently empty.
|
|
for src, dst in FIELD_MAP.items():
|
|
if data.get(src) is not None and full.get(dst) is None:
|
|
full[dst] = data[src]
|
|
set_fields.append(dst)
|
|
# Native status (German text, matches existing domain).
|
|
if data.get("native_status") and not full.get("native_status"):
|
|
full["native_status"] = " ".join(data["native_status"])[:120]
|
|
set_fields.append("native_status")
|
|
# NaturaDB badge tags.
|
|
if data.get("badges") and not full.get("naturadb_tags"):
|
|
full["naturadb_tags"] = ", ".join(data["badges"])[:500]
|
|
set_fields.append("naturadb_tags")
|
|
# Human-readable summary.
|
|
wv = build_wildlife_value(data)
|
|
if wv and not full.get("wildlife_value"):
|
|
full["wildlife_value"] = wv
|
|
set_fields.append("wildlife_value")
|
|
|
|
if not set_fields:
|
|
print("nothing new")
|
|
skipped_no_data += 1
|
|
continue
|
|
|
|
api_put(f"/species/{full['id']}", full)
|
|
print(f"ENRICHED -> {', '.join(set_fields)}")
|
|
enriched += 1
|
|
except Exception as e:
|
|
print(f"API ERROR: {e}")
|
|
errors += 1
|
|
|
|
print("\n" + "=" * 70)
|
|
print("DONE. Results:")
|
|
print(f" Enriched: {enriched}")
|
|
print(f" Already structured: {skipped_has_data}")
|
|
print(f" Not on NaturaDB: {skipped_not_found}")
|
|
print(f" No wildlife data: {skipped_no_data}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total: {len(species_list)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|