Add scraper and enrichment scripts to tools/ directory

This commit is contained in:
2026-03-16 11:10:18 +01:00
parent 83ab8c4cf9
commit 0ef902cc91
13 changed files with 6031 additions and 0 deletions
+330
View File
@@ -0,0 +1,330 @@
#!/usr/bin/env python3
"""
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
"""
import json
import re
import time
import urllib.request
import urllib.error
import sys
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
DELAY = 0.5
def api_get(path):
"""GET from HerbAPI."""
url = f"{HERBAPI_BASE}{path}"
req = urllib.request.Request(url)
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def api_put(path, data):
"""PUT to HerbAPI."""
url = f"{HERBAPI_BASE}{path}"
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, method="PUT")
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def fetch_naturadb(latin_name):
"""Fetch a NaturaDB plant page. Returns HTML string or None."""
slug = latin_name.lower().replace(" ", "-")
url = f"{NATURADB_BASE}/{slug}/"
req = urllib.request.Request(url)
req.add_header("User-Agent", USER_AGENT)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404:
return None
print(f" HTTP {e.code} for {url}")
return None
except Exception as e:
print(f" Error fetching {url}: {e}")
return None
def extract_td_value(html, label):
"""Extract value from <td>label:</td><td>value</td> pattern."""
pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
m = re.search(pattern, html, re.DOTALL)
if m:
# Strip HTML tags from value
val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
return val
return None
def extract_native_status(html):
"""Extract native status from chip badges."""
# Look for the primary native status chips (large, colored)
statuses = []
for m in re.finditer(
r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
):
tag = m.group(1).strip()
if tag in (
"heimische Wildform",
"Archäophyt",
"Neophyt",
"nicht heimisch (Neophyt)",
):
statuses.append(tag)
return statuses
def extract_badge_tags(html):
"""Extract ecological badge chips (large, plain text)."""
tags = []
for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
tag = m.group(1).strip()
if tag and tag not in ("", "winterhart"):
tags.append(tag)
return tags
def parse_count(text):
"""Extract leading integer from text like '82 (Nektar und/oder ...)' """
if not text:
return None
m = re.match(r"(\d+)", text.strip())
return int(m.group(1)) if m else None
def parse_specialist_count(text):
"""Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
if not text:
return None
m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
return int(m.group(1)) if m else None
def parse_nectar_pollen(text):
"""Extract numeric value from '2/4 - mäßig' -> 2."""
if not text:
return None
m = re.match(r"(\d+)/4", text.strip())
return int(m.group(1)) if m else None
def build_wildlife_value(data):
"""Build a structured wildlife_value string from scraped data."""
parts = []
# Nectar and pollen
np_parts = []
if data.get("nectar") is not None:
np_parts.append(f"Nectar: {data['nectar']}/4")
if data.get("pollen") is not None:
np_parts.append(f"Pollen: {data['pollen']}/4")
if np_parts:
parts.append(", ".join(np_parts) + ".")
# Wild bees
if data.get("wildbienen_count") is not None:
s = f"Supports {data['wildbienen_count']} wild bee species"
if data.get("wildbienen_specialists") is not None:
s += f" ({data['wildbienen_specialists']} specialists)"
parts.append(s + ".")
# Butterflies / moths
if data.get("schmetterlinge_count") is not None:
s = f"{data['schmetterlinge_count']} butterfly/moth species"
if data.get("raupen_count") is not None:
spec = ""
if data.get("raupen_specialists") is not None:
spec = f" ({data['raupen_specialists']} specialized)"
s += f", {data['raupen_count']} as caterpillar host{spec}"
parts.append(s + ".")
# Hoverflies
if data.get("schwebfliegen_count") is not None:
parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
# Beetles
if data.get("kaefer_count") is not None:
parts.append(f"{data['kaefer_count']} beetle species.")
# Birds
if data.get("vogelarten_count") is not None:
parts.append(f"{data['vogelarten_count']} bird species.")
# Mammals
if data.get("saeugetier_count") is not None:
parts.append(f"{data['saeugetier_count']} mammal species.")
# Native status
if data.get("native_status"):
parts.append(" ".join(data["native_status"]) + ".")
# Notable badges
notable = [
t
for t in data.get("badges", [])
if any(
kw in t.lower()
for kw in [
"insektenpflanze",
"raupenfutter",
"vogelschutz",
"vogelnähr",
"bienenweide",
]
)
]
if notable:
parts.append("Tags: " + ", ".join(notable) + ".")
return " ".join(parts) if parts else None
def scrape_species(html):
"""Parse NaturaDB HTML and return structured wildlife data dict."""
data = {}
# Nectar and pollen values
nectar_raw = extract_td_value(html, "Nektarwert")
pollen_raw = extract_td_value(html, "Pollenwert")
data["nectar"] = parse_nectar_pollen(nectar_raw)
data["pollen"] = parse_nectar_pollen(pollen_raw)
# Wild bees
bees_raw = extract_td_value(html, "Wildbienen")
data["wildbienen_count"] = parse_count(bees_raw)
data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
# Butterflies/moths
schmett_raw = extract_td_value(html, "Schmetterlinge")
data["schmetterlinge_count"] = parse_count(schmett_raw)
# Caterpillar hosts
raupen_raw = extract_td_value(html, "Raupen")
data["raupen_count"] = parse_count(raupen_raw)
data["raupen_specialists"] = parse_specialist_count(raupen_raw)
# Hoverflies
schweb_raw = extract_td_value(html, "Schwebfliegen")
data["schwebfliegen_count"] = parse_count(schweb_raw)
# Beetles
kaefer_raw = extract_td_value(html, "Käfer")
data["kaefer_count"] = parse_count(kaefer_raw)
# Birds
vogel_raw = extract_td_value(html, "fressende Vogelarten")
data["vogelarten_count"] = parse_count(vogel_raw)
# Mammals
saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
data["saeugetier_count"] = parse_count(saeuget_raw)
# Native status
data["native_status"] = extract_native_status(html)
# Badge tags
data["badges"] = extract_badge_tags(html)
return data
def has_any_data(data):
"""Check if we scraped anything meaningful."""
for k, v in data.items():
if k in ("native_status", "badges"):
if v:
return True
elif v is not None:
return True
return False
def main():
print("Fetching species list from HerbAPI...")
species_list = api_get("/species?per_page=200")["data"]
print(f"Found {len(species_list)} species.\n")
enriched = 0
skipped_has_data = 0
skipped_not_found = 0
skipped_no_data = 0
errors = 0
for i, sp in enumerate(species_list):
slug = sp["slug"]
name = sp["name_scientific"]
existing_wv = sp.get("wildlife_value")
# Only enrich if wildlife_value is empty/null
if existing_wv:
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
skipped_has_data += 1
continue
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
# Fetch NaturaDB page
html = fetch_naturadb(name)
time.sleep(DELAY)
if html is None:
print("NOT FOUND on NaturaDB")
skipped_not_found += 1
continue
# Parse wildlife data
data = scrape_species(html)
if not has_any_data(data):
print("no wildlife data on page")
skipped_no_data += 1
continue
# Build wildlife_value string
wildlife_value = build_wildlife_value(data)
if not wildlife_value:
print("no wildlife data extracted")
skipped_no_data += 1
continue
# GET full species, merge, PUT back
try:
full = api_get(f"/species/{slug}")
full["wildlife_value"] = wildlife_value
# Remove read-only / computed fields that the PUT endpoint might reject
for key in ("created_at", "updated_at", "family"):
full.pop(key, None)
api_put(f"/species/{full['id']}", full)
print(f"ENRICHED -> {wildlife_value[:80]}...")
enriched += 1
except Exception as e:
print(f"API ERROR: {e}")
errors += 1
print("\n" + "=" * 70)
print(f"DONE. Results:")
print(f" Enriched: {enriched}")
print(f" Already had data: {skipped_has_data}")
print(f" Not on NaturaDB: {skipped_not_found}")
print(f" No wildlife data: {skipped_no_data}")
print(f" Errors: {errors}")
print(f" Total: {len(species_list)}")
if __name__ == "__main__":
main()