Add scraper and enrichment scripts to tools/ directory

2026-03-16 11:10:18 +01:00
parent 83ab8c4cf9
commit 0ef902cc91
13 changed files with 6031 additions and 0 deletions
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""
+Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
+"""
+
+import json
+import re
+import time
+import urllib.request
+import urllib.error
+import sys
+
+HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+NATURADB_BASE = "https://www.naturadb.de/pflanzen"
+USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
+DELAY = 0.5
+
+
+def api_get(path):
+    """GET from HerbAPI."""
+    url = f"{HERBAPI_BASE}{path}"
+    req = urllib.request.Request(url)
+    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
+    req.add_header("Accept", "application/json")
+    with urllib.request.urlopen(req) as resp:
+        return json.loads(resp.read().decode())
+
+
+def api_put(path, data):
+    """PUT to HerbAPI."""
+    url = f"{HERBAPI_BASE}{path}"
+    body = json.dumps(data).encode()
+    req = urllib.request.Request(url, data=body, method="PUT")
+    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
+    req.add_header("Content-Type", "application/json")
+    req.add_header("Accept", "application/json")
+    with urllib.request.urlopen(req) as resp:
+        return json.loads(resp.read().decode())
+
+
+def fetch_naturadb(latin_name):
+    """Fetch a NaturaDB plant page. Returns HTML string or None."""
+    slug = latin_name.lower().replace(" ", "-")
+    url = f"{NATURADB_BASE}/{slug}/"
+    req = urllib.request.Request(url)
+    req.add_header("User-Agent", USER_AGENT)
+    try:
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            return resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            return None
+        print(f"  HTTP {e.code} for {url}")
+        return None
+    except Exception as e:
+        print(f"  Error fetching {url}: {e}")
+        return None
+
+
+def extract_td_value(html, label):
+    """Extract value from <td>label:</td><td>value</td> pattern."""
+    pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
+    m = re.search(pattern, html, re.DOTALL)
+    if m:
+        # Strip HTML tags from value
+        val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
+        return val
+    return None
+
+
+def extract_native_status(html):
+    """Extract native status from chip badges."""
+    # Look for the primary native status chips (large, colored)
+    statuses = []
+    for m in re.finditer(
+        r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
+    ):
+        tag = m.group(1).strip()
+        if tag in (
+            "heimische Wildform",
+            "Archäophyt",
+            "Neophyt",
+            "nicht heimisch (Neophyt)",
+        ):
+            statuses.append(tag)
+    return statuses
+
+
+def extract_badge_tags(html):
+    """Extract ecological badge chips (large, plain text)."""
+    tags = []
+    for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
+        tag = m.group(1).strip()
+        if tag and tag not in ("", "winterhart"):
+            tags.append(tag)
+    return tags
+
+
+def parse_count(text):
+    """Extract leading integer from text like '82 (Nektar und/oder ...)' """
+    if not text:
+        return None
+    m = re.match(r"(\d+)", text.strip())
+    return int(m.group(1)) if m else None
+
+
+def parse_specialist_count(text):
+    """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
+    if not text:
+        return None
+    m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
+    return int(m.group(1)) if m else None
+
+
+def parse_nectar_pollen(text):
+    """Extract numeric value from '2/4 - mäßig' -> 2."""
+    if not text:
+        return None
+    m = re.match(r"(\d+)/4", text.strip())
+    return int(m.group(1)) if m else None
+
+
+def build_wildlife_value(data):
+    """Build a structured wildlife_value string from scraped data."""
+    parts = []
+
+    # Nectar and pollen
+    np_parts = []
+    if data.get("nectar") is not None:
+        np_parts.append(f"Nectar: {data['nectar']}/4")
+    if data.get("pollen") is not None:
+        np_parts.append(f"Pollen: {data['pollen']}/4")
+    if np_parts:
+        parts.append(", ".join(np_parts) + ".")
+
+    # Wild bees
+    if data.get("wildbienen_count") is not None:
+        s = f"Supports {data['wildbienen_count']} wild bee species"
+        if data.get("wildbienen_specialists") is not None:
+            s += f" ({data['wildbienen_specialists']} specialists)"
+        parts.append(s + ".")
+
+    # Butterflies / moths
+    if data.get("schmetterlinge_count") is not None:
+        s = f"{data['schmetterlinge_count']} butterfly/moth species"
+        if data.get("raupen_count") is not None:
+            spec = ""
+            if data.get("raupen_specialists") is not None:
+                spec = f" ({data['raupen_specialists']} specialized)"
+            s += f", {data['raupen_count']} as caterpillar host{spec}"
+        parts.append(s + ".")
+
+    # Hoverflies
+    if data.get("schwebfliegen_count") is not None:
+        parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
+
+    # Beetles
+    if data.get("kaefer_count") is not None:
+        parts.append(f"{data['kaefer_count']} beetle species.")
+
+    # Birds
+    if data.get("vogelarten_count") is not None:
+        parts.append(f"{data['vogelarten_count']} bird species.")
+
+    # Mammals
+    if data.get("saeugetier_count") is not None:
+        parts.append(f"{data['saeugetier_count']} mammal species.")
+
+    # Native status
+    if data.get("native_status"):
+        parts.append(" ".join(data["native_status"]) + ".")
+
+    # Notable badges
+    notable = [
+        t
+        for t in data.get("badges", [])
+        if any(
+            kw in t.lower()
+            for kw in [
+                "insektenpflanze",
+                "raupenfutter",
+                "vogelschutz",
+                "vogelnähr",
+                "bienenweide",
+            ]
+        )
+    ]
+    if notable:
+        parts.append("Tags: " + ", ".join(notable) + ".")
+
+    return " ".join(parts) if parts else None
+
+
+def scrape_species(html):
+    """Parse NaturaDB HTML and return structured wildlife data dict."""
+    data = {}
+
+    # Nectar and pollen values
+    nectar_raw = extract_td_value(html, "Nektarwert")
+    pollen_raw = extract_td_value(html, "Pollenwert")
+    data["nectar"] = parse_nectar_pollen(nectar_raw)
+    data["pollen"] = parse_nectar_pollen(pollen_raw)
+
+    # Wild bees
+    bees_raw = extract_td_value(html, "Wildbienen")
+    data["wildbienen_count"] = parse_count(bees_raw)
+    data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
+
+    # Butterflies/moths
+    schmett_raw = extract_td_value(html, "Schmetterlinge")
+    data["schmetterlinge_count"] = parse_count(schmett_raw)
+
+    # Caterpillar hosts
+    raupen_raw = extract_td_value(html, "Raupen")
+    data["raupen_count"] = parse_count(raupen_raw)
+    data["raupen_specialists"] = parse_specialist_count(raupen_raw)
+
+    # Hoverflies
+    schweb_raw = extract_td_value(html, "Schwebfliegen")
+    data["schwebfliegen_count"] = parse_count(schweb_raw)
+
+    # Beetles
+    kaefer_raw = extract_td_value(html, "Käfer")
+    data["kaefer_count"] = parse_count(kaefer_raw)
+
+    # Birds
+    vogel_raw = extract_td_value(html, "fressende Vogelarten")
+    data["vogelarten_count"] = parse_count(vogel_raw)
+
+    # Mammals
+    saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
+    data["saeugetier_count"] = parse_count(saeuget_raw)
+
+    # Native status
+    data["native_status"] = extract_native_status(html)
+
+    # Badge tags
+    data["badges"] = extract_badge_tags(html)
+
+    return data
+
+
+def has_any_data(data):
+    """Check if we scraped anything meaningful."""
+    for k, v in data.items():
+        if k in ("native_status", "badges"):
+            if v:
+                return True
+        elif v is not None:
+            return True
+    return False
+
+
+def main():
+    print("Fetching species list from HerbAPI...")
+    species_list = api_get("/species?per_page=200")["data"]
+    print(f"Found {len(species_list)} species.\n")
+
+    enriched = 0
+    skipped_has_data = 0
+    skipped_not_found = 0
+    skipped_no_data = 0
+    errors = 0
+
+    for i, sp in enumerate(species_list):
+        slug = sp["slug"]
+        name = sp["name_scientific"]
+        existing_wv = sp.get("wildlife_value")
+
+        # Only enrich if wildlife_value is empty/null
+        if existing_wv:
+            print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
+            skipped_has_data += 1
+            continue
+
+        print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
+
+        # Fetch NaturaDB page
+        html = fetch_naturadb(name)
+        time.sleep(DELAY)
+
+        if html is None:
+            print("NOT FOUND on NaturaDB")
+            skipped_not_found += 1
+            continue
+
+        # Parse wildlife data
+        data = scrape_species(html)
+
+        if not has_any_data(data):
+            print("no wildlife data on page")
+            skipped_no_data += 1
+            continue
+
+        # Build wildlife_value string
+        wildlife_value = build_wildlife_value(data)
+        if not wildlife_value:
+            print("no wildlife data extracted")
+            skipped_no_data += 1
+            continue
+
+        # GET full species, merge, PUT back
+        try:
+            full = api_get(f"/species/{slug}")
+            full["wildlife_value"] = wildlife_value
+
+            # Remove read-only / computed fields that the PUT endpoint might reject
+            for key in ("created_at", "updated_at", "family"):
+                full.pop(key, None)
+
+            api_put(f"/species/{full['id']}", full)
+            print(f"ENRICHED -> {wildlife_value[:80]}...")
+            enriched += 1
+        except Exception as e:
+            print(f"API ERROR: {e}")
+            errors += 1
+
+    print("\n" + "=" * 70)
+    print(f"DONE. Results:")
+    print(f"  Enriched:           {enriched}")
+    print(f"  Already had data:   {skipped_has_data}")
+    print(f"  Not on NaturaDB:    {skipped_not_found}")
+    print(f"  No wildlife data:   {skipped_no_data}")
+    print(f"  Errors:             {errors}")
+    print(f"  Total:              {len(species_list)}")
+
+
+if __name__ == "__main__":
+    main()