herbapi/tools/scrapers/scrape_naturadb.py

#!/usr/bin/env python3
"""
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
"""

import json
import re
import time
import urllib.request
import urllib.error
import sys

import os
HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1")
HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "")
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
DELAY = 0.5


def api_get(path):
    """GET from HerbAPI."""
    url = f"{HERBAPI_BASE}{path}"
    req = urllib.request.Request(url)
    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
    req.add_header("Accept", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read().decode())


def api_put(path, data):
    """PUT to HerbAPI."""
    url = f"{HERBAPI_BASE}{path}"
    body = json.dumps(data).encode()
    req = urllib.request.Request(url, data=body, method="PUT")
    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
    req.add_header("Content-Type", "application/json")
    req.add_header("Accept", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read().decode())


def fetch_naturadb(latin_name):
    """Fetch a NaturaDB plant page. Returns HTML string or None."""
    slug = latin_name.lower().replace(" ", "-")
    url = f"{NATURADB_BASE}/{slug}/"
    req = urllib.request.Request(url)
    req.add_header("User-Agent", USER_AGENT)
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except urllib.error.HTTPError as e:
        if e.code == 404:
            return None
        print(f"  HTTP {e.code} for {url}")
        return None
    except Exception as e:
        print(f"  Error fetching {url}: {e}")
        return None


def extract_td_value(html, label):
    """Extract value from <td>label:</td><td>value</td> pattern."""
    pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
    m = re.search(pattern, html, re.DOTALL)
    if m:
        # Strip HTML tags from value
        val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
        return val
    return None


def extract_native_status(html):
    """Extract native status from chip badges."""
    # Look for the primary native status chips (large, colored)
    statuses = []
    for m in re.finditer(
        r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
    ):
        tag = m.group(1).strip()
        if tag in (
            "heimische Wildform",
            "Archäophyt",
            "Neophyt",
            "nicht heimisch (Neophyt)",
        ):
            statuses.append(tag)
    return statuses


def extract_badge_tags(html):
    """Extract ecological badge chips (large, plain text)."""
    tags = []
    for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
        tag = m.group(1).strip()
        if tag and tag not in ("", "winterhart"):
            tags.append(tag)
    return tags


def parse_count(text):
    """Extract leading integer from text like '82 (Nektar und/oder ...)' """
    if not text:
        return None
    m = re.match(r"(\d+)", text.strip())
    return int(m.group(1)) if m else None


def parse_specialist_count(text):
    """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
    if not text:
        return None
    m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
    return int(m.group(1)) if m else None


def parse_nectar_pollen(text):
    """Extract numeric value from '2/4 - mäßig' -> 2."""
    if not text:
        return None
    m = re.match(r"(\d+)/4", text.strip())
    return int(m.group(1)) if m else None


def build_wildlife_value(data):
    """Build a structured wildlife_value string from scraped data."""
    parts = []

    # Nectar and pollen
    np_parts = []
    if data.get("nectar") is not None:
        np_parts.append(f"Nectar: {data['nectar']}/4")
    if data.get("pollen") is not None:
        np_parts.append(f"Pollen: {data['pollen']}/4")
    if np_parts:
        parts.append(", ".join(np_parts) + ".")

    # Wild bees
    if data.get("wildbienen_count") is not None:
        s = f"Supports {data['wildbienen_count']} wild bee species"
        if data.get("wildbienen_specialists") is not None:
            s += f" ({data['wildbienen_specialists']} specialists)"
        parts.append(s + ".")

    # Butterflies / moths
    if data.get("schmetterlinge_count") is not None:
        s = f"{data['schmetterlinge_count']} butterfly/moth species"
        if data.get("raupen_count") is not None:
            spec = ""
            if data.get("raupen_specialists") is not None:
                spec = f" ({data['raupen_specialists']} specialized)"
            s += f", {data['raupen_count']} as caterpillar host{spec}"
        parts.append(s + ".")

    # Hoverflies
    if data.get("schwebfliegen_count") is not None:
        parts.append(f"{data['schwebfliegen_count']} hoverfly species.")

    # Beetles
    if data.get("kaefer_count") is not None:
        parts.append(f"{data['kaefer_count']} beetle species.")

    # Birds
    if data.get("vogelarten_count") is not None:
        parts.append(f"{data['vogelarten_count']} bird species.")

    # Mammals
    if data.get("saeugetier_count") is not None:
        parts.append(f"{data['saeugetier_count']} mammal species.")

    # Native status
    if data.get("native_status"):
        parts.append(" ".join(data["native_status"]) + ".")

    # Notable badges
    notable = [
        t
        for t in data.get("badges", [])
        if any(
            kw in t.lower()
            for kw in [
                "insektenpflanze",
                "raupenfutter",
                "vogelschutz",
                "vogelnähr",
                "bienenweide",
            ]
        )
    ]
    if notable:
        parts.append("Tags: " + ", ".join(notable) + ".")

    return " ".join(parts) if parts else None


def scrape_species(html):
    """Parse NaturaDB HTML and return structured wildlife data dict."""
    data = {}

    # Nectar and pollen values
    nectar_raw = extract_td_value(html, "Nektarwert")
    pollen_raw = extract_td_value(html, "Pollenwert")
    data["nectar"] = parse_nectar_pollen(nectar_raw)
    data["pollen"] = parse_nectar_pollen(pollen_raw)

    # Wild bees
    bees_raw = extract_td_value(html, "Wildbienen")
    data["wildbienen_count"] = parse_count(bees_raw)
    data["wildbienen_specialists"] = parse_specialist_count(bees_raw)

    # Butterflies/moths
    schmett_raw = extract_td_value(html, "Schmetterlinge")
    data["schmetterlinge_count"] = parse_count(schmett_raw)

    # Caterpillar hosts
    raupen_raw = extract_td_value(html, "Raupen")
    data["raupen_count"] = parse_count(raupen_raw)
    data["raupen_specialists"] = parse_specialist_count(raupen_raw)

    # Hoverflies
    schweb_raw = extract_td_value(html, "Schwebfliegen")
    data["schwebfliegen_count"] = parse_count(schweb_raw)

    # Beetles
    kaefer_raw = extract_td_value(html, "Käfer")
    data["kaefer_count"] = parse_count(kaefer_raw)

    # Birds
    vogel_raw = extract_td_value(html, "fressende Vogelarten")
    data["vogelarten_count"] = parse_count(vogel_raw)

    # Mammals
    saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
    data["saeugetier_count"] = parse_count(saeuget_raw)

    # Native status
    data["native_status"] = extract_native_status(html)

    # Badge tags
    data["badges"] = extract_badge_tags(html)

    return data


def has_any_data(data):
    """Check if we scraped anything meaningful."""
    for k, v in data.items():
        if k in ("native_status", "badges"):
            if v:
                return True
        elif v is not None:
            return True
    return False


# NaturaDB scraped key -> HerbAPI structured field.
FIELD_MAP = {
    "nectar": "nectar_value",
    "pollen": "pollen_value",
    "wildbienen_count": "wild_bee_count",
    "wildbienen_specialists": "wild_bee_specialist_count",
    "schmetterlinge_count": "butterfly_moth_count",
    "raupen_count": "caterpillar_host_count",
    "raupen_specialists": "caterpillar_specialist_count",
    "schwebfliegen_count": "hoverfly_count",
    "kaefer_count": "beetle_count",
    "vogelarten_count": "bird_count",
    "saeugetier_count": "mammal_count",
}

# A species is considered already structurally enriched if it has these.
STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count")


def all_species():
    out, page = [], 1
    while True:
        chunk = api_get(f"/species?per_page=100&page={page}")["data"]
        if not chunk:
            break
        out.extend(chunk)
        if len(chunk) < 100:
            break
        page += 1
    return out


def main():
    print("Fetching species list from HerbAPI...")
    species_list = all_species()
    print(f"Found {len(species_list)} species.\n")

    enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0

    for i, sp in enumerate(species_list):
        slug, name = sp["slug"], sp["name_scientific"]

        # Skip species already structurally enriched (any marker present).
        if any(sp.get(m) is not None for m in STRUCTURED_MARKERS):
            skipped_has_data += 1
            continue

        print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
        html = fetch_naturadb(name)
        time.sleep(DELAY)
        if html is None:
            print("NOT FOUND on NaturaDB")
            skipped_not_found += 1
            continue

        data = scrape_species(html)
        if not has_any_data(data):
            print("no wildlife data on page")
            skipped_no_data += 1
            continue

        try:
            full = api_get(f"/species/{slug}")
            for key in ("created_at", "updated_at", "family"):
                full.pop(key, None)

            set_fields = []
            # Structured counts — only fill if currently empty.
            for src, dst in FIELD_MAP.items():
                if data.get(src) is not None and full.get(dst) is None:
                    full[dst] = data[src]
                    set_fields.append(dst)
            # Native status (German text, matches existing domain).
            if data.get("native_status") and not full.get("native_status"):
                full["native_status"] = " ".join(data["native_status"])[:120]
                set_fields.append("native_status")
            # NaturaDB badge tags.
            if data.get("badges") and not full.get("naturadb_tags"):
                full["naturadb_tags"] = ", ".join(data["badges"])[:500]
                set_fields.append("naturadb_tags")
            # Human-readable summary.
            wv = build_wildlife_value(data)
            if wv and not full.get("wildlife_value"):
                full["wildlife_value"] = wv
                set_fields.append("wildlife_value")

            if not set_fields:
                print("nothing new")
                skipped_no_data += 1
                continue

            api_put(f"/species/{full['id']}", full)
            print(f"ENRICHED -> {', '.join(set_fields)}")
            enriched += 1
        except Exception as e:
            print(f"API ERROR: {e}")
            errors += 1

    print("\n" + "=" * 70)
    print("DONE. Results:")
    print(f"  Enriched:           {enriched}")
    print(f"  Already structured: {skipped_has_data}")
    print(f"  Not on NaturaDB:    {skipped_not_found}")
    print(f"  No wildlife data:   {skipped_no_data}")
    print(f"  Errors:             {errors}")
    print(f"  Total:              {len(species_list)}")


if __name__ == "__main__":
    main()