#!/usr/bin/env python3 """ Scrape NaturaDB wildlife interaction data and enrich HerbAPI species. """ import json import re import time import urllib.request import urllib.error import sys import os HERBAPI_BASE = os.environ.get("HERBAPI_BASE", "http://herbapi01.corp.sub-net.at:8080/api/v1") HERBAPI_TOKEN = os.environ.get("HERBAPI_TOKEN", "") NATURADB_BASE = "https://www.naturadb.de/pflanzen" USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)" DELAY = 0.5 def api_get(path): """GET from HerbAPI.""" url = f"{HERBAPI_BASE}{path}" req = urllib.request.Request(url) req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}") req.add_header("Accept", "application/json") with urllib.request.urlopen(req) as resp: return json.loads(resp.read().decode()) def api_put(path, data): """PUT to HerbAPI.""" url = f"{HERBAPI_BASE}{path}" body = json.dumps(data).encode() req = urllib.request.Request(url, data=body, method="PUT") req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}") req.add_header("Content-Type", "application/json") req.add_header("Accept", "application/json") with urllib.request.urlopen(req) as resp: return json.loads(resp.read().decode()) def fetch_naturadb(latin_name): """Fetch a NaturaDB plant page. Returns HTML string or None.""" slug = latin_name.lower().replace(" ", "-") url = f"{NATURADB_BASE}/{slug}/" req = urllib.request.Request(url) req.add_header("User-Agent", USER_AGENT) try: with urllib.request.urlopen(req, timeout=15) as resp: return resp.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as e: if e.code == 404: return None print(f" HTTP {e.code} for {url}") return None except Exception as e: print(f" Error fetching {url}: {e}") return None def extract_td_value(html, label): """Extract value from label:value pattern.""" pattern = rf"{re.escape(label)}:?\s*]*>(.*?)" m = re.search(pattern, html, re.DOTALL) if m: # Strip HTML tags from value val = re.sub(r"<[^>]+>", "", m.group(1)).strip() return val return None def extract_native_status(html): """Extract native status from chip badges.""" # Look for the primary native status chips (large, colored) statuses = [] for m in re.finditer( r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html ): tag = m.group(1).strip() if tag in ( "heimische Wildform", "Archäophyt", "Neophyt", "nicht heimisch (Neophyt)", ): statuses.append(tag) return statuses def extract_badge_tags(html): """Extract ecological badge chips (large, plain text).""" tags = [] for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html): tag = m.group(1).strip() if tag and tag not in ("", "winterhart"): tags.append(tag) return tags def parse_count(text): """Extract leading integer from text like '82 (Nektar und/oder ...)' """ if not text: return None m = re.match(r"(\d+)", text.strip()) return int(m.group(1)) if m else None def parse_specialist_count(text): """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'.""" if not text: return None m = re.search(r"davon\s+(\d+)\s+spezialisiert", text) return int(m.group(1)) if m else None def parse_nectar_pollen(text): """Extract numeric value from '2/4 - mäßig' -> 2.""" if not text: return None m = re.match(r"(\d+)/4", text.strip()) return int(m.group(1)) if m else None def build_wildlife_value(data): """Build a structured wildlife_value string from scraped data.""" parts = [] # Nectar and pollen np_parts = [] if data.get("nectar") is not None: np_parts.append(f"Nectar: {data['nectar']}/4") if data.get("pollen") is not None: np_parts.append(f"Pollen: {data['pollen']}/4") if np_parts: parts.append(", ".join(np_parts) + ".") # Wild bees if data.get("wildbienen_count") is not None: s = f"Supports {data['wildbienen_count']} wild bee species" if data.get("wildbienen_specialists") is not None: s += f" ({data['wildbienen_specialists']} specialists)" parts.append(s + ".") # Butterflies / moths if data.get("schmetterlinge_count") is not None: s = f"{data['schmetterlinge_count']} butterfly/moth species" if data.get("raupen_count") is not None: spec = "" if data.get("raupen_specialists") is not None: spec = f" ({data['raupen_specialists']} specialized)" s += f", {data['raupen_count']} as caterpillar host{spec}" parts.append(s + ".") # Hoverflies if data.get("schwebfliegen_count") is not None: parts.append(f"{data['schwebfliegen_count']} hoverfly species.") # Beetles if data.get("kaefer_count") is not None: parts.append(f"{data['kaefer_count']} beetle species.") # Birds if data.get("vogelarten_count") is not None: parts.append(f"{data['vogelarten_count']} bird species.") # Mammals if data.get("saeugetier_count") is not None: parts.append(f"{data['saeugetier_count']} mammal species.") # Native status if data.get("native_status"): parts.append(" ".join(data["native_status"]) + ".") # Notable badges notable = [ t for t in data.get("badges", []) if any( kw in t.lower() for kw in [ "insektenpflanze", "raupenfutter", "vogelschutz", "vogelnähr", "bienenweide", ] ) ] if notable: parts.append("Tags: " + ", ".join(notable) + ".") return " ".join(parts) if parts else None def scrape_species(html): """Parse NaturaDB HTML and return structured wildlife data dict.""" data = {} # Nectar and pollen values nectar_raw = extract_td_value(html, "Nektarwert") pollen_raw = extract_td_value(html, "Pollenwert") data["nectar"] = parse_nectar_pollen(nectar_raw) data["pollen"] = parse_nectar_pollen(pollen_raw) # Wild bees bees_raw = extract_td_value(html, "Wildbienen") data["wildbienen_count"] = parse_count(bees_raw) data["wildbienen_specialists"] = parse_specialist_count(bees_raw) # Butterflies/moths schmett_raw = extract_td_value(html, "Schmetterlinge") data["schmetterlinge_count"] = parse_count(schmett_raw) # Caterpillar hosts raupen_raw = extract_td_value(html, "Raupen") data["raupen_count"] = parse_count(raupen_raw) data["raupen_specialists"] = parse_specialist_count(raupen_raw) # Hoverflies schweb_raw = extract_td_value(html, "Schwebfliegen") data["schwebfliegen_count"] = parse_count(schweb_raw) # Beetles kaefer_raw = extract_td_value(html, "Käfer") data["kaefer_count"] = parse_count(kaefer_raw) # Birds vogel_raw = extract_td_value(html, "fressende Vogelarten") data["vogelarten_count"] = parse_count(vogel_raw) # Mammals saeuget_raw = extract_td_value(html, "fressende Säugetierarten") data["saeugetier_count"] = parse_count(saeuget_raw) # Native status data["native_status"] = extract_native_status(html) # Badge tags data["badges"] = extract_badge_tags(html) return data def has_any_data(data): """Check if we scraped anything meaningful.""" for k, v in data.items(): if k in ("native_status", "badges"): if v: return True elif v is not None: return True return False # NaturaDB scraped key -> HerbAPI structured field. FIELD_MAP = { "nectar": "nectar_value", "pollen": "pollen_value", "wildbienen_count": "wild_bee_count", "wildbienen_specialists": "wild_bee_specialist_count", "schmetterlinge_count": "butterfly_moth_count", "raupen_count": "caterpillar_host_count", "raupen_specialists": "caterpillar_specialist_count", "schwebfliegen_count": "hoverfly_count", "kaefer_count": "beetle_count", "vogelarten_count": "bird_count", "saeugetier_count": "mammal_count", } # A species is considered already structurally enriched if it has these. STRUCTURED_MARKERS = ("nectar_value", "wild_bee_count", "butterfly_moth_count", "bird_count") def all_species(): out, page = [], 1 while True: chunk = api_get(f"/species?per_page=100&page={page}")["data"] if not chunk: break out.extend(chunk) if len(chunk) < 100: break page += 1 return out def main(): print("Fetching species list from HerbAPI...") species_list = all_species() print(f"Found {len(species_list)} species.\n") enriched = skipped_has_data = skipped_not_found = skipped_no_data = errors = 0 for i, sp in enumerate(species_list): slug, name = sp["slug"], sp["name_scientific"] # Skip species already structurally enriched (any marker present). if any(sp.get(m) is not None for m in STRUCTURED_MARKERS): skipped_has_data += 1 continue print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True) html = fetch_naturadb(name) time.sleep(DELAY) if html is None: print("NOT FOUND on NaturaDB") skipped_not_found += 1 continue data = scrape_species(html) if not has_any_data(data): print("no wildlife data on page") skipped_no_data += 1 continue try: full = api_get(f"/species/{slug}") for key in ("created_at", "updated_at", "family"): full.pop(key, None) set_fields = [] # Structured counts — only fill if currently empty. for src, dst in FIELD_MAP.items(): if data.get(src) is not None and full.get(dst) is None: full[dst] = data[src] set_fields.append(dst) # Native status (German text, matches existing domain). if data.get("native_status") and not full.get("native_status"): full["native_status"] = " ".join(data["native_status"])[:120] set_fields.append("native_status") # NaturaDB badge tags. if data.get("badges") and not full.get("naturadb_tags"): full["naturadb_tags"] = ", ".join(data["badges"])[:500] set_fields.append("naturadb_tags") # Human-readable summary. wv = build_wildlife_value(data) if wv and not full.get("wildlife_value"): full["wildlife_value"] = wv set_fields.append("wildlife_value") if not set_fields: print("nothing new") skipped_no_data += 1 continue api_put(f"/species/{full['id']}", full) print(f"ENRICHED -> {', '.join(set_fields)}") enriched += 1 except Exception as e: print(f"API ERROR: {e}") errors += 1 print("\n" + "=" * 70) print("DONE. Results:") print(f" Enriched: {enriched}") print(f" Already structured: {skipped_has_data}") print(f" Not on NaturaDB: {skipped_not_found}") print(f" No wildlife data: {skipped_no_data}") print(f" Errors: {errors}") print(f" Total: {len(species_list)}") if __name__ == "__main__": main()