Add scraper and enrichment scripts to tools/ directory

2026-03-16 11:10:18 +01:00
parent 83ab8c4cf9
commit 0ef902cc91
13 changed files with 6031 additions and 0 deletions
@@ -0,0 +1,156 @@
 #!/usr/bin/env python3
 """Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
 import json
 import time
 import urllib.parse
 import urllib.request
 HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
 HEADERS_WD = {
    "User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
    "Accept": "application/json",
 }
 def herbapi_request(path, method="GET", data=None):
    url = f"{HERBAPI_BASE}{path}"
    body = json.dumps(data).encode() if data else None
    req = urllib.request.Request(url, data=body, method=method, headers={
        "Authorization": f"Bearer {HERBAPI_TOKEN}",
        "Content-Type": "application/json",
    })
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())
 def query_wikidata_batch(names):
    """Query Wikidata for a batch of scientific names."""
    values = " ".join(f'"{n}"' for n in names)
    sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
  VALUES ?name {{ {values} }}
  ?item wdt:P225 ?name .
  OPTIONAL {{ ?item wdt:P846 ?gbifId }}
  OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
 }}"""
    encoded = urllib.parse.quote(sparql)
    url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
    req = urllib.request.Request(url, headers=HEADERS_WD)
    with urllib.request.urlopen(req, timeout=60) as resp:
        data = json.loads(resp.read())
    results = {}
    for binding in data.get("results", {}).get("bindings", []):
        name = binding["name"]["value"]
        qid_url = binding["item"]["value"]
        qid = qid_url.rsplit("/", 1)[-1]
        gbif = binding.get("gbifId", {}).get("value")
        eppo = binding.get("eppoCode", {}).get("value")
        results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
    return results
 def main():
    # 1. Fetch all species
    resp = herbapi_request("/species?per_page=200")
    species_list = resp["data"]
    print(f"Fetched {len(species_list)} species from HerbAPI\n")
    # 2. Collect species needing enrichment
    to_enrich = [sp for sp in species_list
                 if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
    if not to_enrich:
        print("All species already enriched.")
        return
    print(f"{len(to_enrich)} species need enrichment\n")
    # 3. Batch query Wikidata
    BATCH_SIZE = 20
    wikidata_results = {}
    names = [sp["name_scientific"] for sp in to_enrich]
    for i in range(0, len(names), BATCH_SIZE):
        batch = names[i:i + BATCH_SIZE]
        print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
        try:
            results = query_wikidata_batch(batch)
            wikidata_results.update(results)
            print(f"  Got {len(results)} matches")
        except Exception as e:
            print(f"  ERROR: {e}")
        if i + BATCH_SIZE < len(names):
            time.sleep(2)
    print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
    # 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
    updated = 0
    skipped = 0
    not_found = 0
    errors = 0
    for sp in to_enrich:
        name = sp["name_scientific"]
        wd = wikidata_results.get(name)
        if not wd:
            print(f"  SKIP (no Wikidata match): {name}")
            not_found += 1
            continue
        # Check what needs updating
        needs_qid = not sp["wikidata_qid"] and wd["qid"]
        needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
        needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
        if not (needs_qid or needs_gbif or needs_eppo):
            print(f"  SKIP (nothing new): {name}")
            skipped += 1
            continue
        try:
            # GET full species by slug for the complete object
            full_sp = herbapi_request(f"/species/{sp['slug']}")
            # Remove read-only fields
            species_id = full_sp.pop("id")
            full_sp.pop("slug", None)
            full_sp.pop("created_at", None)
            full_sp.pop("updated_at", None)
            # Merge new data (only null fields)
            if needs_qid:
                full_sp["wikidata_qid"] = wd["qid"]
            if needs_gbif:
                full_sp["gbif_id"] = str(wd["gbif_id"])  # API expects string
            if needs_eppo:
                full_sp["eppo_code"] = wd["eppo_code"]
            # PUT by UUID
            herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
            fields = []
            if needs_qid: fields.append(f"qid={wd['qid']}")
            if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
            if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
            print(f"  UPDATED: {name} -> {', '.join(fields)}")
            updated += 1
        except Exception as e:
            print(f"  ERROR updating {name}: {e}")
            errors += 1
    print(f"\n{'=' * 60}")
    print(f"RESULTS:")
    print(f"  Updated:               {updated}")
    print(f"  Skipped (no new data): {skipped}")
    print(f"  Not found on Wikidata: {not_found}")
    print(f"  Errors:                {errors}")
    print(f"  Total species:         {len(species_list)}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,305 @@
 #!/usr/bin/env python3
 """Expand HerbAPI species database with common permaculture/garden species."""
 import json
 import time
 import urllib.request
 import urllib.parse
 import urllib.error
 import ssl
 BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 DELAY = 0.15
 # SSL context for GBIF (https)
 ssl_ctx = ssl.create_default_context()
 def api_get(path):
    req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())
 def api_post(path, data):
    body = json.dumps(data).encode()
    req = urllib.request.Request(
        f"{BASE_URL}{path}",
        data=body,
        headers={"Authorization": AUTH, "Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req) as resp:
            return json.loads(resp.read()), resp.status
    except urllib.error.HTTPError as e:
        err_body = e.read().decode()
        print(f"  ERROR {e.code}: {err_body}")
        return None, e.code
 def gbif_get_german_name(scientific_name):
    """Query GBIF for the German vernacular name."""
    try:
        url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
        req = urllib.request.Request(url)
        with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
            match = json.loads(resp.read())
        usage_key = match.get("usageKey")
        if not usage_key:
            return None
        url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
        req2 = urllib.request.Request(url2)
        with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
            vn = json.loads(resp.read())
        for r in vn.get("results", []):
            if r.get("language") == "deu":
                return r["vernacularName"]
        return None
    except Exception as e:
        print(f"  GBIF lookup failed for {scientific_name}: {e}")
        return None
 # ── Families to ensure exist ─────────────────────────────────────────
 FAMILIES_NEEDED = {
    "Fabaceae":        {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
    "Solanaceae":      {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
    "Cucurbitaceae":   {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
    "Asteraceae":      {"name_en": "Daisy family", "name_de": "Korbblütler"},
    "Chenopodiaceae":  {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
    "Brassicaceae":    {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
    "Amaryllidaceae":  {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
    "Apiaceae":        {"name_en": "Carrot family", "name_de": "Doldenblütler"},
    "Poaceae":         {"name_en": "Grass family", "name_de": "Süßgräser"},
    "Lamiaceae":       {"name_en": "Mint family", "name_de": "Lippenblütler"},
    "Caprifoliaceae":  {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
    "Rosaceae":        {"name_en": "Rose family", "name_de": "Rosengewächse"},
    "Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
    "Ericaceae":       {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
    "Moraceae":        {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
    # New families not yet in the DB:
    "Hypericaceae":    {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
    "Tropaeolaceae":   {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
    "Elaeagnaceae":    {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
 }
 # ── Species to add ───────────────────────────────────────────────────
 # Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
 SPECIES = [
    # Vegetables
    ("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
     {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
    ("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
     {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
    ("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
     {"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
    ("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
     {"food_uses": "Fruit"}),
    ("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
     {"food_uses": "Fruit"}),
    ("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
     {"food_uses": "Fruit, seeds, flowers"}),
    ("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
     {"food_uses": "Fruit, seeds"}),
    ("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
     {"food_uses": "Leaves"}),
    ("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
     {"food_uses": "Leaves"}),
    ("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
     {"food_uses": "Leaves, flower buds, stems"}),
    ("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
     {"food_uses": "Root, leaves"}),
    ("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
     {"food_uses": "Root, leaves, seed pods"}),
    ("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
     {"food_uses": "Bulb, leaves"}),
    ("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
     {"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
    ("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
     {"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
    ("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
     {"food_uses": "Leaves, root"}),
    ("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
     {"food_uses": "Stalks, root, leaves"}),
    ("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
     {"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
    ("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
     {"food_uses": "Root"}),
    ("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
     {"food_uses": "Kernels, cobs"}),
    ("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
     {"food_uses": "Fruit"}),
    # Herbs
    ("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
     {"food_uses": "Leaves", "attracts_pollinators": True}),
    ("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
     {"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
    ("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
     {"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
    ("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
     {"food_uses": "Leaves", "attracts_pollinators": True}),
    ("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
     {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
    ("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
     {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
    ("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
     {"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
    ("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
     {"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
      "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
      "attracts_beneficial_insects": True, "attracts_pollinators": True}),
    ("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
     {"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
    ("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
     {"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
    ("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
     {"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
      "other_uses": "Earthworm attractant (biodynamic)"}),
    # Flowers & cover crops
    ("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
     {"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
    ("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
     {"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
    ("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
     {"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
    ("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
     {"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
    ("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
     {"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
    ("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
     {"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
      "ground_cover_quality": "excellent", "attracts_pollinators": True}),
    ("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
     {"nitrogen_fixer": True, "food_uses": "Sprouts",
      "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
      "other_uses": "Green manure, deep-rooting soil improver"}),
    # Fruit / Trees
    ("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
     {"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
    ("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
     {"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
    ("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
     {"food_uses": "Fruit", "attracts_pollinators": True}),
    ("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
     {"food_uses": "Berries"}),
    ("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
     {"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
      "wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
    ("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
     {"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
    ("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
     {"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
      "medicinal_uses": "High vitamin C, skin care",
      "other_uses": "Erosion control, windbreak"}),
    ("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
     {"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
 ]
 def main():
    # 1. Load existing families
    print("=== Loading existing families ===")
    fam_resp = api_get("/families?per_page=100")
    family_map = {}  # name_scientific -> id
    for f in fam_resp["data"]:
        family_map[f["name_scientific"]] = f["id"]
    print(f"  Found {len(family_map)} existing families")
    # 2. Create missing families
    print("\n=== Creating missing families ===")
    families_created = 0
    for fam_name, fam_info in FAMILIES_NEEDED.items():
        if fam_name in family_map:
            print(f"  SKIP (exists): {fam_name}")
            continue
        payload = {
            "name_scientific": fam_name,
            "name_en": fam_info["name_en"],
            "name_de": fam_info["name_de"],
        }
        print(f"  CREATE: {fam_name} ...", end=" ")
        result, status = api_post("/families", payload)
        if result and "id" in result:
            family_map[fam_name] = result["id"]
            print(f"OK ({result['id']})")
            families_created += 1
        else:
            print(f"FAILED (status={status})")
        time.sleep(DELAY)
    print(f"\n  Families created: {families_created}")
    # 3. Load existing species
    print("\n=== Loading existing species ===")
    sp_resp = api_get("/species?per_page=200")
    existing_species = set()
    for s in sp_resp["data"]:
        existing_species.add(s["name_scientific"])
    print(f"  Found {len(existing_species)} existing species")
    # 4. Add new species
    print("\n=== Adding new species ===")
    created = 0
    skipped = 0
    failed = 0
    for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
        if sci_name in existing_species:
            print(f"  SKIP (exists): {sci_name}")
            skipped += 1
            continue
        # Look up family ID
        fam_id = family_map.get(family)
        if not fam_id:
            print(f"  SKIP (no family '{family}'): {sci_name}")
            failed += 1
            continue
        # Try GBIF for German name
        gbif_de = gbif_get_german_name(sci_name)
        if gbif_de:
            print(f"  GBIF name for {sci_name}: {gbif_de}")
            # Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
            # Keep our curated name_de but log the GBIF one
        payload = {
            "name_scientific": sci_name,
            "family_id": fam_id,
            "name_en": name_en,
            "name_de": name_de,
            "plant_layer": plant_layer,
        }
        # Add extra fields
        for k, v in extras.items():
            payload[k] = v
        print(f"  CREATE: {sci_name} ({name_de}) ...", end=" ")
        result, status = api_post("/species", payload)
        if result and "id" in result:
            print(f"OK ({result['id']})")
            created += 1
        else:
            print(f"FAILED (status={status})")
            failed += 1
        time.sleep(DELAY)
    print(f"\n{'='*50}")
    print(f"SUMMARY")
    print(f"  Families created: {families_created}")
    print(f"  Species created:  {created}")
    print(f"  Species skipped:  {skipped}")
    print(f"  Species failed:   {failed}")
    print(f"  Total species now: {len(existing_species) + created}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,362 @@
 #!/usr/bin/env python3
 """Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
 import json
 import os
 import re
 import subprocess
 import sys
 import time
 import urllib.parse
 import urllib.request
 # Force unbuffered output
 sys.stdout.reconfigure(line_buffering=True)
 sys.stderr.reconfigure(line_buffering=True)
 # --- Configuration ---
 S3_ENDPOINT = "http://garage.sub-net.at:3900"
 S3_BUCKET = "herbapi"
 S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
 S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
 S3_REGION = "garage"
 DB_HOST = "10.31.3.90"
 DB_USER = "herbapi"
 DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
 DB_NAME = "herbapi"
 USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
 THUMB_WIDTH = 800
 REQUEST_DELAY = 0.3
 ALLOWED_LICENSES = {
    "cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
    "public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
    "pd-us", "pd-usgov", "pd-author",
    "cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
    "cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
    "cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
    "cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
 }
 def slugify(name: str) -> str:
    """Convert scientific name to a URL-safe slug."""
    return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
 def psql(query: str) -> str:
    """Run a psql query and return output."""
    env = os.environ.copy()
    env["PGPASSWORD"] = DB_PASS
    result = subprocess.run(
        ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
        capture_output=True, text=True, env=env
    )
    if result.returncode != 0:
        print(f"  psql error: {result.stderr.strip()}", file=sys.stderr)
    return result.stdout.strip()
 def fetch_json(url: str) -> dict | None:
    """Fetch JSON from a URL with proper User-Agent."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return json.loads(resp.read())
    except Exception as e:
        print(f"  HTTP error fetching {url}: {e}")
        return None
 def get_wikidata_image(qid: str) -> str | None:
    """Query Wikidata SPARQL for P18 image filename."""
    sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
    url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
        "query": sparql, "format": "json"
    })
    data = fetch_json(url)
    if not data:
        return None
    bindings = data.get("results", {}).get("bindings", [])
    if not bindings:
        return None
    image_url = bindings[0]["image"]["value"]
    # URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
    filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
    return filename
 def get_commons_info(filename: str) -> dict | None:
    """Get image info from Wikimedia Commons API."""
    url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
        "action": "query",
        "titles": f"File:{filename}",
        "prop": "imageinfo",
        "iiprop": "url|extmetadata",
        "iiurlwidth": str(THUMB_WIDTH),
        "format": "json",
    })
    data = fetch_json(url)
    if not data:
        return None
    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        if page_id == "-1":
            return None
        imageinfo = page.get("imageinfo", [])
        if not imageinfo:
            return None
        info = imageinfo[0]
        meta = info.get("extmetadata", {})
        thumb_url = info.get("thumburl") or info.get("url")
        desc_url = info.get("descriptionurl", "")
        license_short = meta.get("LicenseShortName", {}).get("value", "")
        artist_html = meta.get("Artist", {}).get("value", "")
        # Strip HTML tags from artist
        artist = re.sub(r'<[^>]+>', '', artist_html).strip()
        # Clean up whitespace
        artist = re.sub(r'\s+', ' ', artist)
        return {
            "thumb_url": thumb_url,
            "description_url": desc_url,
            "license": license_short,
            "artist": artist,
            "filename": filename,
        }
    return None
 def is_license_allowed(license_str: str) -> bool:
    """Check if a license is in our allowed list."""
    normalized = license_str.lower().strip()
    # Direct match
    if normalized in ALLOWED_LICENSES:
        return True
    # Check for NC or ND
    if "nc" in normalized or "nd" in normalized:
        return False
    # Check patterns
    if normalized.startswith("public domain") or normalized.startswith("pd"):
        return True
    if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
        return True
    if re.match(r'^cc[- ]?by[- ]?\d', normalized):
        return True
    if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
        return True
    return False
 def normalize_license(license_str: str) -> str:
    """Normalize license string for storage."""
    low = license_str.lower().strip()
    if "public domain" in low or low.startswith("pd"):
        return "Public domain"
    if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
        return "CC0 1.0"
    # CC BY-SA X.0
    m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
    if m:
        return f"CC BY-SA {m.group(1)}"
    # CC BY X.0
    m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
    if m:
        return f"CC BY {m.group(1)}"
    return license_str
 def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
    """Upload to S3 Garage using AWS CLI."""
    tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
    with open(tmp_path, "wb") as f:
        f.write(data)
    env = os.environ.copy()
    env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
    env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
    env["AWS_DEFAULT_REGION"] = S3_REGION
    result = subprocess.run(
        [
            "aws", "s3", "cp", tmp_path,
            f"s3://{S3_BUCKET}/{s3_key}",
            "--endpoint-url", S3_ENDPOINT,
            "--content-type", content_type,
        ],
        capture_output=True, text=True, env=env
    )
    os.unlink(tmp_path)
    if result.returncode != 0:
        raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
 def download_image(url: str) -> bytes | None:
    """Download image data from URL."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        with urllib.request.urlopen(req, timeout=60) as resp:
            return resp.read()
    except Exception as e:
        print(f"  Download error: {e}")
        return None
 def main():
    # 1. Get species
    rows = psql(
        "SELECT id, name_scientific, wikidata_qid FROM species "
        "WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
        "ORDER BY name_scientific"
    )
    if not rows:
        print("No species with wikidata_qid found.")
        return
    species_list = []
    for line in rows.split("\n"):
        parts = line.split("|")
        if len(parts) == 3:
            species_list.append({
                "id": parts[0],
                "name": parts[1],
                "qid": parts[2],
            })
    print(f"Found {len(species_list)} species with Wikidata QIDs.")
    # 2. Get existing images
    existing = set()
    existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
    if existing_rows:
        for line in existing_rows.split("\n"):
            line = line.strip()
            if line:
                existing.add(line)
    print(f"Found {len(existing)} species that already have images.")
    imported = 0
    skipped_existing = 0
    skipped_no_image = 0
    skipped_license = 0
    skipped_download = 0
    errors = 0
    for i, sp in enumerate(species_list):
        name = sp["name"]
        qid = sp["qid"]
        sp_id = sp["id"]
        slug = slugify(name)
        print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
        if sp_id in existing:
            print("  Already has image, skipping.")
            skipped_existing += 1
            continue
        # Query Wikidata for image
        time.sleep(REQUEST_DELAY)
        filename = get_wikidata_image(qid)
        if not filename:
            print("  No image on Wikidata.")
            skipped_no_image += 1
            continue
        # Get Commons info
        time.sleep(REQUEST_DELAY)
        info = get_commons_info(filename)
        if not info:
            print(f"  Could not get Commons info for {filename}")
            skipped_no_image += 1
            continue
        # Check license
        raw_license = info["license"]
        if not is_license_allowed(raw_license):
            print(f"  License not allowed: {raw_license}")
            skipped_license += 1
            continue
        norm_license = normalize_license(raw_license)
        artist = info["artist"]
        thumb_url = info["thumb_url"]
        desc_url = info["description_url"]
        print(f"  License: {raw_license} -> {norm_license}")
        print(f"  Artist: {artist[:80]}")
        print(f"  Thumbnail: {thumb_url[:100]}...")
        # Download image
        time.sleep(REQUEST_DELAY)
        image_data = download_image(thumb_url)
        if not image_data:
            print("  Failed to download image.")
            skipped_download += 1
            continue
        print(f"  Downloaded {len(image_data)} bytes")
        # Determine file extension from URL
        ext = "jpg"
        if ".png" in thumb_url.lower():
            ext = "png"
        elif ".svg" in thumb_url.lower():
            ext = "svg"
        elif ".gif" in thumb_url.lower():
            ext = "gif"
        s3_key = f"species/{slug}.{ext}"
        content_type = {
            "jpg": "image/jpeg",
            "png": "image/png",
            "svg": "image/svg+xml",
            "gif": "image/gif",
        }.get(ext, "image/jpeg")
        # Upload to S3
        try:
            s3_upload(s3_key, image_data, content_type)
            print(f"  Uploaded to s3://{S3_BUCKET}/{s3_key}")
        except RuntimeError as e:
            print(f"  S3 upload failed: {e}")
            errors += 1
            continue
        # Insert into database
        caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
        # Escape single quotes for SQL
        caption_esc = caption.replace("'", "''")
        desc_url_esc = desc_url.replace("'", "''")
        norm_license_esc = norm_license.replace("'", "''")
        s3_key_esc = s3_key.replace("'", "''")
        insert_sql = (
            f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
            f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
            f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
        )
        result = psql(insert_sql)
        # psql returns empty on success for INSERT
        print(f"  Inserted into images table.")
        imported += 1
    print(f"\n{'='*60}")
    print(f"DONE!")
    print(f"  Imported:          {imported}")
    print(f"  Skipped (existing):{skipped_existing}")
    print(f"  Skipped (no image):{skipped_no_image}")
    print(f"  Skipped (license): {skipped_license}")
    print(f"  Skipped (download):{skipped_download}")
    print(f"  Errors:            {errors}")
    print(f"  Total processed:   {len(species_list)}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,290 @@
 #!/usr/bin/env python3
 """Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
 import hashlib
 import json
 import os
 import re
 import subprocess
 import sys
 import time
 import urllib.parse
 import urllib.request
 # Config
 DB_HOST = "10.31.3.90"
 DB_USER = "herbapi"
 DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
 DB_NAME = "herbapi"
 S3_BUCKET = "herbapi"
 S3_ENDPOINT = "http://10.31.3.170:3900"
 USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
 REQUEST_DELAY = 0.3
 # AWS env for subprocess calls
 AWS_ENV = {
    **os.environ,
    "AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
    "AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
    "AWS_DEFAULT_REGION": "garage",
 }
 # Stats
 stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
 def fetch_url(url):
    """Fetch URL with custom User-Agent."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read()
 def fetch_json(url):
    """Fetch URL and parse JSON."""
    return json.loads(fetch_url(url))
 def psql(sql):
    """Run psql command and return output."""
    result = subprocess.run(
        ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
        capture_output=True, text=True,
        env={**os.environ, "PGPASSWORD": DB_PASS},
    )
    return result.stdout.strip()
 def is_license_allowed(license_str):
    """Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
    Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
    We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
    We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
    """
    if not license_str:
        return False
    ls = license_str.lower().strip()
    # Reject NC and ND explicitly first
    if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
        return False
    # Public domain / CC0
    if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
        return True
    if "public domain" in ls or ls.startswith("pd"):
        return True
    # CC BY-SA (any version, any jurisdiction)
    if re.match(r"cc\s+by-sa\b", ls):
        return True
    # CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
    if re.match(r"cc\s+by\b", ls):
        return True
    return False
 def get_wikidata_image(qid):
    """Query Wikidata SPARQL for P18 image filename."""
    sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
    url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
    data = fetch_json(url)
    bindings = data.get("results", {}).get("bindings", [])
    if not bindings:
        return None
    image_url = bindings[0]["image"]["value"]
    # Extract filename from commons URL
    filename = urllib.parse.unquote(image_url.split("/")[-1])
    return filename
 def get_commons_info(filename):
    """Get image info from Commons API: license, artist, thumbnail URL."""
    title = f"File:{filename}"
    url = (
        f"https://commons.wikimedia.org/w/api.php?action=query"
        f"&titles={urllib.parse.quote(title)}"
        f"&prop=imageinfo&iiprop=url|extmetadata"
        f"&iiurlwidth=800&format=json"
    )
    data = fetch_json(url)
    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        if page_id == "-1":
            return None
        imageinfo = page.get("imageinfo", [{}])[0]
        meta = imageinfo.get("extmetadata", {})
        license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
        artist_html = meta.get("Artist", {}).get("value", "")
        # Clean up artist: strip HTML tags
        artist = re.sub(r"<[^>]+>", "", artist_html).strip()
        # Collapse whitespace
        artist = re.sub(r"\s+", " ", artist)
        if len(artist) > 120:
            artist = artist[:117] + "..."
        # Use the API-provided thumbnail URL (iiurlwidth=800)
        thumb_url = imageinfo.get("thumburl", "")
        # Also get the description URL
        desc_url = imageinfo.get("descriptionurl", "")
        return {
            "license": license_short,
            "artist": artist,
            "thumb_url": thumb_url,
            "desc_url": desc_url,
            "filename": filename,
        }
    return None
 def process_species(species_id, slug, name_sci, qid):
    """Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
    stats["total"] += 1
    # Step 1: Get image filename from Wikidata
    try:
        filename = get_wikidata_image(qid)
    except Exception as e:
        print(f"  ERROR querying Wikidata for {qid}: {e}")
        stats["errors"] += 1
        return False
    time.sleep(REQUEST_DELAY)
    if not filename:
        print(f"  No P18 image for {qid}")
        stats["no_p18"] += 1
        return False
    # Step 2: Get Commons info (license, artist, thumb URL)
    try:
        info = get_commons_info(filename)
    except Exception as e:
        print(f"  ERROR querying Commons for {filename}: {e}")
        stats["errors"] += 1
        return False
    time.sleep(REQUEST_DELAY)
    if not info:
        print(f"  No Commons info for {filename}")
        stats["errors"] += 1
        return False
    # Step 3: Check license
    if not is_license_allowed(info["license"]):
        print(f"  Bad license: {info['license']} for {filename}")
        stats["bad_license"] += 1
        return False
    # Step 4: Download thumbnail using API-provided URL
    thumb_url = info["thumb_url"]
    if not thumb_url:
        print(f"  No thumbnail URL available for {filename}")
        stats["download_fail"] += 1
        return False
    # Determine file extension from thumbnail URL
    ext = "jpg"
    if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
        ext = "png"
    elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
        ext = "gif"
    tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
    try:
        img_data = fetch_url(thumb_url)
        with open(tmp_path, "wb") as f:
            f.write(img_data)
    except Exception as e:
        print(f"  ERROR downloading {thumb_url}: {e}")
        stats["download_fail"] += 1
        return False
    time.sleep(REQUEST_DELAY)
    # Step 5: Upload to S3
    s3_key = f"species/{slug}.{ext}"
    try:
        result = subprocess.run(
            ["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
            capture_output=True, text=True, env=AWS_ENV, timeout=60,
        )
        if result.returncode != 0:
            print(f"  S3 upload failed: {result.stderr}")
            stats["upload_fail"] += 1
            return False
    except Exception as e:
        print(f"  ERROR uploading to S3: {e}")
        stats["upload_fail"] += 1
        return False
    finally:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass
    # Step 6: Insert into DB
    caption = f"Photo: {info['artist']}" if info["artist"] else ""
    caption_sql = caption.replace("'", "''")
    source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
    source_url_sql = source_url.replace("'", "''")
    license_sql = info["license"].replace("'", "''")
    sql = (
        f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
        f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
    )
    try:
        psql(sql)
    except Exception as e:
        print(f"  ERROR inserting to DB: {e}")
        stats["errors"] += 1
        return False
    stats["imported"] += 1
    return True
 def main():
    # Get species without images
    rows = psql(
        "SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
        "FROM species s "
        "LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
        "WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
        "ORDER BY s.name_scientific;"
    )
    if not rows:
        print("No species need images.")
        return
    species_list = []
    for line in rows.split("\n"):
        parts = line.strip().split("|")
        if len(parts) == 4:
            species_list.append(parts)
    print(f"Processing {len(species_list)} species...\n")
    for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
        print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
        ok = process_species(sid, slug, name_sci, qid)
        if ok:
            print(f"  OK - imported")
    print(f"\n{'='*50}")
    print(f"RESULTS:")
    print(f"  Total species processed: {stats['total']}")
    print(f"  Successfully imported:   {stats['imported']}")
    print(f"  No P18 image:            {stats['no_p18']}")
    print(f"  Bad license (NC/ND/GFDL):{stats['bad_license']}")
    print(f"  Download failures:       {stats['download_fail']}")
    print(f"  Upload failures:         {stats['upload_fail']}")
    print(f"  Other errors:            {stats['errors']}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,126 @@
 #!/usr/bin/env python3
 """Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
 import json, urllib.request, urllib.parse, time, sys
 API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 GBIF = "https://api.gbif.org/v1"
 def api_post(path, data):
    req = urllib.request.Request(f"{API}{path}", 
        data=json.dumps(data).encode(),
        headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
    try:
        resp = urllib.request.urlopen(req)
        return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        print(f"  ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
        return None
 def gbif_de_name(name):
    """Get German common name from GBIF."""
    url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
    try:
        match = json.loads(urllib.request.urlopen(url).read())
        if not match.get("usageKey"): return None
        url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
        data = json.loads(urllib.request.urlopen(url2).read())
        for r in data.get("results", []):
            if r.get("language") == "deu":
                return r["vernacularName"]
    except: pass
    return None
 FAMILIES = [
    ("Fabaceae", "Hülsenfrüchtler", "Legumes"),
    ("Rosaceae", "Rosengewächse", "Rose family"),
    ("Brassicaceae", "Kreuzblütler", "Cabbage family"),
    ("Apiaceae", "Doldenblütler", "Carrot family"),
    ("Lamiaceae", "Lippenblütler", "Mint family"),
    ("Asteraceae", "Korbblütler", "Daisy family"),
    ("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
    ("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
    ("Poaceae", "Süßgräser", "Grass family"),
    ("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
    ("Boraginaceae", "Raublattgewächse", "Borage family"),
    ("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
    ("Betulaceae", "Birkengewächse", "Birch family"),
    ("Fagaceae", "Buchengewächse", "Beech family"),
    ("Juglandaceae", "Walnussgewächse", "Walnut family"),
    ("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
    ("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
    ("Ericaceae", "Heidekrautgewächse", "Heath family"),
    ("Moraceae", "Maulbeergewächse", "Mulberry family"),
    ("Urticaceae", "Brennnesselgewächse", "Nettle family"),
    ("Malvaceae", "Malvengewächse", "Mallow family"),
    ("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
    ("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
    ("Asparagaceae", "Spargelgewächse", "Asparagus family"),
    ("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
 ]
 SPECIES = [
    ("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
    ("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
    ("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
    ("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
    ("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
    ("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
    ("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
    ("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
    ("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
    ("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
    ("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
    ("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
    ("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
    ("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
    ("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
    ("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
    ("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
    ("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
    ("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
    ("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
    ("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
    ("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
    ("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
    ("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
    ("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
 ]
 # Create families
 print("=== Creating families ===")
 family_map = {}
 for sci, de, en in FAMILIES:
    r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
    if r:
        family_map[sci] = r["id"]
        print(f"  ✓ {sci}")
    time.sleep(0.05)
 print(f"Created {len(family_map)} families\n")
 # Create species
 print("=== Creating species (with GBIF German names) ===")
 created = 0
 for sci_name, family_sci, extra in SPECIES:
    fam_id = family_map.get(family_sci)
    if not fam_id:
        print(f"  ✗ {sci_name} — family {family_sci} missing")
        continue
    de_name = gbif_de_name(sci_name)
    data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
    r = api_post("/species", data)
    if r:
        created += 1
        print(f"  ✓ {sci_name} → {de_name or '(no DE name)'}")
    time.sleep(0.15)
 print(f"Created {created} species\n")
 # Create suppliers  
 print("=== Creating suppliers ===")
 for name, url, country, organic, demeter, notes in [
    ("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
    ("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
 ]:
    r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
    if r: print(f"  ✓ {name}")
 print("\nDone!")
@@ -0,0 +1,514 @@
 #!/usr/bin/env python3
 """
 Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
 Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
 product listings and details, then creates cultivars in HerbAPI matched
 to existing species.
 """
 import json
 import re
 import time
 import urllib.request
 import urllib.error
 import urllib.parse
 import sys
 from datetime import datetime, timezone
 # --- Configuration -----------------------------------------------------------
 HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
 SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 REQUEST_DELAY = 0.5  # seconds between requests
 # Only import products from these Arche Noah article lines (their own seeds)
 ARCHE_NOAH_LINES = {
    "Bio-Saatgut von ARCHE NOAH",
    "Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
 }
 # Search terms to discover all seed products across the shop
 SEARCH_TERMS = [
    "Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
    "Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
    "Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
    "Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
    "Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
    "Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
    "Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
    "Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
    "Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
    "Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
    "Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
    "Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
    "Rote Bete", "Rote Rübe", "Mangold", "Melde",
    "Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
    "Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
    "Zuckermais", "Popcorn",
 ]
 # --- Helpers -----------------------------------------------------------------
 def herbapi_request(method, path, data=None):
    """Make a request to HerbAPI."""
    url = f"{HERBAPI_BASE}/{path}"
    body = json.dumps(data).encode() if data else None
    req = urllib.request.Request(url, data=body, method=method, headers={
        "Authorization": f"Bearer {HERBAPI_TOKEN}",
        "Content-Type": "application/json",
        "Accept": "application/json",
    })
    try:
        resp = urllib.request.urlopen(req, timeout=30)
        raw = resp.read().decode("utf-8")
        return json.loads(raw) if raw.strip() else None
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        print(f"  HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
        raise
 def shop_create_session():
    """Create an anonymous session on the Arche Noah shop."""
    req = urllib.request.Request(
        SHOP_BASE + "webshop/createanonymoususer",
        data=json.dumps({}).encode(),
        headers={
            "User-Agent": SHOP_UA,
            "Content-Type": "application/json",
            "Origin": "https://shop.arche-noah.at",
            "Referer": "https://shop.arche-noah.at/",
        },
    )
    resp = urllib.request.urlopen(req, timeout=15)
    cookie = resp.headers.get("Set-Cookie", "")
    session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
    if not session:
        raise RuntimeError("Failed to get shop session")
    return session
 def shop_request(session, endpoint, payload):
    """Make a POST request to the shop API."""
    req = urllib.request.Request(
        SHOP_BASE + endpoint,
        data=json.dumps(payload).encode(),
        headers={
            "User-Agent": SHOP_UA,
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Cookie": f"JSESSIONID={session}",
            "Origin": "https://shop.arche-noah.at",
            "Referer": "https://shop.arche-noah.at/",
        },
    )
    resp = urllib.request.urlopen(req, timeout=30)
    raw = resp.read().decode("utf-8")
    return json.loads(raw) if raw.strip() else None
 def extract_latin_name(detail_headline3):
    """Extract the Latin/botanical name from the product detail headline3 field."""
    if not detail_headline3:
        return None
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
    # Remove "Hier geht es zu unseren..." trailing text
    text = text.split("Hier geht")[0].strip()
    # Should be something like "Solanum lycopersicum" or "Capsicum annuum"
    if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
        return text
    return None
 def match_species(latin_name, species_by_scientific):
    """
    Match a Latin name to a species, handling subspecies/variety suffixes.
    E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
    Also handles "subsp.", "convar.", "f." qualifiers.
    """
    if not latin_name:
        return None
    normalized = latin_name.strip().lower()
    # Direct match
    species = species_by_scientific.get(normalized)
    if species:
        return species
    # Strip subspecies/variety/convar/forma qualifiers and try genus + species only
    # Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
    m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
    if m:
        base = m.group(1).strip()
        species = species_by_scientific.get(base)
        if species:
            return species
    return None
 def extract_cultivar_name(product_name):
    """
    Extract the cultivar/variety name from the product name.
    Format examples:
      "Salatparadeiser 'Naama' HG026" -> "Naama"
      "Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
      "Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
    """
    # Try to extract name in quotes (various quote styles)
    m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
    if m:
        return m.group(1).strip()
    # Fallback: remove the article number suffix and type prefix
    # Remove trailing article number like HG026, TO019, etc.
    name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
    # Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
    # Just return the full cleaned name
    return name
 def parse_pack_info(unit_desc):
    """
    Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
    Returns (pack_size, pack_unit) or (None, None).
    """
    if not unit_desc:
        return None, None
    # "20-30 Korn" -> take the lower bound
    m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
    if m:
        return float(m.group(1)), m.group(2)
    return None, None
 # --- Main scraping logic -----------------------------------------------------
 def fetch_all_arche_noah_products(session):
    """Search the shop API to find all Arche Noah seed products."""
    all_products = {}
    seen_terms = set()
    for term in SEARCH_TERMS:
        if term.lower() in seen_terms:
            continue
        seen_terms.add(term.lower())
        offset = 0
        while True:
            payload = {
                "searchCriteria": term,
                "startIndex": offset,
                "numDataSets": 200,
                "allowAllProducts": False,
            }
            try:
                data = shop_request(session, "webshop/getproducts", payload)
            except Exception as e:
                print(f"  Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
                break
            if not data:
                break
            new_count = 0
            for p in data:
                if p["sid"] not in all_products:
                    all_products[p["sid"]] = p
                    new_count += 1
            if len(data) < 200:
                break
            offset += len(data)
            time.sleep(REQUEST_DELAY)
        time.sleep(REQUEST_DELAY)
    # Filter to Arche Noah's own seed products only
    an_products = {
        sid: p for sid, p in all_products.items()
        if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
    }
    print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
    return an_products
 def fetch_product_details(session, products):
    """Fetch detailed info (Latin names) for each product."""
    details = {}
    total = len(products)
    for i, (sid, product) in enumerate(products.items()):
        try:
            detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
            if detail:
                details[sid] = detail
        except Exception as e:
            print(f"  Detail for {sid} failed: {e}", file=sys.stderr)
        if (i + 1) % 20 == 0:
            print(f"  Fetched details: {i + 1}/{total}")
        time.sleep(REQUEST_DELAY)
    print(f"Fetched {len(details)} product details")
    return details
 def load_herbapi_species():
    """Load all species from HerbAPI and build lookup maps (handles pagination)."""
    page = 1
    species_list = []
    while True:
        result = herbapi_request("GET", f"species?per_page=100&page={page}")
        if isinstance(result, dict) and "data" in result:
            data = result["data"]
            total = result.get("total", 0)
        elif isinstance(result, list):
            data = result
            total = len(data)
        else:
            break
        species_list.extend(data)
        if len(species_list) >= total or not data:
            break
        page += 1
    # Build lookup by scientific name (normalized lowercase)
    by_scientific = {}
    for s in species_list:
        key = s["name_scientific"].strip().lower()
        by_scientific[key] = s
    return species_list, by_scientific
 def load_herbapi_cultivars():
    """Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
    page = 1
    all_cultivars = []
    while True:
        result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
        if isinstance(result, dict) and "data" in result:
            data = result["data"]
            total = result.get("total", 0)
        elif isinstance(result, list):
            data = result
            total = len(data)
        else:
            break
        all_cultivars.extend(data)
        if len(all_cultivars) >= total or not data:
            break
        page += 1
    # Build lookup by (species_id, normalized cultivar name)
    by_key = {}
    for c in all_cultivars:
        key = (c["species_id"], c["name"].strip().lower())
        by_key[key] = c
    return all_cultivars, by_key
 def ensure_supplier():
    """Create the Arche Noah supplier if it doesn't exist, return its ID."""
    suppliers = herbapi_request("GET", "suppliers")
    if isinstance(suppliers, dict) and "data" in suppliers:
        suppliers = suppliers["data"]
    for s in suppliers:
        if "arche" in s["name"].lower() and "noah" in s["name"].lower():
            print(f"Supplier 'Arche Noah' already exists: {s['id']}")
            return s["id"]
    print("Creating supplier 'Arche Noah'...")
    result = herbapi_request("POST", "suppliers", {
        "name": "Arche Noah",
        "url": "https://www.arche-noah.at",
        "country": "AT",
        "is_organic": True,
        "is_demeter": False,
        "notes": "Austrian society for heritage seed preservation and biodiversity",
    })
    print(f"Created supplier: {result['id']}")
    return result["id"]
 def load_existing_supplier_links(cultivar_id):
    """Load existing supplier links for a cultivar."""
    try:
        result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
        if isinstance(result, list):
            return result
        if isinstance(result, dict) and "data" in result:
            return result["data"]
        return []
    except Exception:
        return []
 def main():
    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    print(f"=== Arche Noah Seed Catalog Scraper ===")
    print(f"Started at {now_str}\n")
    # Step 1: Create Arche Noah supplier in HerbAPI
    print("[1/6] Ensuring Arche Noah supplier exists...")
    supplier_id = ensure_supplier()
    print()
    # Step 2: Load HerbAPI species for matching
    print("[2/6] Loading HerbAPI species...")
    species_list, species_by_scientific = load_herbapi_species()
    print(f"Loaded {len(species_list)} species")
    print()
    # Step 3: Load existing cultivars for idempotency
    print("[3/6] Loading existing cultivars...")
    existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
    print(f"Loaded {len(existing_cultivars)} existing cultivars")
    print()
    # Step 4: Scrape Arche Noah shop
    print("[4/6] Scraping Arche Noah shop catalog...")
    session = shop_create_session()
    print(f"Got shop session")
    products = fetch_all_arche_noah_products(session)
    print()
    # Step 5: Fetch product details (to get Latin names)
    print("[5/6] Fetching product details for Latin name matching...")
    details = fetch_product_details(session, products)
    print()
    # Step 6: Create cultivars in HerbAPI
    print("[6/6] Creating cultivars in HerbAPI...")
    stats = {
        "created": 0,
        "skipped_existing": 0,
        "skipped_no_species": 0,
        "supplier_linked": 0,
        "supplier_link_existed": 0,
        "errors": 0,
    }
    for sid, product in sorted(products.items()):
        detail = details.get(sid, {})
        # Extract Latin name from detail
        latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
        if not latin_name:
            # Fallback: try from category mapping
            latin_name = None
        # Match to HerbAPI species (handles subspecies/variety suffixes)
        species = match_species(latin_name, species_by_scientific)
        if not species:
            print(f"  SKIP (no species match): {product['name']} | latin={latin_name}")
            stats["skipped_no_species"] += 1
            continue
        # Extract cultivar name
        cultivar_name = extract_cultivar_name(product["name"])
        if not cultivar_name:
            print(f"  SKIP (no cultivar name): {product['name']}")
            stats["skipped_no_species"] += 1
            continue
        # Check if cultivar already exists (idempotency)
        lookup_key = (species["id"], cultivar_name.strip().lower())
        existing = cultivars_by_key.get(lookup_key)
        if existing:
            cultivar_id = existing["id"]
            stats["skipped_existing"] += 1
        else:
            # Determine if this is organic
            is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
            # Build product URL
            alias = product.get("alias") or detail.get("alias", "")
            product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
            # Create cultivar
            cultivar_data = {
                "species_id": species["id"],
                "name": cultivar_name,
                "name_de": cultivar_name,
                "is_organic": is_organic,
                "source_urls": [product_url] if product_url else None,
            }
            try:
                result = herbapi_request("POST", "cultivars", cultivar_data)
                cultivar_id = result["id"]
                stats["created"] += 1
                # Add to lookup for idempotency within this run
                cultivars_by_key[lookup_key] = result
                print(f"  CREATED: {cultivar_name} ({species['name_scientific']})")
            except Exception as e:
                print(f"  ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
                stats["errors"] += 1
                continue
        # Link cultivar to supplier
        existing_links = load_existing_supplier_links(cultivar_id)
        already_linked = any(
            link["supplier_id"] == supplier_id for link in existing_links
        )
        if already_linked:
            stats["supplier_link_existed"] += 1
        else:
            # Parse pack info
            unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
            pack_size, pack_unit = parse_pack_info(unit_desc)
            # Get price
            price = None
            price_list = product.get("priceListPos") or detail.get("priceListPos", [])
            if price_list:
                price = price_list[0].get("singleUnitPrice")
            # Build product URL
            alias = product.get("alias") or detail.get("alias", "")
            product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
            link_data = {
                "supplier_id": supplier_id,
                "article_number": str(product.get("articleNr", "")),
                "product_url": product_url,
                "price_eur": price,
                "pack_size": pack_size,
                "pack_unit": pack_unit,
            }
            try:
                herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
                stats["supplier_linked"] += 1
            except Exception as e:
                print(f"  ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
                stats["errors"] += 1
        time.sleep(0.1)  # small delay between HerbAPI calls
    # Summary
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"  Cultivars created:          {stats['created']}")
    print(f"  Cultivars already existed:   {stats['skipped_existing']}")
    print(f"  Skipped (no species match):  {stats['skipped_no_species']}")
    print(f"  Supplier links created:      {stats['supplier_linked']}")
    print(f"  Supplier links existed:      {stats['supplier_link_existed']}")
    print(f"  Errors:                      {stats['errors']}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,843 @@
 #!/usr/bin/env python3
 """
 Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
 Extracts cultivar data and imports into HerbAPI.
 Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
 """
 import json
 import re
 import sys
 import time
 import urllib.request
 import urllib.error
 import urllib.parse
 from html.parser import HTMLParser
 from typing import Optional
 # ── Configuration ─────────────────────────────────────────────────────────
 API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 SITE_BASE = "https://www.bingenheimersaatgut.de"
 DELAY = 0.5
 USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
 # ── Category URLs to scrape ───────────────────────────────────────────────
 # (url_path, default_species_scientific_name)
 VEGETABLE_CATEGORIES = [
    ("gemuese/tomaten", "Solanum lycopersicum"),
    ("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
    ("gemuese/gurken/salatgurken", "Cucumis sativus"),
    ("gemuese/aubergine", "Solanum melongena"),
    ("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
    ("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
    ("gemuese/bohnen/dicke-bohne", "Vicia faba"),
    ("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
    ("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
    ("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
    ("gemuese/erbsen/markerbse", "Pisum sativum"),
    ("gemuese/erbsen/schalerbse", "Pisum sativum"),
    ("gemuese/erbsen/zuckererbse", "Pisum sativum"),
    ("gemuese/feldsalat", "Valerianella locusta"),
    ("gemuese/knollenfenchel", "Foeniculum vulgare"),
    ("gemuese/kohl/blumenkohl", "Brassica oleracea"),
    ("gemuese/kohl/brokkoli", "Brassica oleracea"),
    ("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
    ("gemuese/kohl/gruenkohl", "Brassica oleracea"),
    ("gemuese/kohl/kohlrabi", "Brassica oleracea"),
    ("gemuese/kohl/rotkohl", "Brassica oleracea"),
    ("gemuese/kohl/weisskohl", "Brassica oleracea"),
    ("gemuese/kohl/wirsing", "Brassica oleracea"),
    ("gemuese/kohl/rosenkohl", "Brassica oleracea"),
    ("gemuese/kresse", "Lepidium sativum"),
    ("gemuese/kuerbis", "Cucurbita maxima"),
    ("gemuese/zuckermais", "Zea mays"),
    ("gemuese/mangold", "Beta vulgaris"),
    ("gemuese/melone", "Cucumis melo"),
    ("gemuese/moehren", "Daucus carota"),
    ("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
    ("gemuese/paprika/chili", "Capsicum annuum"),
    ("gemuese/pastinaken", "Pastinaca sativa"),
    ("gemuese/petersilienwurzel", "Petroselinum crispum"),
    ("gemuese/physalis", "Physalis peruviana"),
    ("gemuese/porreelauch", "Allium porrum"),
    ("gemuese/radies", "Raphanus sativus"),
    ("gemuese/rettich", "Raphanus sativus"),
    ("gemuese/rote-bete", "Beta vulgaris"),
    ("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
    ("gemuese/rueben/kohlruebe", "Brassica napus"),
    ("gemuese/rucola", "Eruca vesicaria"),
    ("gemuese/salat/bataviasalat", "Lactuca sativa"),
    ("gemuese/salat/eichblattsalat", "Lactuca sativa"),
    ("gemuese/salat/eissalat", "Lactuca sativa"),
    ("gemuese/salat/endivien", "Cichorium endivia"),
    ("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
    ("gemuese/salat/kopfsalat", "Lactuca sativa"),
    ("gemuese/salat/lollosalat", "Lactuca sativa"),
    ("gemuese/salat/romanasalat", "Lactuca sativa"),
    ("gemuese/salat/baby-leaf", "Lactuca sativa"),
    ("gemuese/sellerie/knollensellerie", "Apium graveolens"),
    ("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
    ("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
    ("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
    ("gemuese/blattstielgemuese", "Beta vulgaris"),
    ("gemuese/zwiebeln", "Allium cepa"),
    ("gemuese/lauchzwiebeln", "Allium fistulosum"),
    ("gemuese/artischocke", "Cynara cardunculus"),
    ("gemuese/asia-salate", "Brassica juncea"),
    ("gemuese/chicoree", "Cichorium intybus"),
    ("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
    ("gemuese/winterpostelein", "Claytonia perfoliata"),
    ("gemuese/zucchini", "Cucurbita pepo"),
    ("gemuese/catalogna", "Cichorium intybus"),
    ("gemuese/zichoriensalate", "Cichorium intybus"),
 ]
 HERB_CATEGORIES = [
    ("kraeuter/basilikum", "Ocimum basilicum"),
    ("kraeuter/bohnenkraut", "Satureja hortensis"),
    ("kraeuter/borretsch", "Borago officinalis"),
    ("kraeuter/dill", "Anethum graveolens"),
    ("kraeuter/kuemmel", "Carum carvi"),
    ("kraeuter/kerbel", "Anthriscus cerefolium"),
    ("kraeuter/koriander", "Coriandrum sativum"),
    ("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
    ("kraeuter/kultursauerampfer", "Rumex acetosa"),
    ("kraeuter/lavendel", "Lavandula angustifolia"),
    ("kraeuter/liebstock", "Levisticum officinale"),
    ("kraeuter/majoran", "Origanum majorana"),
    ("kraeuter/oregano", "Origanum vulgare"),
    ("kraeuter/pimpinelle", "Sanguisorba minor"),
    ("kraeuter/estragon", "Artemisia dracunculus"),
    ("kraeuter/salbei", "Salvia officinalis"),
    ("kraeuter/schnittlauch", "Allium schoenoprasum"),
    ("kraeuter/schnittknoblauch", "Allium tuberosum"),
    ("kraeuter/schwarzkuemmel", "Nigella sativa"),
    ("kraeuter/speisechrysantheme", "Glebionis coronaria"),
    ("kraeuter/thymian", "Thymus vulgaris"),
    ("kraeuter/ysop", "Hyssopus officinalis"),
    ("kraeuter/winterkresse", "Barbarea vulgaris"),
    ("kraeuter/brunnenkresse", "Nasturtium officinale"),
    ("kraeuter/melisse", "Melissa officinalis"),
    ("kraeuter/petersilie", "Petroselinum crispum"),
    ("kraeuter/schnittsellerie", "Apium graveolens"),
    ("kraeuter/beifuss", "Artemisia vulgaris"),
 ]
 GREEN_MANURE_CATEGORIES = [
    ("gruenduengung", None),
 ]
 ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
 # ── Stats ─────────────────────────────────────────────────────────────────
 stats = {
    "categories_scraped": 0,
    "products_found": 0,
    "detail_pages_fetched": 0,
    "cultivars_created": 0,
    "cultivars_existed": 0,
    "supplier_links_created": 0,
    "supplier_links_existed": 0,
    "species_created": 0,
    "families_created": 0,
    "species_not_matched": [],
    "errors": [],
 }
 # ── HTTP helpers ──────────────────────────────────────────────────────────
 def fetch_page(url: str) -> str:
    """Fetch a web page with User-Agent header."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except urllib.error.HTTPError as e:
        if e.code == 404:
            return ""
        raise
 def api_get(path: str, params: dict = None) -> dict:
    """GET from HerbAPI."""
    url = f"{API_BASE}{path}"
    if params:
        url += "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(url, headers={
        "Authorization": f"Bearer {API_TOKEN}",
        "Accept": "application/json",
    })
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read())
 def api_post(path: str, data: dict) -> tuple:
    """POST to HerbAPI. Returns (response_dict, status_code)."""
    url = f"{API_BASE}{path}"
    body = json.dumps(data).encode("utf-8")
    req = urllib.request.Request(url, data=body, method="POST", headers={
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json",
        "Accept": "application/json",
    })
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return json.loads(resp.read()), resp.status
    except urllib.error.HTTPError as e:
        err_body = e.read().decode("utf-8", errors="replace")
        return {"error": err_body, "_status": e.code}, e.code
 # ── HTML parsing helpers ──────────────────────────────────────────────────
 def parse_product_links(html: str) -> list:
    """Parse product links from listing page using regex."""
    links = []
    # Magento product-item-link pattern
    pattern = re.compile(
        r'<a[^>]+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*</a>',
        re.DOTALL | re.IGNORECASE
    )
    for match in pattern.finditer(html):
        url = match.group(1)
        name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
        if name:
            if not url.startswith("http"):
                url = SITE_BASE + url
            links.append((url, name))
    if not links:
        # Broader pattern for product detail links
        pattern2 = re.compile(
            r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
            re.IGNORECASE
        )
        seen = set()
        for match in pattern2.finditer(html):
            url = match.group(1).strip()
            name = match.group(2).strip()
            if name and url not in seen and not url.endswith(".html"):
                seen.add(url)
                if not url.startswith("http"):
                    url = SITE_BASE + url
                links.append((url, name))
    # Deduplicate by URL
    seen_urls = set()
    unique = []
    for url, name in links:
        if url not in seen_urls:
            seen_urls.add(url)
            unique.append((url, name))
    return unique
 def extract_latin_from_detail(html: str) -> Optional[str]:
    """Extract Latin/botanical name from product detail page."""
    patterns = [
        r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*</(?:em|i)>',
        r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
        r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
    ]
    for pat in patterns:
        m = re.search(pat, html, re.IGNORECASE)
        if m:
            name = m.group(1).strip()
            parts = name.split()
            if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
                return name
    return None
 def extract_description_from_detail(html: str) -> str:
    """Extract product description from detail page."""
    desc_patterns = [
        r'<div[^>]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)</div>',
        r'<div[^>]*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)</div>',
        r'data-content-type="description"[^>]*>(.*?)</div>',
    ]
    for pat in desc_patterns:
        m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
        if m:
            raw = m.group(1)
            text = re.sub(r'<[^>]+>', ' ', raw)
            text = re.sub(r'\s+', ' ', text).strip()
            if len(text) > 20:
                return text[:2000]
    return ""
 def extract_article_number(product_name: str, url: str) -> Optional[str]:
    """Extract article number from product name or URL."""
    m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
    if m:
        return m.group(1).replace(" ", "")
    slug = url.rstrip("/").split("/")[-1]
    m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
    if m:
        return m.group(1).upper()
    return None
 def extract_variety_name(product_name: str) -> str:
    """Extract the variety/cultivar name from the full product name."""
    name = product_name.strip()
    # Remove article number suffix like (G802)
    name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
    # Common German vegetable/herb type prefixes to strip
    prefixes = [
        # Tomatoes
        r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
        r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
        # Beans
        r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
        r'Edamame(?:-Sojabohne)?\s+',
        # Peas
        r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
        # Cucurbits
        r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
        r'Zucchini\s+',
        r'Kürbis\s+',
        r'(?:Wasser)?[Mm]elone\s+',
        # Brassicas
        r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
        r'Kohlrabi\s+',
        r'Wirsing\s+',
        r'Brokkoli\s+',
        r'Chinakohl\s+',
        r'Pak\s+Choi\s+',
        r'Kohlrübe\s+',
        r'Mai-/Herbstrüben?(?:/Navets)?\s+',
        # Root vegetables
        r'Möhre\s+',
        r'Karotten?(?:\s*-?\s*Mix)?\s+',
        r'Pastinake\s+',
        r'Radies(?:chen)?\s+',
        r'Rettich\s+',
        r'Schwarzwurzel\s+',
        r'Haferwurzel\s+',
        r'Petersilienwurzel\s+',
        # Beets
        r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
        r'Mangold\s+',
        # Lettuce & leafy
        r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
        r'Feldsalat\s+',
        r'Endivie\s+',
        r'Asia[\s-]*Salat\s+',
        r'Spinat\s+',
        # Alliums
        r'Zwiebel\s+',
        r'Lauchzwiebel\s+',
        r'Porree(?:/Lauch)?\s+',
        r'Schnittlauch\s+',
        r'Schnittknoblauch\s+',
        # Peppers
        r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
        r'Chili\s+',
        # Celery
        r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
        # Herbs
        r'Basilikum\s+',
        r'Koriander\s+',
        r'Dill\s+',
        r'Petersilie\s+',
        r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
        r'Salbei\s+',
        r'Thymian\s+',
        r'Oregano\s+',
        r'Lavendel\s+',
        r'Melisse\s+',
        r'Majoran\s+',
        r'Estragon\s+',
        r'Kresse\s+',
        r'Bohnenkraut\s+',
        r'Borretsch\s+',
        r'Kümmel\s+',
        r'Kerbel\s+',
        r'Liebstock\s+',
        r'Ysop\s+',
        r'Pimpinelle\s+',
        r'Beifuß\s+',
        r'Schwarzkümmel\s+',
        # Other
        r'Zuckermais\s+',
        r'Artischocke\s+',
        r'Physalis\s+',
        r'Aubergine\s+',
        r'Catalogna\s+',
    ]
    for prefix in prefixes:
        name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
    name = name.strip().strip("'\"")
    return name
 # ── API data caches ───────────────────────────────────────────────────────
 species_cache = {}      # scientific_name_lower -> {id, name_scientific, ...}
 family_cache = {}       # name_scientific_lower -> {id, name_scientific}
 cultivar_cache = {}     # slug -> {id, name, species_id, ...}
 supplier_id = None
 def load_api_data():
    """Load all existing data from HerbAPI for matching."""
    global supplier_id
    print("Loading existing HerbAPI data...")
    # Load families
    page = 1
    while True:
        resp = api_get("/families", {"per_page": 100, "page": page})
        for f in resp["data"]:
            family_cache[f["name_scientific"].lower()] = f
        if len(resp["data"]) < 100:
            break
        page += 1
    print(f"  Loaded {len(family_cache)} families")
    # Load species
    page = 1
    while True:
        resp = api_get("/species", {"per_page": 100, "page": page})
        for s in resp["data"]:
            species_cache[s["name_scientific"].lower()] = s
        if len(resp["data"]) < 100:
            break
        page += 1
    print(f"  Loaded {len(species_cache)} species")
    # Load ALL cultivars (slug + id + name + species_id)
    page = 1
    while True:
        resp = api_get("/cultivars", {"per_page": 100, "page": page})
        for c in resp["data"]:
            cultivar_cache[c["slug"]] = {
                "id": c["id"],
                "name": c["name"],
                "species_id": c["species_id"],
            }
        if len(resp["data"]) < 100:
            break
        page += 1
    print(f"  Loaded {len(cultivar_cache)} cultivars")
    # Create or find Bingenheimer supplier
    resp = api_get("/suppliers")
    for s in resp:
        if "bingenheimer" in s["name"].lower():
            supplier_id = s["id"]
            print(f"  Found existing supplier: {s['name']} ({s['id']})")
            break
    if not supplier_id:
        print("  Creating Bingenheimer Saatgut supplier...")
        s, code = api_post("/suppliers", {
            "name": "Bingenheimer Saatgut",
            "url": "https://www.bingenheimersaatgut.de",
            "country": "DE",
            "is_organic": True,
            "is_demeter": True,
            "notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
        })
        if "id" in s:
            supplier_id = s["id"]
            print(f"  Created supplier: {s['id']}")
        else:
            print(f"  ERROR creating supplier: {s}")
            sys.exit(1)
 def find_or_create_species(latin_name: str) -> Optional[str]:
    """Find species by Latin name or create it. Returns species ID."""
    if not latin_name:
        return None
    key = latin_name.lower().strip()
    # Direct match
    if key in species_cache:
        return species_cache[key]["id"]
    # Try without subspecies/variety
    base = " ".join(key.split()[:2])
    if base in species_cache:
        return species_cache[base]["id"]
    # Handle synonyms
    synonyms = {
        "lycopersicon esculentum": "solanum lycopersicum",
        "capsicum annuum var. annuum": "capsicum annuum",
        "brassica oleracea var. botrytis": "brassica oleracea",
        "brassica oleracea var. italica": "brassica oleracea",
        "brassica oleracea var. gemmifera": "brassica oleracea",
        "brassica oleracea var. gongylodes": "brassica oleracea",
        "brassica oleracea var. capitata": "brassica oleracea",
        "brassica oleracea var. sabauda": "brassica oleracea",
        "brassica oleracea var. sabellica": "brassica oleracea",
        "brassica rapa var. rapa": "brassica rapa",
        "brassica rapa subsp. pekinensis": "brassica rapa",
        "brassica rapa subsp. chinensis": "brassica rapa",
        "beta vulgaris var. conditiva": "beta vulgaris",
        "beta vulgaris subsp. vulgaris": "beta vulgaris",
        "beta vulgaris var. vulgaris": "beta vulgaris",
        "allium porrum": "allium cepa",
        "allium ampeloprasum": "allium cepa",
        "origanum majorana": "origanum vulgare",
        "cichorium intybus var. foliosum": "cichorium intybus",
        "petroselinum crispum var. tuberosum": "petroselinum crispum",
        "apium graveolens var. rapaceum": "apium graveolens",
        "apium graveolens var. dulce": "apium graveolens",
        "lactuca sativa var. capitata": "lactuca sativa",
        "lactuca sativa var. crispa": "lactuca sativa",
        "lactuca sativa var. longifolia": "lactuca sativa",
    }
    if key in synonyms:
        syn_key = synonyms[key]
        if syn_key in species_cache:
            return species_cache[syn_key]["id"]
    # Try to create the species
    genus = latin_name.split()[0]
    family_map = {
        "Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
        "Nicandra": "Solanaceae",
        "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
        "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
        "Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
        "Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
        "Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
        "Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
        "Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
        "Allium": "Amaryllidaceae",
        "Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
        "Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
        "Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
        "Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
        "Sanguisorba": "Rosaceae",
        "Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
        "Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
        "Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
        "Artemisia": "Asteraceae",
        "Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
        "Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
        "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
        "Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
        "Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
        "Mentha": "Lamiaceae",
        "Zea": "Poaceae",
        "Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
        "Valerianella": "Caprifoliaceae",
        "Tropaeolum": "Tropaeolaceae",
        "Rumex": "Polygonaceae",
        "Nigella": "Ranunculaceae",
        "Claytonia": "Montiaceae",
        "Tetragonia": "Aizoaceae",
        "Basella": "Basellaceae",
        "Plantago": "Plantaginaceae",
    }
    family_name = family_map.get(genus)
    if not family_name:
        print(f"    WARNING: Unknown genus '{genus}' for species '{latin_name}'")
        stats["species_not_matched"].append(latin_name)
        return None
    family_id = find_or_create_family(family_name)
    if not family_id:
        return None
    print(f"    Creating species: {latin_name}")
    resp, code = api_post("/species", {
        "name_scientific": latin_name,
        "family_id": family_id,
    })
    if "id" in resp:
        species_cache[latin_name.lower()] = resp
        stats["species_created"] += 1
        return resp["id"]
    else:
        # Might already exist, reload
        print(f"    Species creation returned {code}: {resp.get('error','')[:100]}")
        page = 1
        while True:
            r = api_get("/species", {"per_page": 100, "page": page})
            for s in r["data"]:
                species_cache[s["name_scientific"].lower()] = s
            if len(r["data"]) < 100:
                break
            page += 1
        if latin_name.lower() in species_cache:
            return species_cache[latin_name.lower()]["id"]
        stats["errors"].append(f"Species creation failed: {latin_name}")
        return None
 def find_or_create_family(family_name: str) -> Optional[str]:
    """Find or create a plant family. Returns family ID."""
    key = family_name.lower()
    if key in family_cache:
        return family_cache[key]["id"]
    print(f"    Creating family: {family_name}")
    resp, code = api_post("/families", {"name_scientific": family_name})
    if "id" in resp:
        family_cache[key] = resp
        stats["families_created"] += 1
        return resp["id"]
    else:
        # Reload
        r = api_get("/families", {"per_page": 200})
        for ff in r["data"]:
            family_cache[ff["name_scientific"].lower()] = ff
        if key in family_cache:
            return family_cache[key]["id"]
        stats["errors"].append(f"Family creation failed: {family_name}")
        return None
 def slugify(text: str) -> str:
    """Generate a URL-safe slug."""
    text = text.lower()
    replacements = {
        "ä": "a", "ö": "o", "ü": "u", "ß": "ss",
        "é": "e", "è": "e", "ê": "e", "ë": "e",
        "à": "a", "â": "a", "á": "a",
        "ô": "o", "ù": "u", "û": "u", "ú": "u",
        "ï": "i", "î": "i", "í": "i",
        "ç": "c", "ñ": "n", "ó": "o",
        "œ": "oe", "æ": "ae",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    text = re.sub(r'[^a-z0-9\s-]', '', text)
    text = re.sub(r'[\s]+', '-', text.strip())
    text = re.sub(r'-+', '-', text)
    return text.strip('-')
 def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
    """Check if cultivar already exists. Returns cultivar ID or None."""
    expected_slug = slugify(f"{species_name} {variety_name}")
    # Direct slug match
    if expected_slug in cultivar_cache:
        return cultivar_cache[expected_slug]["id"]
    # Check for name match in same species
    variety_lower = variety_name.lower()
    for slug, data in cultivar_cache.items():
        if data["species_id"] == species_id and data["name"].lower() == variety_lower:
            return data["id"]
    return None
 def scrape_category(cat_path: str, default_species: Optional[str]):
    """Scrape a single category page and all its products."""
    url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
    print(f"\n{'='*60}")
    print(f"Category: {cat_path}")
    html = fetch_page(url)
    if not html:
        print("  SKIP: Page not found (404)")
        return
    time.sleep(DELAY)
    products = parse_product_links(html)
    print(f"  Found {len(products)} products")
    stats["products_found"] += len(products)
    stats["categories_scraped"] += 1
    for prod_url, prod_name in products:
        process_product(prod_url, prod_name, default_species)
 def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
    """Process a single product: fetch detail, extract data, create cultivar."""
    article_number = extract_article_number(prod_name, prod_url)
    variety_name = extract_variety_name(prod_name)
    if not variety_name:
        print(f"  SKIP (no variety): {prod_name}")
        return
    # Skip mixes, sets, bundles
    skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
                     "saat-set", " mix ", "trio ", "quartett", "gutschein",
                     "buch ", "düngung", "erde ", "-garten"]
    name_lower = prod_name.lower()
    # Exception: if the variety name itself is the whole thing, keep it
    if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
        # Only skip if it really seems like a mix
        if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
            print(f"  SKIP (mix/set): {prod_name}")
            return
    print(f"\n  Product: {prod_name}")
    print(f"    Variety: {variety_name}, SKU: {article_number}")
    # Fetch detail page
    latin_name = None
    description = ""
    time.sleep(DELAY)
    try:
        detail_html = fetch_page(prod_url)
        stats["detail_pages_fetched"] += 1
        if detail_html:
            latin_name = extract_latin_from_detail(detail_html)
            description = extract_description_from_detail(detail_html)
    except Exception as e:
        print(f"    WARNING: Detail page error: {e}")
    species_name = latin_name or default_species
    if not species_name:
        print(f"    SKIP: No species for '{prod_name}'")
        stats["species_not_matched"].append(prod_name)
        return
    print(f"    Species: {species_name}")
    species_id = find_or_create_species(species_name)
    if not species_id:
        print(f"    SKIP: Could not resolve species '{species_name}'")
        return
    # Check if cultivar already exists
    existing_id = find_existing_cultivar(species_name, variety_name, species_id)
    cultivar_id = None
    if existing_id:
        cultivar_id = existing_id
        print(f"    EXISTS: cultivar already in DB")
        stats["cultivars_existed"] += 1
    else:
        # Create cultivar
        data = {
            "species_id": species_id,
            "name": variety_name,
            "name_de": variety_name,
            "is_organic": True,
        }
        if description:
            data["description"] = description
        resp, code = api_post("/cultivars", data)
        if "id" in resp:
            cultivar_id = resp["id"]
            cultivar_cache[resp["slug"]] = {
                "id": resp["id"],
                "name": variety_name,
                "species_id": species_id,
            }
            stats["cultivars_created"] += 1
            print(f"    CREATED: {resp['slug']}")
        elif code == 500 and "Database error" in str(resp.get("error", "")):
            # Likely slug conflict - try to find existing
            print(f"    DB conflict - searching for existing cultivar...")
            # Reload cultivars for this species
            page = 1
            while True:
                r = api_get("/cultivars", {"per_page": 100, "page": page})
                for c in r["data"]:
                    cultivar_cache[c["slug"]] = {
                        "id": c["id"],
                        "name": c["name"],
                        "species_id": c["species_id"],
                    }
                    if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
                        cultivar_id = c["id"]
                if cultivar_id or len(r["data"]) < 100:
                    break
                page += 1
            if cultivar_id:
                print(f"    Found existing after conflict: {cultivar_id}")
                stats["cultivars_existed"] += 1
            else:
                print(f"    ERROR: DB error and could not find existing cultivar")
                stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
                return
        else:
            print(f"    ERROR ({code}): {str(resp.get('error',''))[:100]}")
            stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
            return
    # Link to supplier
    if cultivar_id and supplier_id:
        link_data = {
            "supplier_id": supplier_id,
            "product_url": prod_url,
        }
        if article_number:
            link_data["article_number"] = article_number
        resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
        if "id" in resp:
            stats["supplier_links_created"] += 1
            print(f"    LINKED (SKU: {article_number})")
        elif code == 500 or "already" in str(resp.get("error", "")).lower():
            stats["supplier_links_existed"] += 1
            print(f"    LINK EXISTS")
        else:
            print(f"    LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
            stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
 def main():
    print("=" * 60)
    print("Bingenheimer Saatgut Scraper for HerbAPI")
    print("=" * 60)
    load_api_data()
    print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
    for cat_path, default_species in ALL_CATEGORIES:
        try:
            scrape_category(cat_path, default_species)
        except Exception as e:
            print(f"  ERROR in category {cat_path}: {e}")
            stats["errors"].append(f"Category error: {cat_path}: {e}")
    # Summary
    print("\n" + "=" * 60)
    print("SCRAPING COMPLETE - SUMMARY")
    print("=" * 60)
    print(f"Categories scraped:      {stats['categories_scraped']}")
    print(f"Products found:          {stats['products_found']}")
    print(f"Detail pages fetched:    {stats['detail_pages_fetched']}")
    print(f"Cultivars created:       {stats['cultivars_created']}")
    print(f"Cultivars existed:       {stats['cultivars_existed']}")
    print(f"Supplier links created:  {stats['supplier_links_created']}")
    print(f"Supplier links existed:  {stats['supplier_links_existed']}")
    print(f"Species created:         {stats['species_created']}")
    print(f"Families created:        {stats['families_created']}")
    print(f"Errors:                  {len(stats['errors'])}")
    if stats["species_not_matched"]:
        print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
        for s in stats["species_not_matched"][:30]:
            print(f"  - {s}")
    if stats["errors"]:
        print(f"\nErrors ({len(stats['errors'])}):")
        for e in stats["errors"][:30]:
            print(f"  - {e}")
    return 0 if not stats["errors"] else 1
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,760 @@
 #!/usr/bin/env python3
 """
 Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
 Extracts cultivar data and imports into HerbAPI.
 Run 2 - fixes pagination (API caps at 100/page), better species matching,
 caches scraped products, handles duplicates gracefully.
 """
 import urllib.request
 import urllib.parse
 import urllib.error
 import gzip
 import json
 import re
 import time
 import sys
 import os
 import html as html_mod
 from collections import defaultdict
 # --- Configuration ---
 API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 SITE_BASE = "https://www.dreschflegel-saatgut.de"
 DELAY = 0.5
 USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
 CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
 # Unbuffered output
 sys.stdout.reconfigure(line_buffering=True)
 sys.stderr.reconfigure(line_buffering=True)
 stats = defaultdict(int)
 def api_request(method, path, data=None):
    """Make an API request to HerbAPI."""
    url = f"{API_BASE}{path}"
    body = json.dumps(data).encode("utf-8") if data else None
    req = urllib.request.Request(url, data=body, method=method)
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    req.add_header("Content-Type", "application/json")
    req.add_header("Accept", "application/json")
    try:
        resp = urllib.request.urlopen(req)
        return json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        body_text = e.read().decode("utf-8", errors="replace")
        if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
            return None  # Duplicate, handled silently
        if e.code == 500 and "database error" in body_text.lower():
            # Likely a unique constraint violation = duplicate
            return None
        print(f"  API error {e.code} {method} {path}: {body_text[:200]}")
        return None
 def fetch_page(url):
    """Fetch a web page with delay and user-agent."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        resp = urllib.request.urlopen(req, timeout=30)
        return resp.read().decode("utf-8", errors="replace")
    except Exception as e:
        print(f"  Fetch error {url}: {e}")
        return None
 def get_sitemap_urls():
    """Download sitemap and extract all URLs."""
    print("Fetching sitemap index...")
    html = fetch_page(f"{SITE_BASE}/sitemap.xml")
    if not html:
        return []
    sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
    all_urls = []
    for smap_url in sitemap_urls:
        if smap_url.endswith(".xml.gz"):
            print(f"  Fetching compressed sitemap...")
            req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
            try:
                resp = urllib.request.urlopen(req, timeout=30)
                data = gzip.decompress(resp.read()).decode("utf-8")
                urls = re.findall(r"<loc>(.*?)</loc>", data)
                all_urls.extend(urls)
                print(f"    Found {len(urls)} URLs")
            except Exception as e:
                print(f"    Error: {e}")
    return all_urls
 def classify_urls(urls):
    """Filter URLs to likely product pages (single-segment paths)."""
    skip_prefixes = [
        "impressum", "agb", "datenschutz", "kontakt", "widerrufs",
        "versand", "abkuerz", "zertifikat", "wichtige-hinweise",
        "muster-", "gutscheine", "kalender", "flyer", "katalog",
        "sommer-herbst", "unsere-hoefe", "bestellschein",
        "dreschflegel-news", "termine", "rezepte", "anbautipps",
        "tipps-zur", "gartentelefon", "gartenfreude", "buecher",
        "navigation", "vielfalt", "sut20", "saatgut",
        "neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
        "saatgut-vielfalt", "saat",
    ]
    candidates = []
    for url in urls:
        url = url.rstrip("/")
        path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
            "https://www.dreschflegel-saatgut.de/", ""
        )
        if not path or "/" in path:
            continue
        if any(path == p or path.startswith(p) for p in skip_prefixes):
            continue
        candidates.append(url)
    return candidates
 def parse_product_page(html_content):
    """Extract product data from a Dreschflegel product page."""
    if not html_content or 'class="botname"' not in html_content:
        return None
    result = {}
    m = re.search(r"<h1>(.*?)</h1>", html_content)
    if m:
        result["name"] = html_mod.unescape(m.group(1).strip())
    m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
    if m:
        result["botanical_name"] = html_mod.unescape(m.group(1).strip())
    m = re.search(
        r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
        html_content,
        re.DOTALL,
    )
    if m:
        result["article_number"] = m.group(1)
    m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
    if m:
        try:
            result["price"] = float(m.group(1))
        except ValueError:
            pass
    m = re.search(
        r"product-detail-description-text.*?<p>(.*?)</p>",
        html_content,
        re.DOTALL,
    )
    if m:
        desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
        desc = html_mod.unescape(desc).strip()
        if desc:
            result["description"] = desc
    m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
    if m:
        result["pack_info"] = html_mod.unescape(m.group(1).strip())
    return result if "name" in result and "botanical_name" in result else None
 def scrape_all_products(candidate_urls):
    """Scrape product pages, using cache for already-scraped URLs."""
    # Load cache
    cache = {}
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            cache = json.load(f)
        print(f"  Loaded {len(cache)} cached products")
    products = []
    to_fetch = [u for u in candidate_urls if u not in cache]
    already_cached = [u for u in candidate_urls if u in cache]
    # Add cached products
    for u in already_cached:
        if cache[u]:  # None means "not a product page"
            products.append(cache[u])
    cached_products = len(products)
    cached_non_products = len(already_cached) - cached_products
    print(f"  {cached_products} products from cache, "
          f"{cached_non_products} non-products cached, "
          f"{len(to_fetch)} to fetch")
    for i, url in enumerate(to_fetch):
        if (i + 1) % 50 == 0 or i == 0:
            print(f"  Fetching {i + 1}/{len(to_fetch)}...")
        time.sleep(DELAY)
        html_content = fetch_page(url)
        if not html_content:
            stats["fetch_errors"] += 1
            cache[url] = None
            continue
        product = parse_product_page(html_content)
        if product:
            product["url"] = url
            products.append(product)
            cache[url] = product
            stats["products_scraped"] += 1
        else:
            cache[url] = None
            stats["not_product_pages"] += 1
        # Save cache periodically
        if (i + 1) % 100 == 0:
            with open(CACHE_FILE, "w") as f:
                json.dump(cache, f)
    # Final cache save
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)
    print(f"  Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
    return products
 def paginated_get(path):
    """Fetch all pages from a paginated API endpoint."""
    all_items = []
    page = 1
    while True:
        resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
        if not resp or "data" not in resp or not resp["data"]:
            break
        all_items.extend(resp["data"])
        if len(resp["data"]) < 100:
            break
        page += 1
    return all_items
 def load_api_data():
    """Load all species, families, cultivars from HerbAPI."""
    print("Loading HerbAPI data...")
    families = {}
    for f in paginated_get("/families"):
        families[f["name_scientific"].lower()] = f
    print(f"  {len(families)} families")
    species = {}
    for s in paginated_get("/species"):
        species[s["name_scientific"].lower().strip()] = s
    print(f"  {len(species)} species")
    cultivars = {}
    for c in paginated_get("/cultivars"):
        key = (c["species_id"], c["name"].lower().strip())
        cultivars[key] = c
    print(f"  {len(cultivars)} cultivars")
    return families, species, cultivars
 def ensure_supplier():
    """Create or find the Dreschflegel supplier."""
    resp = api_request("GET", "/suppliers")
    if resp:
        for s in resp:
            if "dreschflegel" in s["name"].lower():
                print(f"  Supplier exists: {s['name']} ({s['id']})")
                return s
    data = {
        "name": "Dreschflegel",
        "url": "https://www.dreschflegel-saatgut.de",
        "country": "DE",
        "is_organic": True,
        "is_demeter": False,
        "notes": "German organic seed cooperative, open-pollinated heritage varieties",
    }
    resp = api_request("POST", "/suppliers", data)
    if resp:
        print(f"  Created supplier: {resp['name']} ({resp['id']})")
    return resp
 # Genus → family mapping for species creation
 GENUS_TO_FAMILY = {
    # Asteraceae
    "Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
    "Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
    "Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
    "Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
    "Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
    "Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
    "Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
    "Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
    "Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
    "Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
    "Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
    "Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
    "Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
    "Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
    # Solanaceae
    "Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
    "Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
    # Cucurbitaceae
    "Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
    "Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
    # Fabaceae
    "Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
    "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
    "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
    "Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
    "Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
    # Brassicaceae
    "Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
    "Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
    "Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
    "Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
    "Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
    # Apiaceae
    "Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
    "Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
    "Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
    "Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
    "Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
    # Lamiaceae
    "Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
    "Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
    "Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
    "Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
    "Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
    "Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
    # Amaryllidaceae / Alliaceae
    "Allium": "Amaryllidaceae",
    # Poaceae
    "Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
    "Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
    "Zea": "Poaceae", "Setaria": "Poaceae",
    # Chenopodiaceae
    "Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
    "Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
    # Rosaceae
    "Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
    "Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
    "Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
    "Waldsteinia": "Rosaceae",
    # Boraginaceae
    "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
    "Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
    # Malvaceae
    "Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
    "Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
    # Polygonaceae
    "Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
    # Caryophyllaceae
    "Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
    "Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
    "Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
    # Tropaeolaceae
    "Tropaeolum": "Tropaeolaceae",
    # Papaveraceae
    "Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
    "Meconopsis": "Papaveraceae",
    # Caprifoliaceae
    "Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
    "Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
    # Plantaginaceae
    "Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
    "Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
    # Violaceae
    "Viola": "Violaceae",
    # Ranunculaceae
    "Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
    "Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
    # Linaceae
    "Linum": "Linaceae",
    # Convolvulaceae
    "Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
    # Portulacaceae / Montiaceae
    "Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
    # Amaranthaceae
    "Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
    "Gomphrena": "Amaranthaceae",
    # Asparagaceae
    "Asparagus": "Asparagaceae",
    # Resedaceae
    "Reseda": "Resedaceae",
    # Balsaminaceae
    "Impatiens": "Balsaminaceae",
    # Hydrangeaceae
    "Hydrangea": "Hydrangeaceae",
    # Campanulaceae
    "Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
    # Scrophulariaceae
    "Verbascum": "Scrophulariaceae",
    # Verbenaceae
    "Verbena": "Verbenaceae",
    # Onagraceae
    "Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
    # Cucurbitaceae extras
    "Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
    # Hypericaceae
    "Hypericum": "Hypericaceae",
    # Adoxaceae
    "Sambucus": "Adoxaceae",
    # Others
    "Nigella": "Ranunculaceae",
    "Dipsacus": "Caprifoliaceae",
    "Knautia": "Caprifoliaceae",
    "Scabiosa": "Caprifoliaceae",
    "Succisa": "Caprifoliaceae",
    "Asclepias": "Apocynaceae",
    "Cynoglossum": "Boraginaceae",
    "Echium": "Boraginaceae",
    "Anchusa": "Boraginaceae",
    "Lithospermum": "Boraginaceae",
    "Tanacetum": "Asteraceae",
    "Onobrychis": "Fabaceae",
    "Ornithopus": "Fabaceae",
    "Lotus": "Fabaceae",
    "Anthyllis": "Fabaceae",
    "Melilotus": "Fabaceae",
    "Galega": "Fabaceae",
    "Lespedeza": "Fabaceae",
    "Arachis": "Fabaceae",
    "Senna": "Fabaceae",
    # Additional genera found in Dreschflegel catalog
    "Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
    "Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
    "Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
    "Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
    "Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
    "Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
    "Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
    "Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
    "Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
    "Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
    "Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
    "Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
    "Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
    "Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
    "Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
    "Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
    "Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
    "Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
    "Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
    "Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
    "Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
    "Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
    "Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
 }
 def normalize_species_name(botanical_name):
    """Normalize botanical name to 'Genus species' for matching.
    Handles var., subsp., ssp., hybrids etc.
    """
    name = botanical_name.strip()
    parts = name.split()
    if len(parts) < 2:
        return None, None
    genus = parts[0]
    # Handle 'Genus x species' (hybrid notation)
    if parts[1] == "x" and len(parts) >= 3:
        species = f"x {parts[2]}"
    elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
        # Only genus level - can't match to species
        return genus, None
    else:
        species = parts[1]
    return genus, species
 def find_species(botanical_name, species_cache):
    """Find existing species matching a botanical name.
    Tries exact match, then genus+species without var/subsp.
    """
    genus, sp = normalize_species_name(botanical_name)
    if not genus:
        return None
    if sp:
        # Try exact genus+species
        search_key = f"{genus} {sp}".lower()
        if search_key in species_cache:
            return species_cache[search_key]
    # Try all species with same genus
    genus_lower = genus.lower()
    matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
    if len(matches) == 1:
        # Only one species in this genus - use it
        return list(matches.values())[0]
    return None
 def find_or_create_species(botanical_name, families, species_cache):
    """Find or create a species from a botanical name."""
    # Try to find existing
    sp = find_species(botanical_name, species_cache)
    if sp:
        return sp
    genus, species_epithet = normalize_species_name(botanical_name)
    if not genus or not species_epithet:
        stats["species_no_epithet"] += 1
        return None
    sci_name = f"{genus} {species_epithet}"
    # Check cache again with normalized name
    if sci_name.lower() in species_cache:
        return species_cache[sci_name.lower()]
    # Need to create - find the family
    family_name = GENUS_TO_FAMILY.get(genus)
    if not family_name:
        stats["species_no_family"] += 1
        print(f"    [SKIP] No family mapping for genus: {genus} ({botanical_name})")
        return None
    # Find or create the family
    family = families.get(family_name.lower())
    if not family:
        print(f"    Creating family: {family_name}")
        resp = api_request("POST", "/families", {"name_scientific": family_name})
        if resp:
            families[family_name.lower()] = resp
            family = resp
            stats["families_created"] += 1
        else:
            # May already exist (duplicate from previous run) - reload
            for f in paginated_get("/families"):
                if f["name_scientific"].lower() == family_name.lower():
                    families[family_name.lower()] = f
                    family = f
                    break
            if not family:
                print(f"    [SKIP] Cannot create family: {family_name}")
                return None
    # Create species
    print(f"    Creating species: {sci_name} (family: {family_name})")
    resp = api_request("POST", "/species", {
        "name_scientific": sci_name,
        "family_id": family["id"],
    })
    if resp:
        species_cache[sci_name.lower()] = resp
        stats["species_created"] += 1
        return resp
    else:
        # May already exist - try to find it
        time.sleep(0.1)
        for s in paginated_get("/species"):
            if s["name_scientific"].lower() == sci_name.lower():
                species_cache[sci_name.lower()] = s
                return s
        return None
 def extract_cultivar_name(product_name):
    """Extract the cultivar/variety name from the full product name."""
    name = product_name.strip()
    # Common German crop type prefixes to strip (longest first)
    prefixes = [
        # Tomatoes
        "Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
        "Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
        "Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
        # Lettuce
        "Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
        "Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
        "Spargelsalat", "Romanasalat",
        # Beans
        "Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
        "Prunkbohne",
        # Peas
        "Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
        "Knackerbse", "Kapuzinererbse",
        # Cucumbers
        "Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
        "Freilandgurke",
        # Squash
        "Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
        "Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
        # Melon
        "Wassermelone", "Zuckermelone",
        # Peppers
        "Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
        "Snackpaprika", "Peperoni", "Chili",
        # Brassicas
        "Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
        "Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
        "Chinakohl", "Pak Choi", "Markstammkohl",
        # Root veg
        "Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
        "Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
        "Steckrübe", "Knollensellerie", "Petersilienwurzel",
        "Rettich", "Radieschen",
        # Onions
        "Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
        "Schalotte", "Wintersteckzwiebel", "Zwiebel",
        # Herbs
        "Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
        "Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
        "Basilikum", "Schnittknoblauch",
        # Grains
        "Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
        "Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
        # Misc
        "Zuckermais", "Popcornmais",
        "Salattomate", "Zucchini",
    ]
    for prefix in sorted(prefixes, key=len, reverse=True):
        if name.startswith(prefix + " "):
            return name[len(prefix):].strip()
    return name
 def get_existing_supplier_links(cultivar_id, supplier_id):
    """Check if a cultivar-supplier link already exists."""
    resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
    if resp:
        for link in resp:
            if link["supplier_id"] == supplier_id:
                return True
    return False
 def main():
    print("=" * 60)
    print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
    print("=" * 60)
    # Step 1: Supplier
    print("\n[1] Setting up supplier...")
    supplier = ensure_supplier()
    if not supplier:
        print("FATAL: Could not create/find supplier")
        sys.exit(1)
    supplier_id = supplier["id"]
    # Step 2: Load API data
    print("\n[2] Loading existing HerbAPI data...")
    families, species_cache, cultivar_cache = load_api_data()
    # Step 3: Get product URLs
    print("\n[3] Fetching sitemap...")
    all_urls = get_sitemap_urls()
    if not all_urls:
        print("FATAL: Could not fetch sitemap")
        sys.exit(1)
    candidate_urls = classify_urls(all_urls)
    print(f"  {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
    # Step 4: Scrape
    print(f"\n[4] Scraping product pages...")
    products = scrape_all_products(candidate_urls)
    # Step 5: Import
    print(f"\n[5] Importing {len(products)} products into HerbAPI...")
    for i, product in enumerate(products):
        if (i + 1) % 50 == 0:
            print(f"  Processing {i + 1}/{len(products)}...")
        botanical = product.get("botanical_name", "")
        if not botanical:
            stats["no_botanical"] += 1
            continue
        # Find or create species
        sp = find_or_create_species(botanical, families, species_cache)
        if not sp:
            stats["species_not_matched"] += 1
            continue
        species_id = sp["id"]
        cultivar_name = extract_cultivar_name(product["name"])
        # Check if cultivar already exists
        cv_key = (species_id, cultivar_name.lower().strip())
        if cv_key in cultivar_cache:
            cv = cultivar_cache[cv_key]
            stats["cultivars_existing"] += 1
        else:
            cv_data = {
                "species_id": species_id,
                "name": cultivar_name,
                "is_organic": True,
            }
            if product.get("description"):
                cv_data["description"] = product["description"]
            cv = api_request("POST", "/cultivars", cv_data)
            if cv:
                cultivar_cache[cv_key] = cv
                stats["cultivars_created"] += 1
            else:
                # Might already exist from previous run - try to find it
                found = False
                for c in paginated_get(f"/cultivars?species_id={species_id}"):
                    if c["name"].lower().strip() == cultivar_name.lower().strip():
                        cultivar_cache[cv_key] = c
                        cv = c
                        stats["cultivars_existing"] += 1
                        found = True
                        break
                if not found:
                    stats["cultivar_create_errors"] += 1
                    continue
        # Link to supplier (check first for idempotency)
        if get_existing_supplier_links(cv["id"], supplier_id):
            stats["supplier_links_existing"] += 1
            continue
        link_data = {
            "supplier_id": supplier_id,
            "article_number": product.get("article_number", ""),
            "product_url": product.get("url", ""),
            "price_eur": product.get("price"),
        }
        pack_info = product.get("pack_info", "")
        if pack_info:
            m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
            if m:
                link_data["pack_size"] = float(m.group(1))
                unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
                link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
        resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
        if resp:
            stats["supplier_links_created"] += 1
        else:
            stats["supplier_link_errors"] += 1
    # Summary
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    for key, val in sorted(stats.items()):
        print(f"  {key}: {val}")
    print(f"\n  Total species in DB: {len(species_cache)}")
    print(f"  Total cultivars tracked: {len(cultivar_cache)}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,380 @@
 #!/usr/bin/env python3
 """Scrape Magic Garden Seeds product pages and update herbapi database."""
 import subprocess
 import re
 import time
 import os
 import sys
 DB_CMD = [
    'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
    '-t', '-A', '-F|'
 ]
 DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
 MONTH_MAP = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4,
    'may': 5, 'june': 6, 'july': 7, 'august': 8,
    'september': 9, 'october': 10, 'november': 11, 'december': 12,
 }
 def run_sql(sql):
    result = subprocess.run(
        DB_CMD + ['-c', sql],
        capture_output=True, text=True, env=DB_ENV
    )
    return result.stdout.strip()
 def fetch_page(url):
    result = subprocess.run(
        ['curl', '-sL', '--max-time', '15', url],
        capture_output=True, text=True
    )
    return result.stdout
 def parse_months(text):
    if not text:
        return None
    text_lower = text.lower().strip()
    months = []
    for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
        if month_name in text_lower:
            if month_num not in months:
                months.append(month_num)
            text_lower = text_lower.replace(month_name, '')
    return sorted(months) if months else None
 def parse_depth(text):
    if not text:
        return None
    match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
    if match:
        v1 = float(match.group(1).replace(',', '.'))
        v2 = float(match.group(2).replace(',', '.'))
        return round((v1 + v2) / 2, 1)
    match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
    if match:
        return float(match.group(1).replace(',', '.'))
    return None
 def parse_spacing(text):
    """Parse planting distance. Returns (row_spacing, plant_spacing)."""
    if not text:
        return None, None
    text = text.lower().strip()
    # "X x Y cm"
    match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return float(match.group(2)), float(match.group(1))
    # "X - Y cm" range -> average as plant spacing
    match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
    # Single value
    match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return None, float(match.group(1))
    return None, None
 def parse_germination_days(text):
    if not text:
        return None
    text = text.lower()
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
    if match:
        return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
    match = re.search(r'(\d+)\s*weeks?', text)
    if match:
        return int(match.group(1)) * 7
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
    if match:
        return int(round((int(match.group(1)) + int(match.group(2))) / 2))
    match = re.search(r'(\d+)\s*days?', text)
    if match:
        return int(match.group(1))
    return None
 def parse_germ_temp(text):
    if not text:
        return None
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
    if match:
        return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
    match = re.search(r'(\d+)\s*°', text)
    if match:
        return float(match.group(1))
    return None
 def parse_lifecycle(text):
    if not text:
        return None
    text = text.lower().strip()
    if 'perennial' in text:
        return True
    if 'annual' in text or 'biennial' in text:
        return False
    return None
 def parse_light(text):
    if not text:
        return None
    text = text.lower().strip()
    if 'full sun' in text and 'partial' in text:
        return 'full sun to partial shade'
    if 'full sun' in text:
        return 'full sun'
    if 'partial' in text or 'semi' in text or 'half' in text:
        return 'partial shade'
    if 'shade' in text:
        return 'shade'
    if 'sun' in text:
        return 'full sun'
    return text
 def extract_data(html):
    data = {}
    # Extract table cell pairs
    cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
    clean_cells = []
    for c in cells:
        clean = re.sub(r'<[^>]+>', ' ', c).strip()
        clean = re.sub(r'\s+', ' ', clean)
        clean_cells.append(clean)
    specs = {}
    i = 0
    while i < len(clean_cells) - 1:
        key = clean_cells[i].rstrip(':').strip()
        val = clean_cells[i + 1].strip()
        if key and val and not re.match(r'^[\d,.\s€*]+$', key):
            specs[key.lower()] = val
        i += 2
    # Extract description from itemprop="description"
    desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
    if desc_match:
        content = desc_match.group(1)
        content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
        content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
        content = re.sub(r'<[^>]+>', ' ', content)
        content = re.sub(r'\s+', ' ', content).strip()
        for marker in ['Other names', 'Additional contact mail', 'Question about']:
            idx = content.find(marker)
            if idx > 0:
                content = content[:idx].strip()
        if len(content) > 20:
            data['description'] = content
    if 'description' not in data:
        meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
        if meta_match and len(meta_match.group(1)) > 20:
            data['description'] = meta_match.group(1)
    # Parse specs
    if 'planting distance' in specs:
        row_sp, plant_sp = parse_spacing(specs['planting distance'])
        if plant_sp:
            data['plant_spacing_cm'] = plant_sp
        if row_sp:
            data['row_spacing_cm'] = row_sp
    if 'row spacing' in specs:
        match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
        if match:
            data['row_spacing_cm'] = float(match.group(1))
    if 'sowing depth' in specs:
        depth = parse_depth(specs['sowing depth'])
        if depth is not None:
            data['planting_depth_cm'] = depth
    # Harvesting months - prefer explicit harvest time over flowering
    if 'harvest time' in specs:
        months = parse_months(specs['harvest time'])
        if months:
            data['harvesting_months'] = months
    elif 'harvesting months' in specs:
        months = parse_months(specs['harvesting months'])
        if months:
            data['harvesting_months'] = months
    elif 'flowering months' in specs:
        months = parse_months(specs['flowering months'])
        if months:
            data['harvesting_months'] = months
    if 'when to sow outdoors' in specs:
        months = parse_months(specs['when to sow outdoors'])
        if months:
            data['direct_sowing_months'] = months
    for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
        if indoor_key in specs:
            months = parse_months(specs[indoor_key])
            if months:
                data['indoor_sowing_months'] = months
                break
    if 'lifecycle' in specs:
        perennial = parse_lifecycle(specs['lifecycle'])
        if perennial is not None:
            data['perennial'] = perennial
    if 'sunlight' in specs:
        light = parse_light(specs['sunlight'])
        if light:
            data['light_requirement'] = light
    if 'germination time' in specs:
        days = parse_germination_days(specs['germination time'])
        if days:
            data['days_to_germination'] = days
    if 'germination temperature' in specs:
        temp = parse_germ_temp(specs['germination temperature'])
        if temp:
            data['germination_temp_c'] = temp
    return data
 def get_current_values(cultivar_id):
    sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
                     perennial, harvesting_months, direct_sowing_months, light_requirement,
                     days_to_germination, germination_temp_c, indoor_sowing_months
              FROM cultivars WHERE id = '{cultivar_id}'"""
    row = run_sql(sql)
    if not row:
        return {}
    parts = row.split('|')
    fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
              'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
              'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
    current = {}
    for i, f in enumerate(fields):
        if i < len(parts):
            val = parts[i].strip()
            if val and val != '':
                current[f] = val
    return current
 def build_update_sql(cultivar_id, data, current):
    sets = []
    updated_fields = []
    for field, value in data.items():
        if field in current and current[field]:
            continue
        if isinstance(value, str):
            escaped = value.replace("'", "''")
            sets.append(f"{field} = '{escaped}'")
        elif isinstance(value, bool):
            sets.append(f"{field} = {'true' if value else 'false'}")
        elif isinstance(value, list):
            arr_str = '{' + ','.join(str(x) for x in value) + '}'
            sets.append(f"{field} = '{arr_str}'")
        elif isinstance(value, (int, float)):
            sets.append(f"{field} = {value}")
        updated_fields.append(field)
    if not sets:
        return None, []
    return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
 def main():
    sql = """
    SELECT c.id, c.name, cs.product_url
    FROM cultivars c
    JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
    JOIN suppliers s ON cs.supplier_id = s.id
    WHERE s.name = 'Magic Garden Seeds'
    AND cs.product_url IS NOT NULL AND cs.product_url <> ''
    AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
    ORDER BY c.name;
    """
    rows = run_sql(sql)
    if not rows:
        print("No cultivars to process")
        return
    cultivars = []
    for line in rows.strip().split('\n'):
        parts = line.split('|')
        if len(parts) >= 3:
            cultivars.append({
                'id': parts[0],
                'name': parts[1],
                'url': parts[2]
            })
    print(f"Processing {len(cultivars)} MGS cultivars...")
    sys.stdout.flush()
    updated = 0
    skipped = 0
    failed = 0
    fields_updated = {}
    for i, cv in enumerate(cultivars):
        print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
        try:
            html = fetch_page(cv['url'])
            if not html or len(html) < 1000:
                print("FAILED (empty page)")
                failed += 1
                time.sleep(0.5)
                continue
            data = extract_data(html)
            if not data:
                print("NO DATA")
                skipped += 1
                time.sleep(0.5)
                continue
            current = get_current_values(cv['id'])
            sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
            if not sql_stmt:
                print(f"SKIP (all fields populated)")
                skipped += 1
            else:
                run_sql(sql_stmt)
                for f in upd_fields:
                    fields_updated[f] = fields_updated.get(f, 0) + 1
                print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
                updated += 1
        except Exception as e:
            print(f"ERROR: {e}")
            failed += 1
        time.sleep(0.5)
    print(f"\n=== MGS Summary ===")
    print(f"Total processed: {len(cultivars)}")
    print(f"Updated: {updated}")
    print(f"Skipped (all fields already populated): {skipped}")
    print(f"Failed: {failed}")
    print(f"\nFields updated:")
    for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
        print(f"  {field}: {count}")
 if __name__ == '__main__':
    main()
@@ -0,0 +1,330 @@
 #!/usr/bin/env python3
 """
 Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
 """
 import json
 import re
 import time
 import urllib.request
 import urllib.error
 import sys
 HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 NATURADB_BASE = "https://www.naturadb.de/pflanzen"
 USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
 DELAY = 0.5
 def api_get(path):
    """GET from HerbAPI."""
    url = f"{HERBAPI_BASE}{path}"
    req = urllib.request.Request(url)
    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
    req.add_header("Accept", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read().decode())
 def api_put(path, data):
    """PUT to HerbAPI."""
    url = f"{HERBAPI_BASE}{path}"
    body = json.dumps(data).encode()
    req = urllib.request.Request(url, data=body, method="PUT")
    req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
    req.add_header("Content-Type", "application/json")
    req.add_header("Accept", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read().decode())
 def fetch_naturadb(latin_name):
    """Fetch a NaturaDB plant page. Returns HTML string or None."""
    slug = latin_name.lower().replace(" ", "-")
    url = f"{NATURADB_BASE}/{slug}/"
    req = urllib.request.Request(url)
    req.add_header("User-Agent", USER_AGENT)
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except urllib.error.HTTPError as e:
        if e.code == 404:
            return None
        print(f"  HTTP {e.code} for {url}")
        return None
    except Exception as e:
        print(f"  Error fetching {url}: {e}")
        return None
 def extract_td_value(html, label):
    """Extract value from <td>label:</td><td>value</td> pattern."""
    pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
    m = re.search(pattern, html, re.DOTALL)
    if m:
        # Strip HTML tags from value
        val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
        return val
    return None
 def extract_native_status(html):
    """Extract native status from chip badges."""
    # Look for the primary native status chips (large, colored)
    statuses = []
    for m in re.finditer(
        r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
    ):
        tag = m.group(1).strip()
        if tag in (
            "heimische Wildform",
            "Archäophyt",
            "Neophyt",
            "nicht heimisch (Neophyt)",
        ):
            statuses.append(tag)
    return statuses
 def extract_badge_tags(html):
    """Extract ecological badge chips (large, plain text)."""
    tags = []
    for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
        tag = m.group(1).strip()
        if tag and tag not in ("", "winterhart"):
            tags.append(tag)
    return tags
 def parse_count(text):
    """Extract leading integer from text like '82 (Nektar und/oder ...)' """
    if not text:
        return None
    m = re.match(r"(\d+)", text.strip())
    return int(m.group(1)) if m else None
 def parse_specialist_count(text):
    """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
    if not text:
        return None
    m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
    return int(m.group(1)) if m else None
 def parse_nectar_pollen(text):
    """Extract numeric value from '2/4 - mäßig' -> 2."""
    if not text:
        return None
    m = re.match(r"(\d+)/4", text.strip())
    return int(m.group(1)) if m else None
 def build_wildlife_value(data):
    """Build a structured wildlife_value string from scraped data."""
    parts = []
    # Nectar and pollen
    np_parts = []
    if data.get("nectar") is not None:
        np_parts.append(f"Nectar: {data['nectar']}/4")
    if data.get("pollen") is not None:
        np_parts.append(f"Pollen: {data['pollen']}/4")
    if np_parts:
        parts.append(", ".join(np_parts) + ".")
    # Wild bees
    if data.get("wildbienen_count") is not None:
        s = f"Supports {data['wildbienen_count']} wild bee species"
        if data.get("wildbienen_specialists") is not None:
            s += f" ({data['wildbienen_specialists']} specialists)"
        parts.append(s + ".")
    # Butterflies / moths
    if data.get("schmetterlinge_count") is not None:
        s = f"{data['schmetterlinge_count']} butterfly/moth species"
        if data.get("raupen_count") is not None:
            spec = ""
            if data.get("raupen_specialists") is not None:
                spec = f" ({data['raupen_specialists']} specialized)"
            s += f", {data['raupen_count']} as caterpillar host{spec}"
        parts.append(s + ".")
    # Hoverflies
    if data.get("schwebfliegen_count") is not None:
        parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
    # Beetles
    if data.get("kaefer_count") is not None:
        parts.append(f"{data['kaefer_count']} beetle species.")
    # Birds
    if data.get("vogelarten_count") is not None:
        parts.append(f"{data['vogelarten_count']} bird species.")
    # Mammals
    if data.get("saeugetier_count") is not None:
        parts.append(f"{data['saeugetier_count']} mammal species.")
    # Native status
    if data.get("native_status"):
        parts.append(" ".join(data["native_status"]) + ".")
    # Notable badges
    notable = [
        t
        for t in data.get("badges", [])
        if any(
            kw in t.lower()
            for kw in [
                "insektenpflanze",
                "raupenfutter",
                "vogelschutz",
                "vogelnähr",
                "bienenweide",
            ]
        )
    ]
    if notable:
        parts.append("Tags: " + ", ".join(notable) + ".")
    return " ".join(parts) if parts else None
 def scrape_species(html):
    """Parse NaturaDB HTML and return structured wildlife data dict."""
    data = {}
    # Nectar and pollen values
    nectar_raw = extract_td_value(html, "Nektarwert")
    pollen_raw = extract_td_value(html, "Pollenwert")
    data["nectar"] = parse_nectar_pollen(nectar_raw)
    data["pollen"] = parse_nectar_pollen(pollen_raw)
    # Wild bees
    bees_raw = extract_td_value(html, "Wildbienen")
    data["wildbienen_count"] = parse_count(bees_raw)
    data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
    # Butterflies/moths
    schmett_raw = extract_td_value(html, "Schmetterlinge")
    data["schmetterlinge_count"] = parse_count(schmett_raw)
    # Caterpillar hosts
    raupen_raw = extract_td_value(html, "Raupen")
    data["raupen_count"] = parse_count(raupen_raw)
    data["raupen_specialists"] = parse_specialist_count(raupen_raw)
    # Hoverflies
    schweb_raw = extract_td_value(html, "Schwebfliegen")
    data["schwebfliegen_count"] = parse_count(schweb_raw)
    # Beetles
    kaefer_raw = extract_td_value(html, "Käfer")
    data["kaefer_count"] = parse_count(kaefer_raw)
    # Birds
    vogel_raw = extract_td_value(html, "fressende Vogelarten")
    data["vogelarten_count"] = parse_count(vogel_raw)
    # Mammals
    saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
    data["saeugetier_count"] = parse_count(saeuget_raw)
    # Native status
    data["native_status"] = extract_native_status(html)
    # Badge tags
    data["badges"] = extract_badge_tags(html)
    return data
 def has_any_data(data):
    """Check if we scraped anything meaningful."""
    for k, v in data.items():
        if k in ("native_status", "badges"):
            if v:
                return True
        elif v is not None:
            return True
    return False
 def main():
    print("Fetching species list from HerbAPI...")
    species_list = api_get("/species?per_page=200")["data"]
    print(f"Found {len(species_list)} species.\n")
    enriched = 0
    skipped_has_data = 0
    skipped_not_found = 0
    skipped_no_data = 0
    errors = 0
    for i, sp in enumerate(species_list):
        slug = sp["slug"]
        name = sp["name_scientific"]
        existing_wv = sp.get("wildlife_value")
        # Only enrich if wildlife_value is empty/null
        if existing_wv:
            print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
            skipped_has_data += 1
            continue
        print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
        # Fetch NaturaDB page
        html = fetch_naturadb(name)
        time.sleep(DELAY)
        if html is None:
            print("NOT FOUND on NaturaDB")
            skipped_not_found += 1
            continue
        # Parse wildlife data
        data = scrape_species(html)
        if not has_any_data(data):
            print("no wildlife data on page")
            skipped_no_data += 1
            continue
        # Build wildlife_value string
        wildlife_value = build_wildlife_value(data)
        if not wildlife_value:
            print("no wildlife data extracted")
            skipped_no_data += 1
            continue
        # GET full species, merge, PUT back
        try:
            full = api_get(f"/species/{slug}")
            full["wildlife_value"] = wildlife_value
            # Remove read-only / computed fields that the PUT endpoint might reject
            for key in ("created_at", "updated_at", "family"):
                full.pop(key, None)
            api_put(f"/species/{full['id']}", full)
            print(f"ENRICHED -> {wildlife_value[:80]}...")
            enriched += 1
        except Exception as e:
            print(f"API ERROR: {e}")
            errors += 1
    print("\n" + "=" * 70)
    print(f"DONE. Results:")
    print(f"  Enriched:           {enriched}")
    print(f"  Already had data:   {skipped_has_data}")
    print(f"  Not on NaturaDB:    {skipped_not_found}")
    print(f"  No wildlife data:   {skipped_no_data}")
    print(f"  Errors:             {errors}")
    print(f"  Total:              {len(species_list)}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,560 @@
 #!/usr/bin/env python3
 """
 Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
 Strategy:
 1. Fetch category pages, recursively discover product pages via JSON-LD detection
 2. Extract structured data from JSON-LD Product schema + HTML text for growing data
 3. Match Latin names to existing species in the API
 4. Create cultivar records and link them to Reinsaat supplier
 """
 import json
 import re
 import ssl
 import time
 import urllib.request
 import urllib.error
 import urllib.parse
 from html.parser import HTMLParser
 from dataclasses import dataclass
 from typing import Optional
 # ── Config ──────────────────────────────────────────────────────────────────
 API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
 DELAY = 0.5  # seconds between requests
 USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
 # ── Categories to scrape ────────────────────────────────────────────────────
 # (category_url, default_species_hint for leaf pages in this category)
 CATEGORIES = [
    ("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
    ("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
    ("https://www.reinsaat.at/shop/DE/kuerbis/", None),
    ("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
    ("https://www.reinsaat.at/shop/DE/bohnen/", None),
    ("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
    ("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
    ("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
 ]
 # ── Known Latin name genera we can match ────────────────────────────────────
 KNOWN_GENERA = (
    "Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
    "Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
    "Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
    "Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
    "Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
    "Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
 )
 LATIN_PATTERN = re.compile(
    rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
 )
 # ── HTML helpers ────────────────────────────────────────────────────────────
 class TextExtractor(HTMLParser):
    """Extract all visible text from HTML."""
    def __init__(self):
        super().__init__()
        self.parts = []
        self._skip = 0
    def handle_starttag(self, tag, attrs):
        if tag in ("script", "style", "noscript"):
            self._skip += 1
    def handle_endtag(self, tag):
        if tag in ("script", "style", "noscript") and self._skip > 0:
            self._skip -= 1
    def handle_data(self, data):
        if self._skip == 0:
            t = data.strip()
            if t:
                self.parts.append(t)
 def extract_links(html: str, base_url: str) -> list[str]:
    """Extract all <a href> links from HTML, resolving relative URLs."""
    links = []
    seen = set()
    for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
        href = m.group(1)
        if not href or href.startswith("#") or href.startswith("javascript:"):
            continue
        full = urllib.parse.urljoin(base_url, href)
        if full not in seen:
            seen.add(full)
            links.append(full)
    return links
 def extract_jsonld_product(html: str) -> Optional[dict]:
    """Extract the JSON-LD Product object from HTML, if present."""
    for m in re.finditer(
        r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
        html, re.DOTALL | re.IGNORECASE
    ):
        try:
            data = json.loads(m.group(1))
            if isinstance(data, dict) and data.get("@type") == "Product":
                return data
        except (json.JSONDecodeError, ValueError):
            continue
    return None
 # ── HTTP helpers ────────────────────────────────────────────────────────────
 _ssl_ctx = ssl.create_default_context()
 def fetch_url(url: str, retries: int = 2) -> str:
    """Fetch a URL with retries."""
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
    })
    for attempt in range(retries + 1):
        try:
            with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
                charset = resp.headers.get_content_charset() or "utf-8"
                return resp.read().decode(charset)
        except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
            if attempt < retries:
                time.sleep(2)
                continue
            raise
    return ""
 def api_get(path: str):
    """GET from HerbAPI."""
    req = urllib.request.Request(
        f"{API_BASE}{path}",
        headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=15) as resp:
        return json.loads(resp.read())
 def api_post(path: str, data: dict):
    """POST to HerbAPI."""
    body = json.dumps(data).encode("utf-8")
    req = urllib.request.Request(
        f"{API_BASE}{path}",
        data=body,
        headers={
            "Authorization": f"Bearer {AUTH_TOKEN}",
            "Content-Type": "application/json",
            "Accept": "application/json",
        },
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        error_body = e.read().decode("utf-8", errors="replace")
        print(f"    API ERROR {e.code}: {error_body[:500]}")
        raise
 # ── Species matching ────────────────────────────────────────────────────────
 def load_species() -> dict:
    """Load species from API. Returns dict: lowercase scientific name -> species dict."""
    result = {}
    page = 1
    while True:
        data = api_get(f"/species?per_page=100&page={page}")
        species_list = data.get("data", data) if isinstance(data, dict) else data
        for s in species_list:
            key = s["name_scientific"].lower().strip()
            result[key] = s
        if isinstance(data, dict) and "pagination" in data:
            if page >= data["pagination"].get("total_pages", 1):
                break
        else:
            break
        page += 1
    return result
 def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
    """Match a Latin name to an existing species. Returns species dict or None."""
    if not latin_name:
        return None
    # Clean the name: remove author citations, subspecies
    clean = latin_name.strip()
    clean = re.sub(r'\s+L\.\s*$', '', clean)
    clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
    clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
    key = clean.lower().strip()
    if key in species_map:
        return species_map[key]
    # Try genus + species (first two words)
    parts = key.split()
    if len(parts) >= 2:
        two = f"{parts[0]} {parts[1]}"
        if two in species_map:
            return species_map[two]
    # Try genus-only match (less reliable, but useful for Borago, etc.)
    if parts:
        for skey, sval in species_map.items():
            if skey.startswith(parts[0] + " "):
                return sval
    return None
 # ── Product data extraction ─────────────────────────────────────────────────
@dataclass
 class ProductData:
    name: str = ""
    latin_name: str = ""
    description: str = ""
    sku: str = ""
    url: str = ""
    is_organic: bool = True
    sowing_depth_cm: Optional[float] = None
    row_spacing_cm: Optional[float] = None
    plant_spacing_cm: Optional[float] = None
    germination_temp_c: Optional[float] = None
    perennial: bool = False
 def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
    """Parse a product page. Returns ProductData or None if not a product page."""
    jsonld = extract_jsonld_product(html)
    if not jsonld:
        return None  # Not a product page
    product = ProductData(url=url)
    # ── From JSON-LD ──
    product.name = jsonld.get("name", "").strip()
    product.description = jsonld.get("description", "").strip()
    product.sku = jsonld.get("model", "").strip()
    # ── Extract full text for pattern matching ──
    extractor = TextExtractor()
    extractor.feed(html)
    full_text = " ".join(extractor.parts)
    # ── Latin name ──
    m = LATIN_PATTERN.search(full_text)
    if m:
        product.latin_name = m.group(1).strip()
    # Also check <i>/<em> tags in HTML
    if not product.latin_name:
        for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
            clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
            im = LATIN_PATTERN.search(clean)
            if im:
                product.latin_name = im.group(1).strip()
                break
    if not product.latin_name and default_species:
        product.latin_name = default_species
    # ── Sowing depth ──
    depth_pats = [
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
        r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
    ]
    for pat in depth_pats:
        dm = re.search(pat, full_text, re.IGNORECASE)
        if dm:
            vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
            product.sowing_depth_cm = sum(vals) / len(vals)
            break
    # Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords
    if product.sowing_depth_cm is None:
        dm = re.search(
            r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
            html, re.IGNORECASE
        )
        if dm:
            d1 = float(dm.group(1).replace(",", "."))
            d2 = float(dm.group(2).replace(",", "."))
            product.sowing_depth_cm = (d1 + d2) / 2
    # ── Spacing ──
    # Look for "ROW x PLANT cm" patterns
    spacing_pats = [
        # "30–40 x 2–4 cm" (range x range)
        r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
        # "100 x 50 cm" (simple)
        r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in spacing_pats:
        matches = re.findall(pat, full_text, re.IGNORECASE)
        if matches:
            # Prefer the last match (often the more relevant outdoor spacing)
            m = matches[-1]
            if len(m) == 4:
                product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
                product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
            elif len(m) == 2:
                v1 = float(m[0].replace(",", "."))
                v2 = float(m[1].replace(",", "."))
                product.row_spacing_cm = v1
                product.plant_spacing_cm = v2
            break
    # ── Germination temperature ──
    temp_pats = [
        r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C',
        r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C',
        r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
    ]
    for pat in temp_pats:
        tm = re.search(pat, full_text, re.IGNORECASE)
        if tm:
            vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
            # Sanity check: germination temps are typically 5-35°C
            avg = sum(vals) / len(vals)
            if 5 <= avg <= 40:
                product.germination_temp_c = avg
                break
    # ── Perennial ──
    perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
    for pat in perennial_pats:
        if re.search(pat, full_text, re.IGNORECASE):
            product.perennial = True
            break
    return product
 # ── Recursive product discovery ─────────────────────────────────────────────
 def discover_products(
    category_url: str,
    default_species: Optional[str],
    max_depth: int = 3,
    _depth: int = 0,
    _visited: set = None,
 ) -> list[ProductData]:
    """Recursively discover and parse product pages under a category URL."""
    if _visited is None:
        _visited = set()
    if category_url in _visited or _depth > max_depth:
        return []
    _visited.add(category_url)
    indent = "  " * (_depth + 1)
    print(f"{indent}Fetching: {category_url}")
    try:
        html = fetch_url(category_url)
        time.sleep(DELAY)
    except Exception as e:
        print(f"{indent}  ERROR: {e}")
        return []
    # Check if this IS a product page
    product = parse_product(html, category_url, default_species)
    if product:
        return [product]
    # It's a category/subcategory page: extract child links
    cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
    child_links = []
    for link in extract_links(html, category_url):
        parsed = urllib.parse.urlparse(link)
        if parsed.netloc and parsed.netloc != "www.reinsaat.at":
            continue
        child_path = parsed.path.rstrip("/")
        # Must be a direct child of the category path
        if not child_path.startswith(cat_path + "/"):
            continue
        relative = child_path[len(cat_path) + 1:]
        # Must be exactly one level deeper (no further slashes)
        if "/" in relative:
            continue
        # Skip empty or same-path
        if not relative:
            continue
        # Build clean URL
        clean_url = f"https://www.reinsaat.at{child_path}/"
        if clean_url not in _visited:
            child_links.append(clean_url)
    # Deduplicate
    child_links = list(dict.fromkeys(child_links))
    print(f"{indent}  Found {len(child_links)} child links")
    products = []
    for child_url in child_links:
        results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
        products.extend(results)
    return products
 # ── Main ────────────────────────────────────────────────────────────────────
 def main():
    print("=" * 70)
    print("Reinsaat Scraper -> HerbAPI")
    print("=" * 70)
    # Load species
    print("\n[1] Loading species from API...")
    species_map = load_species()
    sci_names = [k for k in species_map if " " in k]
    print(f"    {len(sci_names)} species loaded:")
    for k in sorted(sci_names):
        s = species_map[k]
        print(f"      {s['name_scientific']:40s} {s['id'][:12]}...")
    # Load existing cultivars
    print("\n[2] Loading existing cultivars...")
    existing_cultivars = {}  # (species_id, name_lower) -> cultivar_id
    page = 1
    while True:
        data = api_get(f"/cultivars?per_page=100&page={page}")
        clist = data.get("data", data) if isinstance(data, dict) else data
        if not clist:
            break
        for c in clist:
            existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
        # Check pagination - API uses {data, total, page, per_page} format
        if isinstance(data, dict):
            total = data.get("total", len(clist))
            per_page = data.get("per_page", 100)
            if page * per_page >= total:
                break
        else:
            break
        page += 1
    print(f"    {len(existing_cultivars)} existing cultivars")
    # Discover products from all categories
    print("\n[3] Discovering products from Reinsaat categories...")
    all_products: list[ProductData] = []
    visited: set[str] = set()
    for cat_url, species_hint in CATEGORIES:
        print(f"\n  Category: {cat_url}")
        products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
        all_products.extend(products)
        print(f"  -> {len(products)} products from this category")
    print(f"\n  Total products discovered: {len(all_products)}")
    # Deduplicate by URL
    seen_urls = set()
    unique_products = []
    for p in all_products:
        if p.url not in seen_urls:
            seen_urls.add(p.url)
            unique_products.append(p)
    all_products = unique_products
    print(f"  Unique products: {len(all_products)}")
    # Process products
    print("\n[4] Creating cultivars in API...")
    stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
    for i, product in enumerate(all_products):
        pct = (i + 1) / len(all_products) * 100
        print(f"\n  [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
        # Match species
        species = match_species(product.latin_name, species_map)
        if not species:
            print(f"    Skip: no species match for '{product.latin_name}'")
            stats["skipped_no_species"] += 1
            continue
        species_id = species["id"]
        print(f"    Species: {species['name_scientific']}")
        print(f"    SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
              f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
              f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
        # Check duplicates
        key = (species_id, product.name.lower())
        if key in existing_cultivars:
            # Still try to link supplier if cultivar exists
            cultivar_id = existing_cultivars[key]
            print(f"    Exists: {cultivar_id[:12]}... - checking supplier link")
            try:
                api_post(f"/cultivars/{cultivar_id}/suppliers", {
                    "supplier_id": REINSAAT_SUPPLIER_ID,
                    "product_url": product.url,
                    "article_number": product.sku,
                })
                print(f"    Linked to Reinsaat (SKU: {product.sku})")
                stats["linked"] += 1
            except Exception:
                pass  # Already linked or other error
            stats["skipped_exists"] += 1
            continue
        # Build payload
        payload = {
            "species_id": species_id,
            "name": product.name,
            "name_de": product.name,
            "name_en": "",
            "description": product.description,
            "is_organic": product.is_organic,
            "perennial": product.perennial,
        }
        if product.sowing_depth_cm is not None:
            payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
        if product.row_spacing_cm is not None:
            payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
        if product.plant_spacing_cm is not None:
            payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
        if product.germination_temp_c is not None:
            payload["germination_temp_c"] = round(product.germination_temp_c, 1)
        # Create cultivar
        try:
            result = api_post("/cultivars", payload)
            cultivar_id = result["id"]
            print(f"    Created: {cultivar_id}")
            stats["created"] += 1
            existing_cultivars[key] = cultivar_id
        except Exception as e:
            print(f"    FAILED to create: {e}")
            stats["errors"] += 1
            continue
        # Link to supplier
        try:
            api_post(f"/cultivars/{cultivar_id}/suppliers", {
                "supplier_id": REINSAAT_SUPPLIER_ID,
                "product_url": product.url,
                "article_number": product.sku,
            })
            print(f"    Linked to Reinsaat (SKU: {product.sku})")
            stats["linked"] += 1
        except Exception as e:
            print(f"    FAILED to link supplier: {e}")
    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"  Created:              {stats['created']}")
    print(f"  Linked to supplier:   {stats['linked']}")
    print(f"  Skipped (no species): {stats['skipped_no_species']}")
    print(f"  Skipped (exists):     {stats['skipped_exists']}")
    print(f"  Errors:               {stats['errors']}")
    print("=" * 70)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,770 @@
 #!/usr/bin/env python3
 """
 Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
 genus+species from extended botanical names, create/enrich cultivars, link supplier.
 Uses direct PostgreSQL access (psycopg2) for speed and reliability.
 """
 import json
 import re
 import ssl
 import sys
 import time
 import uuid
 import html as html_mod
 import urllib.request
 import urllib.error
 import urllib.parse
 from dataclasses import dataclass, field
 from typing import Optional
 # Unbuffered output
 sys.stdout.reconfigure(line_buffering=True)
 sys.stderr.reconfigure(line_buffering=True)
 import psycopg2
 import psycopg2.extras
 # ── Config ──────────────────────────────────────────────────────────────────
 DB_HOST = "10.31.3.90"
 DB_NAME = "herbapi"
 DB_USER = "herbapi"
 DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
 REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
 DELAY = 0.3
 USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
 # ── All Reinsaat categories ────────────────────────────────────────────────
 CATEGORIES = [
    "https://www.reinsaat.at/shop/DE/bohnen/",
    "https://www.reinsaat.at/shop/DE/erbsen/",
    "https://www.reinsaat.at/shop/DE/gurken/",
    "https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
    "https://www.reinsaat.at/shop/DE/knollenfenchel/",
    "https://www.reinsaat.at/shop/DE/kohlgewaechse/",
    "https://www.reinsaat.at/shop/DE/kuerbis/",
    "https://www.reinsaat.at/shop/DE/mais/",
    "https://www.reinsaat.at/shop/DE/mangold/",
    "https://www.reinsaat.at/shop/DE/melanzani_1/",
    "https://www.reinsaat.at/shop/DE/melone/",
    "https://www.reinsaat.at/shop/DE/paprika/",
    "https://www.reinsaat.at/shop/DE/pastinaken_1/",
    "https://www.reinsaat.at/shop/DE/petersilie/",
    "https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
    "https://www.reinsaat.at/shop/DE/porree/",
    "https://www.reinsaat.at/shop/DE/radies_rettich/",
    "https://www.reinsaat.at/shop/DE/rote_ruebe/",
    "https://www.reinsaat.at/shop/DE/salate/",
    "https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
    "https://www.reinsaat.at/shop/DE/sellerie/",
    "https://www.reinsaat.at/shop/DE/spinat/",
    "https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
    "https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
    "https://www.reinsaat.at/shop/DE/zucchini/",
    "https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
    "https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
    "https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
    "https://www.reinsaat.at/shop/DE/gruenduengung/",
 ]
 # ── HTTP ────────────────────────────────────────────────────────────────────
 _ssl_ctx = ssl.create_default_context()
 def fetch_url(url: str, retries: int = 2) -> str:
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
    })
    for attempt in range(retries + 1):
        try:
            with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
                charset = resp.headers.get_content_charset() or "utf-8"
                return resp.read().decode(charset)
        except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
            if attempt < retries:
                time.sleep(2)
                continue
            raise
    return ""
 # ── HTML parsing helpers ────────────────────────────────────────────────────
 def extract_links(html_text: str, base_url: str) -> list[str]:
    links = []
    seen = set()
    for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
        href = m.group(1)
        if not href or href.startswith("#") or href.startswith("javascript:"):
            continue
        full = urllib.parse.urljoin(base_url, href)
        if full not in seen:
            seen.add(full)
            links.append(full)
    return links
 def extract_jsonld_product(html_text: str) -> Optional[dict]:
    for m in re.finditer(
        r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
        html_text, re.DOTALL | re.IGNORECASE
    ):
        try:
            data = json.loads(m.group(1))
            if isinstance(data, dict) and data.get("@type") == "Product":
                return data
        except (json.JSONDecodeError, ValueError):
            continue
    return None
 def html_to_text(html_text: str) -> str:
    """Strip HTML tags and decode entities."""
    text = re.sub(r'<[^>]+>', ' ', html_text)
    text = html_mod.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def extract_botanical_name(html_text: str) -> str:
    """
    Extract the botanical/Latin name from the page.
    Primary source: <div class="fce_shop_kurztext"> content.
    Fallback: <em> tags in growing infos.
    Returns the raw text (may include authority names, infraspecific ranks, etc.)
    """
    # Primary: kurztext div
    m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if m:
        text = html_to_text(m.group(1)).strip()
        if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
            return text
    # Fallback: first <em> in growingInfos that looks like a Latin name
    gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if gi:
        for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
            text = html_to_text(em.group(1)).strip()
            if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
                return text
    # Last resort: any <em>/<i> tag with a Latin-looking name
    for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
        text = html_to_text(tag.group(1)).strip()
        if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
            return text
    return ""
 def normalize_latin_name(raw: str) -> str:
    """
    Extract genus + species from an extended botanical name.
    Examples:
      "Pisum sativum L. convar. sat." -> "Pisum sativum"
      "Capsicum annuum L." -> "Capsicum annuum"
      "Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
      "Solanum lycopersicum L." -> "Solanum lycopersicum"
      "Cucumis sativus" -> "Cucumis sativus"
      "Mentha x piperita" -> "Mentha x piperita"
    """
    if not raw:
        return ""
    # Clean up
    name = raw.strip()
    # Remove leading/trailing punctuation
    name = name.strip(".,;:")
    words = name.split()
    if len(words) < 2:
        return name
    genus = words[0]
    # Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
    if len(words) >= 3 and words[1] in ("x", "×"):
        return f"{genus} x {words[2]}"
    species = words[1]
    # Validate: genus should start uppercase, species lowercase
    if not genus[0].isupper() or not species[0].islower():
        return name  # Can't parse, return as-is
    return f"{genus} {species}"
 # ── Calendar parsing ────────────────────────────────────────────────────────
 CALENDAR_ROW_TYPES = {
    "voranzucht": "indoor_sowing_months",
    "vorzucht": "indoor_sowing_months",
    "vorkultur": "indoor_sowing_months",
    "aussaat/ pflanzung freiland": "direct_sowing_months",
    "aussaat/pflanzung freiland": "direct_sowing_months",
    "aussaat freiland": "direct_sowing_months",
    "direktsaat": "direct_sowing_months",
    "pflanzung freiland": "transplanting_months",
    "pflanzung": "transplanting_months",
    "aussaat/ pflanzung gewächshaus": "glasshouse_months",
    "aussaat/pflanzung gewächshaus": "glasshouse_months",
    "gewächshaus": "glasshouse_months",
    "ernte": "harvesting_months",
 }
 def parse_calendar(html_text: str) -> dict:
    """
    Parse the Reinsaat growing calendar table.
    Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
    Each value is a sorted list of month integers (1-12).
    """
    result = {}
    cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
    if not cal_match:
        return result
    cal = cal_match.group(1)
    rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
    for row in rows:
        # Get label
        label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
        if not label_m:
            continue
        label = html_to_text(label_m.group(1)).strip().lower()
        # Map label to our field
        field_name = None
        for pattern, fname in CALENDAR_ROW_TYPES.items():
            if pattern in label:
                field_name = fname
                break
        if not field_name:
            continue
        # Extract background colors for each cell (24 cells = 12 months x 2 halves)
        colors = re.findall(r'background-color:\s*([^;"]+)', row)
        # Convert to months: cell i maps to month (i // 2) + 1
        active_months = set()
        for i, color in enumerate(colors):
            color = color.strip().lower()
            if color != "none" and color != "transparent" and color != "":
                month = (i // 2) + 1
                if 1 <= month <= 12:
                    active_months.add(month)
        if active_months:
            # Merge if same field already found (e.g. two sowing rows)
            if field_name in result:
                result[field_name] = sorted(set(result[field_name]) | active_months)
            else:
                result[field_name] = sorted(active_months)
    return result
 # ── Growing data extraction ─────────────────────────────────────────────────
 def extract_growing_data(html_text: str) -> dict:
    """Extract spacing, depth, germination temp from the growing text."""
    data = {}
    # Get the growingInfos text
    gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if not gi:
        return data
    full_text = html_to_text(gi.group(1))
    # Also get the raw HTML for better entity handling
    raw_html = gi.group(1)
    # Convert HTML entities for pattern matching
    raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
    raw_text = re.sub(r'\s+', ' ', raw_text)
    # ── Sowing depth ──
    depth_pats = [
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in depth_pats:
        dm = re.search(pat, raw_text, re.IGNORECASE)
        if dm:
            vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
            data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
            break
    # ── Spacing: "ROW x PLANT cm" ──
    spacing_pats = [
        # "30–45 x 3–5 cm" (range x range)
        r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
        # "100 x 50 cm" (simple)
        r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in spacing_pats:
        matches = re.findall(pat, raw_text, re.IGNORECASE)
        if matches:
            m = matches[-1]  # prefer last match
            if len(m) == 4:
                data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
                data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
            elif len(m) == 2:
                v1 = float(m[0].replace(",", "."))
                v2 = float(m[1].replace(",", "."))
                data["row_spacing_cm"] = round(v1, 1)
                data["plant_spacing_cm"] = round(v2, 1)
            break
    # ── Germination temperature ──
    temp_pats = [
        r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
        r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
    ]
    for pat in temp_pats:
        tm = re.search(pat, raw_text, re.IGNORECASE)
        if tm:
            vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
            avg = sum(vals) / len(vals)
            if 5 <= avg <= 40:
                data["germination_temp_c"] = round(avg, 1)
                break
    # ── Perennial ──
    perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
    for pat in perennial_pats:
        if re.search(pat, raw_text, re.IGNORECASE):
            data["perennial"] = True
            break
    return data
 # ── Product data ────────────────────────────────────────────────────────────
@dataclass
 class ProductData:
    name: str = ""
    raw_latin_name: str = ""
    normalized_latin: str = ""
    description: str = ""
    sku: str = ""
    url: str = ""
    is_organic: bool = True
    growing_data: dict = field(default_factory=dict)
    calendar: dict = field(default_factory=dict)
 def parse_product(html_text: str, url: str) -> Optional[ProductData]:
    """Parse a product page. Returns ProductData or None if not a product page."""
    jsonld = extract_jsonld_product(html_text)
    if not jsonld:
        return None
    product = ProductData(url=url)
    product.name = jsonld.get("name", "").strip()
    product.description = jsonld.get("description", "").strip()
    product.sku = jsonld.get("model", "").strip()
    # Extract and normalize botanical name
    product.raw_latin_name = extract_botanical_name(html_text)
    product.normalized_latin = normalize_latin_name(product.raw_latin_name)
    # Extract growing data
    product.growing_data = extract_growing_data(html_text)
    # Parse calendar
    product.calendar = parse_calendar(html_text)
    # Check organic status (Reinsaat is all organic, but check for "demeter" too)
    product.is_organic = True
    return product
 # ── Recursive discovery ─────────────────────────────────────────────────────
 def discover_products(
    category_url: str,
    max_depth: int = 4,
    _depth: int = 0,
    _visited: set = None,
 ) -> list[ProductData]:
    if _visited is None:
        _visited = set()
    if category_url in _visited or _depth > max_depth:
        return []
    _visited.add(category_url)
    indent = "  " * (_depth + 1)
    try:
        html_text = fetch_url(category_url)
        time.sleep(DELAY)
    except Exception as e:
        print(f"{indent}ERROR fetching {category_url}: {e}")
        return []
    # Check if this is a product page
    product = parse_product(html_text, category_url)
    if product:
        return [product]
    # Category page: find child links
    cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
    child_links = []
    for link in extract_links(html_text, category_url):
        parsed = urllib.parse.urlparse(link)
        if parsed.netloc and parsed.netloc != "www.reinsaat.at":
            continue
        child_path = parsed.path.rstrip("/")
        if not child_path.startswith(cat_path + "/"):
            continue
        relative = child_path[len(cat_path) + 1:]
        if "/" in relative or not relative:
            continue
        clean_url = f"https://www.reinsaat.at{child_path}/"
        if clean_url not in _visited:
            child_links.append(clean_url)
    child_links = list(dict.fromkeys(child_links))
    print(f"{indent}Category {category_url} -> {len(child_links)} children")
    products = []
    for child_url in child_links:
        results = discover_products(child_url, max_depth, _depth + 1, _visited)
        products.extend(results)
    return products
 # ── Slug generation ─────────────────────────────────────────────────────────
 def make_slug(species_name: str, cultivar_name: str) -> str:
    """Generate a URL-friendly slug."""
    raw = f"{species_name}-{cultivar_name}".lower()
    # Replace umlauts and special chars
    replacements = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
        'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
        'á': 'a', 'à': 'a', 'â': 'a',
        'í': 'i', 'ì': 'i', 'î': 'i',
        'ó': 'o', 'ò': 'o', 'ô': 'o',
        'ú': 'u', 'ù': 'u', 'û': 'u',
        'ñ': 'n', 'ç': 'c',
    }
    for old, new in replacements.items():
        raw = raw.replace(old, new)
    # Keep only alphanumeric and hyphens
    slug = re.sub(r'[^a-z0-9]+', '-', raw)
    slug = slug.strip('-')
    # Collapse multiple hyphens
    slug = re.sub(r'-+', '-', slug)
    return slug
 # ── Main ────────────────────────────────────────────────────────────────────
 def db_connect():
    """Create a fresh DB connection."""
    conn = psycopg2.connect(
        host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
    )
    conn.autocommit = False
    return conn
 def main():
    print("=" * 70)
    print("Reinsaat Scraper v2")
    print("=" * 70)
    # ── Phase 1: Discover all products (no DB needed) ──
    print("\n[1] Discovering products from Reinsaat categories...")
    all_products: list[ProductData] = []
    visited: set[str] = set()
    for cat_url in CATEGORIES:
        print(f"\n  Category: {cat_url}")
        products = discover_products(cat_url, max_depth=4, _visited=visited)
        all_products.extend(products)
        print(f"  -> {len(products)} products")
    # Deduplicate by URL
    seen_urls = set()
    unique_products = []
    for p in all_products:
        if p.url not in seen_urls:
            seen_urls.add(p.url)
            unique_products.append(p)
    all_products = unique_products
    print(f"\n  Total unique products: {len(all_products)}")
    # ── Phase 2: Connect to DB and load existing data ──
    print("\n[2] Connecting to DB and loading existing data...")
    conn = db_connect()
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    # Load species
    cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
    species_rows = cur.fetchall()
    species_map = {}
    for row in species_rows:
        key = row["name_scientific"].lower().strip()
        species_map[key] = row
    print(f"    {len(species_map)} species loaded")
    # Load existing cultivars
    cur.execute("""
        SELECT id, species_id, name, slug, description,
               row_spacing_cm, plant_spacing_cm, planting_depth_cm,
               germination_temp_c, perennial,
               indoor_sowing_months, direct_sowing_months,
               transplanting_months, glasshouse_months, harvesting_months
        FROM cultivars
    """)
    cultivar_rows = cur.fetchall()
    existing_cultivars = {}
    existing_slugs = set()
    for row in cultivar_rows:
        sid = str(row["species_id"])
        name_lower = row["name"].lower()
        existing_cultivars[(sid, name_lower)] = dict(row)
        existing_slugs.add(row["slug"])
    print(f"    {len(existing_cultivars)} cultivars loaded")
    # Load existing Reinsaat supplier links
    cur.execute("""
        SELECT cultivar_id, product_url, article_number
        FROM cultivar_suppliers
        WHERE supplier_id = %s
    """, (REINSAAT_SUPPLIER_ID,))
    existing_links = {}
    for row in cur.fetchall():
        cid = str(row["cultivar_id"])
        url = row["product_url"] or ""
        sku = row["article_number"] or ""
        existing_links.setdefault(cid, []).append((url, sku))
    print(f"    {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
    # ── Phase 3: Process products ──
    print("\n[3] Processing products...")
    stats = {
        "created": 0,
        "linked": 0,
        "enriched": 0,
        "skipped_no_species": 0,
        "skipped_no_name": 0,
        "link_exists": 0,
        "errors": 0,
    }
    unmatched = []
    for i, product in enumerate(all_products):
        pct = (i + 1) / len(all_products) * 100
        prefix = f"  [{i+1}/{len(all_products)}] ({pct:.0f}%)"
        if not product.name:
            stats["skipped_no_name"] += 1
            continue
        # Match species
        normalized = product.normalized_latin.lower().strip()
        species = species_map.get(normalized)
        if not species:
            # Try exact match on raw name (first two words)
            raw_words = product.raw_latin_name.split()
            if len(raw_words) >= 2:
                attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
                species = species_map.get(attempt)
        if not species:
            stats["skipped_no_species"] += 1
            unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
            continue
        species_id = str(species["id"])
        species_name = species["name_scientific"]
        # Check if cultivar exists
        ckey = (species_id, product.name.lower())
        existing = existing_cultivars.get(ckey)
        if existing:
            cultivar_id = str(existing["id"])
            # ── Enrich existing cultivar with missing data ──
            updates = {}
            # Growing data from page
            gd = product.growing_data
            if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
                updates["planting_depth_cm"] = gd["planting_depth_cm"]
            if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
                updates["row_spacing_cm"] = gd["row_spacing_cm"]
            if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
                updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
            if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
                updates["germination_temp_c"] = gd["germination_temp_c"]
            if gd.get("perennial") and not existing.get("perennial"):
                updates["perennial"] = True
            # Calendar data
            cal = product.calendar
            if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
                updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
            if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
                updates["direct_sowing_months"] = cal["direct_sowing_months"]
            if cal.get("transplanting_months") and not existing.get("transplanting_months"):
                updates["transplanting_months"] = cal["transplanting_months"]
            if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
                updates["glasshouse_months"] = cal["glasshouse_months"]
            if cal.get("harvesting_months") and not existing.get("harvesting_months"):
                updates["harvesting_months"] = cal["harvesting_months"]
            # Description
            if product.description and not existing.get("description"):
                updates["description"] = product.description
            if updates:
                set_clauses = []
                values = []
                for col, val in updates.items():
                    set_clauses.append(f"{col} = %s")
                    values.append(val)
                set_clauses.append("updated_at = NOW()")
                values.append(cultivar_id)
                cur.execute(
                    f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
                    values
                )
                stats["enriched"] += 1
                print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
            # ── Add supplier link if missing ──
            link_exists = False
            if cultivar_id in existing_links:
                for lurl, lsku in existing_links[cultivar_id]:
                    if lurl == product.url or (lsku and lsku == product.sku):
                        link_exists = True
                        break
            if link_exists:
                stats["link_exists"] += 1
            else:
                try:
                    cur.execute("SAVEPOINT link_sp")
                    cur.execute("""
                        INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
                        VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
                        ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
                        SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
                    """, (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
                    cur.execute("RELEASE SAVEPOINT link_sp")
                    stats["linked"] += 1
                    existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
                    print(f"{prefix} {product.name} -> LINKED ({product.sku})")
                except Exception as e:
                    print(f"{prefix} {product.name} -> LINK ERROR: {e}")
                    cur.execute("ROLLBACK TO SAVEPOINT link_sp")
                    stats["errors"] += 1
        else:
            # ── Create new cultivar ──
            slug = make_slug(species_name, product.name)
            # Ensure unique slug
            base_slug = slug
            counter = 2
            while slug in existing_slugs:
                slug = f"{base_slug}-{counter}"
                counter += 1
            gd = product.growing_data
            cal = product.calendar
            try:
                cur.execute("SAVEPOINT create_sp")
                cur.execute("""
                    INSERT INTO cultivars (
                        species_id, name, name_de, slug, description,
                        is_organic, perennial,
                        planting_depth_cm, row_spacing_cm, plant_spacing_cm,
                        germination_temp_c,
                        indoor_sowing_months, direct_sowing_months,
                        transplanting_months, glasshouse_months, harvesting_months
                    ) VALUES (
                        %s::uuid, %s, %s, %s, %s,
                        %s, %s,
                        %s, %s, %s,
                        %s,
                        %s, %s,
                        %s, %s, %s
                    )
                    RETURNING id
                """, (
                    species_id,
                    product.name,
                    product.name,
                    slug,
                    product.description,
                    product.is_organic,
                    gd.get("perennial", False),
                    gd.get("planting_depth_cm"),
                    gd.get("row_spacing_cm"),
                    gd.get("plant_spacing_cm"),
                    gd.get("germination_temp_c"),
                    cal.get("indoor_sowing_months"),
                    cal.get("direct_sowing_months"),
                    cal.get("transplanting_months"),
                    cal.get("glasshouse_months"),
                    cal.get("harvesting_months"),
                ))
                new_id = str(cur.fetchone()["id"])
                existing_slugs.add(slug)
                existing_cultivars[ckey] = {"id": new_id}
                stats["created"] += 1
                # Link to supplier
                cur.execute("""
                    INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
                    VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
                """, (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
                stats["linked"] += 1
                existing_links.setdefault(new_id, []).append((product.url, product.sku))
                print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
                cur.execute("RELEASE SAVEPOINT create_sp")
            except Exception as e:
                print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
                cur.execute("ROLLBACK TO SAVEPOINT create_sp")
                stats["errors"] += 1
    # ── Commit ──
    conn.commit()
    # ── Summary ──
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"  Total products discovered: {len(all_products)}")
    print(f"  New cultivars created:     {stats['created']}")
    print(f"  New supplier links added:  {stats['linked']}")
    print(f"  Cultivars enriched:        {stats['enriched']}")
    print(f"  Links already existed:     {stats['link_exists']}")
    print(f"  Skipped (no species):      {stats['skipped_no_species']}")
    print(f"  Skipped (no name):         {stats['skipped_no_name']}")
    print(f"  Errors:                    {stats['errors']}")
    print("=" * 70)
    if unmatched:
        print(f"\n  UNMATCHED PRODUCTS ({len(unmatched)}):")
        for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
            print(f"    {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
    cur.close()
    conn.close()
 if __name__ == "__main__":
    main()
@@ -0,0 +1,635 @@
 #!/usr/bin/env python3
 """Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
 import json
 import re
 import sys
 import time
 import urllib.request
 import urllib.error
 import urllib.parse
 from html import unescape
 # --- Config ---
 API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
 API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
 REINSAAT_BASE = "https://www.reinsaat.at"
 DELAY = 0.3
 # Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
 CATEGORIES = [
    "beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
    "pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
    "carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
    "parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
    "celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
    "culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
    "wild_flowers_seeds", "green_manure",
 ]
 # Suffixes to strip from botanical names (authority names, infraspecific ranks)
 STRIP_SUFFIXES = {
    "l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
    "subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
    "hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
    "crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
    "sat.", "sat", "axillare", "medikus",
 }
 def api_get(path, params=None):
    """GET from HerbAPI."""
    url = f"{API_BASE}{path}"
    if params:
        url += "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(url)
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())
 def api_post(path, data):
    """POST to HerbAPI."""
    url = f"{API_BASE}{path}"
    body = json.dumps(data).encode()
    req = urllib.request.Request(url, data=body, method="POST")
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    req.add_header("Content-Type", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())
 def fetch_page(url):
    """Fetch a web page, return HTML string."""
    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
    with urllib.request.urlopen(req, timeout=15) as resp:
        return resp.read().decode("utf-8", errors="replace")
 BOTANICAL_TYPOS = {
    "capscicum": "capsicum",
    "capsicum frutenscens": "capsicum frutescens",
    "tropaelum": "tropaeolum",
    "lact.": "lactuca",
 }
 ABBREVIATED_NAMES = {
    "origanum vulg.": "origanum vulgare",
    "helichrysum bract.": "helichrysum bracteatum",
    "campanula lat.": "campanula latifolia",
    "cosmos bip.": "cosmos bipinnatus",
    "papaver somnif.": "papaver somniferum",
 }
 def normalise_botanical(raw):
    """Strip botanical name to genus + species only.
    'Pisum sativum L. convar. sat.' -> 'pisum sativum'
    'Solanum lycopersicum L.'       -> 'solanum lycopersicum'
    'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
    """
    if not raw:
        return None
    # Clean HTML entities
    raw = unescape(raw).replace("\xa0", " ").strip()
    # Remove trailing commas/periods
    raw = raw.rstrip(",. ")
    # Remove content in parentheses
    raw = re.sub(r"\([^)]*\)", "", raw)
    # Check abbreviated names first (before splitting)
    raw_lower = raw.lower().strip()
    for abbrev, full in ABBREVIATED_NAMES.items():
        if raw_lower.startswith(abbrev):
            return full
    parts = raw.split()
    if len(parts) < 2:
        return None
    # Genus (capitalised) + species (lowercase)
    genus = parts[0].lower().rstrip(",")
    species = parts[1].lower().rstrip(",")
    # Fix known typos
    if genus in BOTANICAL_TYPOS:
        genus = BOTANICAL_TYPOS[genus]
    full_name = f"{genus} {species}"
    if full_name in BOTANICAL_TYPOS:
        full_name = BOTANICAL_TYPOS[full_name]
        genus, species = full_name.split()
    # Validate: genus should start with letter, species should be all lowercase
    if not genus[0].isalpha() or not species[0].isalpha():
        return None
    # Skip if species looks like an authority (starts with uppercase in original)
    if parts[1][0].isupper():
        return None
    return f"{genus} {species}"
 def extract_product_data(html, url):
    """Extract product info from a Reinsaat product page."""
    result = {}
    # H1 = variety name
    m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
    if m:
        name = unescape(m.group(1)).strip()
        # Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
        paren = re.search(r"\(([^)]+)\)", name)
        if paren and re.match(r"RS-", name):
            name = paren.group(1).strip()
        result["name"] = name
    # Botanical name from fce_shop_kurztext
    m = re.search(
        r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
        html,
    )
    if m:
        result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
        result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
    # Article number from JSON-LD
    for jm in re.finditer(
        r'<script type="application/ld\+json">(.*?)</script>', html, re.S
    ):
        try:
            jd = json.loads(jm.group(1))
        except json.JSONDecodeError:
            continue
        if jd.get("@type") == "Product":
            if "model" in jd:
                result["article_number"] = str(jd["model"])
            # Get smallest pack price (usually the Portion)
            offers = jd.get("offers", {})
            if isinstance(offers, dict):
                offer_list = offers.get("offers", [])
            elif isinstance(offers, list):
                offer_list = offers
            else:
                offer_list = []
            if offer_list:
                prices = [
                    o["price"]
                    for o in offer_list
                    if isinstance(o.get("price"), (int, float)) and o["price"] > 0
                ]
                if prices:
                    result["price_eur"] = min(prices)
            break
    # Price table - get pack sizes
    tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
    for tbl in tables:
        if "€" not in tbl:
            continue
        rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
        if len(rows) >= 2:
            size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
            size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
            price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
            price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
            # Find the "Port." entry
            for i, st in enumerate(size_texts):
                if "Port" in st:
                    if i < len(price_texts):
                        pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
                        if pm:
                            result["port_price"] = float(pm.group())
                    break
            # Get portion content info
            result["pack_sizes"] = size_texts
            break
    # Sowing depth
    m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
    if m:
        d1 = float(m.group(1).replace(",", "."))
        d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
        result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
    # Spacing: "row spacing NNxNN cm" or "NN x NN cm"
    # Try outdoor spacing first
    m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if not m:
        m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if not m:
        m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if m:
        result["row_spacing_cm"] = float(m.group(1))
        result["plant_spacing_cm"] = float(m.group(2))
    # Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
    if "row_spacing_cm" not in result:
        m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
        if m:
            r1 = int(m.group(1))
            r2 = int(m.group(2)) if m.group(2) else r1
            result["row_spacing_cm"] = float((r1 + r2) // 2)
    # Germination temperature
    m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
    if m:
        t1 = int(m.group(1))
        t2 = int(m.group(2)) if m.group(2) else t1
        result["germination_temp_c"] = float((t1 + t2) // 2)
    # Pack unit from portion info - "20 seeds" or "25 g" etc
    portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
    if not portion_m:
        # Try "Port. (20 seeds)" format
        portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
    if portion_m:
        result["pack_size"] = float(portion_m.group(1).replace(",", "."))
        unit = portion_m.group(2).lower()
        if unit in ("seed", "seeds", "korn"):
            result["pack_unit"] = "Korn"
        else:
            result["pack_unit"] = unit
    result["url"] = url
    return result
 def get_all_species():
    """Fetch all species from API, build lookup by normalised name."""
    species_map = {}
    page = 1
    while True:
        data = api_get("/species", {"per_page": 100, "page": page})
        batch = data.get("data", [])
        for sp in batch:
            norm = normalise_botanical(sp["name_scientific"])
            if norm:
                species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
        print(f"    page {page}: {len(batch)} species (total so far: {len(species_map)})")
        if len(batch) < 100:
            break
        page += 1
    return species_map
 def get_all_cultivars():
    """Fetch all cultivars, build lookup by (species_id, normalised name)."""
    cultivar_map = {}  # (species_id, lower_name) -> cultivar
    page = 1
    while True:
        data = api_get("/cultivars", {"per_page": 100, "page": page})
        batch = data.get("data", [])
        for cv in batch:
            key = (cv["species_id"], cv["name"].lower().strip())
            cultivar_map[key] = cv
        print(f"    page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
        if len(batch) < 100:
            break
        page += 1
    return cultivar_map
 def get_reinsaat_supplier():
    """Get Reinsaat supplier record."""
    suppliers = api_get("/suppliers")
    for s in suppliers:
        if s["slug"] == "reinsaat":
            return s
    raise RuntimeError("Reinsaat supplier not found in API")
 def get_cultivar_suppliers(cultivar_id):
    """Get existing supplier links for a cultivar."""
    return api_get(f"/cultivars/{cultivar_id}/suppliers")
 def get_product_urls_from_category(cat_slug):
    """Fetch product URLs from a category page. Handles one level of subcategories."""
    cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
    try:
        html = fetch_page(cat_url)
    except Exception as e:
        print(f"  WARN: Failed to fetch category {cat_slug}: {e}")
        return []
    time.sleep(DELAY)
    # Get all internal links under this category
    pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
    raw_links = re.findall(rf'href="({pattern})"', html)
    # raw_links is list of (full_path, slug_part) but re gives us captured groups
    # Let me redo this
    raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
    unique_links = sorted(set(raw_links))
    product_urls = []
    subcategory_urls = []
    for link in unique_links:
        full_url = REINSAAT_BASE + link
        # Determine depth relative to category
        parts = link.rstrip("/").split("/")
        # /shop/EN/cat_slug/item -> 4 parts = product or subcategory
        # /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
        if len(parts) == 4:
            # Could be product or subcategory - we'll check later
            product_urls.append(full_url)
        elif len(parts) >= 5:
            product_urls.append(full_url)
    return product_urls
 def is_product_page(html):
    """Check if HTML is a product page (has botanical name or JSON-LD Product)."""
    return bool(
        re.search(r'fce_shop_kurztext', html)
        or re.search(r'"@type":\s*"Product"', html)
    )
 def main():
    print("=" * 60)
    print("Reinsaat v3 Scraper")
    print("=" * 60)
    # Step 1: Load all species
    print("\n[1/4] Loading species from API...")
    species_map = get_all_species()
    print(f"  Loaded {len(species_map)} species")
    # Step 2: Load all cultivars
    print("\n[2/4] Loading cultivars from API...")
    cultivar_map = get_all_cultivars()
    print(f"  Loaded {len(cultivar_map)} cultivars")
    # Step 3: Get Reinsaat supplier
    print("\n[3/4] Getting Reinsaat supplier...")
    supplier = get_reinsaat_supplier()
    supplier_id = supplier["id"]
    print(f"  Reinsaat ID: {supplier_id}")
    # Step 4: Scrape categories
    print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
    stats = {
        "products_found": 0,
        "botanical_extracted": 0,
        "species_matched": 0,
        "species_not_matched": 0,
        "cultivar_existed": 0,
        "cultivar_created": 0,
        "link_existed": 0,
        "link_created": 0,
        "errors": 0,
    }
    unmatched_species = {}  # botanical_norm -> count
    new_cultivars = []
    new_links = []
    for cat_i, cat in enumerate(CATEGORIES):
        print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
        urls = get_product_urls_from_category(cat)
        print(f"  Found {len(urls)} URLs")
        for url in urls:
            time.sleep(DELAY)
            try:
                html = fetch_page(url)
            except Exception as e:
                print(f"  ERROR fetching {url}: {e}")
                stats["errors"] += 1
                continue
            # Check if this is actually a product page
            if not is_product_page(html):
                # Might be a subcategory - get links from it
                sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
                sub_links = [
                    REINSAAT_BASE + l
                    for l in sorted(set(sub_links))
                    if l.startswith(f"/shop/EN/{cat}/")
                    and l.count("/") > url.rstrip("/").count("/")
                ]
                if sub_links:
                    # It's a subcategory, process its product links
                    for sub_url in sub_links:
                        if sub_url in urls:
                            continue  # already in list
                        time.sleep(DELAY)
                        try:
                            sub_html = fetch_page(sub_url)
                        except Exception as e:
                            print(f"  ERROR fetching {sub_url}: {e}")
                            stats["errors"] += 1
                            continue
                        if not is_product_page(sub_html):
                            continue
                        process_product(
                            sub_html, sub_url, species_map, cultivar_map,
                            supplier_id, stats, unmatched_species,
                            new_cultivars, new_links,
                        )
                continue
            process_product(
                html, url, species_map, cultivar_map,
                supplier_id, stats, unmatched_species,
                new_cultivars, new_links,
            )
    # Report
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"Products found:        {stats['products_found']}")
    print(f"Botanical extracted:   {stats['botanical_extracted']}")
    print(f"Species matched:       {stats['species_matched']}")
    print(f"Species NOT matched:   {stats['species_not_matched']}")
    print(f"Cultivars existed:     {stats['cultivar_existed']}")
    print(f"Cultivars created:     {stats['cultivar_created']}")
    print(f"Links existed:         {stats['link_existed']}")
    print(f"Links created:         {stats['link_created']}")
    print(f"Errors:                {stats['errors']}")
    if new_cultivars:
        print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
        for cv in new_cultivars:
            print(f"  + {cv['name']} ({cv.get('species', '?')})")
    if new_links:
        print(f"\n--- New supplier links ({len(new_links)}) ---")
        for lk in new_links:
            print(f"  + {lk['cultivar']} -> {lk.get('article', '?')}")
    if unmatched_species:
        print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
        for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
            print(f"  ? {name} (x{count})")
    print("\nDone.")
 def process_product(html, url, species_map, cultivar_map, supplier_id,
                    stats, unmatched_species, new_cultivars, new_links):
    """Process a single product page."""
    stats["products_found"] += 1
    prod = extract_product_data(html, url)
    if not prod.get("name"):
        return
    bot_norm = prod.get("botanical_norm")
    if not bot_norm:
        # No botanical name found on page
        stats["species_not_matched"] += 1
        unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
        return
    stats["botanical_extracted"] += 1
    # Match species
    species = species_map.get(bot_norm)
    if not species:
        stats["species_not_matched"] += 1
        unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
        return
    stats["species_matched"] += 1
    species_id = species["id"]
    cultivar_name = prod["name"]
    # Check if cultivar exists
    cv_key = (species_id, cultivar_name.lower().strip())
    existing_cv = cultivar_map.get(cv_key)
    if existing_cv:
        stats["cultivar_existed"] += 1
        cultivar_id = existing_cv["id"]
    else:
        # Create cultivar
        create_data = {
            "species_id": species_id,
            "name": cultivar_name,
            "is_organic": True,
            "source_urls": [url],
        }
        # Add growing data if we extracted any
        if "planting_depth_cm" in prod:
            create_data["planting_depth_cm"] = prod["planting_depth_cm"]
        if "row_spacing_cm" in prod:
            create_data["row_spacing_cm"] = prod["row_spacing_cm"]
        if "plant_spacing_cm" in prod:
            create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
        if "germination_temp_c" in prod:
            create_data["germination_temp_c"] = prod["germination_temp_c"]
        try:
            new_cv = api_post("/cultivars", create_data)
            cultivar_id = new_cv["id"]
            stats["cultivar_created"] += 1
            new_cultivars.append({
                "name": cultivar_name,
                "species": species["name"],
                "id": cultivar_id,
            })
            # Add to local cache
            cultivar_map[cv_key] = new_cv
            print(f"  + Created cultivar: {cultivar_name} ({species['name']})")
        except urllib.error.HTTPError as e:
            body = e.read().decode() if hasattr(e, 'read') else str(e)
            if e.code == 500 and "Database error" in body:
                # Likely slug collision - search for existing cultivar
                try:
                    # Try multiple search strategies
                    found = None
                    cn_lower = cultivar_name.lower().strip()
                    # Strategy 1: search by full name
                    search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
                    for cv in search_data.get("data", []):
                        if cv["name"].lower().strip() == cn_lower:
                            found = cv
                            break
                    # Strategy 2: match by species_id + partial name
                    if not found:
                        for cv in search_data.get("data", []):
                            if cv["species_id"] == species_id:
                                # Match if names are similar (ignoring punctuation)
                                cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
                                cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
                                if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
                                    found = cv
                                    break
                    # Strategy 3: search by last significant word
                    if not found:
                        words = [w for w in cultivar_name.split() if len(w) > 2]
                        if words:
                            search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
                            for cv in search2.get("data", []):
                                if cv["species_id"] == species_id:
                                    cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
                                    cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
                                    if cv_clean == cn_clean:
                                        found = cv
                                        break
                    if found:
                        cultivar_id = found["id"]
                        cultivar_map[cv_key] = found
                        stats["cultivar_existed"] += 1
                    else:
                        print(f"  WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
                        stats["errors"] += 1
                        return
                except Exception as e2:
                    print(f"  ERROR searching for '{cultivar_name}' after collision: {e2}")
                    stats["errors"] += 1
                    return
            else:
                print(f"  ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
                stats["errors"] += 1
                return
    # Check if Reinsaat supplier link exists
    try:
        existing_links = get_cultivar_suppliers(cultivar_id)
    except Exception:
        existing_links = []
    has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
    if has_reinsaat:
        stats["link_existed"] += 1
    else:
        # Create supplier link
        link_data = {
            "supplier_id": supplier_id,
            "product_url": url,
        }
        if "article_number" in prod:
            link_data["article_number"] = prod["article_number"]
        if "port_price" in prod:
            link_data["price_eur"] = prod["port_price"]
        elif "price_eur" in prod:
            link_data["price_eur"] = prod["price_eur"]
        if "pack_size" in prod:
            link_data["pack_size"] = prod["pack_size"]
        if "pack_unit" in prod:
            link_data["pack_unit"] = prod["pack_unit"]
        try:
            api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
            stats["link_created"] += 1
            new_links.append({
                "cultivar": cultivar_name,
                "article": prod.get("article_number", "?"),
                "url": url,
            })
        except urllib.error.HTTPError as e:
            body = e.read().decode() if hasattr(e, 'read') else str(e)
            print(f"  ERROR linking '{cultivar_name}': {e.code} {body}")
            stats["errors"] += 1
 if __name__ == "__main__":
    main()