diff --git a/tools/enrichment/enrich_wikidata.py b/tools/enrichment/enrich_wikidata.py
new file mode 100644
index 0000000..46cad90
--- /dev/null
+++ b/tools/enrichment/enrich_wikidata.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
+
+import json
+import time
+import urllib.parse
+import urllib.request
+
+HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
+
+HEADERS_WD = {
+ "User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
+ "Accept": "application/json",
+}
+
+
+def herbapi_request(path, method="GET", data=None):
+ url = f"{HERBAPI_BASE}{path}"
+ body = json.dumps(data).encode() if data else None
+ req = urllib.request.Request(url, data=body, method=method, headers={
+ "Authorization": f"Bearer {HERBAPI_TOKEN}",
+ "Content-Type": "application/json",
+ })
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read())
+
+
+def query_wikidata_batch(names):
+ """Query Wikidata for a batch of scientific names."""
+ values = " ".join(f'"{n}"' for n in names)
+ sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
+ VALUES ?name {{ {values} }}
+ ?item wdt:P225 ?name .
+ OPTIONAL {{ ?item wdt:P846 ?gbifId }}
+ OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
+}}"""
+ encoded = urllib.parse.quote(sparql)
+ url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
+ req = urllib.request.Request(url, headers=HEADERS_WD)
+ with urllib.request.urlopen(req, timeout=60) as resp:
+ data = json.loads(resp.read())
+
+ results = {}
+ for binding in data.get("results", {}).get("bindings", []):
+ name = binding["name"]["value"]
+ qid_url = binding["item"]["value"]
+ qid = qid_url.rsplit("/", 1)[-1]
+ gbif = binding.get("gbifId", {}).get("value")
+ eppo = binding.get("eppoCode", {}).get("value")
+ results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
+ return results
+
+
+def main():
+ # 1. Fetch all species
+ resp = herbapi_request("/species?per_page=200")
+ species_list = resp["data"]
+ print(f"Fetched {len(species_list)} species from HerbAPI\n")
+
+ # 2. Collect species needing enrichment
+ to_enrich = [sp for sp in species_list
+ if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
+
+ if not to_enrich:
+ print("All species already enriched.")
+ return
+
+ print(f"{len(to_enrich)} species need enrichment\n")
+
+ # 3. Batch query Wikidata
+ BATCH_SIZE = 20
+ wikidata_results = {}
+ names = [sp["name_scientific"] for sp in to_enrich]
+
+ for i in range(0, len(names), BATCH_SIZE):
+ batch = names[i:i + BATCH_SIZE]
+ print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
+ try:
+ results = query_wikidata_batch(batch)
+ wikidata_results.update(results)
+ print(f" Got {len(results)} matches")
+ except Exception as e:
+ print(f" ERROR: {e}")
+ if i + BATCH_SIZE < len(names):
+ time.sleep(2)
+
+ print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
+
+ # 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
+ updated = 0
+ skipped = 0
+ not_found = 0
+ errors = 0
+
+ for sp in to_enrich:
+ name = sp["name_scientific"]
+ wd = wikidata_results.get(name)
+ if not wd:
+ print(f" SKIP (no Wikidata match): {name}")
+ not_found += 1
+ continue
+
+ # Check what needs updating
+ needs_qid = not sp["wikidata_qid"] and wd["qid"]
+ needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
+ needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
+
+ if not (needs_qid or needs_gbif or needs_eppo):
+ print(f" SKIP (nothing new): {name}")
+ skipped += 1
+ continue
+
+ try:
+ # GET full species by slug for the complete object
+ full_sp = herbapi_request(f"/species/{sp['slug']}")
+
+ # Remove read-only fields
+ species_id = full_sp.pop("id")
+ full_sp.pop("slug", None)
+ full_sp.pop("created_at", None)
+ full_sp.pop("updated_at", None)
+
+ # Merge new data (only null fields)
+ if needs_qid:
+ full_sp["wikidata_qid"] = wd["qid"]
+ if needs_gbif:
+ full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
+ if needs_eppo:
+ full_sp["eppo_code"] = wd["eppo_code"]
+
+ # PUT by UUID
+ herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
+
+ fields = []
+ if needs_qid: fields.append(f"qid={wd['qid']}")
+ if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
+ if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
+ print(f" UPDATED: {name} -> {', '.join(fields)}")
+ updated += 1
+ except Exception as e:
+ print(f" ERROR updating {name}: {e}")
+ errors += 1
+
+ print(f"\n{'=' * 60}")
+ print(f"RESULTS:")
+ print(f" Updated: {updated}")
+ print(f" Skipped (no new data): {skipped}")
+ print(f" Not found on Wikidata: {not_found}")
+ print(f" Errors: {errors}")
+ print(f" Total species: {len(species_list)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/enrichment/expand_species.py b/tools/enrichment/expand_species.py
new file mode 100644
index 0000000..4351d0a
--- /dev/null
+++ b/tools/enrichment/expand_species.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+"""Expand HerbAPI species database with common permaculture/garden species."""
+
+import json
+import time
+import urllib.request
+import urllib.parse
+import urllib.error
+import ssl
+
+BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+DELAY = 0.15
+
+# SSL context for GBIF (https)
+ssl_ctx = ssl.create_default_context()
+
+
+def api_get(path):
+ req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read())
+
+
+def api_post(path, data):
+ body = json.dumps(data).encode()
+ req = urllib.request.Request(
+ f"{BASE_URL}{path}",
+ data=body,
+ headers={"Authorization": AUTH, "Content-Type": "application/json"},
+ method="POST",
+ )
+ try:
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read()), resp.status
+ except urllib.error.HTTPError as e:
+ err_body = e.read().decode()
+ print(f" ERROR {e.code}: {err_body}")
+ return None, e.code
+
+
+def gbif_get_german_name(scientific_name):
+ """Query GBIF for the German vernacular name."""
+ try:
+ url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
+ req = urllib.request.Request(url)
+ with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
+ match = json.loads(resp.read())
+
+ usage_key = match.get("usageKey")
+ if not usage_key:
+ return None
+
+ url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
+ req2 = urllib.request.Request(url2)
+ with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
+ vn = json.loads(resp.read())
+
+ for r in vn.get("results", []):
+ if r.get("language") == "deu":
+ return r["vernacularName"]
+ return None
+ except Exception as e:
+ print(f" GBIF lookup failed for {scientific_name}: {e}")
+ return None
+
+
+# ── Families to ensure exist ─────────────────────────────────────────
+FAMILIES_NEEDED = {
+ "Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
+ "Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
+ "Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
+ "Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
+ "Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
+ "Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
+ "Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
+ "Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
+ "Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
+ "Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
+ "Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
+ "Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
+ "Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
+ "Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
+ "Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
+ # New families not yet in the DB:
+ "Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
+ "Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
+ "Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
+}
+
+# ── Species to add ───────────────────────────────────────────────────
+# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
+SPECIES = [
+ # Vegetables
+ ("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
+ {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
+ ("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
+ {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
+ ("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
+ {"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
+ ("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
+ {"food_uses": "Fruit"}),
+ ("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
+ {"food_uses": "Fruit"}),
+ ("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
+ {"food_uses": "Fruit, seeds, flowers"}),
+ ("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
+ {"food_uses": "Fruit, seeds"}),
+ ("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
+ {"food_uses": "Leaves"}),
+ ("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
+ {"food_uses": "Leaves"}),
+ ("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
+ {"food_uses": "Leaves, flower buds, stems"}),
+ ("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
+ {"food_uses": "Root, leaves"}),
+ ("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
+ {"food_uses": "Root, leaves, seed pods"}),
+ ("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
+ {"food_uses": "Bulb, leaves"}),
+ ("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
+ {"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
+ ("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
+ {"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
+ ("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
+ {"food_uses": "Leaves, root"}),
+ ("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
+ {"food_uses": "Stalks, root, leaves"}),
+ ("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
+ {"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
+ ("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
+ {"food_uses": "Root"}),
+ ("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
+ {"food_uses": "Kernels, cobs"}),
+ ("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
+ {"food_uses": "Fruit"}),
+
+ # Herbs
+ ("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
+ {"food_uses": "Leaves", "attracts_pollinators": True}),
+ ("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
+ {"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
+ ("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
+ {"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
+ ("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
+ {"food_uses": "Leaves", "attracts_pollinators": True}),
+ ("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
+ {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
+ ("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
+ {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
+ ("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
+ {"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
+ ("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
+ {"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
+ "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
+ "attracts_beneficial_insects": True, "attracts_pollinators": True}),
+ ("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
+ {"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
+ ("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
+ {"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
+ ("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
+ {"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
+ "other_uses": "Earthworm attractant (biodynamic)"}),
+
+ # Flowers & cover crops
+ ("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
+ {"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
+ ("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
+ {"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
+ ("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
+ {"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
+ ("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
+ {"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
+ ("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
+ {"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
+ ("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
+ {"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
+ "ground_cover_quality": "excellent", "attracts_pollinators": True}),
+ ("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
+ {"nitrogen_fixer": True, "food_uses": "Sprouts",
+ "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
+ "other_uses": "Green manure, deep-rooting soil improver"}),
+
+ # Fruit / Trees
+ ("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
+ {"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
+ ("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
+ {"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
+ ("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
+ {"food_uses": "Fruit", "attracts_pollinators": True}),
+ ("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
+ {"food_uses": "Berries"}),
+ ("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
+ {"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
+ "wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
+ ("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
+ {"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
+ ("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
+ {"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
+ "medicinal_uses": "High vitamin C, skin care",
+ "other_uses": "Erosion control, windbreak"}),
+ ("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
+ {"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
+]
+
+
+def main():
+ # 1. Load existing families
+ print("=== Loading existing families ===")
+ fam_resp = api_get("/families?per_page=100")
+ family_map = {} # name_scientific -> id
+ for f in fam_resp["data"]:
+ family_map[f["name_scientific"]] = f["id"]
+ print(f" Found {len(family_map)} existing families")
+
+ # 2. Create missing families
+ print("\n=== Creating missing families ===")
+ families_created = 0
+ for fam_name, fam_info in FAMILIES_NEEDED.items():
+ if fam_name in family_map:
+ print(f" SKIP (exists): {fam_name}")
+ continue
+ payload = {
+ "name_scientific": fam_name,
+ "name_en": fam_info["name_en"],
+ "name_de": fam_info["name_de"],
+ }
+ print(f" CREATE: {fam_name} ...", end=" ")
+ result, status = api_post("/families", payload)
+ if result and "id" in result:
+ family_map[fam_name] = result["id"]
+ print(f"OK ({result['id']})")
+ families_created += 1
+ else:
+ print(f"FAILED (status={status})")
+ time.sleep(DELAY)
+
+ print(f"\n Families created: {families_created}")
+
+ # 3. Load existing species
+ print("\n=== Loading existing species ===")
+ sp_resp = api_get("/species?per_page=200")
+ existing_species = set()
+ for s in sp_resp["data"]:
+ existing_species.add(s["name_scientific"])
+ print(f" Found {len(existing_species)} existing species")
+
+ # 4. Add new species
+ print("\n=== Adding new species ===")
+ created = 0
+ skipped = 0
+ failed = 0
+
+ for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
+ if sci_name in existing_species:
+ print(f" SKIP (exists): {sci_name}")
+ skipped += 1
+ continue
+
+ # Look up family ID
+ fam_id = family_map.get(family)
+ if not fam_id:
+ print(f" SKIP (no family '{family}'): {sci_name}")
+ failed += 1
+ continue
+
+ # Try GBIF for German name
+ gbif_de = gbif_get_german_name(sci_name)
+ if gbif_de:
+ print(f" GBIF name for {sci_name}: {gbif_de}")
+ # Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
+ # Keep our curated name_de but log the GBIF one
+
+ payload = {
+ "name_scientific": sci_name,
+ "family_id": fam_id,
+ "name_en": name_en,
+ "name_de": name_de,
+ "plant_layer": plant_layer,
+ }
+ # Add extra fields
+ for k, v in extras.items():
+ payload[k] = v
+
+ print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
+ result, status = api_post("/species", payload)
+ if result and "id" in result:
+ print(f"OK ({result['id']})")
+ created += 1
+ else:
+ print(f"FAILED (status={status})")
+ failed += 1
+ time.sleep(DELAY)
+
+ print(f"\n{'='*50}")
+ print(f"SUMMARY")
+ print(f" Families created: {families_created}")
+ print(f" Species created: {created}")
+ print(f" Species skipped: {skipped}")
+ print(f" Species failed: {failed}")
+ print(f" Total species now: {len(existing_species) + created}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/enrichment/import_images.py b/tools/enrichment/import_images.py
new file mode 100644
index 0000000..628afaa
--- /dev/null
+++ b/tools/enrichment/import_images.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+# Force unbuffered output
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+# --- Configuration ---
+S3_ENDPOINT = "http://garage.sub-net.at:3900"
+S3_BUCKET = "herbapi"
+S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
+S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
+S3_REGION = "garage"
+
+DB_HOST = "10.31.3.90"
+DB_USER = "herbapi"
+DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
+DB_NAME = "herbapi"
+
+USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
+THUMB_WIDTH = 800
+REQUEST_DELAY = 0.3
+
+ALLOWED_LICENSES = {
+ "cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
+ "public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
+ "pd-us", "pd-usgov", "pd-author",
+ "cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
+ "cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
+ "cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
+ "cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
+}
+
+
+def slugify(name: str) -> str:
+ """Convert scientific name to a URL-safe slug."""
+ return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
+
+
+def psql(query: str) -> str:
+ """Run a psql query and return output."""
+ env = os.environ.copy()
+ env["PGPASSWORD"] = DB_PASS
+ result = subprocess.run(
+ ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
+ capture_output=True, text=True, env=env
+ )
+ if result.returncode != 0:
+ print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
+ return result.stdout.strip()
+
+
+def fetch_json(url: str) -> dict | None:
+ """Fetch JSON from a URL with proper User-Agent."""
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+ try:
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ return json.loads(resp.read())
+ except Exception as e:
+ print(f" HTTP error fetching {url}: {e}")
+ return None
+
+
+def get_wikidata_image(qid: str) -> str | None:
+ """Query Wikidata SPARQL for P18 image filename."""
+ sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
+ url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
+ "query": sparql, "format": "json"
+ })
+ data = fetch_json(url)
+ if not data:
+ return None
+ bindings = data.get("results", {}).get("bindings", [])
+ if not bindings:
+ return None
+ image_url = bindings[0]["image"]["value"]
+ # URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
+ filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
+ return filename
+
+
+def get_commons_info(filename: str) -> dict | None:
+ """Get image info from Wikimedia Commons API."""
+ url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
+ "action": "query",
+ "titles": f"File:{filename}",
+ "prop": "imageinfo",
+ "iiprop": "url|extmetadata",
+ "iiurlwidth": str(THUMB_WIDTH),
+ "format": "json",
+ })
+ data = fetch_json(url)
+ if not data:
+ return None
+ pages = data.get("query", {}).get("pages", {})
+ for page_id, page in pages.items():
+ if page_id == "-1":
+ return None
+ imageinfo = page.get("imageinfo", [])
+ if not imageinfo:
+ return None
+ info = imageinfo[0]
+ meta = info.get("extmetadata", {})
+
+ thumb_url = info.get("thumburl") or info.get("url")
+ desc_url = info.get("descriptionurl", "")
+
+ license_short = meta.get("LicenseShortName", {}).get("value", "")
+ artist_html = meta.get("Artist", {}).get("value", "")
+ # Strip HTML tags from artist
+ artist = re.sub(r'<[^>]+>', '', artist_html).strip()
+ # Clean up whitespace
+ artist = re.sub(r'\s+', ' ', artist)
+
+ return {
+ "thumb_url": thumb_url,
+ "description_url": desc_url,
+ "license": license_short,
+ "artist": artist,
+ "filename": filename,
+ }
+ return None
+
+
+def is_license_allowed(license_str: str) -> bool:
+ """Check if a license is in our allowed list."""
+ normalized = license_str.lower().strip()
+ # Direct match
+ if normalized in ALLOWED_LICENSES:
+ return True
+ # Check for NC or ND
+ if "nc" in normalized or "nd" in normalized:
+ return False
+ # Check patterns
+ if normalized.startswith("public domain") or normalized.startswith("pd"):
+ return True
+ if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
+ return True
+ if re.match(r'^cc[- ]?by[- ]?\d', normalized):
+ return True
+ if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
+ return True
+ return False
+
+
+def normalize_license(license_str: str) -> str:
+ """Normalize license string for storage."""
+ low = license_str.lower().strip()
+ if "public domain" in low or low.startswith("pd"):
+ return "Public domain"
+ if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
+ return "CC0 1.0"
+ # CC BY-SA X.0
+ m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
+ if m:
+ return f"CC BY-SA {m.group(1)}"
+ # CC BY X.0
+ m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
+ if m:
+ return f"CC BY {m.group(1)}"
+ return license_str
+
+
+def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
+ """Upload to S3 Garage using AWS CLI."""
+ tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
+ with open(tmp_path, "wb") as f:
+ f.write(data)
+
+ env = os.environ.copy()
+ env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
+ env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
+ env["AWS_DEFAULT_REGION"] = S3_REGION
+
+ result = subprocess.run(
+ [
+ "aws", "s3", "cp", tmp_path,
+ f"s3://{S3_BUCKET}/{s3_key}",
+ "--endpoint-url", S3_ENDPOINT,
+ "--content-type", content_type,
+ ],
+ capture_output=True, text=True, env=env
+ )
+ os.unlink(tmp_path)
+ if result.returncode != 0:
+ raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
+
+
+def download_image(url: str) -> bytes | None:
+ """Download image data from URL."""
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+ try:
+ with urllib.request.urlopen(req, timeout=60) as resp:
+ return resp.read()
+ except Exception as e:
+ print(f" Download error: {e}")
+ return None
+
+
+def main():
+ # 1. Get species
+ rows = psql(
+ "SELECT id, name_scientific, wikidata_qid FROM species "
+ "WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
+ "ORDER BY name_scientific"
+ )
+ if not rows:
+ print("No species with wikidata_qid found.")
+ return
+
+ species_list = []
+ for line in rows.split("\n"):
+ parts = line.split("|")
+ if len(parts) == 3:
+ species_list.append({
+ "id": parts[0],
+ "name": parts[1],
+ "qid": parts[2],
+ })
+
+ print(f"Found {len(species_list)} species with Wikidata QIDs.")
+
+ # 2. Get existing images
+ existing = set()
+ existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
+ if existing_rows:
+ for line in existing_rows.split("\n"):
+ line = line.strip()
+ if line:
+ existing.add(line)
+
+ print(f"Found {len(existing)} species that already have images.")
+
+ imported = 0
+ skipped_existing = 0
+ skipped_no_image = 0
+ skipped_license = 0
+ skipped_download = 0
+ errors = 0
+
+ for i, sp in enumerate(species_list):
+ name = sp["name"]
+ qid = sp["qid"]
+ sp_id = sp["id"]
+ slug = slugify(name)
+
+ print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
+
+ if sp_id in existing:
+ print(" Already has image, skipping.")
+ skipped_existing += 1
+ continue
+
+ # Query Wikidata for image
+ time.sleep(REQUEST_DELAY)
+ filename = get_wikidata_image(qid)
+ if not filename:
+ print(" No image on Wikidata.")
+ skipped_no_image += 1
+ continue
+
+ # Get Commons info
+ time.sleep(REQUEST_DELAY)
+ info = get_commons_info(filename)
+ if not info:
+ print(f" Could not get Commons info for {filename}")
+ skipped_no_image += 1
+ continue
+
+ # Check license
+ raw_license = info["license"]
+ if not is_license_allowed(raw_license):
+ print(f" License not allowed: {raw_license}")
+ skipped_license += 1
+ continue
+
+ norm_license = normalize_license(raw_license)
+ artist = info["artist"]
+ thumb_url = info["thumb_url"]
+ desc_url = info["description_url"]
+
+ print(f" License: {raw_license} -> {norm_license}")
+ print(f" Artist: {artist[:80]}")
+ print(f" Thumbnail: {thumb_url[:100]}...")
+
+ # Download image
+ time.sleep(REQUEST_DELAY)
+ image_data = download_image(thumb_url)
+ if not image_data:
+ print(" Failed to download image.")
+ skipped_download += 1
+ continue
+
+ print(f" Downloaded {len(image_data)} bytes")
+
+ # Determine file extension from URL
+ ext = "jpg"
+ if ".png" in thumb_url.lower():
+ ext = "png"
+ elif ".svg" in thumb_url.lower():
+ ext = "svg"
+ elif ".gif" in thumb_url.lower():
+ ext = "gif"
+
+ s3_key = f"species/{slug}.{ext}"
+ content_type = {
+ "jpg": "image/jpeg",
+ "png": "image/png",
+ "svg": "image/svg+xml",
+ "gif": "image/gif",
+ }.get(ext, "image/jpeg")
+
+ # Upload to S3
+ try:
+ s3_upload(s3_key, image_data, content_type)
+ print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
+ except RuntimeError as e:
+ print(f" S3 upload failed: {e}")
+ errors += 1
+ continue
+
+ # Insert into database
+ caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
+ # Escape single quotes for SQL
+ caption_esc = caption.replace("'", "''")
+ desc_url_esc = desc_url.replace("'", "''")
+ norm_license_esc = norm_license.replace("'", "''")
+ s3_key_esc = s3_key.replace("'", "''")
+
+ insert_sql = (
+ f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
+ f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
+ f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
+ )
+
+ result = psql(insert_sql)
+ # psql returns empty on success for INSERT
+ print(f" Inserted into images table.")
+ imported += 1
+
+ print(f"\n{'='*60}")
+ print(f"DONE!")
+ print(f" Imported: {imported}")
+ print(f" Skipped (existing):{skipped_existing}")
+ print(f" Skipped (no image):{skipped_no_image}")
+ print(f" Skipped (license): {skipped_license}")
+ print(f" Skipped (download):{skipped_download}")
+ print(f" Errors: {errors}")
+ print(f" Total processed: {len(species_list)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/enrichment/import_images_v2.py b/tools/enrichment/import_images_v2.py
new file mode 100644
index 0000000..034c123
--- /dev/null
+++ b/tools/enrichment/import_images_v2.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
+
+import hashlib
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+# Config
+DB_HOST = "10.31.3.90"
+DB_USER = "herbapi"
+DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
+DB_NAME = "herbapi"
+S3_BUCKET = "herbapi"
+S3_ENDPOINT = "http://10.31.3.170:3900"
+USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
+REQUEST_DELAY = 0.3
+
+# AWS env for subprocess calls
+AWS_ENV = {
+ **os.environ,
+ "AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
+ "AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
+ "AWS_DEFAULT_REGION": "garage",
+}
+
+# Stats
+stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
+
+
+def fetch_url(url):
+ """Fetch URL with custom User-Agent."""
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ return resp.read()
+
+
+def fetch_json(url):
+ """Fetch URL and parse JSON."""
+ return json.loads(fetch_url(url))
+
+
+def psql(sql):
+ """Run psql command and return output."""
+ result = subprocess.run(
+ ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
+ capture_output=True, text=True,
+ env={**os.environ, "PGPASSWORD": DB_PASS},
+ )
+ return result.stdout.strip()
+
+
+def is_license_allowed(license_str):
+ """Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
+ Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
+ We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
+ We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
+ """
+ if not license_str:
+ return False
+ ls = license_str.lower().strip()
+
+ # Reject NC and ND explicitly first
+ if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
+ return False
+
+ # Public domain / CC0
+ if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
+ return True
+ if "public domain" in ls or ls.startswith("pd"):
+ return True
+
+ # CC BY-SA (any version, any jurisdiction)
+ if re.match(r"cc\s+by-sa\b", ls):
+ return True
+
+ # CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
+ if re.match(r"cc\s+by\b", ls):
+ return True
+
+ return False
+
+
+def get_wikidata_image(qid):
+ """Query Wikidata SPARQL for P18 image filename."""
+ sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
+ url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
+ data = fetch_json(url)
+ bindings = data.get("results", {}).get("bindings", [])
+ if not bindings:
+ return None
+ image_url = bindings[0]["image"]["value"]
+ # Extract filename from commons URL
+ filename = urllib.parse.unquote(image_url.split("/")[-1])
+ return filename
+
+
+def get_commons_info(filename):
+ """Get image info from Commons API: license, artist, thumbnail URL."""
+ title = f"File:{filename}"
+ url = (
+ f"https://commons.wikimedia.org/w/api.php?action=query"
+ f"&titles={urllib.parse.quote(title)}"
+ f"&prop=imageinfo&iiprop=url|extmetadata"
+ f"&iiurlwidth=800&format=json"
+ )
+ data = fetch_json(url)
+ pages = data.get("query", {}).get("pages", {})
+ for page_id, page in pages.items():
+ if page_id == "-1":
+ return None
+ imageinfo = page.get("imageinfo", [{}])[0]
+ meta = imageinfo.get("extmetadata", {})
+
+ license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
+ artist_html = meta.get("Artist", {}).get("value", "")
+
+ # Clean up artist: strip HTML tags
+ artist = re.sub(r"<[^>]+>", "", artist_html).strip()
+ # Collapse whitespace
+ artist = re.sub(r"\s+", " ", artist)
+ if len(artist) > 120:
+ artist = artist[:117] + "..."
+
+ # Use the API-provided thumbnail URL (iiurlwidth=800)
+ thumb_url = imageinfo.get("thumburl", "")
+ # Also get the description URL
+ desc_url = imageinfo.get("descriptionurl", "")
+
+ return {
+ "license": license_short,
+ "artist": artist,
+ "thumb_url": thumb_url,
+ "desc_url": desc_url,
+ "filename": filename,
+ }
+ return None
+
+
+def process_species(species_id, slug, name_sci, qid):
+ """Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
+ stats["total"] += 1
+
+ # Step 1: Get image filename from Wikidata
+ try:
+ filename = get_wikidata_image(qid)
+ except Exception as e:
+ print(f" ERROR querying Wikidata for {qid}: {e}")
+ stats["errors"] += 1
+ return False
+ time.sleep(REQUEST_DELAY)
+
+ if not filename:
+ print(f" No P18 image for {qid}")
+ stats["no_p18"] += 1
+ return False
+
+ # Step 2: Get Commons info (license, artist, thumb URL)
+ try:
+ info = get_commons_info(filename)
+ except Exception as e:
+ print(f" ERROR querying Commons for {filename}: {e}")
+ stats["errors"] += 1
+ return False
+ time.sleep(REQUEST_DELAY)
+
+ if not info:
+ print(f" No Commons info for {filename}")
+ stats["errors"] += 1
+ return False
+
+ # Step 3: Check license
+ if not is_license_allowed(info["license"]):
+ print(f" Bad license: {info['license']} for {filename}")
+ stats["bad_license"] += 1
+ return False
+
+ # Step 4: Download thumbnail using API-provided URL
+ thumb_url = info["thumb_url"]
+ if not thumb_url:
+ print(f" No thumbnail URL available for {filename}")
+ stats["download_fail"] += 1
+ return False
+
+ # Determine file extension from thumbnail URL
+ ext = "jpg"
+ if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
+ ext = "png"
+ elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
+ ext = "gif"
+
+ tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
+ try:
+ img_data = fetch_url(thumb_url)
+ with open(tmp_path, "wb") as f:
+ f.write(img_data)
+ except Exception as e:
+ print(f" ERROR downloading {thumb_url}: {e}")
+ stats["download_fail"] += 1
+ return False
+ time.sleep(REQUEST_DELAY)
+
+ # Step 5: Upload to S3
+ s3_key = f"species/{slug}.{ext}"
+ try:
+ result = subprocess.run(
+ ["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
+ capture_output=True, text=True, env=AWS_ENV, timeout=60,
+ )
+ if result.returncode != 0:
+ print(f" S3 upload failed: {result.stderr}")
+ stats["upload_fail"] += 1
+ return False
+ except Exception as e:
+ print(f" ERROR uploading to S3: {e}")
+ stats["upload_fail"] += 1
+ return False
+ finally:
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
+
+ # Step 6: Insert into DB
+ caption = f"Photo: {info['artist']}" if info["artist"] else ""
+ caption_sql = caption.replace("'", "''")
+ source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
+ source_url_sql = source_url.replace("'", "''")
+ license_sql = info["license"].replace("'", "''")
+
+ sql = (
+ f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
+ f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
+ )
+ try:
+ psql(sql)
+ except Exception as e:
+ print(f" ERROR inserting to DB: {e}")
+ stats["errors"] += 1
+ return False
+
+ stats["imported"] += 1
+ return True
+
+
+def main():
+ # Get species without images
+ rows = psql(
+ "SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
+ "FROM species s "
+ "LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
+ "WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
+ "ORDER BY s.name_scientific;"
+ )
+ if not rows:
+ print("No species need images.")
+ return
+
+ species_list = []
+ for line in rows.split("\n"):
+ parts = line.strip().split("|")
+ if len(parts) == 4:
+ species_list.append(parts)
+
+ print(f"Processing {len(species_list)} species...\n")
+
+ for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
+ print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
+ ok = process_species(sid, slug, name_sci, qid)
+ if ok:
+ print(f" OK - imported")
+
+ print(f"\n{'='*50}")
+ print(f"RESULTS:")
+ print(f" Total species processed: {stats['total']}")
+ print(f" Successfully imported: {stats['imported']}")
+ print(f" No P18 image: {stats['no_p18']}")
+ print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
+ print(f" Download failures: {stats['download_fail']}")
+ print(f" Upload failures: {stats['upload_fail']}")
+ print(f" Other errors: {stats['errors']}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/enrichment/seed_herbapi.py b/tools/enrichment/seed_herbapi.py
new file mode 100644
index 0000000..85f6564
--- /dev/null
+++ b/tools/enrichment/seed_herbapi.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
+import json, urllib.request, urllib.parse, time, sys
+
+API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+GBIF = "https://api.gbif.org/v1"
+
+def api_post(path, data):
+ req = urllib.request.Request(f"{API}{path}",
+ data=json.dumps(data).encode(),
+ headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
+ try:
+ resp = urllib.request.urlopen(req)
+ return json.loads(resp.read())
+ except urllib.error.HTTPError as e:
+ print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
+ return None
+
+def gbif_de_name(name):
+ """Get German common name from GBIF."""
+ url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
+ try:
+ match = json.loads(urllib.request.urlopen(url).read())
+ if not match.get("usageKey"): return None
+ url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
+ data = json.loads(urllib.request.urlopen(url2).read())
+ for r in data.get("results", []):
+ if r.get("language") == "deu":
+ return r["vernacularName"]
+ except: pass
+ return None
+
+FAMILIES = [
+ ("Fabaceae", "Hülsenfrüchtler", "Legumes"),
+ ("Rosaceae", "Rosengewächse", "Rose family"),
+ ("Brassicaceae", "Kreuzblütler", "Cabbage family"),
+ ("Apiaceae", "Doldenblütler", "Carrot family"),
+ ("Lamiaceae", "Lippenblütler", "Mint family"),
+ ("Asteraceae", "Korbblütler", "Daisy family"),
+ ("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
+ ("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
+ ("Poaceae", "Süßgräser", "Grass family"),
+ ("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
+ ("Boraginaceae", "Raublattgewächse", "Borage family"),
+ ("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
+ ("Betulaceae", "Birkengewächse", "Birch family"),
+ ("Fagaceae", "Buchengewächse", "Beech family"),
+ ("Juglandaceae", "Walnussgewächse", "Walnut family"),
+ ("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
+ ("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
+ ("Ericaceae", "Heidekrautgewächse", "Heath family"),
+ ("Moraceae", "Maulbeergewächse", "Mulberry family"),
+ ("Urticaceae", "Brennnesselgewächse", "Nettle family"),
+ ("Malvaceae", "Malvengewächse", "Mallow family"),
+ ("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
+ ("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
+ ("Asparagaceae", "Spargelgewächse", "Asparagus family"),
+ ("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
+]
+
+SPECIES = [
+ ("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
+ ("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
+ ("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
+ ("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
+ ("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
+ ("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
+ ("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
+ ("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
+ ("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
+ ("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
+ ("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
+ ("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
+ ("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
+ ("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
+ ("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
+ ("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
+ ("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
+ ("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
+ ("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
+ ("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
+ ("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
+ ("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
+ ("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
+ ("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
+ ("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
+]
+
+# Create families
+print("=== Creating families ===")
+family_map = {}
+for sci, de, en in FAMILIES:
+ r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
+ if r:
+ family_map[sci] = r["id"]
+ print(f" ✓ {sci}")
+ time.sleep(0.05)
+print(f"Created {len(family_map)} families\n")
+
+# Create species
+print("=== Creating species (with GBIF German names) ===")
+created = 0
+for sci_name, family_sci, extra in SPECIES:
+ fam_id = family_map.get(family_sci)
+ if not fam_id:
+ print(f" ✗ {sci_name} — family {family_sci} missing")
+ continue
+ de_name = gbif_de_name(sci_name)
+ data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
+ r = api_post("/species", data)
+ if r:
+ created += 1
+ print(f" ✓ {sci_name} → {de_name or '(no DE name)'}")
+ time.sleep(0.15)
+print(f"Created {created} species\n")
+
+# Create suppliers
+print("=== Creating suppliers ===")
+for name, url, country, organic, demeter, notes in [
+ ("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
+ ("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
+]:
+ r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
+ if r: print(f" ✓ {name}")
+print("\nDone!")
diff --git a/tools/scrapers/scrape_arche_noah.py b/tools/scrapers/scrape_arche_noah.py
new file mode 100644
index 0000000..0b58f3a
--- /dev/null
+++ b/tools/scrapers/scrape_arche_noah.py
@@ -0,0 +1,514 @@
+#!/usr/bin/env python3
+"""
+Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
+
+Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
+product listings and details, then creates cultivars in HerbAPI matched
+to existing species.
+"""
+
+import json
+import re
+import time
+import urllib.request
+import urllib.error
+import urllib.parse
+import sys
+from datetime import datetime, timezone
+
+# --- Configuration -----------------------------------------------------------
+
+HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+
+SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
+SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
+REQUEST_DELAY = 0.5 # seconds between requests
+
+# Only import products from these Arche Noah article lines (their own seeds)
+ARCHE_NOAH_LINES = {
+ "Bio-Saatgut von ARCHE NOAH",
+ "Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
+}
+
+# Search terms to discover all seed products across the shop
+SEARCH_TERMS = [
+ "Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
+ "Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
+ "Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
+ "Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
+ "Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
+ "Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
+ "Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
+ "Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
+ "Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
+ "Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
+ "Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
+ "Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
+ "Rote Bete", "Rote Rübe", "Mangold", "Melde",
+ "Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
+ "Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
+ "Zuckermais", "Popcorn",
+]
+
+# --- Helpers -----------------------------------------------------------------
+
+def herbapi_request(method, path, data=None):
+ """Make a request to HerbAPI."""
+ url = f"{HERBAPI_BASE}/{path}"
+ body = json.dumps(data).encode() if data else None
+ req = urllib.request.Request(url, data=body, method=method, headers={
+ "Authorization": f"Bearer {HERBAPI_TOKEN}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ })
+ try:
+ resp = urllib.request.urlopen(req, timeout=30)
+ raw = resp.read().decode("utf-8")
+ return json.loads(raw) if raw.strip() else None
+ except urllib.error.HTTPError as e:
+ body = e.read().decode("utf-8", errors="replace")
+ print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
+ raise
+
+
+def shop_create_session():
+ """Create an anonymous session on the Arche Noah shop."""
+ req = urllib.request.Request(
+ SHOP_BASE + "webshop/createanonymoususer",
+ data=json.dumps({}).encode(),
+ headers={
+ "User-Agent": SHOP_UA,
+ "Content-Type": "application/json",
+ "Origin": "https://shop.arche-noah.at",
+ "Referer": "https://shop.arche-noah.at/",
+ },
+ )
+ resp = urllib.request.urlopen(req, timeout=15)
+ cookie = resp.headers.get("Set-Cookie", "")
+ session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
+ if not session:
+ raise RuntimeError("Failed to get shop session")
+ return session
+
+
+def shop_request(session, endpoint, payload):
+ """Make a POST request to the shop API."""
+ req = urllib.request.Request(
+ SHOP_BASE + endpoint,
+ data=json.dumps(payload).encode(),
+ headers={
+ "User-Agent": SHOP_UA,
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ "Cookie": f"JSESSIONID={session}",
+ "Origin": "https://shop.arche-noah.at",
+ "Referer": "https://shop.arche-noah.at/",
+ },
+ )
+ resp = urllib.request.urlopen(req, timeout=30)
+ raw = resp.read().decode("utf-8")
+ return json.loads(raw) if raw.strip() else None
+
+
+def extract_latin_name(detail_headline3):
+ """Extract the Latin/botanical name from the product detail headline3 field."""
+ if not detail_headline3:
+ return None
+ # Remove HTML tags
+ text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
+ # Remove "Hier geht es zu unseren..." trailing text
+ text = text.split("Hier geht")[0].strip()
+ # Should be something like "Solanum lycopersicum" or "Capsicum annuum"
+ if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
+ return text
+ return None
+
+
+def match_species(latin_name, species_by_scientific):
+ """
+ Match a Latin name to a species, handling subspecies/variety suffixes.
+ E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
+ Also handles "subsp.", "convar.", "f." qualifiers.
+ """
+ if not latin_name:
+ return None
+
+ normalized = latin_name.strip().lower()
+
+ # Direct match
+ species = species_by_scientific.get(normalized)
+ if species:
+ return species
+
+ # Strip subspecies/variety/convar/forma qualifiers and try genus + species only
+ # Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
+ m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
+ if m:
+ base = m.group(1).strip()
+ species = species_by_scientific.get(base)
+ if species:
+ return species
+
+ return None
+
+
+def extract_cultivar_name(product_name):
+ """
+ Extract the cultivar/variety name from the product name.
+ Format examples:
+ "Salatparadeiser 'Naama' HG026" -> "Naama"
+ "Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
+ "Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
+ """
+ # Try to extract name in quotes (various quote styles)
+ m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
+ if m:
+ return m.group(1).strip()
+ # Fallback: remove the article number suffix and type prefix
+ # Remove trailing article number like HG026, TO019, etc.
+ name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
+ # Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
+ # Just return the full cleaned name
+ return name
+
+
+def parse_pack_info(unit_desc):
+ """
+ Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
+ Returns (pack_size, pack_unit) or (None, None).
+ """
+ if not unit_desc:
+ return None, None
+ # "20-30 Korn" -> take the lower bound
+ m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
+ if m:
+ return float(m.group(1)), m.group(2)
+ return None, None
+
+
+# --- Main scraping logic -----------------------------------------------------
+
+def fetch_all_arche_noah_products(session):
+ """Search the shop API to find all Arche Noah seed products."""
+ all_products = {}
+ seen_terms = set()
+
+ for term in SEARCH_TERMS:
+ if term.lower() in seen_terms:
+ continue
+ seen_terms.add(term.lower())
+
+ offset = 0
+ while True:
+ payload = {
+ "searchCriteria": term,
+ "startIndex": offset,
+ "numDataSets": 200,
+ "allowAllProducts": False,
+ }
+ try:
+ data = shop_request(session, "webshop/getproducts", payload)
+ except Exception as e:
+ print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
+ break
+
+ if not data:
+ break
+
+ new_count = 0
+ for p in data:
+ if p["sid"] not in all_products:
+ all_products[p["sid"]] = p
+ new_count += 1
+
+ if len(data) < 200:
+ break
+ offset += len(data)
+ time.sleep(REQUEST_DELAY)
+
+ time.sleep(REQUEST_DELAY)
+
+ # Filter to Arche Noah's own seed products only
+ an_products = {
+ sid: p for sid, p in all_products.items()
+ if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
+ }
+
+ print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
+ return an_products
+
+
+def fetch_product_details(session, products):
+ """Fetch detailed info (Latin names) for each product."""
+ details = {}
+ total = len(products)
+ for i, (sid, product) in enumerate(products.items()):
+ try:
+ detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
+ if detail:
+ details[sid] = detail
+ except Exception as e:
+ print(f" Detail for {sid} failed: {e}", file=sys.stderr)
+
+ if (i + 1) % 20 == 0:
+ print(f" Fetched details: {i + 1}/{total}")
+ time.sleep(REQUEST_DELAY)
+
+ print(f"Fetched {len(details)} product details")
+ return details
+
+
+def load_herbapi_species():
+ """Load all species from HerbAPI and build lookup maps (handles pagination)."""
+ page = 1
+ species_list = []
+ while True:
+ result = herbapi_request("GET", f"species?per_page=100&page={page}")
+ if isinstance(result, dict) and "data" in result:
+ data = result["data"]
+ total = result.get("total", 0)
+ elif isinstance(result, list):
+ data = result
+ total = len(data)
+ else:
+ break
+ species_list.extend(data)
+ if len(species_list) >= total or not data:
+ break
+ page += 1
+
+ # Build lookup by scientific name (normalized lowercase)
+ by_scientific = {}
+ for s in species_list:
+ key = s["name_scientific"].strip().lower()
+ by_scientific[key] = s
+ return species_list, by_scientific
+
+
+def load_herbapi_cultivars():
+ """Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
+ page = 1
+ all_cultivars = []
+ while True:
+ result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
+ if isinstance(result, dict) and "data" in result:
+ data = result["data"]
+ total = result.get("total", 0)
+ elif isinstance(result, list):
+ data = result
+ total = len(data)
+ else:
+ break
+
+ all_cultivars.extend(data)
+ if len(all_cultivars) >= total or not data:
+ break
+ page += 1
+
+ # Build lookup by (species_id, normalized cultivar name)
+ by_key = {}
+ for c in all_cultivars:
+ key = (c["species_id"], c["name"].strip().lower())
+ by_key[key] = c
+
+ return all_cultivars, by_key
+
+
+def ensure_supplier():
+ """Create the Arche Noah supplier if it doesn't exist, return its ID."""
+ suppliers = herbapi_request("GET", "suppliers")
+ if isinstance(suppliers, dict) and "data" in suppliers:
+ suppliers = suppliers["data"]
+
+ for s in suppliers:
+ if "arche" in s["name"].lower() and "noah" in s["name"].lower():
+ print(f"Supplier 'Arche Noah' already exists: {s['id']}")
+ return s["id"]
+
+ print("Creating supplier 'Arche Noah'...")
+ result = herbapi_request("POST", "suppliers", {
+ "name": "Arche Noah",
+ "url": "https://www.arche-noah.at",
+ "country": "AT",
+ "is_organic": True,
+ "is_demeter": False,
+ "notes": "Austrian society for heritage seed preservation and biodiversity",
+ })
+ print(f"Created supplier: {result['id']}")
+ return result["id"]
+
+
+def load_existing_supplier_links(cultivar_id):
+ """Load existing supplier links for a cultivar."""
+ try:
+ result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
+ if isinstance(result, list):
+ return result
+ if isinstance(result, dict) and "data" in result:
+ return result["data"]
+ return []
+ except Exception:
+ return []
+
+
+def main():
+ now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+ print(f"=== Arche Noah Seed Catalog Scraper ===")
+ print(f"Started at {now_str}\n")
+
+ # Step 1: Create Arche Noah supplier in HerbAPI
+ print("[1/6] Ensuring Arche Noah supplier exists...")
+ supplier_id = ensure_supplier()
+ print()
+
+ # Step 2: Load HerbAPI species for matching
+ print("[2/6] Loading HerbAPI species...")
+ species_list, species_by_scientific = load_herbapi_species()
+ print(f"Loaded {len(species_list)} species")
+ print()
+
+ # Step 3: Load existing cultivars for idempotency
+ print("[3/6] Loading existing cultivars...")
+ existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
+ print(f"Loaded {len(existing_cultivars)} existing cultivars")
+ print()
+
+ # Step 4: Scrape Arche Noah shop
+ print("[4/6] Scraping Arche Noah shop catalog...")
+ session = shop_create_session()
+ print(f"Got shop session")
+ products = fetch_all_arche_noah_products(session)
+ print()
+
+ # Step 5: Fetch product details (to get Latin names)
+ print("[5/6] Fetching product details for Latin name matching...")
+ details = fetch_product_details(session, products)
+ print()
+
+ # Step 6: Create cultivars in HerbAPI
+ print("[6/6] Creating cultivars in HerbAPI...")
+ stats = {
+ "created": 0,
+ "skipped_existing": 0,
+ "skipped_no_species": 0,
+ "supplier_linked": 0,
+ "supplier_link_existed": 0,
+ "errors": 0,
+ }
+
+ for sid, product in sorted(products.items()):
+ detail = details.get(sid, {})
+
+ # Extract Latin name from detail
+ latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
+ if not latin_name:
+ # Fallback: try from category mapping
+ latin_name = None
+
+ # Match to HerbAPI species (handles subspecies/variety suffixes)
+ species = match_species(latin_name, species_by_scientific)
+
+ if not species:
+ print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
+ stats["skipped_no_species"] += 1
+ continue
+
+ # Extract cultivar name
+ cultivar_name = extract_cultivar_name(product["name"])
+ if not cultivar_name:
+ print(f" SKIP (no cultivar name): {product['name']}")
+ stats["skipped_no_species"] += 1
+ continue
+
+ # Check if cultivar already exists (idempotency)
+ lookup_key = (species["id"], cultivar_name.strip().lower())
+ existing = cultivars_by_key.get(lookup_key)
+
+ if existing:
+ cultivar_id = existing["id"]
+ stats["skipped_existing"] += 1
+ else:
+ # Determine if this is organic
+ is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
+
+ # Build product URL
+ alias = product.get("alias") or detail.get("alias", "")
+ product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
+
+ # Create cultivar
+ cultivar_data = {
+ "species_id": species["id"],
+ "name": cultivar_name,
+ "name_de": cultivar_name,
+ "is_organic": is_organic,
+ "source_urls": [product_url] if product_url else None,
+ }
+
+ try:
+ result = herbapi_request("POST", "cultivars", cultivar_data)
+ cultivar_id = result["id"]
+ stats["created"] += 1
+ # Add to lookup for idempotency within this run
+ cultivars_by_key[lookup_key] = result
+ print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
+ except Exception as e:
+ print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
+ stats["errors"] += 1
+ continue
+
+ # Link cultivar to supplier
+ existing_links = load_existing_supplier_links(cultivar_id)
+ already_linked = any(
+ link["supplier_id"] == supplier_id for link in existing_links
+ )
+
+ if already_linked:
+ stats["supplier_link_existed"] += 1
+ else:
+ # Parse pack info
+ unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
+ pack_size, pack_unit = parse_pack_info(unit_desc)
+
+ # Get price
+ price = None
+ price_list = product.get("priceListPos") or detail.get("priceListPos", [])
+ if price_list:
+ price = price_list[0].get("singleUnitPrice")
+
+ # Build product URL
+ alias = product.get("alias") or detail.get("alias", "")
+ product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
+
+ link_data = {
+ "supplier_id": supplier_id,
+ "article_number": str(product.get("articleNr", "")),
+ "product_url": product_url,
+ "price_eur": price,
+ "pack_size": pack_size,
+ "pack_unit": pack_unit,
+ }
+
+ try:
+ herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
+ stats["supplier_linked"] += 1
+ except Exception as e:
+ print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
+ stats["errors"] += 1
+
+ time.sleep(0.1) # small delay between HerbAPI calls
+
+ # Summary
+ print(f"\n{'='*60}")
+ print(f"Scraping complete!")
+ print(f" Cultivars created: {stats['created']}")
+ print(f" Cultivars already existed: {stats['skipped_existing']}")
+ print(f" Skipped (no species match): {stats['skipped_no_species']}")
+ print(f" Supplier links created: {stats['supplier_linked']}")
+ print(f" Supplier links existed: {stats['supplier_link_existed']}")
+ print(f" Errors: {stats['errors']}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/scrapers/scrape_bingenheimer.py b/tools/scrapers/scrape_bingenheimer.py
new file mode 100644
index 0000000..b94ee16
--- /dev/null
+++ b/tools/scrapers/scrape_bingenheimer.py
@@ -0,0 +1,843 @@
+#!/usr/bin/env python3
+"""
+Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
+Extracts cultivar data and imports into HerbAPI.
+
+Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
+"""
+
+import json
+import re
+import sys
+import time
+import urllib.request
+import urllib.error
+import urllib.parse
+from html.parser import HTMLParser
+from typing import Optional
+
+# ── Configuration ─────────────────────────────────────────────────────────
+API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+SITE_BASE = "https://www.bingenheimersaatgut.de"
+DELAY = 0.5
+USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
+
+# ── Category URLs to scrape ───────────────────────────────────────────────
+# (url_path, default_species_scientific_name)
+
+VEGETABLE_CATEGORIES = [
+ ("gemuese/tomaten", "Solanum lycopersicum"),
+ ("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
+ ("gemuese/gurken/salatgurken", "Cucumis sativus"),
+ ("gemuese/aubergine", "Solanum melongena"),
+ ("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
+ ("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
+ ("gemuese/bohnen/dicke-bohne", "Vicia faba"),
+ ("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
+ ("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
+ ("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
+ ("gemuese/erbsen/markerbse", "Pisum sativum"),
+ ("gemuese/erbsen/schalerbse", "Pisum sativum"),
+ ("gemuese/erbsen/zuckererbse", "Pisum sativum"),
+ ("gemuese/feldsalat", "Valerianella locusta"),
+ ("gemuese/knollenfenchel", "Foeniculum vulgare"),
+ ("gemuese/kohl/blumenkohl", "Brassica oleracea"),
+ ("gemuese/kohl/brokkoli", "Brassica oleracea"),
+ ("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
+ ("gemuese/kohl/gruenkohl", "Brassica oleracea"),
+ ("gemuese/kohl/kohlrabi", "Brassica oleracea"),
+ ("gemuese/kohl/rotkohl", "Brassica oleracea"),
+ ("gemuese/kohl/weisskohl", "Brassica oleracea"),
+ ("gemuese/kohl/wirsing", "Brassica oleracea"),
+ ("gemuese/kohl/rosenkohl", "Brassica oleracea"),
+ ("gemuese/kresse", "Lepidium sativum"),
+ ("gemuese/kuerbis", "Cucurbita maxima"),
+ ("gemuese/zuckermais", "Zea mays"),
+ ("gemuese/mangold", "Beta vulgaris"),
+ ("gemuese/melone", "Cucumis melo"),
+ ("gemuese/moehren", "Daucus carota"),
+ ("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
+ ("gemuese/paprika/chili", "Capsicum annuum"),
+ ("gemuese/pastinaken", "Pastinaca sativa"),
+ ("gemuese/petersilienwurzel", "Petroselinum crispum"),
+ ("gemuese/physalis", "Physalis peruviana"),
+ ("gemuese/porreelauch", "Allium porrum"),
+ ("gemuese/radies", "Raphanus sativus"),
+ ("gemuese/rettich", "Raphanus sativus"),
+ ("gemuese/rote-bete", "Beta vulgaris"),
+ ("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
+ ("gemuese/rueben/kohlruebe", "Brassica napus"),
+ ("gemuese/rucola", "Eruca vesicaria"),
+ ("gemuese/salat/bataviasalat", "Lactuca sativa"),
+ ("gemuese/salat/eichblattsalat", "Lactuca sativa"),
+ ("gemuese/salat/eissalat", "Lactuca sativa"),
+ ("gemuese/salat/endivien", "Cichorium endivia"),
+ ("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
+ ("gemuese/salat/kopfsalat", "Lactuca sativa"),
+ ("gemuese/salat/lollosalat", "Lactuca sativa"),
+ ("gemuese/salat/romanasalat", "Lactuca sativa"),
+ ("gemuese/salat/baby-leaf", "Lactuca sativa"),
+ ("gemuese/sellerie/knollensellerie", "Apium graveolens"),
+ ("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
+ ("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
+ ("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
+ ("gemuese/blattstielgemuese", "Beta vulgaris"),
+ ("gemuese/zwiebeln", "Allium cepa"),
+ ("gemuese/lauchzwiebeln", "Allium fistulosum"),
+ ("gemuese/artischocke", "Cynara cardunculus"),
+ ("gemuese/asia-salate", "Brassica juncea"),
+ ("gemuese/chicoree", "Cichorium intybus"),
+ ("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
+ ("gemuese/winterpostelein", "Claytonia perfoliata"),
+ ("gemuese/zucchini", "Cucurbita pepo"),
+ ("gemuese/catalogna", "Cichorium intybus"),
+ ("gemuese/zichoriensalate", "Cichorium intybus"),
+]
+
+HERB_CATEGORIES = [
+ ("kraeuter/basilikum", "Ocimum basilicum"),
+ ("kraeuter/bohnenkraut", "Satureja hortensis"),
+ ("kraeuter/borretsch", "Borago officinalis"),
+ ("kraeuter/dill", "Anethum graveolens"),
+ ("kraeuter/kuemmel", "Carum carvi"),
+ ("kraeuter/kerbel", "Anthriscus cerefolium"),
+ ("kraeuter/koriander", "Coriandrum sativum"),
+ ("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
+ ("kraeuter/kultursauerampfer", "Rumex acetosa"),
+ ("kraeuter/lavendel", "Lavandula angustifolia"),
+ ("kraeuter/liebstock", "Levisticum officinale"),
+ ("kraeuter/majoran", "Origanum majorana"),
+ ("kraeuter/oregano", "Origanum vulgare"),
+ ("kraeuter/pimpinelle", "Sanguisorba minor"),
+ ("kraeuter/estragon", "Artemisia dracunculus"),
+ ("kraeuter/salbei", "Salvia officinalis"),
+ ("kraeuter/schnittlauch", "Allium schoenoprasum"),
+ ("kraeuter/schnittknoblauch", "Allium tuberosum"),
+ ("kraeuter/schwarzkuemmel", "Nigella sativa"),
+ ("kraeuter/speisechrysantheme", "Glebionis coronaria"),
+ ("kraeuter/thymian", "Thymus vulgaris"),
+ ("kraeuter/ysop", "Hyssopus officinalis"),
+ ("kraeuter/winterkresse", "Barbarea vulgaris"),
+ ("kraeuter/brunnenkresse", "Nasturtium officinale"),
+ ("kraeuter/melisse", "Melissa officinalis"),
+ ("kraeuter/petersilie", "Petroselinum crispum"),
+ ("kraeuter/schnittsellerie", "Apium graveolens"),
+ ("kraeuter/beifuss", "Artemisia vulgaris"),
+]
+
+GREEN_MANURE_CATEGORIES = [
+ ("gruenduengung", None),
+]
+
+ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
+
+# ── Stats ─────────────────────────────────────────────────────────────────
+stats = {
+ "categories_scraped": 0,
+ "products_found": 0,
+ "detail_pages_fetched": 0,
+ "cultivars_created": 0,
+ "cultivars_existed": 0,
+ "supplier_links_created": 0,
+ "supplier_links_existed": 0,
+ "species_created": 0,
+ "families_created": 0,
+ "species_not_matched": [],
+ "errors": [],
+}
+
+
+# ── HTTP helpers ──────────────────────────────────────────────────────────
+def fetch_page(url: str) -> str:
+ """Fetch a web page with User-Agent header."""
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+ try:
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ return resp.read().decode("utf-8", errors="replace")
+ except urllib.error.HTTPError as e:
+ if e.code == 404:
+ return ""
+ raise
+
+
+def api_get(path: str, params: dict = None) -> dict:
+ """GET from HerbAPI."""
+ url = f"{API_BASE}{path}"
+ if params:
+ url += "?" + urllib.parse.urlencode(params)
+ req = urllib.request.Request(url, headers={
+ "Authorization": f"Bearer {API_TOKEN}",
+ "Accept": "application/json",
+ })
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ return json.loads(resp.read())
+
+
+def api_post(path: str, data: dict) -> tuple:
+ """POST to HerbAPI. Returns (response_dict, status_code)."""
+ url = f"{API_BASE}{path}"
+ body = json.dumps(data).encode("utf-8")
+ req = urllib.request.Request(url, data=body, method="POST", headers={
+ "Authorization": f"Bearer {API_TOKEN}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ })
+ try:
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ return json.loads(resp.read()), resp.status
+ except urllib.error.HTTPError as e:
+ err_body = e.read().decode("utf-8", errors="replace")
+ return {"error": err_body, "_status": e.code}, e.code
+
+
+# ── HTML parsing helpers ──────────────────────────────────────────────────
+def parse_product_links(html: str) -> list:
+ """Parse product links from listing page using regex."""
+ links = []
+ # Magento product-item-link pattern
+ pattern = re.compile(
+ r']+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*',
+ re.DOTALL | re.IGNORECASE
+ )
+ for match in pattern.finditer(html):
+ url = match.group(1)
+ name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
+ if name:
+ if not url.startswith("http"):
+ url = SITE_BASE + url
+ links.append((url, name))
+
+ if not links:
+ # Broader pattern for product detail links
+ pattern2 = re.compile(
+ r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
+ re.IGNORECASE
+ )
+ seen = set()
+ for match in pattern2.finditer(html):
+ url = match.group(1).strip()
+ name = match.group(2).strip()
+ if name and url not in seen and not url.endswith(".html"):
+ seen.add(url)
+ if not url.startswith("http"):
+ url = SITE_BASE + url
+ links.append((url, name))
+
+ # Deduplicate by URL
+ seen_urls = set()
+ unique = []
+ for url, name in links:
+ if url not in seen_urls:
+ seen_urls.add(url)
+ unique.append((url, name))
+ return unique
+
+
+def extract_latin_from_detail(html: str) -> Optional[str]:
+ """Extract Latin/botanical name from product detail page."""
+ patterns = [
+ r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*(?:em|i)>',
+ r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
+ r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
+ ]
+ for pat in patterns:
+ m = re.search(pat, html, re.IGNORECASE)
+ if m:
+ name = m.group(1).strip()
+ parts = name.split()
+ if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
+ return name
+ return None
+
+
+def extract_description_from_detail(html: str) -> str:
+ """Extract product description from detail page."""
+ desc_patterns = [
+ r'
]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)
',
+ r']*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)
',
+ r'data-content-type="description"[^>]*>(.*?)',
+ ]
+ for pat in desc_patterns:
+ m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
+ if m:
+ raw = m.group(1)
+ text = re.sub(r'<[^>]+>', ' ', raw)
+ text = re.sub(r'\s+', ' ', text).strip()
+ if len(text) > 20:
+ return text[:2000]
+ return ""
+
+
+def extract_article_number(product_name: str, url: str) -> Optional[str]:
+ """Extract article number from product name or URL."""
+ m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
+ if m:
+ return m.group(1).replace(" ", "")
+ slug = url.rstrip("/").split("/")[-1]
+ m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
+ if m:
+ return m.group(1).upper()
+ return None
+
+
+def extract_variety_name(product_name: str) -> str:
+ """Extract the variety/cultivar name from the full product name."""
+ name = product_name.strip()
+
+ # Remove article number suffix like (G802)
+ name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
+
+ # Common German vegetable/herb type prefixes to strip
+ prefixes = [
+ # Tomatoes
+ r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
+ r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
+ # Beans
+ r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
+ r'Edamame(?:-Sojabohne)?\s+',
+ # Peas
+ r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
+ # Cucurbits
+ r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
+ r'Zucchini\s+',
+ r'Kürbis\s+',
+ r'(?:Wasser)?[Mm]elone\s+',
+ # Brassicas
+ r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
+ r'Kohlrabi\s+',
+ r'Wirsing\s+',
+ r'Brokkoli\s+',
+ r'Chinakohl\s+',
+ r'Pak\s+Choi\s+',
+ r'Kohlrübe\s+',
+ r'Mai-/Herbstrüben?(?:/Navets)?\s+',
+ # Root vegetables
+ r'Möhre\s+',
+ r'Karotten?(?:\s*-?\s*Mix)?\s+',
+ r'Pastinake\s+',
+ r'Radies(?:chen)?\s+',
+ r'Rettich\s+',
+ r'Schwarzwurzel\s+',
+ r'Haferwurzel\s+',
+ r'Petersilienwurzel\s+',
+ # Beets
+ r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
+ r'Mangold\s+',
+ # Lettuce & leafy
+ r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
+ r'Feldsalat\s+',
+ r'Endivie\s+',
+ r'Asia[\s-]*Salat\s+',
+ r'Spinat\s+',
+ # Alliums
+ r'Zwiebel\s+',
+ r'Lauchzwiebel\s+',
+ r'Porree(?:/Lauch)?\s+',
+ r'Schnittlauch\s+',
+ r'Schnittknoblauch\s+',
+ # Peppers
+ r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
+ r'Chili\s+',
+ # Celery
+ r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
+ # Herbs
+ r'Basilikum\s+',
+ r'Koriander\s+',
+ r'Dill\s+',
+ r'Petersilie\s+',
+ r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
+ r'Salbei\s+',
+ r'Thymian\s+',
+ r'Oregano\s+',
+ r'Lavendel\s+',
+ r'Melisse\s+',
+ r'Majoran\s+',
+ r'Estragon\s+',
+ r'Kresse\s+',
+ r'Bohnenkraut\s+',
+ r'Borretsch\s+',
+ r'Kümmel\s+',
+ r'Kerbel\s+',
+ r'Liebstock\s+',
+ r'Ysop\s+',
+ r'Pimpinelle\s+',
+ r'Beifuß\s+',
+ r'Schwarzkümmel\s+',
+ # Other
+ r'Zuckermais\s+',
+ r'Artischocke\s+',
+ r'Physalis\s+',
+ r'Aubergine\s+',
+ r'Catalogna\s+',
+ ]
+ for prefix in prefixes:
+ name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
+
+ name = name.strip().strip("'\"")
+ return name
+
+
+# ── API data caches ───────────────────────────────────────────────────────
+species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
+family_cache = {} # name_scientific_lower -> {id, name_scientific}
+cultivar_cache = {} # slug -> {id, name, species_id, ...}
+supplier_id = None
+
+
+def load_api_data():
+ """Load all existing data from HerbAPI for matching."""
+ global supplier_id
+
+ print("Loading existing HerbAPI data...")
+
+ # Load families
+ page = 1
+ while True:
+ resp = api_get("/families", {"per_page": 100, "page": page})
+ for f in resp["data"]:
+ family_cache[f["name_scientific"].lower()] = f
+ if len(resp["data"]) < 100:
+ break
+ page += 1
+ print(f" Loaded {len(family_cache)} families")
+
+ # Load species
+ page = 1
+ while True:
+ resp = api_get("/species", {"per_page": 100, "page": page})
+ for s in resp["data"]:
+ species_cache[s["name_scientific"].lower()] = s
+ if len(resp["data"]) < 100:
+ break
+ page += 1
+ print(f" Loaded {len(species_cache)} species")
+
+ # Load ALL cultivars (slug + id + name + species_id)
+ page = 1
+ while True:
+ resp = api_get("/cultivars", {"per_page": 100, "page": page})
+ for c in resp["data"]:
+ cultivar_cache[c["slug"]] = {
+ "id": c["id"],
+ "name": c["name"],
+ "species_id": c["species_id"],
+ }
+ if len(resp["data"]) < 100:
+ break
+ page += 1
+ print(f" Loaded {len(cultivar_cache)} cultivars")
+
+ # Create or find Bingenheimer supplier
+ resp = api_get("/suppliers")
+ for s in resp:
+ if "bingenheimer" in s["name"].lower():
+ supplier_id = s["id"]
+ print(f" Found existing supplier: {s['name']} ({s['id']})")
+ break
+
+ if not supplier_id:
+ print(" Creating Bingenheimer Saatgut supplier...")
+ s, code = api_post("/suppliers", {
+ "name": "Bingenheimer Saatgut",
+ "url": "https://www.bingenheimersaatgut.de",
+ "country": "DE",
+ "is_organic": True,
+ "is_demeter": True,
+ "notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
+ })
+ if "id" in s:
+ supplier_id = s["id"]
+ print(f" Created supplier: {s['id']}")
+ else:
+ print(f" ERROR creating supplier: {s}")
+ sys.exit(1)
+
+
+def find_or_create_species(latin_name: str) -> Optional[str]:
+ """Find species by Latin name or create it. Returns species ID."""
+ if not latin_name:
+ return None
+
+ key = latin_name.lower().strip()
+
+ # Direct match
+ if key in species_cache:
+ return species_cache[key]["id"]
+
+ # Try without subspecies/variety
+ base = " ".join(key.split()[:2])
+ if base in species_cache:
+ return species_cache[base]["id"]
+
+ # Handle synonyms
+ synonyms = {
+ "lycopersicon esculentum": "solanum lycopersicum",
+ "capsicum annuum var. annuum": "capsicum annuum",
+ "brassica oleracea var. botrytis": "brassica oleracea",
+ "brassica oleracea var. italica": "brassica oleracea",
+ "brassica oleracea var. gemmifera": "brassica oleracea",
+ "brassica oleracea var. gongylodes": "brassica oleracea",
+ "brassica oleracea var. capitata": "brassica oleracea",
+ "brassica oleracea var. sabauda": "brassica oleracea",
+ "brassica oleracea var. sabellica": "brassica oleracea",
+ "brassica rapa var. rapa": "brassica rapa",
+ "brassica rapa subsp. pekinensis": "brassica rapa",
+ "brassica rapa subsp. chinensis": "brassica rapa",
+ "beta vulgaris var. conditiva": "beta vulgaris",
+ "beta vulgaris subsp. vulgaris": "beta vulgaris",
+ "beta vulgaris var. vulgaris": "beta vulgaris",
+ "allium porrum": "allium cepa",
+ "allium ampeloprasum": "allium cepa",
+ "origanum majorana": "origanum vulgare",
+ "cichorium intybus var. foliosum": "cichorium intybus",
+ "petroselinum crispum var. tuberosum": "petroselinum crispum",
+ "apium graveolens var. rapaceum": "apium graveolens",
+ "apium graveolens var. dulce": "apium graveolens",
+ "lactuca sativa var. capitata": "lactuca sativa",
+ "lactuca sativa var. crispa": "lactuca sativa",
+ "lactuca sativa var. longifolia": "lactuca sativa",
+ }
+ if key in synonyms:
+ syn_key = synonyms[key]
+ if syn_key in species_cache:
+ return species_cache[syn_key]["id"]
+
+ # Try to create the species
+ genus = latin_name.split()[0]
+ family_map = {
+ "Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
+ "Nicandra": "Solanaceae",
+ "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
+ "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
+ "Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
+ "Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
+ "Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
+ "Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
+ "Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
+ "Allium": "Amaryllidaceae",
+ "Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
+ "Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
+ "Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
+ "Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
+ "Sanguisorba": "Rosaceae",
+ "Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
+ "Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
+ "Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
+ "Artemisia": "Asteraceae",
+ "Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
+ "Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
+ "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
+ "Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
+ "Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
+ "Mentha": "Lamiaceae",
+ "Zea": "Poaceae",
+ "Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
+ "Valerianella": "Caprifoliaceae",
+ "Tropaeolum": "Tropaeolaceae",
+ "Rumex": "Polygonaceae",
+ "Nigella": "Ranunculaceae",
+ "Claytonia": "Montiaceae",
+ "Tetragonia": "Aizoaceae",
+ "Basella": "Basellaceae",
+ "Plantago": "Plantaginaceae",
+ }
+
+ family_name = family_map.get(genus)
+ if not family_name:
+ print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
+ stats["species_not_matched"].append(latin_name)
+ return None
+
+ family_id = find_or_create_family(family_name)
+ if not family_id:
+ return None
+
+ print(f" Creating species: {latin_name}")
+ resp, code = api_post("/species", {
+ "name_scientific": latin_name,
+ "family_id": family_id,
+ })
+ if "id" in resp:
+ species_cache[latin_name.lower()] = resp
+ stats["species_created"] += 1
+ return resp["id"]
+ else:
+ # Might already exist, reload
+ print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
+ page = 1
+ while True:
+ r = api_get("/species", {"per_page": 100, "page": page})
+ for s in r["data"]:
+ species_cache[s["name_scientific"].lower()] = s
+ if len(r["data"]) < 100:
+ break
+ page += 1
+ if latin_name.lower() in species_cache:
+ return species_cache[latin_name.lower()]["id"]
+ stats["errors"].append(f"Species creation failed: {latin_name}")
+ return None
+
+
+def find_or_create_family(family_name: str) -> Optional[str]:
+ """Find or create a plant family. Returns family ID."""
+ key = family_name.lower()
+ if key in family_cache:
+ return family_cache[key]["id"]
+
+ print(f" Creating family: {family_name}")
+ resp, code = api_post("/families", {"name_scientific": family_name})
+ if "id" in resp:
+ family_cache[key] = resp
+ stats["families_created"] += 1
+ return resp["id"]
+ else:
+ # Reload
+ r = api_get("/families", {"per_page": 200})
+ for ff in r["data"]:
+ family_cache[ff["name_scientific"].lower()] = ff
+ if key in family_cache:
+ return family_cache[key]["id"]
+ stats["errors"].append(f"Family creation failed: {family_name}")
+ return None
+
+
+def slugify(text: str) -> str:
+ """Generate a URL-safe slug."""
+ text = text.lower()
+ replacements = {
+ "ä": "a", "ö": "o", "ü": "u", "ß": "ss",
+ "é": "e", "è": "e", "ê": "e", "ë": "e",
+ "à": "a", "â": "a", "á": "a",
+ "ô": "o", "ù": "u", "û": "u", "ú": "u",
+ "ï": "i", "î": "i", "í": "i",
+ "ç": "c", "ñ": "n", "ó": "o",
+ "œ": "oe", "æ": "ae",
+ }
+ for old, new in replacements.items():
+ text = text.replace(old, new)
+ text = re.sub(r'[^a-z0-9\s-]', '', text)
+ text = re.sub(r'[\s]+', '-', text.strip())
+ text = re.sub(r'-+', '-', text)
+ return text.strip('-')
+
+
+def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
+ """Check if cultivar already exists. Returns cultivar ID or None."""
+ expected_slug = slugify(f"{species_name} {variety_name}")
+
+ # Direct slug match
+ if expected_slug in cultivar_cache:
+ return cultivar_cache[expected_slug]["id"]
+
+ # Check for name match in same species
+ variety_lower = variety_name.lower()
+ for slug, data in cultivar_cache.items():
+ if data["species_id"] == species_id and data["name"].lower() == variety_lower:
+ return data["id"]
+
+ return None
+
+
+def scrape_category(cat_path: str, default_species: Optional[str]):
+ """Scrape a single category page and all its products."""
+ url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
+ print(f"\n{'='*60}")
+ print(f"Category: {cat_path}")
+
+ html = fetch_page(url)
+ if not html:
+ print(" SKIP: Page not found (404)")
+ return
+
+ time.sleep(DELAY)
+
+ products = parse_product_links(html)
+ print(f" Found {len(products)} products")
+ stats["products_found"] += len(products)
+ stats["categories_scraped"] += 1
+
+ for prod_url, prod_name in products:
+ process_product(prod_url, prod_name, default_species)
+
+
+def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
+ """Process a single product: fetch detail, extract data, create cultivar."""
+ article_number = extract_article_number(prod_name, prod_url)
+ variety_name = extract_variety_name(prod_name)
+
+ if not variety_name:
+ print(f" SKIP (no variety): {prod_name}")
+ return
+
+ # Skip mixes, sets, bundles
+ skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
+ "saat-set", " mix ", "trio ", "quartett", "gutschein",
+ "buch ", "düngung", "erde ", "-garten"]
+ name_lower = prod_name.lower()
+ # Exception: if the variety name itself is the whole thing, keep it
+ if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
+ # Only skip if it really seems like a mix
+ if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
+ print(f" SKIP (mix/set): {prod_name}")
+ return
+
+ print(f"\n Product: {prod_name}")
+ print(f" Variety: {variety_name}, SKU: {article_number}")
+
+ # Fetch detail page
+ latin_name = None
+ description = ""
+ time.sleep(DELAY)
+ try:
+ detail_html = fetch_page(prod_url)
+ stats["detail_pages_fetched"] += 1
+ if detail_html:
+ latin_name = extract_latin_from_detail(detail_html)
+ description = extract_description_from_detail(detail_html)
+ except Exception as e:
+ print(f" WARNING: Detail page error: {e}")
+
+ species_name = latin_name or default_species
+ if not species_name:
+ print(f" SKIP: No species for '{prod_name}'")
+ stats["species_not_matched"].append(prod_name)
+ return
+
+ print(f" Species: {species_name}")
+
+ species_id = find_or_create_species(species_name)
+ if not species_id:
+ print(f" SKIP: Could not resolve species '{species_name}'")
+ return
+
+ # Check if cultivar already exists
+ existing_id = find_existing_cultivar(species_name, variety_name, species_id)
+
+ cultivar_id = None
+
+ if existing_id:
+ cultivar_id = existing_id
+ print(f" EXISTS: cultivar already in DB")
+ stats["cultivars_existed"] += 1
+ else:
+ # Create cultivar
+ data = {
+ "species_id": species_id,
+ "name": variety_name,
+ "name_de": variety_name,
+ "is_organic": True,
+ }
+ if description:
+ data["description"] = description
+
+ resp, code = api_post("/cultivars", data)
+
+ if "id" in resp:
+ cultivar_id = resp["id"]
+ cultivar_cache[resp["slug"]] = {
+ "id": resp["id"],
+ "name": variety_name,
+ "species_id": species_id,
+ }
+ stats["cultivars_created"] += 1
+ print(f" CREATED: {resp['slug']}")
+ elif code == 500 and "Database error" in str(resp.get("error", "")):
+ # Likely slug conflict - try to find existing
+ print(f" DB conflict - searching for existing cultivar...")
+ # Reload cultivars for this species
+ page = 1
+ while True:
+ r = api_get("/cultivars", {"per_page": 100, "page": page})
+ for c in r["data"]:
+ cultivar_cache[c["slug"]] = {
+ "id": c["id"],
+ "name": c["name"],
+ "species_id": c["species_id"],
+ }
+ if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
+ cultivar_id = c["id"]
+ if cultivar_id or len(r["data"]) < 100:
+ break
+ page += 1
+
+ if cultivar_id:
+ print(f" Found existing after conflict: {cultivar_id}")
+ stats["cultivars_existed"] += 1
+ else:
+ print(f" ERROR: DB error and could not find existing cultivar")
+ stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
+ return
+ else:
+ print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
+ stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
+ return
+
+ # Link to supplier
+ if cultivar_id and supplier_id:
+ link_data = {
+ "supplier_id": supplier_id,
+ "product_url": prod_url,
+ }
+ if article_number:
+ link_data["article_number"] = article_number
+
+ resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
+
+ if "id" in resp:
+ stats["supplier_links_created"] += 1
+ print(f" LINKED (SKU: {article_number})")
+ elif code == 500 or "already" in str(resp.get("error", "")).lower():
+ stats["supplier_links_existed"] += 1
+ print(f" LINK EXISTS")
+ else:
+ print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
+ stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
+
+
+def main():
+ print("=" * 60)
+ print("Bingenheimer Saatgut Scraper for HerbAPI")
+ print("=" * 60)
+
+ load_api_data()
+
+ print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
+
+ for cat_path, default_species in ALL_CATEGORIES:
+ try:
+ scrape_category(cat_path, default_species)
+ except Exception as e:
+ print(f" ERROR in category {cat_path}: {e}")
+ stats["errors"].append(f"Category error: {cat_path}: {e}")
+
+ # Summary
+ print("\n" + "=" * 60)
+ print("SCRAPING COMPLETE - SUMMARY")
+ print("=" * 60)
+ print(f"Categories scraped: {stats['categories_scraped']}")
+ print(f"Products found: {stats['products_found']}")
+ print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
+ print(f"Cultivars created: {stats['cultivars_created']}")
+ print(f"Cultivars existed: {stats['cultivars_existed']}")
+ print(f"Supplier links created: {stats['supplier_links_created']}")
+ print(f"Supplier links existed: {stats['supplier_links_existed']}")
+ print(f"Species created: {stats['species_created']}")
+ print(f"Families created: {stats['families_created']}")
+ print(f"Errors: {len(stats['errors'])}")
+
+ if stats["species_not_matched"]:
+ print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
+ for s in stats["species_not_matched"][:30]:
+ print(f" - {s}")
+
+ if stats["errors"]:
+ print(f"\nErrors ({len(stats['errors'])}):")
+ for e in stats["errors"][:30]:
+ print(f" - {e}")
+
+ return 0 if not stats["errors"] else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tools/scrapers/scrape_dreschflegel.py b/tools/scrapers/scrape_dreschflegel.py
new file mode 100644
index 0000000..015baf6
--- /dev/null
+++ b/tools/scrapers/scrape_dreschflegel.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python3
+"""
+Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
+Extracts cultivar data and imports into HerbAPI.
+
+Run 2 - fixes pagination (API caps at 100/page), better species matching,
+caches scraped products, handles duplicates gracefully.
+"""
+
+import urllib.request
+import urllib.parse
+import urllib.error
+import gzip
+import json
+import re
+import time
+import sys
+import os
+import html as html_mod
+from collections import defaultdict
+
+# --- Configuration ---
+API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+SITE_BASE = "https://www.dreschflegel-saatgut.de"
+DELAY = 0.5
+USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
+CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
+
+# Unbuffered output
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+stats = defaultdict(int)
+
+
+def api_request(method, path, data=None):
+ """Make an API request to HerbAPI."""
+ url = f"{API_BASE}{path}"
+ body = json.dumps(data).encode("utf-8") if data else None
+ req = urllib.request.Request(url, data=body, method=method)
+ req.add_header("Authorization", f"Bearer {API_TOKEN}")
+ req.add_header("Content-Type", "application/json")
+ req.add_header("Accept", "application/json")
+ try:
+ resp = urllib.request.urlopen(req)
+ return json.loads(resp.read().decode("utf-8"))
+ except urllib.error.HTTPError as e:
+ body_text = e.read().decode("utf-8", errors="replace")
+ if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
+ return None # Duplicate, handled silently
+ if e.code == 500 and "database error" in body_text.lower():
+ # Likely a unique constraint violation = duplicate
+ return None
+ print(f" API error {e.code} {method} {path}: {body_text[:200]}")
+ return None
+
+
+def fetch_page(url):
+ """Fetch a web page with delay and user-agent."""
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+ try:
+ resp = urllib.request.urlopen(req, timeout=30)
+ return resp.read().decode("utf-8", errors="replace")
+ except Exception as e:
+ print(f" Fetch error {url}: {e}")
+ return None
+
+
+def get_sitemap_urls():
+ """Download sitemap and extract all URLs."""
+ print("Fetching sitemap index...")
+ html = fetch_page(f"{SITE_BASE}/sitemap.xml")
+ if not html:
+ return []
+
+ sitemap_urls = re.findall(r"(.*?)", html)
+ all_urls = []
+
+ for smap_url in sitemap_urls:
+ if smap_url.endswith(".xml.gz"):
+ print(f" Fetching compressed sitemap...")
+ req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
+ try:
+ resp = urllib.request.urlopen(req, timeout=30)
+ data = gzip.decompress(resp.read()).decode("utf-8")
+ urls = re.findall(r"(.*?)", data)
+ all_urls.extend(urls)
+ print(f" Found {len(urls)} URLs")
+ except Exception as e:
+ print(f" Error: {e}")
+
+ return all_urls
+
+
+def classify_urls(urls):
+ """Filter URLs to likely product pages (single-segment paths)."""
+ skip_prefixes = [
+ "impressum", "agb", "datenschutz", "kontakt", "widerrufs",
+ "versand", "abkuerz", "zertifikat", "wichtige-hinweise",
+ "muster-", "gutscheine", "kalender", "flyer", "katalog",
+ "sommer-herbst", "unsere-hoefe", "bestellschein",
+ "dreschflegel-news", "termine", "rezepte", "anbautipps",
+ "tipps-zur", "gartentelefon", "gartenfreude", "buecher",
+ "navigation", "vielfalt", "sut20", "saatgut",
+ "neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
+ "saatgut-vielfalt", "saat",
+ ]
+ candidates = []
+ for url in urls:
+ url = url.rstrip("/")
+ path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
+ "https://www.dreschflegel-saatgut.de/", ""
+ )
+ if not path or "/" in path:
+ continue
+ if any(path == p or path.startswith(p) for p in skip_prefixes):
+ continue
+ candidates.append(url)
+ return candidates
+
+
+def parse_product_page(html_content):
+ """Extract product data from a Dreschflegel product page."""
+ if not html_content or 'class="botname"' not in html_content:
+ return None
+
+ result = {}
+
+ m = re.search(r"(.*?)
", html_content)
+ if m:
+ result["name"] = html_mod.unescape(m.group(1).strip())
+
+ m = re.search(r'\s*(.*?)\s*
', html_content, re.DOTALL)
+ if m:
+ result["botanical_name"] = html_mod.unescape(m.group(1).strip())
+
+ m = re.search(
+ r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
+ html_content,
+ re.DOTALL,
+ )
+ if m:
+ result["article_number"] = m.group(1)
+
+ m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
+ if m:
+ try:
+ result["price"] = float(m.group(1))
+ except ValueError:
+ pass
+
+ m = re.search(
+ r"product-detail-description-text.*?(.*?)
",
+ html_content,
+ re.DOTALL,
+ )
+ if m:
+ desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
+ desc = html_mod.unescape(desc).strip()
+ if desc:
+ result["description"] = desc
+
+ m = re.search(r"Inhalt reicht f[üu]r:\s*(.*?)\s* | ", html_content)
+ if m:
+ result["pack_info"] = html_mod.unescape(m.group(1).strip())
+
+ return result if "name" in result and "botanical_name" in result else None
+
+
+def scrape_all_products(candidate_urls):
+ """Scrape product pages, using cache for already-scraped URLs."""
+ # Load cache
+ cache = {}
+ if os.path.exists(CACHE_FILE):
+ with open(CACHE_FILE, "r") as f:
+ cache = json.load(f)
+ print(f" Loaded {len(cache)} cached products")
+
+ products = []
+ to_fetch = [u for u in candidate_urls if u not in cache]
+ already_cached = [u for u in candidate_urls if u in cache]
+
+ # Add cached products
+ for u in already_cached:
+ if cache[u]: # None means "not a product page"
+ products.append(cache[u])
+
+ cached_products = len(products)
+ cached_non_products = len(already_cached) - cached_products
+ print(f" {cached_products} products from cache, "
+ f"{cached_non_products} non-products cached, "
+ f"{len(to_fetch)} to fetch")
+
+ for i, url in enumerate(to_fetch):
+ if (i + 1) % 50 == 0 or i == 0:
+ print(f" Fetching {i + 1}/{len(to_fetch)}...")
+
+ time.sleep(DELAY)
+ html_content = fetch_page(url)
+ if not html_content:
+ stats["fetch_errors"] += 1
+ cache[url] = None
+ continue
+
+ product = parse_product_page(html_content)
+ if product:
+ product["url"] = url
+ products.append(product)
+ cache[url] = product
+ stats["products_scraped"] += 1
+ else:
+ cache[url] = None
+ stats["not_product_pages"] += 1
+
+ # Save cache periodically
+ if (i + 1) % 100 == 0:
+ with open(CACHE_FILE, "w") as f:
+ json.dump(cache, f)
+
+ # Final cache save
+ with open(CACHE_FILE, "w") as f:
+ json.dump(cache, f)
+
+ print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
+ return products
+
+
+def paginated_get(path):
+ """Fetch all pages from a paginated API endpoint."""
+ all_items = []
+ page = 1
+ while True:
+ resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
+ if not resp or "data" not in resp or not resp["data"]:
+ break
+ all_items.extend(resp["data"])
+ if len(resp["data"]) < 100:
+ break
+ page += 1
+ return all_items
+
+
+def load_api_data():
+ """Load all species, families, cultivars from HerbAPI."""
+ print("Loading HerbAPI data...")
+
+ families = {}
+ for f in paginated_get("/families"):
+ families[f["name_scientific"].lower()] = f
+ print(f" {len(families)} families")
+
+ species = {}
+ for s in paginated_get("/species"):
+ species[s["name_scientific"].lower().strip()] = s
+ print(f" {len(species)} species")
+
+ cultivars = {}
+ for c in paginated_get("/cultivars"):
+ key = (c["species_id"], c["name"].lower().strip())
+ cultivars[key] = c
+ print(f" {len(cultivars)} cultivars")
+
+ return families, species, cultivars
+
+
+def ensure_supplier():
+ """Create or find the Dreschflegel supplier."""
+ resp = api_request("GET", "/suppliers")
+ if resp:
+ for s in resp:
+ if "dreschflegel" in s["name"].lower():
+ print(f" Supplier exists: {s['name']} ({s['id']})")
+ return s
+ data = {
+ "name": "Dreschflegel",
+ "url": "https://www.dreschflegel-saatgut.de",
+ "country": "DE",
+ "is_organic": True,
+ "is_demeter": False,
+ "notes": "German organic seed cooperative, open-pollinated heritage varieties",
+ }
+ resp = api_request("POST", "/suppliers", data)
+ if resp:
+ print(f" Created supplier: {resp['name']} ({resp['id']})")
+ return resp
+
+
+# Genus → family mapping for species creation
+GENUS_TO_FAMILY = {
+ # Asteraceae
+ "Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
+ "Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
+ "Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
+ "Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
+ "Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
+ "Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
+ "Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
+ "Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
+ "Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
+ "Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
+ "Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
+ "Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
+ "Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
+ "Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
+ # Solanaceae
+ "Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
+ "Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
+ # Cucurbitaceae
+ "Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
+ "Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
+ # Fabaceae
+ "Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
+ "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
+ "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
+ "Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
+ "Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
+ # Brassicaceae
+ "Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
+ "Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
+ "Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
+ "Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
+ "Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
+ # Apiaceae
+ "Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
+ "Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
+ "Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
+ "Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
+ "Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
+ # Lamiaceae
+ "Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
+ "Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
+ "Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
+ "Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
+ "Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
+ "Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
+ # Amaryllidaceae / Alliaceae
+ "Allium": "Amaryllidaceae",
+ # Poaceae
+ "Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
+ "Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
+ "Zea": "Poaceae", "Setaria": "Poaceae",
+ # Chenopodiaceae
+ "Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
+ "Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
+ # Rosaceae
+ "Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
+ "Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
+ "Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
+ "Waldsteinia": "Rosaceae",
+ # Boraginaceae
+ "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
+ "Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
+ # Malvaceae
+ "Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
+ "Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
+ # Polygonaceae
+ "Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
+ # Caryophyllaceae
+ "Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
+ "Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
+ "Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
+ # Tropaeolaceae
+ "Tropaeolum": "Tropaeolaceae",
+ # Papaveraceae
+ "Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
+ "Meconopsis": "Papaveraceae",
+ # Caprifoliaceae
+ "Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
+ "Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
+ # Plantaginaceae
+ "Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
+ "Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
+ # Violaceae
+ "Viola": "Violaceae",
+ # Ranunculaceae
+ "Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
+ "Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
+ # Linaceae
+ "Linum": "Linaceae",
+ # Convolvulaceae
+ "Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
+ # Portulacaceae / Montiaceae
+ "Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
+ # Amaranthaceae
+ "Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
+ "Gomphrena": "Amaranthaceae",
+ # Asparagaceae
+ "Asparagus": "Asparagaceae",
+ # Resedaceae
+ "Reseda": "Resedaceae",
+ # Balsaminaceae
+ "Impatiens": "Balsaminaceae",
+ # Hydrangeaceae
+ "Hydrangea": "Hydrangeaceae",
+ # Campanulaceae
+ "Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
+ # Scrophulariaceae
+ "Verbascum": "Scrophulariaceae",
+ # Verbenaceae
+ "Verbena": "Verbenaceae",
+ # Onagraceae
+ "Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
+ # Cucurbitaceae extras
+ "Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
+ # Hypericaceae
+ "Hypericum": "Hypericaceae",
+ # Adoxaceae
+ "Sambucus": "Adoxaceae",
+ # Others
+ "Nigella": "Ranunculaceae",
+ "Dipsacus": "Caprifoliaceae",
+ "Knautia": "Caprifoliaceae",
+ "Scabiosa": "Caprifoliaceae",
+ "Succisa": "Caprifoliaceae",
+ "Asclepias": "Apocynaceae",
+ "Cynoglossum": "Boraginaceae",
+ "Echium": "Boraginaceae",
+ "Anchusa": "Boraginaceae",
+ "Lithospermum": "Boraginaceae",
+ "Tanacetum": "Asteraceae",
+ "Onobrychis": "Fabaceae",
+ "Ornithopus": "Fabaceae",
+ "Lotus": "Fabaceae",
+ "Anthyllis": "Fabaceae",
+ "Melilotus": "Fabaceae",
+ "Galega": "Fabaceae",
+ "Lespedeza": "Fabaceae",
+ "Arachis": "Fabaceae",
+ "Senna": "Fabaceae",
+ # Additional genera found in Dreschflegel catalog
+ "Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
+ "Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
+ "Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
+ "Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
+ "Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
+ "Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
+ "Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
+ "Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
+ "Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
+ "Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
+ "Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
+ "Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
+ "Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
+ "Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
+ "Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
+ "Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
+ "Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
+ "Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
+ "Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
+ "Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
+ "Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
+ "Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
+ "Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
+}
+
+
+def normalize_species_name(botanical_name):
+ """Normalize botanical name to 'Genus species' for matching.
+ Handles var., subsp., ssp., hybrids etc.
+ """
+ name = botanical_name.strip()
+ parts = name.split()
+ if len(parts) < 2:
+ return None, None
+
+ genus = parts[0]
+ # Handle 'Genus x species' (hybrid notation)
+ if parts[1] == "x" and len(parts) >= 3:
+ species = f"x {parts[2]}"
+ elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
+ # Only genus level - can't match to species
+ return genus, None
+ else:
+ species = parts[1]
+
+ return genus, species
+
+
+def find_species(botanical_name, species_cache):
+ """Find existing species matching a botanical name.
+ Tries exact match, then genus+species without var/subsp.
+ """
+ genus, sp = normalize_species_name(botanical_name)
+ if not genus:
+ return None
+
+ if sp:
+ # Try exact genus+species
+ search_key = f"{genus} {sp}".lower()
+ if search_key in species_cache:
+ return species_cache[search_key]
+
+ # Try all species with same genus
+ genus_lower = genus.lower()
+ matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
+ if len(matches) == 1:
+ # Only one species in this genus - use it
+ return list(matches.values())[0]
+
+ return None
+
+
+def find_or_create_species(botanical_name, families, species_cache):
+ """Find or create a species from a botanical name."""
+ # Try to find existing
+ sp = find_species(botanical_name, species_cache)
+ if sp:
+ return sp
+
+ genus, species_epithet = normalize_species_name(botanical_name)
+ if not genus or not species_epithet:
+ stats["species_no_epithet"] += 1
+ return None
+
+ sci_name = f"{genus} {species_epithet}"
+
+ # Check cache again with normalized name
+ if sci_name.lower() in species_cache:
+ return species_cache[sci_name.lower()]
+
+ # Need to create - find the family
+ family_name = GENUS_TO_FAMILY.get(genus)
+ if not family_name:
+ stats["species_no_family"] += 1
+ print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
+ return None
+
+ # Find or create the family
+ family = families.get(family_name.lower())
+ if not family:
+ print(f" Creating family: {family_name}")
+ resp = api_request("POST", "/families", {"name_scientific": family_name})
+ if resp:
+ families[family_name.lower()] = resp
+ family = resp
+ stats["families_created"] += 1
+ else:
+ # May already exist (duplicate from previous run) - reload
+ for f in paginated_get("/families"):
+ if f["name_scientific"].lower() == family_name.lower():
+ families[family_name.lower()] = f
+ family = f
+ break
+ if not family:
+ print(f" [SKIP] Cannot create family: {family_name}")
+ return None
+
+ # Create species
+ print(f" Creating species: {sci_name} (family: {family_name})")
+ resp = api_request("POST", "/species", {
+ "name_scientific": sci_name,
+ "family_id": family["id"],
+ })
+ if resp:
+ species_cache[sci_name.lower()] = resp
+ stats["species_created"] += 1
+ return resp
+ else:
+ # May already exist - try to find it
+ time.sleep(0.1)
+ for s in paginated_get("/species"):
+ if s["name_scientific"].lower() == sci_name.lower():
+ species_cache[sci_name.lower()] = s
+ return s
+ return None
+
+
+def extract_cultivar_name(product_name):
+ """Extract the cultivar/variety name from the full product name."""
+ name = product_name.strip()
+
+ # Common German crop type prefixes to strip (longest first)
+ prefixes = [
+ # Tomatoes
+ "Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
+ "Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
+ "Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
+ # Lettuce
+ "Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
+ "Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
+ "Spargelsalat", "Romanasalat",
+ # Beans
+ "Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
+ "Prunkbohne",
+ # Peas
+ "Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
+ "Knackerbse", "Kapuzinererbse",
+ # Cucumbers
+ "Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
+ "Freilandgurke",
+ # Squash
+ "Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
+ "Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
+ # Melon
+ "Wassermelone", "Zuckermelone",
+ # Peppers
+ "Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
+ "Snackpaprika", "Peperoni", "Chili",
+ # Brassicas
+ "Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
+ "Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
+ "Chinakohl", "Pak Choi", "Markstammkohl",
+ # Root veg
+ "Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
+ "Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
+ "Steckrübe", "Knollensellerie", "Petersilienwurzel",
+ "Rettich", "Radieschen",
+ # Onions
+ "Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
+ "Schalotte", "Wintersteckzwiebel", "Zwiebel",
+ # Herbs
+ "Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
+ "Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
+ "Basilikum", "Schnittknoblauch",
+ # Grains
+ "Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
+ "Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
+ # Misc
+ "Zuckermais", "Popcornmais",
+ "Salattomate", "Zucchini",
+ ]
+
+ for prefix in sorted(prefixes, key=len, reverse=True):
+ if name.startswith(prefix + " "):
+ return name[len(prefix):].strip()
+
+ return name
+
+
+def get_existing_supplier_links(cultivar_id, supplier_id):
+ """Check if a cultivar-supplier link already exists."""
+ resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
+ if resp:
+ for link in resp:
+ if link["supplier_id"] == supplier_id:
+ return True
+ return False
+
+
+def main():
+ print("=" * 60)
+ print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
+ print("=" * 60)
+
+ # Step 1: Supplier
+ print("\n[1] Setting up supplier...")
+ supplier = ensure_supplier()
+ if not supplier:
+ print("FATAL: Could not create/find supplier")
+ sys.exit(1)
+ supplier_id = supplier["id"]
+
+ # Step 2: Load API data
+ print("\n[2] Loading existing HerbAPI data...")
+ families, species_cache, cultivar_cache = load_api_data()
+
+ # Step 3: Get product URLs
+ print("\n[3] Fetching sitemap...")
+ all_urls = get_sitemap_urls()
+ if not all_urls:
+ print("FATAL: Could not fetch sitemap")
+ sys.exit(1)
+ candidate_urls = classify_urls(all_urls)
+ print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
+
+ # Step 4: Scrape
+ print(f"\n[4] Scraping product pages...")
+ products = scrape_all_products(candidate_urls)
+
+ # Step 5: Import
+ print(f"\n[5] Importing {len(products)} products into HerbAPI...")
+
+ for i, product in enumerate(products):
+ if (i + 1) % 50 == 0:
+ print(f" Processing {i + 1}/{len(products)}...")
+
+ botanical = product.get("botanical_name", "")
+ if not botanical:
+ stats["no_botanical"] += 1
+ continue
+
+ # Find or create species
+ sp = find_or_create_species(botanical, families, species_cache)
+ if not sp:
+ stats["species_not_matched"] += 1
+ continue
+
+ species_id = sp["id"]
+ cultivar_name = extract_cultivar_name(product["name"])
+
+ # Check if cultivar already exists
+ cv_key = (species_id, cultivar_name.lower().strip())
+ if cv_key in cultivar_cache:
+ cv = cultivar_cache[cv_key]
+ stats["cultivars_existing"] += 1
+ else:
+ cv_data = {
+ "species_id": species_id,
+ "name": cultivar_name,
+ "is_organic": True,
+ }
+ if product.get("description"):
+ cv_data["description"] = product["description"]
+
+ cv = api_request("POST", "/cultivars", cv_data)
+ if cv:
+ cultivar_cache[cv_key] = cv
+ stats["cultivars_created"] += 1
+ else:
+ # Might already exist from previous run - try to find it
+ found = False
+ for c in paginated_get(f"/cultivars?species_id={species_id}"):
+ if c["name"].lower().strip() == cultivar_name.lower().strip():
+ cultivar_cache[cv_key] = c
+ cv = c
+ stats["cultivars_existing"] += 1
+ found = True
+ break
+ if not found:
+ stats["cultivar_create_errors"] += 1
+ continue
+
+ # Link to supplier (check first for idempotency)
+ if get_existing_supplier_links(cv["id"], supplier_id):
+ stats["supplier_links_existing"] += 1
+ continue
+
+ link_data = {
+ "supplier_id": supplier_id,
+ "article_number": product.get("article_number", ""),
+ "product_url": product.get("url", ""),
+ "price_eur": product.get("price"),
+ }
+ pack_info = product.get("pack_info", "")
+ if pack_info:
+ m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
+ if m:
+ link_data["pack_size"] = float(m.group(1))
+ unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
+ link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
+
+ resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
+ if resp:
+ stats["supplier_links_created"] += 1
+ else:
+ stats["supplier_link_errors"] += 1
+
+ # Summary
+ print("\n" + "=" * 60)
+ print("RESULTS")
+ print("=" * 60)
+ for key, val in sorted(stats.items()):
+ print(f" {key}: {val}")
+ print(f"\n Total species in DB: {len(species_cache)}")
+ print(f" Total cultivars tracked: {len(cultivar_cache)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/scrapers/scrape_mgs.py b/tools/scrapers/scrape_mgs.py
new file mode 100644
index 0000000..9a71847
--- /dev/null
+++ b/tools/scrapers/scrape_mgs.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+"""Scrape Magic Garden Seeds product pages and update herbapi database."""
+
+import subprocess
+import re
+import time
+import os
+import sys
+
+DB_CMD = [
+ 'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
+ '-t', '-A', '-F|'
+]
+DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
+
+MONTH_MAP = {
+ 'january': 1, 'february': 2, 'march': 3, 'april': 4,
+ 'may': 5, 'june': 6, 'july': 7, 'august': 8,
+ 'september': 9, 'october': 10, 'november': 11, 'december': 12,
+}
+
+
+def run_sql(sql):
+ result = subprocess.run(
+ DB_CMD + ['-c', sql],
+ capture_output=True, text=True, env=DB_ENV
+ )
+ return result.stdout.strip()
+
+
+def fetch_page(url):
+ result = subprocess.run(
+ ['curl', '-sL', '--max-time', '15', url],
+ capture_output=True, text=True
+ )
+ return result.stdout
+
+
+def parse_months(text):
+ if not text:
+ return None
+ text_lower = text.lower().strip()
+ months = []
+ for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
+ if month_name in text_lower:
+ if month_num not in months:
+ months.append(month_num)
+ text_lower = text_lower.replace(month_name, '')
+ return sorted(months) if months else None
+
+
+def parse_depth(text):
+ if not text:
+ return None
+ match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
+ if match:
+ v1 = float(match.group(1).replace(',', '.'))
+ v2 = float(match.group(2).replace(',', '.'))
+ return round((v1 + v2) / 2, 1)
+ match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
+ if match:
+ return float(match.group(1).replace(',', '.'))
+ return None
+
+
+def parse_spacing(text):
+ """Parse planting distance. Returns (row_spacing, plant_spacing)."""
+ if not text:
+ return None, None
+ text = text.lower().strip()
+ # "X x Y cm"
+ match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
+ if match:
+ return float(match.group(2)), float(match.group(1))
+ # "X - Y cm" range -> average as plant spacing
+ match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
+ if match:
+ return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
+ # Single value
+ match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
+ if match:
+ return None, float(match.group(1))
+ return None, None
+
+
+def parse_germination_days(text):
+ if not text:
+ return None
+ text = text.lower()
+ match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
+ if match:
+ return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
+ match = re.search(r'(\d+)\s*weeks?', text)
+ if match:
+ return int(match.group(1)) * 7
+ match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
+ if match:
+ return int(round((int(match.group(1)) + int(match.group(2))) / 2))
+ match = re.search(r'(\d+)\s*days?', text)
+ if match:
+ return int(match.group(1))
+ return None
+
+
+def parse_germ_temp(text):
+ if not text:
+ return None
+ match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
+ if match:
+ return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
+ match = re.search(r'(\d+)\s*°', text)
+ if match:
+ return float(match.group(1))
+ return None
+
+
+def parse_lifecycle(text):
+ if not text:
+ return None
+ text = text.lower().strip()
+ if 'perennial' in text:
+ return True
+ if 'annual' in text or 'biennial' in text:
+ return False
+ return None
+
+
+def parse_light(text):
+ if not text:
+ return None
+ text = text.lower().strip()
+ if 'full sun' in text and 'partial' in text:
+ return 'full sun to partial shade'
+ if 'full sun' in text:
+ return 'full sun'
+ if 'partial' in text or 'semi' in text or 'half' in text:
+ return 'partial shade'
+ if 'shade' in text:
+ return 'shade'
+ if 'sun' in text:
+ return 'full sun'
+ return text
+
+
+def extract_data(html):
+ data = {}
+
+ # Extract table cell pairs
+ cells = re.findall(r']*>(.*?) | ', html, re.DOTALL)
+ clean_cells = []
+ for c in cells:
+ clean = re.sub(r'<[^>]+>', ' ', c).strip()
+ clean = re.sub(r'\s+', ' ', clean)
+ clean_cells.append(clean)
+
+ specs = {}
+ i = 0
+ while i < len(clean_cells) - 1:
+ key = clean_cells[i].rstrip(':').strip()
+ val = clean_cells[i + 1].strip()
+ if key and val and not re.match(r'^[\d,.\s€*]+$', key):
+ specs[key.lower()] = val
+ i += 2
+
+ # Extract description from itemprop="description"
+ desc_match = re.search(r'itemprop="description">(.*?)\s*\s*', html, re.DOTALL)
+ if desc_match:
+ content = desc_match.group(1)
+ content = re.sub(r'', '', content, flags=re.DOTALL)
+ content = re.sub(r'', '', content, flags=re.DOTALL)
+ content = re.sub(r'<[^>]+>', ' ', content)
+ content = re.sub(r'\s+', ' ', content).strip()
+ for marker in ['Other names', 'Additional contact mail', 'Question about']:
+ idx = content.find(marker)
+ if idx > 0:
+ content = content[:idx].strip()
+ if len(content) > 20:
+ data['description'] = content
+
+ if 'description' not in data:
+ meta_match = re.search(r']*name="description"[^>]*content="([^"]*)"', html)
+ if meta_match and len(meta_match.group(1)) > 20:
+ data['description'] = meta_match.group(1)
+
+ # Parse specs
+ if 'planting distance' in specs:
+ row_sp, plant_sp = parse_spacing(specs['planting distance'])
+ if plant_sp:
+ data['plant_spacing_cm'] = plant_sp
+ if row_sp:
+ data['row_spacing_cm'] = row_sp
+
+ if 'row spacing' in specs:
+ match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
+ if match:
+ data['row_spacing_cm'] = float(match.group(1))
+
+ if 'sowing depth' in specs:
+ depth = parse_depth(specs['sowing depth'])
+ if depth is not None:
+ data['planting_depth_cm'] = depth
+
+ # Harvesting months - prefer explicit harvest time over flowering
+ if 'harvest time' in specs:
+ months = parse_months(specs['harvest time'])
+ if months:
+ data['harvesting_months'] = months
+ elif 'harvesting months' in specs:
+ months = parse_months(specs['harvesting months'])
+ if months:
+ data['harvesting_months'] = months
+ elif 'flowering months' in specs:
+ months = parse_months(specs['flowering months'])
+ if months:
+ data['harvesting_months'] = months
+
+ if 'when to sow outdoors' in specs:
+ months = parse_months(specs['when to sow outdoors'])
+ if months:
+ data['direct_sowing_months'] = months
+
+ for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
+ if indoor_key in specs:
+ months = parse_months(specs[indoor_key])
+ if months:
+ data['indoor_sowing_months'] = months
+ break
+
+ if 'lifecycle' in specs:
+ perennial = parse_lifecycle(specs['lifecycle'])
+ if perennial is not None:
+ data['perennial'] = perennial
+
+ if 'sunlight' in specs:
+ light = parse_light(specs['sunlight'])
+ if light:
+ data['light_requirement'] = light
+
+ if 'germination time' in specs:
+ days = parse_germination_days(specs['germination time'])
+ if days:
+ data['days_to_germination'] = days
+
+ if 'germination temperature' in specs:
+ temp = parse_germ_temp(specs['germination temperature'])
+ if temp:
+ data['germination_temp_c'] = temp
+
+ return data
+
+
+def get_current_values(cultivar_id):
+ sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
+ perennial, harvesting_months, direct_sowing_months, light_requirement,
+ days_to_germination, germination_temp_c, indoor_sowing_months
+ FROM cultivars WHERE id = '{cultivar_id}'"""
+ row = run_sql(sql)
+ if not row:
+ return {}
+ parts = row.split('|')
+ fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
+ 'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
+ 'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
+ current = {}
+ for i, f in enumerate(fields):
+ if i < len(parts):
+ val = parts[i].strip()
+ if val and val != '':
+ current[f] = val
+ return current
+
+
+def build_update_sql(cultivar_id, data, current):
+ sets = []
+ updated_fields = []
+ for field, value in data.items():
+ if field in current and current[field]:
+ continue
+
+ if isinstance(value, str):
+ escaped = value.replace("'", "''")
+ sets.append(f"{field} = '{escaped}'")
+ elif isinstance(value, bool):
+ sets.append(f"{field} = {'true' if value else 'false'}")
+ elif isinstance(value, list):
+ arr_str = '{' + ','.join(str(x) for x in value) + '}'
+ sets.append(f"{field} = '{arr_str}'")
+ elif isinstance(value, (int, float)):
+ sets.append(f"{field} = {value}")
+ updated_fields.append(field)
+
+ if not sets:
+ return None, []
+
+ return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
+
+
+def main():
+ sql = """
+ SELECT c.id, c.name, cs.product_url
+ FROM cultivars c
+ JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
+ JOIN suppliers s ON cs.supplier_id = s.id
+ WHERE s.name = 'Magic Garden Seeds'
+ AND cs.product_url IS NOT NULL AND cs.product_url <> ''
+ AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
+ ORDER BY c.name;
+ """
+ rows = run_sql(sql)
+ if not rows:
+ print("No cultivars to process")
+ return
+
+ cultivars = []
+ for line in rows.strip().split('\n'):
+ parts = line.split('|')
+ if len(parts) >= 3:
+ cultivars.append({
+ 'id': parts[0],
+ 'name': parts[1],
+ 'url': parts[2]
+ })
+
+ print(f"Processing {len(cultivars)} MGS cultivars...")
+ sys.stdout.flush()
+
+ updated = 0
+ skipped = 0
+ failed = 0
+ fields_updated = {}
+
+ for i, cv in enumerate(cultivars):
+ print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
+
+ try:
+ html = fetch_page(cv['url'])
+ if not html or len(html) < 1000:
+ print("FAILED (empty page)")
+ failed += 1
+ time.sleep(0.5)
+ continue
+
+ data = extract_data(html)
+ if not data:
+ print("NO DATA")
+ skipped += 1
+ time.sleep(0.5)
+ continue
+
+ current = get_current_values(cv['id'])
+ sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
+
+ if not sql_stmt:
+ print(f"SKIP (all fields populated)")
+ skipped += 1
+ else:
+ run_sql(sql_stmt)
+ for f in upd_fields:
+ fields_updated[f] = fields_updated.get(f, 0) + 1
+ print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
+ updated += 1
+
+ except Exception as e:
+ print(f"ERROR: {e}")
+ failed += 1
+
+ time.sleep(0.5)
+
+ print(f"\n=== MGS Summary ===")
+ print(f"Total processed: {len(cultivars)}")
+ print(f"Updated: {updated}")
+ print(f"Skipped (all fields already populated): {skipped}")
+ print(f"Failed: {failed}")
+ print(f"\nFields updated:")
+ for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
+ print(f" {field}: {count}")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/scrapers/scrape_naturadb.py b/tools/scrapers/scrape_naturadb.py
new file mode 100644
index 0000000..e6963e3
--- /dev/null
+++ b/tools/scrapers/scrape_naturadb.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""
+Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
+"""
+
+import json
+import re
+import time
+import urllib.request
+import urllib.error
+import sys
+
+HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+NATURADB_BASE = "https://www.naturadb.de/pflanzen"
+USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
+DELAY = 0.5
+
+
+def api_get(path):
+ """GET from HerbAPI."""
+ url = f"{HERBAPI_BASE}{path}"
+ req = urllib.request.Request(url)
+ req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
+ req.add_header("Accept", "application/json")
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read().decode())
+
+
+def api_put(path, data):
+ """PUT to HerbAPI."""
+ url = f"{HERBAPI_BASE}{path}"
+ body = json.dumps(data).encode()
+ req = urllib.request.Request(url, data=body, method="PUT")
+ req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
+ req.add_header("Content-Type", "application/json")
+ req.add_header("Accept", "application/json")
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read().decode())
+
+
+def fetch_naturadb(latin_name):
+ """Fetch a NaturaDB plant page. Returns HTML string or None."""
+ slug = latin_name.lower().replace(" ", "-")
+ url = f"{NATURADB_BASE}/{slug}/"
+ req = urllib.request.Request(url)
+ req.add_header("User-Agent", USER_AGENT)
+ try:
+ with urllib.request.urlopen(req, timeout=15) as resp:
+ return resp.read().decode("utf-8", errors="replace")
+ except urllib.error.HTTPError as e:
+ if e.code == 404:
+ return None
+ print(f" HTTP {e.code} for {url}")
+ return None
+ except Exception as e:
+ print(f" Error fetching {url}: {e}")
+ return None
+
+
+def extract_td_value(html, label):
+ """Extract value from label: | value | pattern."""
+ pattern = rf"{re.escape(label)}:? | \s*]*>(.*?) | "
+ m = re.search(pattern, html, re.DOTALL)
+ if m:
+ # Strip HTML tags from value
+ val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
+ return val
+ return None
+
+
+def extract_native_status(html):
+ """Extract native status from chip badges."""
+ # Look for the primary native status chips (large, colored)
+ statuses = []
+ for m in re.finditer(
+ r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
+ ):
+ tag = m.group(1).strip()
+ if tag in (
+ "heimische Wildform",
+ "Archäophyt",
+ "Neophyt",
+ "nicht heimisch (Neophyt)",
+ ):
+ statuses.append(tag)
+ return statuses
+
+
+def extract_badge_tags(html):
+ """Extract ecological badge chips (large, plain text)."""
+ tags = []
+ for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
+ tag = m.group(1).strip()
+ if tag and tag not in ("", "winterhart"):
+ tags.append(tag)
+ return tags
+
+
+def parse_count(text):
+ """Extract leading integer from text like '82 (Nektar und/oder ...)' """
+ if not text:
+ return None
+ m = re.match(r"(\d+)", text.strip())
+ return int(m.group(1)) if m else None
+
+
+def parse_specialist_count(text):
+ """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
+ if not text:
+ return None
+ m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
+ return int(m.group(1)) if m else None
+
+
+def parse_nectar_pollen(text):
+ """Extract numeric value from '2/4 - mäßig' -> 2."""
+ if not text:
+ return None
+ m = re.match(r"(\d+)/4", text.strip())
+ return int(m.group(1)) if m else None
+
+
+def build_wildlife_value(data):
+ """Build a structured wildlife_value string from scraped data."""
+ parts = []
+
+ # Nectar and pollen
+ np_parts = []
+ if data.get("nectar") is not None:
+ np_parts.append(f"Nectar: {data['nectar']}/4")
+ if data.get("pollen") is not None:
+ np_parts.append(f"Pollen: {data['pollen']}/4")
+ if np_parts:
+ parts.append(", ".join(np_parts) + ".")
+
+ # Wild bees
+ if data.get("wildbienen_count") is not None:
+ s = f"Supports {data['wildbienen_count']} wild bee species"
+ if data.get("wildbienen_specialists") is not None:
+ s += f" ({data['wildbienen_specialists']} specialists)"
+ parts.append(s + ".")
+
+ # Butterflies / moths
+ if data.get("schmetterlinge_count") is not None:
+ s = f"{data['schmetterlinge_count']} butterfly/moth species"
+ if data.get("raupen_count") is not None:
+ spec = ""
+ if data.get("raupen_specialists") is not None:
+ spec = f" ({data['raupen_specialists']} specialized)"
+ s += f", {data['raupen_count']} as caterpillar host{spec}"
+ parts.append(s + ".")
+
+ # Hoverflies
+ if data.get("schwebfliegen_count") is not None:
+ parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
+
+ # Beetles
+ if data.get("kaefer_count") is not None:
+ parts.append(f"{data['kaefer_count']} beetle species.")
+
+ # Birds
+ if data.get("vogelarten_count") is not None:
+ parts.append(f"{data['vogelarten_count']} bird species.")
+
+ # Mammals
+ if data.get("saeugetier_count") is not None:
+ parts.append(f"{data['saeugetier_count']} mammal species.")
+
+ # Native status
+ if data.get("native_status"):
+ parts.append(" ".join(data["native_status"]) + ".")
+
+ # Notable badges
+ notable = [
+ t
+ for t in data.get("badges", [])
+ if any(
+ kw in t.lower()
+ for kw in [
+ "insektenpflanze",
+ "raupenfutter",
+ "vogelschutz",
+ "vogelnähr",
+ "bienenweide",
+ ]
+ )
+ ]
+ if notable:
+ parts.append("Tags: " + ", ".join(notable) + ".")
+
+ return " ".join(parts) if parts else None
+
+
+def scrape_species(html):
+ """Parse NaturaDB HTML and return structured wildlife data dict."""
+ data = {}
+
+ # Nectar and pollen values
+ nectar_raw = extract_td_value(html, "Nektarwert")
+ pollen_raw = extract_td_value(html, "Pollenwert")
+ data["nectar"] = parse_nectar_pollen(nectar_raw)
+ data["pollen"] = parse_nectar_pollen(pollen_raw)
+
+ # Wild bees
+ bees_raw = extract_td_value(html, "Wildbienen")
+ data["wildbienen_count"] = parse_count(bees_raw)
+ data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
+
+ # Butterflies/moths
+ schmett_raw = extract_td_value(html, "Schmetterlinge")
+ data["schmetterlinge_count"] = parse_count(schmett_raw)
+
+ # Caterpillar hosts
+ raupen_raw = extract_td_value(html, "Raupen")
+ data["raupen_count"] = parse_count(raupen_raw)
+ data["raupen_specialists"] = parse_specialist_count(raupen_raw)
+
+ # Hoverflies
+ schweb_raw = extract_td_value(html, "Schwebfliegen")
+ data["schwebfliegen_count"] = parse_count(schweb_raw)
+
+ # Beetles
+ kaefer_raw = extract_td_value(html, "Käfer")
+ data["kaefer_count"] = parse_count(kaefer_raw)
+
+ # Birds
+ vogel_raw = extract_td_value(html, "fressende Vogelarten")
+ data["vogelarten_count"] = parse_count(vogel_raw)
+
+ # Mammals
+ saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
+ data["saeugetier_count"] = parse_count(saeuget_raw)
+
+ # Native status
+ data["native_status"] = extract_native_status(html)
+
+ # Badge tags
+ data["badges"] = extract_badge_tags(html)
+
+ return data
+
+
+def has_any_data(data):
+ """Check if we scraped anything meaningful."""
+ for k, v in data.items():
+ if k in ("native_status", "badges"):
+ if v:
+ return True
+ elif v is not None:
+ return True
+ return False
+
+
+def main():
+ print("Fetching species list from HerbAPI...")
+ species_list = api_get("/species?per_page=200")["data"]
+ print(f"Found {len(species_list)} species.\n")
+
+ enriched = 0
+ skipped_has_data = 0
+ skipped_not_found = 0
+ skipped_no_data = 0
+ errors = 0
+
+ for i, sp in enumerate(species_list):
+ slug = sp["slug"]
+ name = sp["name_scientific"]
+ existing_wv = sp.get("wildlife_value")
+
+ # Only enrich if wildlife_value is empty/null
+ if existing_wv:
+ print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
+ skipped_has_data += 1
+ continue
+
+ print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
+
+ # Fetch NaturaDB page
+ html = fetch_naturadb(name)
+ time.sleep(DELAY)
+
+ if html is None:
+ print("NOT FOUND on NaturaDB")
+ skipped_not_found += 1
+ continue
+
+ # Parse wildlife data
+ data = scrape_species(html)
+
+ if not has_any_data(data):
+ print("no wildlife data on page")
+ skipped_no_data += 1
+ continue
+
+ # Build wildlife_value string
+ wildlife_value = build_wildlife_value(data)
+ if not wildlife_value:
+ print("no wildlife data extracted")
+ skipped_no_data += 1
+ continue
+
+ # GET full species, merge, PUT back
+ try:
+ full = api_get(f"/species/{slug}")
+ full["wildlife_value"] = wildlife_value
+
+ # Remove read-only / computed fields that the PUT endpoint might reject
+ for key in ("created_at", "updated_at", "family"):
+ full.pop(key, None)
+
+ api_put(f"/species/{full['id']}", full)
+ print(f"ENRICHED -> {wildlife_value[:80]}...")
+ enriched += 1
+ except Exception as e:
+ print(f"API ERROR: {e}")
+ errors += 1
+
+ print("\n" + "=" * 70)
+ print(f"DONE. Results:")
+ print(f" Enriched: {enriched}")
+ print(f" Already had data: {skipped_has_data}")
+ print(f" Not on NaturaDB: {skipped_not_found}")
+ print(f" No wildlife data: {skipped_no_data}")
+ print(f" Errors: {errors}")
+ print(f" Total: {len(species_list)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/scrapers/scrape_reinsaat.py b/tools/scrapers/scrape_reinsaat.py
new file mode 100644
index 0000000..60ff189
--- /dev/null
+++ b/tools/scrapers/scrape_reinsaat.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""
+Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
+
+Strategy:
+1. Fetch category pages, recursively discover product pages via JSON-LD detection
+2. Extract structured data from JSON-LD Product schema + HTML text for growing data
+3. Match Latin names to existing species in the API
+4. Create cultivar records and link them to Reinsaat supplier
+"""
+
+import json
+import re
+import ssl
+import time
+import urllib.request
+import urllib.error
+import urllib.parse
+from html.parser import HTMLParser
+from dataclasses import dataclass
+from typing import Optional
+
+# ── Config ──────────────────────────────────────────────────────────────────
+API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
+DELAY = 0.5 # seconds between requests
+USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
+
+# ── Categories to scrape ────────────────────────────────────────────────────
+# (category_url, default_species_hint for leaf pages in this category)
+CATEGORIES = [
+ ("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
+ ("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
+ ("https://www.reinsaat.at/shop/DE/kuerbis/", None),
+ ("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
+ ("https://www.reinsaat.at/shop/DE/bohnen/", None),
+ ("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
+ ("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
+ ("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
+]
+
+# ── Known Latin name genera we can match ────────────────────────────────────
+KNOWN_GENERA = (
+ "Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
+ "Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
+ "Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
+ "Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
+ "Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
+ "Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
+)
+
+LATIN_PATTERN = re.compile(
+ rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
+)
+
+
+# ── HTML helpers ────────────────────────────────────────────────────────────
+class TextExtractor(HTMLParser):
+ """Extract all visible text from HTML."""
+ def __init__(self):
+ super().__init__()
+ self.parts = []
+ self._skip = 0
+
+ def handle_starttag(self, tag, attrs):
+ if tag in ("script", "style", "noscript"):
+ self._skip += 1
+
+ def handle_endtag(self, tag):
+ if tag in ("script", "style", "noscript") and self._skip > 0:
+ self._skip -= 1
+
+ def handle_data(self, data):
+ if self._skip == 0:
+ t = data.strip()
+ if t:
+ self.parts.append(t)
+
+
+def extract_links(html: str, base_url: str) -> list[str]:
+ """Extract all links from HTML, resolving relative URLs."""
+ links = []
+ seen = set()
+ for m in re.finditer(r']*href="([^"]*)"', html, re.IGNORECASE):
+ href = m.group(1)
+ if not href or href.startswith("#") or href.startswith("javascript:"):
+ continue
+ full = urllib.parse.urljoin(base_url, href)
+ if full not in seen:
+ seen.add(full)
+ links.append(full)
+ return links
+
+
+def extract_jsonld_product(html: str) -> Optional[dict]:
+ """Extract the JSON-LD Product object from HTML, if present."""
+ for m in re.finditer(
+ r'',
+ html, re.DOTALL | re.IGNORECASE
+ ):
+ try:
+ data = json.loads(m.group(1))
+ if isinstance(data, dict) and data.get("@type") == "Product":
+ return data
+ except (json.JSONDecodeError, ValueError):
+ continue
+ return None
+
+
+# ── HTTP helpers ────────────────────────────────────────────────────────────
+_ssl_ctx = ssl.create_default_context()
+
+def fetch_url(url: str, retries: int = 2) -> str:
+ """Fetch a URL with retries."""
+ req = urllib.request.Request(url, headers={
+ "User-Agent": USER_AGENT,
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
+ })
+ for attempt in range(retries + 1):
+ try:
+ with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
+ charset = resp.headers.get_content_charset() or "utf-8"
+ return resp.read().decode(charset)
+ except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
+ if attempt < retries:
+ time.sleep(2)
+ continue
+ raise
+ return ""
+
+
+def api_get(path: str):
+ """GET from HerbAPI."""
+ req = urllib.request.Request(
+ f"{API_BASE}{path}",
+ headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
+ )
+ with urllib.request.urlopen(req, timeout=15) as resp:
+ return json.loads(resp.read())
+
+
+def api_post(path: str, data: dict):
+ """POST to HerbAPI."""
+ body = json.dumps(data).encode("utf-8")
+ req = urllib.request.Request(
+ f"{API_BASE}{path}",
+ data=body,
+ headers={
+ "Authorization": f"Bearer {AUTH_TOKEN}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ },
+ method="POST",
+ )
+ try:
+ with urllib.request.urlopen(req, timeout=15) as resp:
+ return json.loads(resp.read())
+ except urllib.error.HTTPError as e:
+ error_body = e.read().decode("utf-8", errors="replace")
+ print(f" API ERROR {e.code}: {error_body[:500]}")
+ raise
+
+
+# ── Species matching ────────────────────────────────────────────────────────
+def load_species() -> dict:
+ """Load species from API. Returns dict: lowercase scientific name -> species dict."""
+ result = {}
+ page = 1
+ while True:
+ data = api_get(f"/species?per_page=100&page={page}")
+ species_list = data.get("data", data) if isinstance(data, dict) else data
+ for s in species_list:
+ key = s["name_scientific"].lower().strip()
+ result[key] = s
+ if isinstance(data, dict) and "pagination" in data:
+ if page >= data["pagination"].get("total_pages", 1):
+ break
+ else:
+ break
+ page += 1
+ return result
+
+
+def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
+ """Match a Latin name to an existing species. Returns species dict or None."""
+ if not latin_name:
+ return None
+
+ # Clean the name: remove author citations, subspecies
+ clean = latin_name.strip()
+ clean = re.sub(r'\s+L\.\s*$', '', clean)
+ clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
+ clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
+
+ key = clean.lower().strip()
+ if key in species_map:
+ return species_map[key]
+
+ # Try genus + species (first two words)
+ parts = key.split()
+ if len(parts) >= 2:
+ two = f"{parts[0]} {parts[1]}"
+ if two in species_map:
+ return species_map[two]
+
+ # Try genus-only match (less reliable, but useful for Borago, etc.)
+ if parts:
+ for skey, sval in species_map.items():
+ if skey.startswith(parts[0] + " "):
+ return sval
+
+ return None
+
+
+# ── Product data extraction ─────────────────────────────────────────────────
+@dataclass
+class ProductData:
+ name: str = ""
+ latin_name: str = ""
+ description: str = ""
+ sku: str = ""
+ url: str = ""
+ is_organic: bool = True
+ sowing_depth_cm: Optional[float] = None
+ row_spacing_cm: Optional[float] = None
+ plant_spacing_cm: Optional[float] = None
+ germination_temp_c: Optional[float] = None
+ perennial: bool = False
+
+
+def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
+ """Parse a product page. Returns ProductData or None if not a product page."""
+ jsonld = extract_jsonld_product(html)
+ if not jsonld:
+ return None # Not a product page
+
+ product = ProductData(url=url)
+
+ # ── From JSON-LD ──
+ product.name = jsonld.get("name", "").strip()
+ product.description = jsonld.get("description", "").strip()
+ product.sku = jsonld.get("model", "").strip()
+
+ # ── Extract full text for pattern matching ──
+ extractor = TextExtractor()
+ extractor.feed(html)
+ full_text = " ".join(extractor.parts)
+
+ # ── Latin name ──
+ m = LATIN_PATTERN.search(full_text)
+ if m:
+ product.latin_name = m.group(1).strip()
+ # Also check / tags in HTML
+ if not product.latin_name:
+ for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
+ clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
+ im = LATIN_PATTERN.search(clean)
+ if im:
+ product.latin_name = im.group(1).strip()
+ break
+ if not product.latin_name and default_species:
+ product.latin_name = default_species
+
+ # ── Sowing depth ──
+ depth_pats = [
+ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
+ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
+ r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
+ ]
+ for pat in depth_pats:
+ dm = re.search(pat, full_text, re.IGNORECASE)
+ if dm:
+ vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
+ product.sowing_depth_cm = sum(vals) / len(vals)
+ break
+
+ # Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords
+ if product.sowing_depth_cm is None:
+ dm = re.search(
+ r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
+ html, re.IGNORECASE
+ )
+ if dm:
+ d1 = float(dm.group(1).replace(",", "."))
+ d2 = float(dm.group(2).replace(",", "."))
+ product.sowing_depth_cm = (d1 + d2) / 2
+
+ # ── Spacing ──
+ # Look for "ROW x PLANT cm" patterns
+ spacing_pats = [
+ # "30–40 x 2–4 cm" (range x range)
+ r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
+ # "100 x 50 cm" (simple)
+ r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
+ ]
+ for pat in spacing_pats:
+ matches = re.findall(pat, full_text, re.IGNORECASE)
+ if matches:
+ # Prefer the last match (often the more relevant outdoor spacing)
+ m = matches[-1]
+ if len(m) == 4:
+ product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
+ product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
+ elif len(m) == 2:
+ v1 = float(m[0].replace(",", "."))
+ v2 = float(m[1].replace(",", "."))
+ product.row_spacing_cm = v1
+ product.plant_spacing_cm = v2
+ break
+
+ # ── Germination temperature ──
+ temp_pats = [
+ r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C',
+ r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C',
+ r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
+ ]
+ for pat in temp_pats:
+ tm = re.search(pat, full_text, re.IGNORECASE)
+ if tm:
+ vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
+ # Sanity check: germination temps are typically 5-35°C
+ avg = sum(vals) / len(vals)
+ if 5 <= avg <= 40:
+ product.germination_temp_c = avg
+ break
+
+ # ── Perennial ──
+ perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
+ for pat in perennial_pats:
+ if re.search(pat, full_text, re.IGNORECASE):
+ product.perennial = True
+ break
+
+ return product
+
+
+# ── Recursive product discovery ─────────────────────────────────────────────
+def discover_products(
+ category_url: str,
+ default_species: Optional[str],
+ max_depth: int = 3,
+ _depth: int = 0,
+ _visited: set = None,
+) -> list[ProductData]:
+ """Recursively discover and parse product pages under a category URL."""
+ if _visited is None:
+ _visited = set()
+ if category_url in _visited or _depth > max_depth:
+ return []
+ _visited.add(category_url)
+
+ indent = " " * (_depth + 1)
+ print(f"{indent}Fetching: {category_url}")
+
+ try:
+ html = fetch_url(category_url)
+ time.sleep(DELAY)
+ except Exception as e:
+ print(f"{indent} ERROR: {e}")
+ return []
+
+ # Check if this IS a product page
+ product = parse_product(html, category_url, default_species)
+ if product:
+ return [product]
+
+ # It's a category/subcategory page: extract child links
+ cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
+ child_links = []
+ for link in extract_links(html, category_url):
+ parsed = urllib.parse.urlparse(link)
+ if parsed.netloc and parsed.netloc != "www.reinsaat.at":
+ continue
+ child_path = parsed.path.rstrip("/")
+ # Must be a direct child of the category path
+ if not child_path.startswith(cat_path + "/"):
+ continue
+ relative = child_path[len(cat_path) + 1:]
+ # Must be exactly one level deeper (no further slashes)
+ if "/" in relative:
+ continue
+ # Skip empty or same-path
+ if not relative:
+ continue
+ # Build clean URL
+ clean_url = f"https://www.reinsaat.at{child_path}/"
+ if clean_url not in _visited:
+ child_links.append(clean_url)
+
+ # Deduplicate
+ child_links = list(dict.fromkeys(child_links))
+ print(f"{indent} Found {len(child_links)} child links")
+
+ products = []
+ for child_url in child_links:
+ results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
+ products.extend(results)
+
+ return products
+
+
+# ── Main ────────────────────────────────────────────────────────────────────
+def main():
+ print("=" * 70)
+ print("Reinsaat Scraper -> HerbAPI")
+ print("=" * 70)
+
+ # Load species
+ print("\n[1] Loading species from API...")
+ species_map = load_species()
+ sci_names = [k for k in species_map if " " in k]
+ print(f" {len(sci_names)} species loaded:")
+ for k in sorted(sci_names):
+ s = species_map[k]
+ print(f" {s['name_scientific']:40s} {s['id'][:12]}...")
+
+ # Load existing cultivars
+ print("\n[2] Loading existing cultivars...")
+ existing_cultivars = {} # (species_id, name_lower) -> cultivar_id
+ page = 1
+ while True:
+ data = api_get(f"/cultivars?per_page=100&page={page}")
+ clist = data.get("data", data) if isinstance(data, dict) else data
+ if not clist:
+ break
+ for c in clist:
+ existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
+ # Check pagination - API uses {data, total, page, per_page} format
+ if isinstance(data, dict):
+ total = data.get("total", len(clist))
+ per_page = data.get("per_page", 100)
+ if page * per_page >= total:
+ break
+ else:
+ break
+ page += 1
+ print(f" {len(existing_cultivars)} existing cultivars")
+
+ # Discover products from all categories
+ print("\n[3] Discovering products from Reinsaat categories...")
+ all_products: list[ProductData] = []
+ visited: set[str] = set()
+
+ for cat_url, species_hint in CATEGORIES:
+ print(f"\n Category: {cat_url}")
+ products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
+ all_products.extend(products)
+ print(f" -> {len(products)} products from this category")
+
+ print(f"\n Total products discovered: {len(all_products)}")
+
+ # Deduplicate by URL
+ seen_urls = set()
+ unique_products = []
+ for p in all_products:
+ if p.url not in seen_urls:
+ seen_urls.add(p.url)
+ unique_products.append(p)
+ all_products = unique_products
+ print(f" Unique products: {len(all_products)}")
+
+ # Process products
+ print("\n[4] Creating cultivars in API...")
+ stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
+
+ for i, product in enumerate(all_products):
+ pct = (i + 1) / len(all_products) * 100
+ print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
+
+ # Match species
+ species = match_species(product.latin_name, species_map)
+ if not species:
+ print(f" Skip: no species match for '{product.latin_name}'")
+ stats["skipped_no_species"] += 1
+ continue
+
+ species_id = species["id"]
+ print(f" Species: {species['name_scientific']}")
+ print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
+ f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
+ f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
+
+ # Check duplicates
+ key = (species_id, product.name.lower())
+ if key in existing_cultivars:
+ # Still try to link supplier if cultivar exists
+ cultivar_id = existing_cultivars[key]
+ print(f" Exists: {cultivar_id[:12]}... - checking supplier link")
+ try:
+ api_post(f"/cultivars/{cultivar_id}/suppliers", {
+ "supplier_id": REINSAAT_SUPPLIER_ID,
+ "product_url": product.url,
+ "article_number": product.sku,
+ })
+ print(f" Linked to Reinsaat (SKU: {product.sku})")
+ stats["linked"] += 1
+ except Exception:
+ pass # Already linked or other error
+ stats["skipped_exists"] += 1
+ continue
+
+ # Build payload
+ payload = {
+ "species_id": species_id,
+ "name": product.name,
+ "name_de": product.name,
+ "name_en": "",
+ "description": product.description,
+ "is_organic": product.is_organic,
+ "perennial": product.perennial,
+ }
+ if product.sowing_depth_cm is not None:
+ payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
+ if product.row_spacing_cm is not None:
+ payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
+ if product.plant_spacing_cm is not None:
+ payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
+ if product.germination_temp_c is not None:
+ payload["germination_temp_c"] = round(product.germination_temp_c, 1)
+
+ # Create cultivar
+ try:
+ result = api_post("/cultivars", payload)
+ cultivar_id = result["id"]
+ print(f" Created: {cultivar_id}")
+ stats["created"] += 1
+ existing_cultivars[key] = cultivar_id
+ except Exception as e:
+ print(f" FAILED to create: {e}")
+ stats["errors"] += 1
+ continue
+
+ # Link to supplier
+ try:
+ api_post(f"/cultivars/{cultivar_id}/suppliers", {
+ "supplier_id": REINSAAT_SUPPLIER_ID,
+ "product_url": product.url,
+ "article_number": product.sku,
+ })
+ print(f" Linked to Reinsaat (SKU: {product.sku})")
+ stats["linked"] += 1
+ except Exception as e:
+ print(f" FAILED to link supplier: {e}")
+
+ # Summary
+ print("\n" + "=" * 70)
+ print("SUMMARY")
+ print("=" * 70)
+ print(f" Created: {stats['created']}")
+ print(f" Linked to supplier: {stats['linked']}")
+ print(f" Skipped (no species): {stats['skipped_no_species']}")
+ print(f" Skipped (exists): {stats['skipped_exists']}")
+ print(f" Errors: {stats['errors']}")
+ print("=" * 70)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/scrapers/scrape_reinsaat_v2.py b/tools/scrapers/scrape_reinsaat_v2.py
new file mode 100644
index 0000000..8b9f2f6
--- /dev/null
+++ b/tools/scrapers/scrape_reinsaat_v2.py
@@ -0,0 +1,770 @@
+#!/usr/bin/env python3
+"""
+Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
+genus+species from extended botanical names, create/enrich cultivars, link supplier.
+
+Uses direct PostgreSQL access (psycopg2) for speed and reliability.
+"""
+
+import json
+import re
+import ssl
+import sys
+import time
+import uuid
+import html as html_mod
+import urllib.request
+import urllib.error
+import urllib.parse
+from dataclasses import dataclass, field
+from typing import Optional
+
+# Unbuffered output
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+import psycopg2
+import psycopg2.extras
+
+# ── Config ──────────────────────────────────────────────────────────────────
+DB_HOST = "10.31.3.90"
+DB_NAME = "herbapi"
+DB_USER = "herbapi"
+DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
+
+REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
+DELAY = 0.3
+USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
+
+# ── All Reinsaat categories ────────────────────────────────────────────────
+CATEGORIES = [
+ "https://www.reinsaat.at/shop/DE/bohnen/",
+ "https://www.reinsaat.at/shop/DE/erbsen/",
+ "https://www.reinsaat.at/shop/DE/gurken/",
+ "https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
+ "https://www.reinsaat.at/shop/DE/knollenfenchel/",
+ "https://www.reinsaat.at/shop/DE/kohlgewaechse/",
+ "https://www.reinsaat.at/shop/DE/kuerbis/",
+ "https://www.reinsaat.at/shop/DE/mais/",
+ "https://www.reinsaat.at/shop/DE/mangold/",
+ "https://www.reinsaat.at/shop/DE/melanzani_1/",
+ "https://www.reinsaat.at/shop/DE/melone/",
+ "https://www.reinsaat.at/shop/DE/paprika/",
+ "https://www.reinsaat.at/shop/DE/pastinaken_1/",
+ "https://www.reinsaat.at/shop/DE/petersilie/",
+ "https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
+ "https://www.reinsaat.at/shop/DE/porree/",
+ "https://www.reinsaat.at/shop/DE/radies_rettich/",
+ "https://www.reinsaat.at/shop/DE/rote_ruebe/",
+ "https://www.reinsaat.at/shop/DE/salate/",
+ "https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
+ "https://www.reinsaat.at/shop/DE/sellerie/",
+ "https://www.reinsaat.at/shop/DE/spinat/",
+ "https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
+ "https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
+ "https://www.reinsaat.at/shop/DE/zucchini/",
+ "https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
+ "https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
+ "https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
+ "https://www.reinsaat.at/shop/DE/gruenduengung/",
+]
+
+# ── HTTP ────────────────────────────────────────────────────────────────────
+_ssl_ctx = ssl.create_default_context()
+
+
+def fetch_url(url: str, retries: int = 2) -> str:
+ req = urllib.request.Request(url, headers={
+ "User-Agent": USER_AGENT,
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
+ })
+ for attempt in range(retries + 1):
+ try:
+ with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
+ charset = resp.headers.get_content_charset() or "utf-8"
+ return resp.read().decode(charset)
+ except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
+ if attempt < retries:
+ time.sleep(2)
+ continue
+ raise
+ return ""
+
+
+# ── HTML parsing helpers ────────────────────────────────────────────────────
+def extract_links(html_text: str, base_url: str) -> list[str]:
+ links = []
+ seen = set()
+ for m in re.finditer(r']*href="([^"]*)"', html_text, re.IGNORECASE):
+ href = m.group(1)
+ if not href or href.startswith("#") or href.startswith("javascript:"):
+ continue
+ full = urllib.parse.urljoin(base_url, href)
+ if full not in seen:
+ seen.add(full)
+ links.append(full)
+ return links
+
+
+def extract_jsonld_product(html_text: str) -> Optional[dict]:
+ for m in re.finditer(
+ r'',
+ html_text, re.DOTALL | re.IGNORECASE
+ ):
+ try:
+ data = json.loads(m.group(1))
+ if isinstance(data, dict) and data.get("@type") == "Product":
+ return data
+ except (json.JSONDecodeError, ValueError):
+ continue
+ return None
+
+
+def html_to_text(html_text: str) -> str:
+ """Strip HTML tags and decode entities."""
+ text = re.sub(r'<[^>]+>', ' ', html_text)
+ text = html_mod.unescape(text)
+ text = re.sub(r'\s+', ' ', text).strip()
+ return text
+
+
+def extract_botanical_name(html_text: str) -> str:
+ """
+ Extract the botanical/Latin name from the page.
+ Primary source: content.
+ Fallback: tags in growing infos.
+
+ Returns the raw text (may include authority names, infraspecific ranks, etc.)
+ """
+ # Primary: kurztext div
+ m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)
', html_text, re.DOTALL | re.IGNORECASE)
+ if m:
+ text = html_to_text(m.group(1)).strip()
+ if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
+ return text
+
+ # Fallback: first in growingInfos that looks like a Latin name
+ gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE)
+ if gi:
+ for em in re.finditer(r'(.*?)', gi.group(1), re.DOTALL):
+ text = html_to_text(em.group(1)).strip()
+ if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
+ return text
+
+ # Last resort: any / tag with a Latin-looking name
+ for tag in re.finditer(r'<(?:em|i)>(.*?)(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
+ text = html_to_text(tag.group(1)).strip()
+ if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
+ return text
+
+ return ""
+
+
+def normalize_latin_name(raw: str) -> str:
+ """
+ Extract genus + species from an extended botanical name.
+
+ Examples:
+ "Pisum sativum L. convar. sat." -> "Pisum sativum"
+ "Capsicum annuum L." -> "Capsicum annuum"
+ "Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
+ "Solanum lycopersicum L." -> "Solanum lycopersicum"
+ "Cucumis sativus" -> "Cucumis sativus"
+ "Mentha x piperita" -> "Mentha x piperita"
+ """
+ if not raw:
+ return ""
+
+ # Clean up
+ name = raw.strip()
+ # Remove leading/trailing punctuation
+ name = name.strip(".,;:")
+
+ words = name.split()
+ if len(words) < 2:
+ return name
+
+ genus = words[0]
+
+ # Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
+ if len(words) >= 3 and words[1] in ("x", "×"):
+ return f"{genus} x {words[2]}"
+
+ species = words[1]
+
+ # Validate: genus should start uppercase, species lowercase
+ if not genus[0].isupper() or not species[0].islower():
+ return name # Can't parse, return as-is
+
+ return f"{genus} {species}"
+
+
+# ── Calendar parsing ────────────────────────────────────────────────────────
+CALENDAR_ROW_TYPES = {
+ "voranzucht": "indoor_sowing_months",
+ "vorzucht": "indoor_sowing_months",
+ "vorkultur": "indoor_sowing_months",
+ "aussaat/ pflanzung freiland": "direct_sowing_months",
+ "aussaat/pflanzung freiland": "direct_sowing_months",
+ "aussaat freiland": "direct_sowing_months",
+ "direktsaat": "direct_sowing_months",
+ "pflanzung freiland": "transplanting_months",
+ "pflanzung": "transplanting_months",
+ "aussaat/ pflanzung gewächshaus": "glasshouse_months",
+ "aussaat/pflanzung gewächshaus": "glasshouse_months",
+ "gewächshaus": "glasshouse_months",
+ "ernte": "harvesting_months",
+}
+
+
+def parse_calendar(html_text: str) -> dict:
+ """
+ Parse the Reinsaat growing calendar table.
+ Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
+ Each value is a sorted list of month integers (1-12).
+ """
+ result = {}
+
+ cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)', html_text, re.DOTALL)
+ if not cal_match:
+ return result
+
+ cal = cal_match.group(1)
+ rows = re.findall(r'(.*?)
', cal, re.DOTALL)
+
+ for row in rows:
+ # Get label
+ label_m = re.search(r'class="type-lable"[^>]*>(.*?)', row, re.DOTALL)
+ if not label_m:
+ continue
+ label = html_to_text(label_m.group(1)).strip().lower()
+
+ # Map label to our field
+ field_name = None
+ for pattern, fname in CALENDAR_ROW_TYPES.items():
+ if pattern in label:
+ field_name = fname
+ break
+ if not field_name:
+ continue
+
+ # Extract background colors for each cell (24 cells = 12 months x 2 halves)
+ colors = re.findall(r'background-color:\s*([^;"]+)', row)
+
+ # Convert to months: cell i maps to month (i // 2) + 1
+ active_months = set()
+ for i, color in enumerate(colors):
+ color = color.strip().lower()
+ if color != "none" and color != "transparent" and color != "":
+ month = (i // 2) + 1
+ if 1 <= month <= 12:
+ active_months.add(month)
+
+ if active_months:
+ # Merge if same field already found (e.g. two sowing rows)
+ if field_name in result:
+ result[field_name] = sorted(set(result[field_name]) | active_months)
+ else:
+ result[field_name] = sorted(active_months)
+
+ return result
+
+
+# ── Growing data extraction ─────────────────────────────────────────────────
+def extract_growing_data(html_text: str) -> dict:
+ """Extract spacing, depth, germination temp from the growing text."""
+ data = {}
+
+ # Get the growingInfos text
+ gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE)
+ if not gi:
+ return data
+
+ full_text = html_to_text(gi.group(1))
+ # Also get the raw HTML for better entity handling
+ raw_html = gi.group(1)
+ # Convert HTML entities for pattern matching
+ raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
+ raw_text = re.sub(r'\s+', ' ', raw_text)
+
+ # ── Sowing depth ──
+ depth_pats = [
+ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
+ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
+ ]
+ for pat in depth_pats:
+ dm = re.search(pat, raw_text, re.IGNORECASE)
+ if dm:
+ vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
+ data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
+ break
+
+ # ── Spacing: "ROW x PLANT cm" ──
+ spacing_pats = [
+ # "30–45 x 3–5 cm" (range x range)
+ r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
+ # "100 x 50 cm" (simple)
+ r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
+ ]
+ for pat in spacing_pats:
+ matches = re.findall(pat, raw_text, re.IGNORECASE)
+ if matches:
+ m = matches[-1] # prefer last match
+ if len(m) == 4:
+ data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
+ data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
+ elif len(m) == 2:
+ v1 = float(m[0].replace(",", "."))
+ v2 = float(m[1].replace(",", "."))
+ data["row_spacing_cm"] = round(v1, 1)
+ data["plant_spacing_cm"] = round(v2, 1)
+ break
+
+ # ── Germination temperature ──
+ temp_pats = [
+ r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
+ r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
+ ]
+ for pat in temp_pats:
+ tm = re.search(pat, raw_text, re.IGNORECASE)
+ if tm:
+ vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
+ avg = sum(vals) / len(vals)
+ if 5 <= avg <= 40:
+ data["germination_temp_c"] = round(avg, 1)
+ break
+
+ # ── Perennial ──
+ perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
+ for pat in perennial_pats:
+ if re.search(pat, raw_text, re.IGNORECASE):
+ data["perennial"] = True
+ break
+
+ return data
+
+
+# ── Product data ────────────────────────────────────────────────────────────
+@dataclass
+class ProductData:
+ name: str = ""
+ raw_latin_name: str = ""
+ normalized_latin: str = ""
+ description: str = ""
+ sku: str = ""
+ url: str = ""
+ is_organic: bool = True
+ growing_data: dict = field(default_factory=dict)
+ calendar: dict = field(default_factory=dict)
+
+
+def parse_product(html_text: str, url: str) -> Optional[ProductData]:
+ """Parse a product page. Returns ProductData or None if not a product page."""
+ jsonld = extract_jsonld_product(html_text)
+ if not jsonld:
+ return None
+
+ product = ProductData(url=url)
+ product.name = jsonld.get("name", "").strip()
+ product.description = jsonld.get("description", "").strip()
+ product.sku = jsonld.get("model", "").strip()
+
+ # Extract and normalize botanical name
+ product.raw_latin_name = extract_botanical_name(html_text)
+ product.normalized_latin = normalize_latin_name(product.raw_latin_name)
+
+ # Extract growing data
+ product.growing_data = extract_growing_data(html_text)
+
+ # Parse calendar
+ product.calendar = parse_calendar(html_text)
+
+ # Check organic status (Reinsaat is all organic, but check for "demeter" too)
+ product.is_organic = True
+
+ return product
+
+
+# ── Recursive discovery ─────────────────────────────────────────────────────
+def discover_products(
+ category_url: str,
+ max_depth: int = 4,
+ _depth: int = 0,
+ _visited: set = None,
+) -> list[ProductData]:
+ if _visited is None:
+ _visited = set()
+ if category_url in _visited or _depth > max_depth:
+ return []
+ _visited.add(category_url)
+
+ indent = " " * (_depth + 1)
+
+ try:
+ html_text = fetch_url(category_url)
+ time.sleep(DELAY)
+ except Exception as e:
+ print(f"{indent}ERROR fetching {category_url}: {e}")
+ return []
+
+ # Check if this is a product page
+ product = parse_product(html_text, category_url)
+ if product:
+ return [product]
+
+ # Category page: find child links
+ cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
+ child_links = []
+ for link in extract_links(html_text, category_url):
+ parsed = urllib.parse.urlparse(link)
+ if parsed.netloc and parsed.netloc != "www.reinsaat.at":
+ continue
+ child_path = parsed.path.rstrip("/")
+ if not child_path.startswith(cat_path + "/"):
+ continue
+ relative = child_path[len(cat_path) + 1:]
+ if "/" in relative or not relative:
+ continue
+ clean_url = f"https://www.reinsaat.at{child_path}/"
+ if clean_url not in _visited:
+ child_links.append(clean_url)
+
+ child_links = list(dict.fromkeys(child_links))
+ print(f"{indent}Category {category_url} -> {len(child_links)} children")
+
+ products = []
+ for child_url in child_links:
+ results = discover_products(child_url, max_depth, _depth + 1, _visited)
+ products.extend(results)
+
+ return products
+
+
+# ── Slug generation ─────────────────────────────────────────────────────────
+def make_slug(species_name: str, cultivar_name: str) -> str:
+ """Generate a URL-friendly slug."""
+ raw = f"{species_name}-{cultivar_name}".lower()
+ # Replace umlauts and special chars
+ replacements = {
+ 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
+ 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
+ 'á': 'a', 'à': 'a', 'â': 'a',
+ 'í': 'i', 'ì': 'i', 'î': 'i',
+ 'ó': 'o', 'ò': 'o', 'ô': 'o',
+ 'ú': 'u', 'ù': 'u', 'û': 'u',
+ 'ñ': 'n', 'ç': 'c',
+ }
+ for old, new in replacements.items():
+ raw = raw.replace(old, new)
+ # Keep only alphanumeric and hyphens
+ slug = re.sub(r'[^a-z0-9]+', '-', raw)
+ slug = slug.strip('-')
+ # Collapse multiple hyphens
+ slug = re.sub(r'-+', '-', slug)
+ return slug
+
+
+# ── Main ────────────────────────────────────────────────────────────────────
+def db_connect():
+ """Create a fresh DB connection."""
+ conn = psycopg2.connect(
+ host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
+ )
+ conn.autocommit = False
+ return conn
+
+
+def main():
+ print("=" * 70)
+ print("Reinsaat Scraper v2")
+ print("=" * 70)
+
+ # ── Phase 1: Discover all products (no DB needed) ──
+ print("\n[1] Discovering products from Reinsaat categories...")
+ all_products: list[ProductData] = []
+ visited: set[str] = set()
+
+ for cat_url in CATEGORIES:
+ print(f"\n Category: {cat_url}")
+ products = discover_products(cat_url, max_depth=4, _visited=visited)
+ all_products.extend(products)
+ print(f" -> {len(products)} products")
+
+ # Deduplicate by URL
+ seen_urls = set()
+ unique_products = []
+ for p in all_products:
+ if p.url not in seen_urls:
+ seen_urls.add(p.url)
+ unique_products.append(p)
+ all_products = unique_products
+ print(f"\n Total unique products: {len(all_products)}")
+
+ # ── Phase 2: Connect to DB and load existing data ──
+ print("\n[2] Connecting to DB and loading existing data...")
+ conn = db_connect()
+ cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+ # Load species
+ cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
+ species_rows = cur.fetchall()
+ species_map = {}
+ for row in species_rows:
+ key = row["name_scientific"].lower().strip()
+ species_map[key] = row
+ print(f" {len(species_map)} species loaded")
+
+ # Load existing cultivars
+ cur.execute("""
+ SELECT id, species_id, name, slug, description,
+ row_spacing_cm, plant_spacing_cm, planting_depth_cm,
+ germination_temp_c, perennial,
+ indoor_sowing_months, direct_sowing_months,
+ transplanting_months, glasshouse_months, harvesting_months
+ FROM cultivars
+ """)
+ cultivar_rows = cur.fetchall()
+ existing_cultivars = {}
+ existing_slugs = set()
+ for row in cultivar_rows:
+ sid = str(row["species_id"])
+ name_lower = row["name"].lower()
+ existing_cultivars[(sid, name_lower)] = dict(row)
+ existing_slugs.add(row["slug"])
+ print(f" {len(existing_cultivars)} cultivars loaded")
+
+ # Load existing Reinsaat supplier links
+ cur.execute("""
+ SELECT cultivar_id, product_url, article_number
+ FROM cultivar_suppliers
+ WHERE supplier_id = %s
+ """, (REINSAAT_SUPPLIER_ID,))
+ existing_links = {}
+ for row in cur.fetchall():
+ cid = str(row["cultivar_id"])
+ url = row["product_url"] or ""
+ sku = row["article_number"] or ""
+ existing_links.setdefault(cid, []).append((url, sku))
+ print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
+
+ # ── Phase 3: Process products ──
+ print("\n[3] Processing products...")
+ stats = {
+ "created": 0,
+ "linked": 0,
+ "enriched": 0,
+ "skipped_no_species": 0,
+ "skipped_no_name": 0,
+ "link_exists": 0,
+ "errors": 0,
+ }
+ unmatched = []
+
+ for i, product in enumerate(all_products):
+ pct = (i + 1) / len(all_products) * 100
+ prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
+
+ if not product.name:
+ stats["skipped_no_name"] += 1
+ continue
+
+ # Match species
+ normalized = product.normalized_latin.lower().strip()
+ species = species_map.get(normalized)
+
+ if not species:
+ # Try exact match on raw name (first two words)
+ raw_words = product.raw_latin_name.split()
+ if len(raw_words) >= 2:
+ attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
+ species = species_map.get(attempt)
+
+ if not species:
+ stats["skipped_no_species"] += 1
+ unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
+ continue
+
+ species_id = str(species["id"])
+ species_name = species["name_scientific"]
+
+ # Check if cultivar exists
+ ckey = (species_id, product.name.lower())
+ existing = existing_cultivars.get(ckey)
+
+ if existing:
+ cultivar_id = str(existing["id"])
+
+ # ── Enrich existing cultivar with missing data ──
+ updates = {}
+
+ # Growing data from page
+ gd = product.growing_data
+ if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
+ updates["planting_depth_cm"] = gd["planting_depth_cm"]
+ if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
+ updates["row_spacing_cm"] = gd["row_spacing_cm"]
+ if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
+ updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
+ if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
+ updates["germination_temp_c"] = gd["germination_temp_c"]
+ if gd.get("perennial") and not existing.get("perennial"):
+ updates["perennial"] = True
+
+ # Calendar data
+ cal = product.calendar
+ if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
+ updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
+ if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
+ updates["direct_sowing_months"] = cal["direct_sowing_months"]
+ if cal.get("transplanting_months") and not existing.get("transplanting_months"):
+ updates["transplanting_months"] = cal["transplanting_months"]
+ if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
+ updates["glasshouse_months"] = cal["glasshouse_months"]
+ if cal.get("harvesting_months") and not existing.get("harvesting_months"):
+ updates["harvesting_months"] = cal["harvesting_months"]
+
+ # Description
+ if product.description and not existing.get("description"):
+ updates["description"] = product.description
+
+ if updates:
+ set_clauses = []
+ values = []
+ for col, val in updates.items():
+ set_clauses.append(f"{col} = %s")
+ values.append(val)
+ set_clauses.append("updated_at = NOW()")
+ values.append(cultivar_id)
+ cur.execute(
+ f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
+ values
+ )
+ stats["enriched"] += 1
+ print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
+
+ # ── Add supplier link if missing ──
+ link_exists = False
+ if cultivar_id in existing_links:
+ for lurl, lsku in existing_links[cultivar_id]:
+ if lurl == product.url or (lsku and lsku == product.sku):
+ link_exists = True
+ break
+
+ if link_exists:
+ stats["link_exists"] += 1
+ else:
+ try:
+ cur.execute("SAVEPOINT link_sp")
+ cur.execute("""
+ INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
+ VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
+ ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
+ SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
+ """, (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
+ cur.execute("RELEASE SAVEPOINT link_sp")
+ stats["linked"] += 1
+ existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
+ print(f"{prefix} {product.name} -> LINKED ({product.sku})")
+ except Exception as e:
+ print(f"{prefix} {product.name} -> LINK ERROR: {e}")
+ cur.execute("ROLLBACK TO SAVEPOINT link_sp")
+ stats["errors"] += 1
+ else:
+ # ── Create new cultivar ──
+ slug = make_slug(species_name, product.name)
+ # Ensure unique slug
+ base_slug = slug
+ counter = 2
+ while slug in existing_slugs:
+ slug = f"{base_slug}-{counter}"
+ counter += 1
+
+ gd = product.growing_data
+ cal = product.calendar
+
+ try:
+ cur.execute("SAVEPOINT create_sp")
+ cur.execute("""
+ INSERT INTO cultivars (
+ species_id, name, name_de, slug, description,
+ is_organic, perennial,
+ planting_depth_cm, row_spacing_cm, plant_spacing_cm,
+ germination_temp_c,
+ indoor_sowing_months, direct_sowing_months,
+ transplanting_months, glasshouse_months, harvesting_months
+ ) VALUES (
+ %s::uuid, %s, %s, %s, %s,
+ %s, %s,
+ %s, %s, %s,
+ %s,
+ %s, %s,
+ %s, %s, %s
+ )
+ RETURNING id
+ """, (
+ species_id,
+ product.name,
+ product.name,
+ slug,
+ product.description,
+ product.is_organic,
+ gd.get("perennial", False),
+ gd.get("planting_depth_cm"),
+ gd.get("row_spacing_cm"),
+ gd.get("plant_spacing_cm"),
+ gd.get("germination_temp_c"),
+ cal.get("indoor_sowing_months"),
+ cal.get("direct_sowing_months"),
+ cal.get("transplanting_months"),
+ cal.get("glasshouse_months"),
+ cal.get("harvesting_months"),
+ ))
+ new_id = str(cur.fetchone()["id"])
+ existing_slugs.add(slug)
+ existing_cultivars[ckey] = {"id": new_id}
+ stats["created"] += 1
+
+ # Link to supplier
+ cur.execute("""
+ INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
+ VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
+ """, (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
+ stats["linked"] += 1
+ existing_links.setdefault(new_id, []).append((product.url, product.sku))
+
+ print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
+ cur.execute("RELEASE SAVEPOINT create_sp")
+ except Exception as e:
+ print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
+ cur.execute("ROLLBACK TO SAVEPOINT create_sp")
+ stats["errors"] += 1
+
+ # ── Commit ──
+ conn.commit()
+
+ # ── Summary ──
+ print("\n" + "=" * 70)
+ print("SUMMARY")
+ print("=" * 70)
+ print(f" Total products discovered: {len(all_products)}")
+ print(f" New cultivars created: {stats['created']}")
+ print(f" New supplier links added: {stats['linked']}")
+ print(f" Cultivars enriched: {stats['enriched']}")
+ print(f" Links already existed: {stats['link_exists']}")
+ print(f" Skipped (no species): {stats['skipped_no_species']}")
+ print(f" Skipped (no name): {stats['skipped_no_name']}")
+ print(f" Errors: {stats['errors']}")
+ print("=" * 70)
+
+ if unmatched:
+ print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
+ for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
+ print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
+
+ cur.close()
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/scrapers/scrape_reinsaat_v3.py b/tools/scrapers/scrape_reinsaat_v3.py
new file mode 100644
index 0000000..c013b2e
--- /dev/null
+++ b/tools/scrapers/scrape_reinsaat_v3.py
@@ -0,0 +1,635 @@
+#!/usr/bin/env python3
+"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
+
+import json
+import re
+import sys
+import time
+import urllib.request
+import urllib.error
+import urllib.parse
+from html import unescape
+
+# --- Config ---
+API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
+API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
+REINSAAT_BASE = "https://www.reinsaat.at"
+DELAY = 0.3
+
+# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
+CATEGORIES = [
+ "beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
+ "pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
+ "carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
+ "parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
+ "celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
+ "culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
+ "wild_flowers_seeds", "green_manure",
+]
+
+# Suffixes to strip from botanical names (authority names, infraspecific ranks)
+STRIP_SUFFIXES = {
+ "l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
+ "subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
+ "hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
+ "crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
+ "sat.", "sat", "axillare", "medikus",
+}
+
+
+def api_get(path, params=None):
+ """GET from HerbAPI."""
+ url = f"{API_BASE}{path}"
+ if params:
+ url += "?" + urllib.parse.urlencode(params)
+ req = urllib.request.Request(url)
+ req.add_header("Authorization", f"Bearer {API_TOKEN}")
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read())
+
+
+def api_post(path, data):
+ """POST to HerbAPI."""
+ url = f"{API_BASE}{path}"
+ body = json.dumps(data).encode()
+ req = urllib.request.Request(url, data=body, method="POST")
+ req.add_header("Authorization", f"Bearer {API_TOKEN}")
+ req.add_header("Content-Type", "application/json")
+ with urllib.request.urlopen(req) as resp:
+ return json.loads(resp.read())
+
+
+def fetch_page(url):
+ """Fetch a web page, return HTML string."""
+ req = urllib.request.Request(url)
+ req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
+ with urllib.request.urlopen(req, timeout=15) as resp:
+ return resp.read().decode("utf-8", errors="replace")
+
+
+BOTANICAL_TYPOS = {
+ "capscicum": "capsicum",
+ "capsicum frutenscens": "capsicum frutescens",
+ "tropaelum": "tropaeolum",
+ "lact.": "lactuca",
+}
+
+ABBREVIATED_NAMES = {
+ "origanum vulg.": "origanum vulgare",
+ "helichrysum bract.": "helichrysum bracteatum",
+ "campanula lat.": "campanula latifolia",
+ "cosmos bip.": "cosmos bipinnatus",
+ "papaver somnif.": "papaver somniferum",
+}
+
+
+def normalise_botanical(raw):
+ """Strip botanical name to genus + species only.
+
+ 'Pisum sativum L. convar. sat.' -> 'pisum sativum'
+ 'Solanum lycopersicum L.' -> 'solanum lycopersicum'
+ 'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
+ """
+ if not raw:
+ return None
+ # Clean HTML entities
+ raw = unescape(raw).replace("\xa0", " ").strip()
+ # Remove trailing commas/periods
+ raw = raw.rstrip(",. ")
+ # Remove content in parentheses
+ raw = re.sub(r"\([^)]*\)", "", raw)
+ # Check abbreviated names first (before splitting)
+ raw_lower = raw.lower().strip()
+ for abbrev, full in ABBREVIATED_NAMES.items():
+ if raw_lower.startswith(abbrev):
+ return full
+
+ parts = raw.split()
+ if len(parts) < 2:
+ return None
+ # Genus (capitalised) + species (lowercase)
+ genus = parts[0].lower().rstrip(",")
+ species = parts[1].lower().rstrip(",")
+
+ # Fix known typos
+ if genus in BOTANICAL_TYPOS:
+ genus = BOTANICAL_TYPOS[genus]
+ full_name = f"{genus} {species}"
+ if full_name in BOTANICAL_TYPOS:
+ full_name = BOTANICAL_TYPOS[full_name]
+ genus, species = full_name.split()
+
+ # Validate: genus should start with letter, species should be all lowercase
+ if not genus[0].isalpha() or not species[0].isalpha():
+ return None
+ # Skip if species looks like an authority (starts with uppercase in original)
+ if parts[1][0].isupper():
+ return None
+ return f"{genus} {species}"
+
+
+def extract_product_data(html, url):
+ """Extract product info from a Reinsaat product page."""
+ result = {}
+
+ # H1 = variety name
+ m = re.search(r']*>([^<]+)
', html)
+ if m:
+ name = unescape(m.group(1)).strip()
+ # Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
+ paren = re.search(r"\(([^)]+)\)", name)
+ if paren and re.match(r"RS-", name):
+ name = paren.group(1).strip()
+ result["name"] = name
+
+ # Botanical name from fce_shop_kurztext
+ m = re.search(
+ r'fce_shop_kurztext[^>]*>\s*(?:]*>)?\s*([^<]+?)\s*(?:)?\s*',
+ html,
+ )
+ if m:
+ result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
+ result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
+
+ # Article number from JSON-LD
+ for jm in re.finditer(
+ r'', html, re.S
+ ):
+ try:
+ jd = json.loads(jm.group(1))
+ except json.JSONDecodeError:
+ continue
+ if jd.get("@type") == "Product":
+ if "model" in jd:
+ result["article_number"] = str(jd["model"])
+ # Get smallest pack price (usually the Portion)
+ offers = jd.get("offers", {})
+ if isinstance(offers, dict):
+ offer_list = offers.get("offers", [])
+ elif isinstance(offers, list):
+ offer_list = offers
+ else:
+ offer_list = []
+ if offer_list:
+ prices = [
+ o["price"]
+ for o in offer_list
+ if isinstance(o.get("price"), (int, float)) and o["price"] > 0
+ ]
+ if prices:
+ result["price_eur"] = min(prices)
+ break
+
+ # Price table - get pack sizes
+ tables = re.findall(r"", html, re.S)
+ for tbl in tables:
+ if "€" not in tbl:
+ continue
+ rows = re.findall(r"]*>(.*?)
", tbl, re.S)
+ if len(rows) >= 2:
+ size_cells = re.findall(r"]*>(.*?) | ", rows[0], re.S)
+ size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
+ price_cells = re.findall(r"]*>(.*?) | ", rows[1], re.S)
+ price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
+ # Find the "Port." entry
+ for i, st in enumerate(size_texts):
+ if "Port" in st:
+ if i < len(price_texts):
+ pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
+ if pm:
+ result["port_price"] = float(pm.group())
+ break
+ # Get portion content info
+ result["pack_sizes"] = size_texts
+ break
+
+ # Sowing depth
+ m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
+ if m:
+ d1 = float(m.group(1).replace(",", "."))
+ d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
+ result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
+
+ # Spacing: "row spacing NNxNN cm" or "NN x NN cm"
+ # Try outdoor spacing first
+ m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
+ if not m:
+ m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
+ if not m:
+ m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
+ if m:
+ result["row_spacing_cm"] = float(m.group(1))
+ result["plant_spacing_cm"] = float(m.group(2))
+
+ # Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
+ if "row_spacing_cm" not in result:
+ m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
+ if m:
+ r1 = int(m.group(1))
+ r2 = int(m.group(2)) if m.group(2) else r1
+ result["row_spacing_cm"] = float((r1 + r2) // 2)
+
+ # Germination temperature
+ m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
+ if m:
+ t1 = int(m.group(1))
+ t2 = int(m.group(2)) if m.group(2) else t1
+ result["germination_temp_c"] = float((t1 + t2) // 2)
+
+ # Pack unit from portion info - "20 seeds" or "25 g" etc
+ portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
+ if not portion_m:
+ # Try "Port. (20 seeds)" format
+ portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
+ if portion_m:
+ result["pack_size"] = float(portion_m.group(1).replace(",", "."))
+ unit = portion_m.group(2).lower()
+ if unit in ("seed", "seeds", "korn"):
+ result["pack_unit"] = "Korn"
+ else:
+ result["pack_unit"] = unit
+
+ result["url"] = url
+ return result
+
+
+def get_all_species():
+ """Fetch all species from API, build lookup by normalised name."""
+ species_map = {}
+ page = 1
+ while True:
+ data = api_get("/species", {"per_page": 100, "page": page})
+ batch = data.get("data", [])
+ for sp in batch:
+ norm = normalise_botanical(sp["name_scientific"])
+ if norm:
+ species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
+ print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
+ if len(batch) < 100:
+ break
+ page += 1
+ return species_map
+
+
+def get_all_cultivars():
+ """Fetch all cultivars, build lookup by (species_id, normalised name)."""
+ cultivar_map = {} # (species_id, lower_name) -> cultivar
+ page = 1
+ while True:
+ data = api_get("/cultivars", {"per_page": 100, "page": page})
+ batch = data.get("data", [])
+ for cv in batch:
+ key = (cv["species_id"], cv["name"].lower().strip())
+ cultivar_map[key] = cv
+ print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
+ if len(batch) < 100:
+ break
+ page += 1
+ return cultivar_map
+
+
+def get_reinsaat_supplier():
+ """Get Reinsaat supplier record."""
+ suppliers = api_get("/suppliers")
+ for s in suppliers:
+ if s["slug"] == "reinsaat":
+ return s
+ raise RuntimeError("Reinsaat supplier not found in API")
+
+
+def get_cultivar_suppliers(cultivar_id):
+ """Get existing supplier links for a cultivar."""
+ return api_get(f"/cultivars/{cultivar_id}/suppliers")
+
+
+def get_product_urls_from_category(cat_slug):
+ """Fetch product URLs from a category page. Handles one level of subcategories."""
+ cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
+ try:
+ html = fetch_page(cat_url)
+ except Exception as e:
+ print(f" WARN: Failed to fetch category {cat_slug}: {e}")
+ return []
+
+ time.sleep(DELAY)
+
+ # Get all internal links under this category
+ pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
+ raw_links = re.findall(rf'href="({pattern})"', html)
+ # raw_links is list of (full_path, slug_part) but re gives us captured groups
+ # Let me redo this
+ raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
+ unique_links = sorted(set(raw_links))
+
+ product_urls = []
+ subcategory_urls = []
+
+ for link in unique_links:
+ full_url = REINSAAT_BASE + link
+ # Determine depth relative to category
+ parts = link.rstrip("/").split("/")
+ # /shop/EN/cat_slug/item -> 4 parts = product or subcategory
+ # /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
+ if len(parts) == 4:
+ # Could be product or subcategory - we'll check later
+ product_urls.append(full_url)
+ elif len(parts) >= 5:
+ product_urls.append(full_url)
+
+ return product_urls
+
+
+def is_product_page(html):
+ """Check if HTML is a product page (has botanical name or JSON-LD Product)."""
+ return bool(
+ re.search(r'fce_shop_kurztext', html)
+ or re.search(r'"@type":\s*"Product"', html)
+ )
+
+
+def main():
+ print("=" * 60)
+ print("Reinsaat v3 Scraper")
+ print("=" * 60)
+
+ # Step 1: Load all species
+ print("\n[1/4] Loading species from API...")
+ species_map = get_all_species()
+ print(f" Loaded {len(species_map)} species")
+
+ # Step 2: Load all cultivars
+ print("\n[2/4] Loading cultivars from API...")
+ cultivar_map = get_all_cultivars()
+ print(f" Loaded {len(cultivar_map)} cultivars")
+
+ # Step 3: Get Reinsaat supplier
+ print("\n[3/4] Getting Reinsaat supplier...")
+ supplier = get_reinsaat_supplier()
+ supplier_id = supplier["id"]
+ print(f" Reinsaat ID: {supplier_id}")
+
+ # Step 4: Scrape categories
+ print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
+
+ stats = {
+ "products_found": 0,
+ "botanical_extracted": 0,
+ "species_matched": 0,
+ "species_not_matched": 0,
+ "cultivar_existed": 0,
+ "cultivar_created": 0,
+ "link_existed": 0,
+ "link_created": 0,
+ "errors": 0,
+ }
+ unmatched_species = {} # botanical_norm -> count
+ new_cultivars = []
+ new_links = []
+
+ for cat_i, cat in enumerate(CATEGORIES):
+ print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
+ urls = get_product_urls_from_category(cat)
+ print(f" Found {len(urls)} URLs")
+
+ for url in urls:
+ time.sleep(DELAY)
+ try:
+ html = fetch_page(url)
+ except Exception as e:
+ print(f" ERROR fetching {url}: {e}")
+ stats["errors"] += 1
+ continue
+
+ # Check if this is actually a product page
+ if not is_product_page(html):
+ # Might be a subcategory - get links from it
+ sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
+ sub_links = [
+ REINSAAT_BASE + l
+ for l in sorted(set(sub_links))
+ if l.startswith(f"/shop/EN/{cat}/")
+ and l.count("/") > url.rstrip("/").count("/")
+ ]
+ if sub_links:
+ # It's a subcategory, process its product links
+ for sub_url in sub_links:
+ if sub_url in urls:
+ continue # already in list
+ time.sleep(DELAY)
+ try:
+ sub_html = fetch_page(sub_url)
+ except Exception as e:
+ print(f" ERROR fetching {sub_url}: {e}")
+ stats["errors"] += 1
+ continue
+ if not is_product_page(sub_html):
+ continue
+ process_product(
+ sub_html, sub_url, species_map, cultivar_map,
+ supplier_id, stats, unmatched_species,
+ new_cultivars, new_links,
+ )
+ continue
+
+ process_product(
+ html, url, species_map, cultivar_map,
+ supplier_id, stats, unmatched_species,
+ new_cultivars, new_links,
+ )
+
+ # Report
+ print("\n" + "=" * 60)
+ print("RESULTS")
+ print("=" * 60)
+ print(f"Products found: {stats['products_found']}")
+ print(f"Botanical extracted: {stats['botanical_extracted']}")
+ print(f"Species matched: {stats['species_matched']}")
+ print(f"Species NOT matched: {stats['species_not_matched']}")
+ print(f"Cultivars existed: {stats['cultivar_existed']}")
+ print(f"Cultivars created: {stats['cultivar_created']}")
+ print(f"Links existed: {stats['link_existed']}")
+ print(f"Links created: {stats['link_created']}")
+ print(f"Errors: {stats['errors']}")
+
+ if new_cultivars:
+ print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
+ for cv in new_cultivars:
+ print(f" + {cv['name']} ({cv.get('species', '?')})")
+
+ if new_links:
+ print(f"\n--- New supplier links ({len(new_links)}) ---")
+ for lk in new_links:
+ print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
+
+ if unmatched_species:
+ print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
+ for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
+ print(f" ? {name} (x{count})")
+
+ print("\nDone.")
+
+
+def process_product(html, url, species_map, cultivar_map, supplier_id,
+ stats, unmatched_species, new_cultivars, new_links):
+ """Process a single product page."""
+ stats["products_found"] += 1
+ prod = extract_product_data(html, url)
+
+ if not prod.get("name"):
+ return
+
+ bot_norm = prod.get("botanical_norm")
+ if not bot_norm:
+ # No botanical name found on page
+ stats["species_not_matched"] += 1
+ unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
+ return
+
+ stats["botanical_extracted"] += 1
+
+ # Match species
+ species = species_map.get(bot_norm)
+ if not species:
+ stats["species_not_matched"] += 1
+ unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
+ return
+
+ stats["species_matched"] += 1
+ species_id = species["id"]
+ cultivar_name = prod["name"]
+
+ # Check if cultivar exists
+ cv_key = (species_id, cultivar_name.lower().strip())
+ existing_cv = cultivar_map.get(cv_key)
+
+ if existing_cv:
+ stats["cultivar_existed"] += 1
+ cultivar_id = existing_cv["id"]
+ else:
+ # Create cultivar
+ create_data = {
+ "species_id": species_id,
+ "name": cultivar_name,
+ "is_organic": True,
+ "source_urls": [url],
+ }
+ # Add growing data if we extracted any
+ if "planting_depth_cm" in prod:
+ create_data["planting_depth_cm"] = prod["planting_depth_cm"]
+ if "row_spacing_cm" in prod:
+ create_data["row_spacing_cm"] = prod["row_spacing_cm"]
+ if "plant_spacing_cm" in prod:
+ create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
+ if "germination_temp_c" in prod:
+ create_data["germination_temp_c"] = prod["germination_temp_c"]
+
+ try:
+ new_cv = api_post("/cultivars", create_data)
+ cultivar_id = new_cv["id"]
+ stats["cultivar_created"] += 1
+ new_cultivars.append({
+ "name": cultivar_name,
+ "species": species["name"],
+ "id": cultivar_id,
+ })
+ # Add to local cache
+ cultivar_map[cv_key] = new_cv
+ print(f" + Created cultivar: {cultivar_name} ({species['name']})")
+ except urllib.error.HTTPError as e:
+ body = e.read().decode() if hasattr(e, 'read') else str(e)
+ if e.code == 500 and "Database error" in body:
+ # Likely slug collision - search for existing cultivar
+ try:
+ # Try multiple search strategies
+ found = None
+ cn_lower = cultivar_name.lower().strip()
+
+ # Strategy 1: search by full name
+ search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
+ for cv in search_data.get("data", []):
+ if cv["name"].lower().strip() == cn_lower:
+ found = cv
+ break
+ # Strategy 2: match by species_id + partial name
+ if not found:
+ for cv in search_data.get("data", []):
+ if cv["species_id"] == species_id:
+ # Match if names are similar (ignoring punctuation)
+ cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
+ cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
+ if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
+ found = cv
+ break
+ # Strategy 3: search by last significant word
+ if not found:
+ words = [w for w in cultivar_name.split() if len(w) > 2]
+ if words:
+ search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
+ for cv in search2.get("data", []):
+ if cv["species_id"] == species_id:
+ cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
+ cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
+ if cv_clean == cn_clean:
+ found = cv
+ break
+
+ if found:
+ cultivar_id = found["id"]
+ cultivar_map[cv_key] = found
+ stats["cultivar_existed"] += 1
+ else:
+ print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
+ stats["errors"] += 1
+ return
+ except Exception as e2:
+ print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
+ stats["errors"] += 1
+ return
+ else:
+ print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
+ stats["errors"] += 1
+ return
+
+ # Check if Reinsaat supplier link exists
+ try:
+ existing_links = get_cultivar_suppliers(cultivar_id)
+ except Exception:
+ existing_links = []
+
+ has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
+
+ if has_reinsaat:
+ stats["link_existed"] += 1
+ else:
+ # Create supplier link
+ link_data = {
+ "supplier_id": supplier_id,
+ "product_url": url,
+ }
+ if "article_number" in prod:
+ link_data["article_number"] = prod["article_number"]
+ if "port_price" in prod:
+ link_data["price_eur"] = prod["port_price"]
+ elif "price_eur" in prod:
+ link_data["price_eur"] = prod["price_eur"]
+ if "pack_size" in prod:
+ link_data["pack_size"] = prod["pack_size"]
+ if "pack_unit" in prod:
+ link_data["pack_unit"] = prod["pack_unit"]
+
+ try:
+ api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
+ stats["link_created"] += 1
+ new_links.append({
+ "cultivar": cultivar_name,
+ "article": prod.get("article_number", "?"),
+ "url": url,
+ })
+ except urllib.error.HTTPError as e:
+ body = e.read().decode() if hasattr(e, 'read') else str(e)
+ print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
+ stats["errors"] += 1
+
+
+if __name__ == "__main__":
+ main()