diff --git a/tools/enrichment/enrich_wikidata.py b/tools/enrichment/enrich_wikidata.py new file mode 100644 index 0000000..46cad90 --- /dev/null +++ b/tools/enrichment/enrich_wikidata.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code.""" + +import json +import time +import urllib.parse +import urllib.request + +HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" + +HEADERS_WD = { + "User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)", + "Accept": "application/json", +} + + +def herbapi_request(path, method="GET", data=None): + url = f"{HERBAPI_BASE}{path}" + body = json.dumps(data).encode() if data else None + req = urllib.request.Request(url, data=body, method=method, headers={ + "Authorization": f"Bearer {HERBAPI_TOKEN}", + "Content-Type": "application/json", + }) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def query_wikidata_batch(names): + """Query Wikidata for a batch of scientific names.""" + values = " ".join(f'"{n}"' for n in names) + sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{ + VALUES ?name {{ {values} }} + ?item wdt:P225 ?name . + OPTIONAL {{ ?item wdt:P846 ?gbifId }} + OPTIONAL {{ ?item wdt:P3031 ?eppoCode }} +}}""" + encoded = urllib.parse.quote(sparql) + url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json" + req = urllib.request.Request(url, headers=HEADERS_WD) + with urllib.request.urlopen(req, timeout=60) as resp: + data = json.loads(resp.read()) + + results = {} + for binding in data.get("results", {}).get("bindings", []): + name = binding["name"]["value"] + qid_url = binding["item"]["value"] + qid = qid_url.rsplit("/", 1)[-1] + gbif = binding.get("gbifId", {}).get("value") + eppo = binding.get("eppoCode", {}).get("value") + results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo} + return results + + +def main(): + # 1. Fetch all species + resp = herbapi_request("/species?per_page=200") + species_list = resp["data"] + print(f"Fetched {len(species_list)} species from HerbAPI\n") + + # 2. Collect species needing enrichment + to_enrich = [sp for sp in species_list + if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]] + + if not to_enrich: + print("All species already enriched.") + return + + print(f"{len(to_enrich)} species need enrichment\n") + + # 3. Batch query Wikidata + BATCH_SIZE = 20 + wikidata_results = {} + names = [sp["name_scientific"] for sp in to_enrich] + + for i in range(0, len(names), BATCH_SIZE): + batch = names[i:i + BATCH_SIZE] + print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...") + try: + results = query_wikidata_batch(batch) + wikidata_results.update(results) + print(f" Got {len(results)} matches") + except Exception as e: + print(f" ERROR: {e}") + if i + BATCH_SIZE < len(names): + time.sleep(2) + + print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n") + + # 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID + updated = 0 + skipped = 0 + not_found = 0 + errors = 0 + + for sp in to_enrich: + name = sp["name_scientific"] + wd = wikidata_results.get(name) + if not wd: + print(f" SKIP (no Wikidata match): {name}") + not_found += 1 + continue + + # Check what needs updating + needs_qid = not sp["wikidata_qid"] and wd["qid"] + needs_gbif = not sp["gbif_id"] and wd["gbif_id"] + needs_eppo = not sp["eppo_code"] and wd["eppo_code"] + + if not (needs_qid or needs_gbif or needs_eppo): + print(f" SKIP (nothing new): {name}") + skipped += 1 + continue + + try: + # GET full species by slug for the complete object + full_sp = herbapi_request(f"/species/{sp['slug']}") + + # Remove read-only fields + species_id = full_sp.pop("id") + full_sp.pop("slug", None) + full_sp.pop("created_at", None) + full_sp.pop("updated_at", None) + + # Merge new data (only null fields) + if needs_qid: + full_sp["wikidata_qid"] = wd["qid"] + if needs_gbif: + full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string + if needs_eppo: + full_sp["eppo_code"] = wd["eppo_code"] + + # PUT by UUID + herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp) + + fields = [] + if needs_qid: fields.append(f"qid={wd['qid']}") + if needs_gbif: fields.append(f"gbif={wd['gbif_id']}") + if needs_eppo: fields.append(f"eppo={wd['eppo_code']}") + print(f" UPDATED: {name} -> {', '.join(fields)}") + updated += 1 + except Exception as e: + print(f" ERROR updating {name}: {e}") + errors += 1 + + print(f"\n{'=' * 60}") + print(f"RESULTS:") + print(f" Updated: {updated}") + print(f" Skipped (no new data): {skipped}") + print(f" Not found on Wikidata: {not_found}") + print(f" Errors: {errors}") + print(f" Total species: {len(species_list)}") + + +if __name__ == "__main__": + main() diff --git a/tools/enrichment/expand_species.py b/tools/enrichment/expand_species.py new file mode 100644 index 0000000..4351d0a --- /dev/null +++ b/tools/enrichment/expand_species.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +"""Expand HerbAPI species database with common permaculture/garden species.""" + +import json +import time +import urllib.request +import urllib.parse +import urllib.error +import ssl + +BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1" +AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +DELAY = 0.15 + +# SSL context for GBIF (https) +ssl_ctx = ssl.create_default_context() + + +def api_get(path): + req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH}) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def api_post(path, data): + body = json.dumps(data).encode() + req = urllib.request.Request( + f"{BASE_URL}{path}", + data=body, + headers={"Authorization": AUTH, "Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()), resp.status + except urllib.error.HTTPError as e: + err_body = e.read().decode() + print(f" ERROR {e.code}: {err_body}") + return None, e.code + + +def gbif_get_german_name(scientific_name): + """Query GBIF for the German vernacular name.""" + try: + url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}" + req = urllib.request.Request(url) + with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp: + match = json.loads(resp.read()) + + usage_key = match.get("usageKey") + if not usage_key: + return None + + url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100" + req2 = urllib.request.Request(url2) + with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp: + vn = json.loads(resp.read()) + + for r in vn.get("results", []): + if r.get("language") == "deu": + return r["vernacularName"] + return None + except Exception as e: + print(f" GBIF lookup failed for {scientific_name}: {e}") + return None + + +# ── Families to ensure exist ───────────────────────────────────────── +FAMILIES_NEEDED = { + "Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"}, + "Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"}, + "Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"}, + "Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"}, + "Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"}, + "Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"}, + "Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"}, + "Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"}, + "Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"}, + "Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"}, + "Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"}, + "Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"}, + "Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"}, + "Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"}, + "Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"}, + # New families not yet in the DB: + "Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"}, + "Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"}, + "Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"}, +} + +# ── Species to add ─────────────────────────────────────────────────── +# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields) +SPECIES = [ + # Vegetables + ("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous", + {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}), + ("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous", + {"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}), + ("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous", + {"nitrogen_fixer": True, "food_uses": "Peas, shoots"}), + ("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous", + {"food_uses": "Fruit"}), + ("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover", + {"food_uses": "Fruit"}), + ("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover", + {"food_uses": "Fruit, seeds, flowers"}), + ("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover", + {"food_uses": "Fruit, seeds"}), + ("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous", + {"food_uses": "Leaves"}), + ("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous", + {"food_uses": "Leaves"}), + ("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous", + {"food_uses": "Leaves, flower buds, stems"}), + ("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous", + {"food_uses": "Root, leaves"}), + ("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous", + {"food_uses": "Root, leaves, seed pods"}), + ("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous", + {"food_uses": "Bulb, leaves"}), + ("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous", + {"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}), + ("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous", + {"food_uses": "Leaves, flowers", "attracts_pollinators": True}), + ("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous", + {"food_uses": "Leaves, root"}), + ("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous", + {"food_uses": "Stalks, root, leaves"}), + ("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous", + {"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}), + ("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous", + {"food_uses": "Root"}), + ("Zea mays", "Poaceae", "corn", "Mais", "herbaceous", + {"food_uses": "Kernels, cobs"}), + ("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous", + {"food_uses": "Fruit"}), + + # Herbs + ("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous", + {"food_uses": "Leaves", "attracts_pollinators": True}), + ("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous", + {"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}), + ("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous", + {"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}), + ("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous", + {"food_uses": "Leaves", "attracts_pollinators": True}), + ("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous", + {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}), + ("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous", + {"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}), + ("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous", + {"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}), + ("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous", + {"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory", + "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu", + "attracts_beneficial_insects": True, "attracts_pollinators": True}), + ("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous", + {"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}), + ("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous", + {"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}), + ("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous", + {"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True, + "other_uses": "Earthworm attractant (biodynamic)"}), + + # Flowers & cover crops + ("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous", + {"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}), + ("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous", + {"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}), + ("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover", + {"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}), + ("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous", + {"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}), + ("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous", + {"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}), + ("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover", + {"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves", + "ground_cover_quality": "excellent", "attracts_pollinators": True}), + ("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous", + {"nitrogen_fixer": True, "food_uses": "Sprouts", + "dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe", + "other_uses": "Green manure, deep-rooting soil improver"}), + + # Fruit / Trees + ("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy", + {"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}), + ("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory", + {"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}), + ("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy", + {"food_uses": "Fruit", "attracts_pollinators": True}), + ("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub", + {"food_uses": "Berries"}), + ("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub", + {"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True, + "wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}), + ("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub", + {"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}), + ("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub", + {"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)", + "medicinal_uses": "High vitamin C, skin care", + "other_uses": "Erosion control, windbreak"}), + ("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy", + {"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}), +] + + +def main(): + # 1. Load existing families + print("=== Loading existing families ===") + fam_resp = api_get("/families?per_page=100") + family_map = {} # name_scientific -> id + for f in fam_resp["data"]: + family_map[f["name_scientific"]] = f["id"] + print(f" Found {len(family_map)} existing families") + + # 2. Create missing families + print("\n=== Creating missing families ===") + families_created = 0 + for fam_name, fam_info in FAMILIES_NEEDED.items(): + if fam_name in family_map: + print(f" SKIP (exists): {fam_name}") + continue + payload = { + "name_scientific": fam_name, + "name_en": fam_info["name_en"], + "name_de": fam_info["name_de"], + } + print(f" CREATE: {fam_name} ...", end=" ") + result, status = api_post("/families", payload) + if result and "id" in result: + family_map[fam_name] = result["id"] + print(f"OK ({result['id']})") + families_created += 1 + else: + print(f"FAILED (status={status})") + time.sleep(DELAY) + + print(f"\n Families created: {families_created}") + + # 3. Load existing species + print("\n=== Loading existing species ===") + sp_resp = api_get("/species?per_page=200") + existing_species = set() + for s in sp_resp["data"]: + existing_species.add(s["name_scientific"]) + print(f" Found {len(existing_species)} existing species") + + # 4. Add new species + print("\n=== Adding new species ===") + created = 0 + skipped = 0 + failed = 0 + + for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES: + if sci_name in existing_species: + print(f" SKIP (exists): {sci_name}") + skipped += 1 + continue + + # Look up family ID + fam_id = family_map.get(family) + if not fam_id: + print(f" SKIP (no family '{family}'): {sci_name}") + failed += 1 + continue + + # Try GBIF for German name + gbif_de = gbif_get_german_name(sci_name) + if gbif_de: + print(f" GBIF name for {sci_name}: {gbif_de}") + # Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation) + # Keep our curated name_de but log the GBIF one + + payload = { + "name_scientific": sci_name, + "family_id": fam_id, + "name_en": name_en, + "name_de": name_de, + "plant_layer": plant_layer, + } + # Add extra fields + for k, v in extras.items(): + payload[k] = v + + print(f" CREATE: {sci_name} ({name_de}) ...", end=" ") + result, status = api_post("/species", payload) + if result and "id" in result: + print(f"OK ({result['id']})") + created += 1 + else: + print(f"FAILED (status={status})") + failed += 1 + time.sleep(DELAY) + + print(f"\n{'='*50}") + print(f"SUMMARY") + print(f" Families created: {families_created}") + print(f" Species created: {created}") + print(f" Species skipped: {skipped}") + print(f" Species failed: {failed}") + print(f" Total species now: {len(existing_species) + created}") + + +if __name__ == "__main__": + main() diff --git a/tools/enrichment/import_images.py b/tools/enrichment/import_images.py new file mode 100644 index 0000000..628afaa --- /dev/null +++ b/tools/enrichment/import_images.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI.""" + +import json +import os +import re +import subprocess +import sys +import time +import urllib.parse +import urllib.request + +# Force unbuffered output +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +# --- Configuration --- +S3_ENDPOINT = "http://garage.sub-net.at:3900" +S3_BUCKET = "herbapi" +S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f" +S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899" +S3_REGION = "garage" + +DB_HOST = "10.31.3.90" +DB_USER = "herbapi" +DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj" +DB_NAME = "herbapi" + +USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)" +THUMB_WIDTH = 800 +REQUEST_DELAY = 0.3 + +ALLOWED_LICENSES = { + "cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0", + "public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100", + "pd-us", "pd-usgov", "pd-author", + "cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0", + "cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0", + "cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0", + "cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0", +} + + +def slugify(name: str) -> str: + """Convert scientific name to a URL-safe slug.""" + return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') + + +def psql(query: str) -> str: + """Run a psql query and return output.""" + env = os.environ.copy() + env["PGPASSWORD"] = DB_PASS + result = subprocess.run( + ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query], + capture_output=True, text=True, env=env + ) + if result.returncode != 0: + print(f" psql error: {result.stderr.strip()}", file=sys.stderr) + return result.stdout.strip() + + +def fetch_json(url: str) -> dict | None: + """Fetch JSON from a URL with proper User-Agent.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read()) + except Exception as e: + print(f" HTTP error fetching {url}: {e}") + return None + + +def get_wikidata_image(qid: str) -> str | None: + """Query Wikidata SPARQL for P18 image filename.""" + sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1" + url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({ + "query": sparql, "format": "json" + }) + data = fetch_json(url) + if not data: + return None + bindings = data.get("results", {}).get("bindings", []) + if not bindings: + return None + image_url = bindings[0]["image"]["value"] + # URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg + filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1]) + return filename + + +def get_commons_info(filename: str) -> dict | None: + """Get image info from Wikimedia Commons API.""" + url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({ + "action": "query", + "titles": f"File:{filename}", + "prop": "imageinfo", + "iiprop": "url|extmetadata", + "iiurlwidth": str(THUMB_WIDTH), + "format": "json", + }) + data = fetch_json(url) + if not data: + return None + pages = data.get("query", {}).get("pages", {}) + for page_id, page in pages.items(): + if page_id == "-1": + return None + imageinfo = page.get("imageinfo", []) + if not imageinfo: + return None + info = imageinfo[0] + meta = info.get("extmetadata", {}) + + thumb_url = info.get("thumburl") or info.get("url") + desc_url = info.get("descriptionurl", "") + + license_short = meta.get("LicenseShortName", {}).get("value", "") + artist_html = meta.get("Artist", {}).get("value", "") + # Strip HTML tags from artist + artist = re.sub(r'<[^>]+>', '', artist_html).strip() + # Clean up whitespace + artist = re.sub(r'\s+', ' ', artist) + + return { + "thumb_url": thumb_url, + "description_url": desc_url, + "license": license_short, + "artist": artist, + "filename": filename, + } + return None + + +def is_license_allowed(license_str: str) -> bool: + """Check if a license is in our allowed list.""" + normalized = license_str.lower().strip() + # Direct match + if normalized in ALLOWED_LICENSES: + return True + # Check for NC or ND + if "nc" in normalized or "nd" in normalized: + return False + # Check patterns + if normalized.startswith("public domain") or normalized.startswith("pd"): + return True + if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized): + return True + if re.match(r'^cc[- ]?by[- ]?\d', normalized): + return True + if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero": + return True + return False + + +def normalize_license(license_str: str) -> str: + """Normalize license string for storage.""" + low = license_str.lower().strip() + if "public domain" in low or low.startswith("pd"): + return "Public domain" + if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low: + return "CC0 1.0" + # CC BY-SA X.0 + m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low) + if m: + return f"CC BY-SA {m.group(1)}" + # CC BY X.0 + m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low) + if m: + return f"CC BY {m.group(1)}" + return license_str + + +def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"): + """Upload to S3 Garage using AWS CLI.""" + tmp_path = "/tmp/_herbapi_upload_tmp_file_file" + with open(tmp_path, "wb") as f: + f.write(data) + + env = os.environ.copy() + env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY + env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY + env["AWS_DEFAULT_REGION"] = S3_REGION + + result = subprocess.run( + [ + "aws", "s3", "cp", tmp_path, + f"s3://{S3_BUCKET}/{s3_key}", + "--endpoint-url", S3_ENDPOINT, + "--content-type", content_type, + ], + capture_output=True, text=True, env=env + ) + os.unlink(tmp_path) + if result.returncode != 0: + raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}") + + +def download_image(url: str) -> bytes | None: + """Download image data from URL.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=60) as resp: + return resp.read() + except Exception as e: + print(f" Download error: {e}") + return None + + +def main(): + # 1. Get species + rows = psql( + "SELECT id, name_scientific, wikidata_qid FROM species " + "WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' " + "ORDER BY name_scientific" + ) + if not rows: + print("No species with wikidata_qid found.") + return + + species_list = [] + for line in rows.split("\n"): + parts = line.split("|") + if len(parts) == 3: + species_list.append({ + "id": parts[0], + "name": parts[1], + "qid": parts[2], + }) + + print(f"Found {len(species_list)} species with Wikidata QIDs.") + + # 2. Get existing images + existing = set() + existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'") + if existing_rows: + for line in existing_rows.split("\n"): + line = line.strip() + if line: + existing.add(line) + + print(f"Found {len(existing)} species that already have images.") + + imported = 0 + skipped_existing = 0 + skipped_no_image = 0 + skipped_license = 0 + skipped_download = 0 + errors = 0 + + for i, sp in enumerate(species_list): + name = sp["name"] + qid = sp["qid"] + sp_id = sp["id"] + slug = slugify(name) + + print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})") + + if sp_id in existing: + print(" Already has image, skipping.") + skipped_existing += 1 + continue + + # Query Wikidata for image + time.sleep(REQUEST_DELAY) + filename = get_wikidata_image(qid) + if not filename: + print(" No image on Wikidata.") + skipped_no_image += 1 + continue + + # Get Commons info + time.sleep(REQUEST_DELAY) + info = get_commons_info(filename) + if not info: + print(f" Could not get Commons info for {filename}") + skipped_no_image += 1 + continue + + # Check license + raw_license = info["license"] + if not is_license_allowed(raw_license): + print(f" License not allowed: {raw_license}") + skipped_license += 1 + continue + + norm_license = normalize_license(raw_license) + artist = info["artist"] + thumb_url = info["thumb_url"] + desc_url = info["description_url"] + + print(f" License: {raw_license} -> {norm_license}") + print(f" Artist: {artist[:80]}") + print(f" Thumbnail: {thumb_url[:100]}...") + + # Download image + time.sleep(REQUEST_DELAY) + image_data = download_image(thumb_url) + if not image_data: + print(" Failed to download image.") + skipped_download += 1 + continue + + print(f" Downloaded {len(image_data)} bytes") + + # Determine file extension from URL + ext = "jpg" + if ".png" in thumb_url.lower(): + ext = "png" + elif ".svg" in thumb_url.lower(): + ext = "svg" + elif ".gif" in thumb_url.lower(): + ext = "gif" + + s3_key = f"species/{slug}.{ext}" + content_type = { + "jpg": "image/jpeg", + "png": "image/png", + "svg": "image/svg+xml", + "gif": "image/gif", + }.get(ext, "image/jpeg") + + # Upload to S3 + try: + s3_upload(s3_key, image_data, content_type) + print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}") + except RuntimeError as e: + print(f" S3 upload failed: {e}") + errors += 1 + continue + + # Insert into database + caption = f"Photo: {artist}" if artist else "Wikimedia Commons" + # Escape single quotes for SQL + caption_esc = caption.replace("'", "''") + desc_url_esc = desc_url.replace("'", "''") + norm_license_esc = norm_license.replace("'", "''") + s3_key_esc = s3_key.replace("'", "''") + + insert_sql = ( + f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) " + f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', " + f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)" + ) + + result = psql(insert_sql) + # psql returns empty on success for INSERT + print(f" Inserted into images table.") + imported += 1 + + print(f"\n{'='*60}") + print(f"DONE!") + print(f" Imported: {imported}") + print(f" Skipped (existing):{skipped_existing}") + print(f" Skipped (no image):{skipped_no_image}") + print(f" Skipped (license): {skipped_license}") + print(f" Skipped (download):{skipped_download}") + print(f" Errors: {errors}") + print(f" Total processed: {len(species_list)}") + + +if __name__ == "__main__": + main() diff --git a/tools/enrichment/import_images_v2.py b/tools/enrichment/import_images_v2.py new file mode 100644 index 0000000..034c123 --- /dev/null +++ b/tools/enrichment/import_images_v2.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI.""" + +import hashlib +import json +import os +import re +import subprocess +import sys +import time +import urllib.parse +import urllib.request + +# Config +DB_HOST = "10.31.3.90" +DB_USER = "herbapi" +DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj" +DB_NAME = "herbapi" +S3_BUCKET = "herbapi" +S3_ENDPOINT = "http://10.31.3.170:3900" +USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)" +REQUEST_DELAY = 0.3 + +# AWS env for subprocess calls +AWS_ENV = { + **os.environ, + "AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f", + "AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899", + "AWS_DEFAULT_REGION": "garage", +} + +# Stats +stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0} + + +def fetch_url(url): + """Fetch URL with custom User-Agent.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read() + + +def fetch_json(url): + """Fetch URL and parse JSON.""" + return json.loads(fetch_url(url)) + + +def psql(sql): + """Run psql command and return output.""" + result = subprocess.run( + ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql], + capture_output=True, text=True, + env={**os.environ, "PGPASSWORD": DB_PASS}, + ) + return result.stdout.strip() + + +def is_license_allowed(license_str): + """Check if license is CC0/CC-BY/CC-BY-SA or Public Domain. + Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'. + We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version). + We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use. + """ + if not license_str: + return False + ls = license_str.lower().strip() + + # Reject NC and ND explicitly first + if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls: + return False + + # Public domain / CC0 + if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"): + return True + if "public domain" in ls or ls.startswith("pd"): + return True + + # CC BY-SA (any version, any jurisdiction) + if re.match(r"cc\s+by-sa\b", ls): + return True + + # CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND + if re.match(r"cc\s+by\b", ls): + return True + + return False + + +def get_wikidata_image(qid): + """Query Wikidata SPARQL for P18 image filename.""" + sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1" + url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json" + data = fetch_json(url) + bindings = data.get("results", {}).get("bindings", []) + if not bindings: + return None + image_url = bindings[0]["image"]["value"] + # Extract filename from commons URL + filename = urllib.parse.unquote(image_url.split("/")[-1]) + return filename + + +def get_commons_info(filename): + """Get image info from Commons API: license, artist, thumbnail URL.""" + title = f"File:{filename}" + url = ( + f"https://commons.wikimedia.org/w/api.php?action=query" + f"&titles={urllib.parse.quote(title)}" + f"&prop=imageinfo&iiprop=url|extmetadata" + f"&iiurlwidth=800&format=json" + ) + data = fetch_json(url) + pages = data.get("query", {}).get("pages", {}) + for page_id, page in pages.items(): + if page_id == "-1": + return None + imageinfo = page.get("imageinfo", [{}])[0] + meta = imageinfo.get("extmetadata", {}) + + license_short = meta.get("LicenseShortName", {}).get("value", "").strip() + artist_html = meta.get("Artist", {}).get("value", "") + + # Clean up artist: strip HTML tags + artist = re.sub(r"<[^>]+>", "", artist_html).strip() + # Collapse whitespace + artist = re.sub(r"\s+", " ", artist) + if len(artist) > 120: + artist = artist[:117] + "..." + + # Use the API-provided thumbnail URL (iiurlwidth=800) + thumb_url = imageinfo.get("thumburl", "") + # Also get the description URL + desc_url = imageinfo.get("descriptionurl", "") + + return { + "license": license_short, + "artist": artist, + "thumb_url": thumb_url, + "desc_url": desc_url, + "filename": filename, + } + return None + + +def process_species(species_id, slug, name_sci, qid): + """Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB.""" + stats["total"] += 1 + + # Step 1: Get image filename from Wikidata + try: + filename = get_wikidata_image(qid) + except Exception as e: + print(f" ERROR querying Wikidata for {qid}: {e}") + stats["errors"] += 1 + return False + time.sleep(REQUEST_DELAY) + + if not filename: + print(f" No P18 image for {qid}") + stats["no_p18"] += 1 + return False + + # Step 2: Get Commons info (license, artist, thumb URL) + try: + info = get_commons_info(filename) + except Exception as e: + print(f" ERROR querying Commons for {filename}: {e}") + stats["errors"] += 1 + return False + time.sleep(REQUEST_DELAY) + + if not info: + print(f" No Commons info for {filename}") + stats["errors"] += 1 + return False + + # Step 3: Check license + if not is_license_allowed(info["license"]): + print(f" Bad license: {info['license']} for {filename}") + stats["bad_license"] += 1 + return False + + # Step 4: Download thumbnail using API-provided URL + thumb_url = info["thumb_url"] + if not thumb_url: + print(f" No thumbnail URL available for {filename}") + stats["download_fail"] += 1 + return False + + # Determine file extension from thumbnail URL + ext = "jpg" + if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]: + ext = "png" + elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]: + ext = "gif" + + tmp_path = f"/tmp/herbapi_img_{slug}.{ext}" + try: + img_data = fetch_url(thumb_url) + with open(tmp_path, "wb") as f: + f.write(img_data) + except Exception as e: + print(f" ERROR downloading {thumb_url}: {e}") + stats["download_fail"] += 1 + return False + time.sleep(REQUEST_DELAY) + + # Step 5: Upload to S3 + s3_key = f"species/{slug}.{ext}" + try: + result = subprocess.run( + ["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT], + capture_output=True, text=True, env=AWS_ENV, timeout=60, + ) + if result.returncode != 0: + print(f" S3 upload failed: {result.stderr}") + stats["upload_fail"] += 1 + return False + except Exception as e: + print(f" ERROR uploading to S3: {e}") + stats["upload_fail"] += 1 + return False + finally: + try: + os.unlink(tmp_path) + except OSError: + pass + + # Step 6: Insert into DB + caption = f"Photo: {info['artist']}" if info["artist"] else "" + caption_sql = caption.replace("'", "''") + source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}" + source_url_sql = source_url.replace("'", "''") + license_sql = info["license"].replace("'", "''") + + sql = ( + f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) " + f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);" + ) + try: + psql(sql) + except Exception as e: + print(f" ERROR inserting to DB: {e}") + stats["errors"] += 1 + return False + + stats["imported"] += 1 + return True + + +def main(): + # Get species without images + rows = psql( + "SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid " + "FROM species s " + "LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id " + "WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL " + "ORDER BY s.name_scientific;" + ) + if not rows: + print("No species need images.") + return + + species_list = [] + for line in rows.split("\n"): + parts = line.strip().split("|") + if len(parts) == 4: + species_list.append(parts) + + print(f"Processing {len(species_list)} species...\n") + + for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1): + print(f"[{i}/{len(species_list)}] {name_sci} ({qid})") + ok = process_species(sid, slug, name_sci, qid) + if ok: + print(f" OK - imported") + + print(f"\n{'='*50}") + print(f"RESULTS:") + print(f" Total species processed: {stats['total']}") + print(f" Successfully imported: {stats['imported']}") + print(f" No P18 image: {stats['no_p18']}") + print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}") + print(f" Download failures: {stats['download_fail']}") + print(f" Upload failures: {stats['upload_fail']}") + print(f" Other errors: {stats['errors']}") + + +if __name__ == "__main__": + main() diff --git a/tools/enrichment/seed_herbapi.py b/tools/enrichment/seed_herbapi.py new file mode 100644 index 0000000..85f6564 --- /dev/null +++ b/tools/enrichment/seed_herbapi.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Seed HerbAPI with common permaculture plant families and species via GBIF + API.""" +import json, urllib.request, urllib.parse, time, sys + +API = "http://herbapi01.corp.sub-net.at:8080/api/v1" +TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +GBIF = "https://api.gbif.org/v1" + +def api_post(path, data): + req = urllib.request.Request(f"{API}{path}", + data=json.dumps(data).encode(), + headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"}) + try: + resp = urllib.request.urlopen(req) + return json.loads(resp.read()) + except urllib.error.HTTPError as e: + print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr) + return None + +def gbif_de_name(name): + """Get German common name from GBIF.""" + url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}" + try: + match = json.loads(urllib.request.urlopen(url).read()) + if not match.get("usageKey"): return None + url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100" + data = json.loads(urllib.request.urlopen(url2).read()) + for r in data.get("results", []): + if r.get("language") == "deu": + return r["vernacularName"] + except: pass + return None + +FAMILIES = [ + ("Fabaceae", "Hülsenfrüchtler", "Legumes"), + ("Rosaceae", "Rosengewächse", "Rose family"), + ("Brassicaceae", "Kreuzblütler", "Cabbage family"), + ("Apiaceae", "Doldenblütler", "Carrot family"), + ("Lamiaceae", "Lippenblütler", "Mint family"), + ("Asteraceae", "Korbblütler", "Daisy family"), + ("Solanaceae", "Nachtschattengewächse", "Nightshade family"), + ("Cucurbitaceae", "Kürbisgewächse", "Gourd family"), + ("Poaceae", "Süßgräser", "Grass family"), + ("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"), + ("Boraginaceae", "Raublattgewächse", "Borage family"), + ("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"), + ("Betulaceae", "Birkengewächse", "Birch family"), + ("Fagaceae", "Buchengewächse", "Beech family"), + ("Juglandaceae", "Walnussgewächse", "Walnut family"), + ("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"), + ("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"), + ("Ericaceae", "Heidekrautgewächse", "Heath family"), + ("Moraceae", "Maulbeergewächse", "Mulberry family"), + ("Urticaceae", "Brennnesselgewächse", "Nettle family"), + ("Malvaceae", "Malvengewächse", "Mallow family"), + ("Polygonaceae", "Knöterichgewächse", "Buckwheat family"), + ("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"), + ("Asparagaceae", "Spargelgewächse", "Asparagus family"), + ("Plantaginaceae", "Wegerichgewächse", "Plantain family"), +] + +SPECIES = [ + ("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}), + ("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}), + ("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}), + ("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}), + ("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}), + ("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}), + ("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}), + ("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}), + ("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}), + ("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}), + ("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}), + ("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}), + ("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}), + ("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}), + ("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}), + ("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}), + ("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}), + ("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}), + ("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}), + ("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}), + ("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}), + ("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}), + ("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}), + ("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}), + ("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}), +] + +# Create families +print("=== Creating families ===") +family_map = {} +for sci, de, en in FAMILIES: + r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en}) + if r: + family_map[sci] = r["id"] + print(f" ✓ {sci}") + time.sleep(0.05) +print(f"Created {len(family_map)} families\n") + +# Create species +print("=== Creating species (with GBIF German names) ===") +created = 0 +for sci_name, family_sci, extra in SPECIES: + fam_id = family_map.get(family_sci) + if not fam_id: + print(f" ✗ {sci_name} — family {family_sci} missing") + continue + de_name = gbif_de_name(sci_name) + data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra} + r = api_post("/species", data) + if r: + created += 1 + print(f" ✓ {sci_name} → {de_name or '(no DE name)'}") + time.sleep(0.15) +print(f"Created {created} species\n") + +# Create suppliers +print("=== Creating suppliers ===") +for name, url, country, organic, demeter, notes in [ + ("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"), + ("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"), +]: + r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes}) + if r: print(f" ✓ {name}") +print("\nDone!") diff --git a/tools/scrapers/scrape_arche_noah.py b/tools/scrapers/scrape_arche_noah.py new file mode 100644 index 0000000..0b58f3a --- /dev/null +++ b/tools/scrapers/scrape_arche_noah.py @@ -0,0 +1,514 @@ +#!/usr/bin/env python3 +""" +Scrape Arche Noah seed catalog and import cultivars into HerbAPI. + +Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch +product listings and details, then creates cultivars in HerbAPI matched +to existing species. +""" + +import json +import re +import time +import urllib.request +import urllib.error +import urllib.parse +import sys +from datetime import datetime, timezone + +# --- Configuration ----------------------------------------------------------- + +HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" + +SHOP_BASE = "https://shop.arche-noah.at/ACM/api/" +SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + +REQUEST_DELAY = 0.5 # seconds between requests + +# Only import products from these Arche Noah article lines (their own seeds) +ARCHE_NOAH_LINES = { + "Bio-Saatgut von ARCHE NOAH", + "Kostbarkeiten aus dem ARCHE NOAH Samenarchiv", +} + +# Search terms to discover all seed products across the shop +SEARCH_TERMS = [ + "Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini", + "Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine", + "Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter", + "Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold", + "Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen", + "Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze", + "Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf", + "Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn", + "Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake", + "Schnittlauch", "Knoblauch", "Bärlauch", "Wermut", + "Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell", + "Studentenblume", "Tagetes", "Phacelia", "Buchweizen", + "Rote Bete", "Rote Rübe", "Mangold", "Melde", + "Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing", + "Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat", + "Zuckermais", "Popcorn", +] + +# --- Helpers ----------------------------------------------------------------- + +def herbapi_request(method, path, data=None): + """Make a request to HerbAPI.""" + url = f"{HERBAPI_BASE}/{path}" + body = json.dumps(data).encode() if data else None + req = urllib.request.Request(url, data=body, method=method, headers={ + "Authorization": f"Bearer {HERBAPI_TOKEN}", + "Content-Type": "application/json", + "Accept": "application/json", + }) + try: + resp = urllib.request.urlopen(req, timeout=30) + raw = resp.read().decode("utf-8") + return json.loads(raw) if raw.strip() else None + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace") + print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr) + raise + + +def shop_create_session(): + """Create an anonymous session on the Arche Noah shop.""" + req = urllib.request.Request( + SHOP_BASE + "webshop/createanonymoususer", + data=json.dumps({}).encode(), + headers={ + "User-Agent": SHOP_UA, + "Content-Type": "application/json", + "Origin": "https://shop.arche-noah.at", + "Referer": "https://shop.arche-noah.at/", + }, + ) + resp = urllib.request.urlopen(req, timeout=15) + cookie = resp.headers.get("Set-Cookie", "") + session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else "" + if not session: + raise RuntimeError("Failed to get shop session") + return session + + +def shop_request(session, endpoint, payload): + """Make a POST request to the shop API.""" + req = urllib.request.Request( + SHOP_BASE + endpoint, + data=json.dumps(payload).encode(), + headers={ + "User-Agent": SHOP_UA, + "Content-Type": "application/json", + "Accept": "application/json", + "Cookie": f"JSESSIONID={session}", + "Origin": "https://shop.arche-noah.at", + "Referer": "https://shop.arche-noah.at/", + }, + ) + resp = urllib.request.urlopen(req, timeout=30) + raw = resp.read().decode("utf-8") + return json.loads(raw) if raw.strip() else None + + +def extract_latin_name(detail_headline3): + """Extract the Latin/botanical name from the product detail headline3 field.""" + if not detail_headline3: + return None + # Remove HTML tags + text = re.sub(r"<[^>]+>", "", detail_headline3).strip() + # Remove "Hier geht es zu unseren..." trailing text + text = text.split("Hier geht")[0].strip() + # Should be something like "Solanum lycopersicum" or "Capsicum annuum" + if text and re.match(r"^[A-Z][a-z]+ [a-z]", text): + return text + return None + + +def match_species(latin_name, species_by_scientific): + """ + Match a Latin name to a species, handling subspecies/variety suffixes. + E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris". + Also handles "subsp.", "convar.", "f." qualifiers. + """ + if not latin_name: + return None + + normalized = latin_name.strip().lower() + + # Direct match + species = species_by_scientific.get(normalized) + if species: + return species + + # Strip subspecies/variety/convar/forma qualifiers and try genus + species only + # Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..." + m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized) + if m: + base = m.group(1).strip() + species = species_by_scientific.get(base) + if species: + return species + + return None + + +def extract_cultivar_name(product_name): + """ + Extract the cultivar/variety name from the product name. + Format examples: + "Salatparadeiser 'Naama' HG026" -> "Naama" + "Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection" + "Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond" + """ + # Try to extract name in quotes (various quote styles) + m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name) + if m: + return m.group(1).strip() + # Fallback: remove the article number suffix and type prefix + # Remove trailing article number like HG026, TO019, etc. + name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip() + # Remove common prefixes like "Salatparadeiser", "Buschbohne", etc. + # Just return the full cleaned name + return name + + +def parse_pack_info(unit_desc): + """ + Parse pack size info from unitDesc like '20-30 Korn' or '2g'. + Returns (pack_size, pack_unit) or (None, None). + """ + if not unit_desc: + return None, None + # "20-30 Korn" -> take the lower bound + m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc) + if m: + return float(m.group(1)), m.group(2) + return None, None + + +# --- Main scraping logic ----------------------------------------------------- + +def fetch_all_arche_noah_products(session): + """Search the shop API to find all Arche Noah seed products.""" + all_products = {} + seen_terms = set() + + for term in SEARCH_TERMS: + if term.lower() in seen_terms: + continue + seen_terms.add(term.lower()) + + offset = 0 + while True: + payload = { + "searchCriteria": term, + "startIndex": offset, + "numDataSets": 200, + "allowAllProducts": False, + } + try: + data = shop_request(session, "webshop/getproducts", payload) + except Exception as e: + print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr) + break + + if not data: + break + + new_count = 0 + for p in data: + if p["sid"] not in all_products: + all_products[p["sid"]] = p + new_count += 1 + + if len(data) < 200: + break + offset += len(data) + time.sleep(REQUEST_DELAY) + + time.sleep(REQUEST_DELAY) + + # Filter to Arche Noah's own seed products only + an_products = { + sid: p for sid, p in all_products.items() + if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES + } + + print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products") + return an_products + + +def fetch_product_details(session, products): + """Fetch detailed info (Latin names) for each product.""" + details = {} + total = len(products) + for i, (sid, product) in enumerate(products.items()): + try: + detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid}) + if detail: + details[sid] = detail + except Exception as e: + print(f" Detail for {sid} failed: {e}", file=sys.stderr) + + if (i + 1) % 20 == 0: + print(f" Fetched details: {i + 1}/{total}") + time.sleep(REQUEST_DELAY) + + print(f"Fetched {len(details)} product details") + return details + + +def load_herbapi_species(): + """Load all species from HerbAPI and build lookup maps (handles pagination).""" + page = 1 + species_list = [] + while True: + result = herbapi_request("GET", f"species?per_page=100&page={page}") + if isinstance(result, dict) and "data" in result: + data = result["data"] + total = result.get("total", 0) + elif isinstance(result, list): + data = result + total = len(data) + else: + break + species_list.extend(data) + if len(species_list) >= total or not data: + break + page += 1 + + # Build lookup by scientific name (normalized lowercase) + by_scientific = {} + for s in species_list: + key = s["name_scientific"].strip().lower() + by_scientific[key] = s + return species_list, by_scientific + + +def load_herbapi_cultivars(): + """Load all existing cultivars from HerbAPI (handles pagination, max 100/page).""" + page = 1 + all_cultivars = [] + while True: + result = herbapi_request("GET", f"cultivars?per_page=100&page={page}") + if isinstance(result, dict) and "data" in result: + data = result["data"] + total = result.get("total", 0) + elif isinstance(result, list): + data = result + total = len(data) + else: + break + + all_cultivars.extend(data) + if len(all_cultivars) >= total or not data: + break + page += 1 + + # Build lookup by (species_id, normalized cultivar name) + by_key = {} + for c in all_cultivars: + key = (c["species_id"], c["name"].strip().lower()) + by_key[key] = c + + return all_cultivars, by_key + + +def ensure_supplier(): + """Create the Arche Noah supplier if it doesn't exist, return its ID.""" + suppliers = herbapi_request("GET", "suppliers") + if isinstance(suppliers, dict) and "data" in suppliers: + suppliers = suppliers["data"] + + for s in suppliers: + if "arche" in s["name"].lower() and "noah" in s["name"].lower(): + print(f"Supplier 'Arche Noah' already exists: {s['id']}") + return s["id"] + + print("Creating supplier 'Arche Noah'...") + result = herbapi_request("POST", "suppliers", { + "name": "Arche Noah", + "url": "https://www.arche-noah.at", + "country": "AT", + "is_organic": True, + "is_demeter": False, + "notes": "Austrian society for heritage seed preservation and biodiversity", + }) + print(f"Created supplier: {result['id']}") + return result["id"] + + +def load_existing_supplier_links(cultivar_id): + """Load existing supplier links for a cultivar.""" + try: + result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers") + if isinstance(result, list): + return result + if isinstance(result, dict) and "data" in result: + return result["data"] + return [] + except Exception: + return [] + + +def main(): + now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + print(f"=== Arche Noah Seed Catalog Scraper ===") + print(f"Started at {now_str}\n") + + # Step 1: Create Arche Noah supplier in HerbAPI + print("[1/6] Ensuring Arche Noah supplier exists...") + supplier_id = ensure_supplier() + print() + + # Step 2: Load HerbAPI species for matching + print("[2/6] Loading HerbAPI species...") + species_list, species_by_scientific = load_herbapi_species() + print(f"Loaded {len(species_list)} species") + print() + + # Step 3: Load existing cultivars for idempotency + print("[3/6] Loading existing cultivars...") + existing_cultivars, cultivars_by_key = load_herbapi_cultivars() + print(f"Loaded {len(existing_cultivars)} existing cultivars") + print() + + # Step 4: Scrape Arche Noah shop + print("[4/6] Scraping Arche Noah shop catalog...") + session = shop_create_session() + print(f"Got shop session") + products = fetch_all_arche_noah_products(session) + print() + + # Step 5: Fetch product details (to get Latin names) + print("[5/6] Fetching product details for Latin name matching...") + details = fetch_product_details(session, products) + print() + + # Step 6: Create cultivars in HerbAPI + print("[6/6] Creating cultivars in HerbAPI...") + stats = { + "created": 0, + "skipped_existing": 0, + "skipped_no_species": 0, + "supplier_linked": 0, + "supplier_link_existed": 0, + "errors": 0, + } + + for sid, product in sorted(products.items()): + detail = details.get(sid, {}) + + # Extract Latin name from detail + latin_name = extract_latin_name(detail.get("detailHeadline3", "")) + if not latin_name: + # Fallback: try from category mapping + latin_name = None + + # Match to HerbAPI species (handles subspecies/variety suffixes) + species = match_species(latin_name, species_by_scientific) + + if not species: + print(f" SKIP (no species match): {product['name']} | latin={latin_name}") + stats["skipped_no_species"] += 1 + continue + + # Extract cultivar name + cultivar_name = extract_cultivar_name(product["name"]) + if not cultivar_name: + print(f" SKIP (no cultivar name): {product['name']}") + stats["skipped_no_species"] += 1 + continue + + # Check if cultivar already exists (idempotency) + lookup_key = (species["id"], cultivar_name.strip().lower()) + existing = cultivars_by_key.get(lookup_key) + + if existing: + cultivar_id = existing["id"] + stats["skipped_existing"] += 1 + else: + # Determine if this is organic + is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH" + + # Build product URL + alias = product.get("alias") or detail.get("alias", "") + product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None + + # Create cultivar + cultivar_data = { + "species_id": species["id"], + "name": cultivar_name, + "name_de": cultivar_name, + "is_organic": is_organic, + "source_urls": [product_url] if product_url else None, + } + + try: + result = herbapi_request("POST", "cultivars", cultivar_data) + cultivar_id = result["id"] + stats["created"] += 1 + # Add to lookup for idempotency within this run + cultivars_by_key[lookup_key] = result + print(f" CREATED: {cultivar_name} ({species['name_scientific']})") + except Exception as e: + print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr) + stats["errors"] += 1 + continue + + # Link cultivar to supplier + existing_links = load_existing_supplier_links(cultivar_id) + already_linked = any( + link["supplier_id"] == supplier_id for link in existing_links + ) + + if already_linked: + stats["supplier_link_existed"] += 1 + else: + # Parse pack info + unit_desc = product.get("unitDesc") or detail.get("unitDesc", "") + pack_size, pack_unit = parse_pack_info(unit_desc) + + # Get price + price = None + price_list = product.get("priceListPos") or detail.get("priceListPos", []) + if price_list: + price = price_list[0].get("singleUnitPrice") + + # Build product URL + alias = product.get("alias") or detail.get("alias", "") + product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None + + link_data = { + "supplier_id": supplier_id, + "article_number": str(product.get("articleNr", "")), + "product_url": product_url, + "price_eur": price, + "pack_size": pack_size, + "pack_unit": pack_unit, + } + + try: + herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data) + stats["supplier_linked"] += 1 + except Exception as e: + print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr) + stats["errors"] += 1 + + time.sleep(0.1) # small delay between HerbAPI calls + + # Summary + print(f"\n{'='*60}") + print(f"Scraping complete!") + print(f" Cultivars created: {stats['created']}") + print(f" Cultivars already existed: {stats['skipped_existing']}") + print(f" Skipped (no species match): {stats['skipped_no_species']}") + print(f" Supplier links created: {stats['supplier_linked']}") + print(f" Supplier links existed: {stats['supplier_link_existed']}") + print(f" Errors: {stats['errors']}") + + +if __name__ == "__main__": + main() diff --git a/tools/scrapers/scrape_bingenheimer.py b/tools/scrapers/scrape_bingenheimer.py new file mode 100644 index 0000000..b94ee16 --- /dev/null +++ b/tools/scrapers/scrape_bingenheimer.py @@ -0,0 +1,843 @@ +#!/usr/bin/env python3 +""" +Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/) +Extracts cultivar data and imports into HerbAPI. + +Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure). +""" + +import json +import re +import sys +import time +import urllib.request +import urllib.error +import urllib.parse +from html.parser import HTMLParser +from typing import Optional + +# ── Configuration ───────────────────────────────────────────────────────── +API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +SITE_BASE = "https://www.bingenheimersaatgut.de" +DELAY = 0.5 +USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)" + +# ── Category URLs to scrape ─────────────────────────────────────────────── +# (url_path, default_species_scientific_name) + +VEGETABLE_CATEGORIES = [ + ("gemuese/tomaten", "Solanum lycopersicum"), + ("gemuese/gurken/gewuerzgurke", "Cucumis sativus"), + ("gemuese/gurken/salatgurken", "Cucumis sativus"), + ("gemuese/aubergine", "Solanum melongena"), + ("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"), + ("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"), + ("gemuese/bohnen/dicke-bohne", "Vicia faba"), + ("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"), + ("gemuese/bohnen/edamame-sojabohne", "Glycine max"), + ("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"), + ("gemuese/erbsen/markerbse", "Pisum sativum"), + ("gemuese/erbsen/schalerbse", "Pisum sativum"), + ("gemuese/erbsen/zuckererbse", "Pisum sativum"), + ("gemuese/feldsalat", "Valerianella locusta"), + ("gemuese/knollenfenchel", "Foeniculum vulgare"), + ("gemuese/kohl/blumenkohl", "Brassica oleracea"), + ("gemuese/kohl/brokkoli", "Brassica oleracea"), + ("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"), + ("gemuese/kohl/gruenkohl", "Brassica oleracea"), + ("gemuese/kohl/kohlrabi", "Brassica oleracea"), + ("gemuese/kohl/rotkohl", "Brassica oleracea"), + ("gemuese/kohl/weisskohl", "Brassica oleracea"), + ("gemuese/kohl/wirsing", "Brassica oleracea"), + ("gemuese/kohl/rosenkohl", "Brassica oleracea"), + ("gemuese/kresse", "Lepidium sativum"), + ("gemuese/kuerbis", "Cucurbita maxima"), + ("gemuese/zuckermais", "Zea mays"), + ("gemuese/mangold", "Beta vulgaris"), + ("gemuese/melone", "Cucumis melo"), + ("gemuese/moehren", "Daucus carota"), + ("gemuese/paprika/gemuesepaprika", "Capsicum annuum"), + ("gemuese/paprika/chili", "Capsicum annuum"), + ("gemuese/pastinaken", "Pastinaca sativa"), + ("gemuese/petersilienwurzel", "Petroselinum crispum"), + ("gemuese/physalis", "Physalis peruviana"), + ("gemuese/porreelauch", "Allium porrum"), + ("gemuese/radies", "Raphanus sativus"), + ("gemuese/rettich", "Raphanus sativus"), + ("gemuese/rote-bete", "Beta vulgaris"), + ("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"), + ("gemuese/rueben/kohlruebe", "Brassica napus"), + ("gemuese/rucola", "Eruca vesicaria"), + ("gemuese/salat/bataviasalat", "Lactuca sativa"), + ("gemuese/salat/eichblattsalat", "Lactuca sativa"), + ("gemuese/salat/eissalat", "Lactuca sativa"), + ("gemuese/salat/endivien", "Cichorium endivia"), + ("gemuese/salat/hirschhornwegerich", "Plantago coronopus"), + ("gemuese/salat/kopfsalat", "Lactuca sativa"), + ("gemuese/salat/lollosalat", "Lactuca sativa"), + ("gemuese/salat/romanasalat", "Lactuca sativa"), + ("gemuese/salat/baby-leaf", "Lactuca sativa"), + ("gemuese/sellerie/knollensellerie", "Apium graveolens"), + ("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"), + ("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"), + ("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"), + ("gemuese/blattstielgemuese", "Beta vulgaris"), + ("gemuese/zwiebeln", "Allium cepa"), + ("gemuese/lauchzwiebeln", "Allium fistulosum"), + ("gemuese/artischocke", "Cynara cardunculus"), + ("gemuese/asia-salate", "Brassica juncea"), + ("gemuese/chicoree", "Cichorium intybus"), + ("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"), + ("gemuese/winterpostelein", "Claytonia perfoliata"), + ("gemuese/zucchini", "Cucurbita pepo"), + ("gemuese/catalogna", "Cichorium intybus"), + ("gemuese/zichoriensalate", "Cichorium intybus"), +] + +HERB_CATEGORIES = [ + ("kraeuter/basilikum", "Ocimum basilicum"), + ("kraeuter/bohnenkraut", "Satureja hortensis"), + ("kraeuter/borretsch", "Borago officinalis"), + ("kraeuter/dill", "Anethum graveolens"), + ("kraeuter/kuemmel", "Carum carvi"), + ("kraeuter/kerbel", "Anthriscus cerefolium"), + ("kraeuter/koriander", "Coriandrum sativum"), + ("kraeuter/gewuerzfenchel", "Foeniculum vulgare"), + ("kraeuter/kultursauerampfer", "Rumex acetosa"), + ("kraeuter/lavendel", "Lavandula angustifolia"), + ("kraeuter/liebstock", "Levisticum officinale"), + ("kraeuter/majoran", "Origanum majorana"), + ("kraeuter/oregano", "Origanum vulgare"), + ("kraeuter/pimpinelle", "Sanguisorba minor"), + ("kraeuter/estragon", "Artemisia dracunculus"), + ("kraeuter/salbei", "Salvia officinalis"), + ("kraeuter/schnittlauch", "Allium schoenoprasum"), + ("kraeuter/schnittknoblauch", "Allium tuberosum"), + ("kraeuter/schwarzkuemmel", "Nigella sativa"), + ("kraeuter/speisechrysantheme", "Glebionis coronaria"), + ("kraeuter/thymian", "Thymus vulgaris"), + ("kraeuter/ysop", "Hyssopus officinalis"), + ("kraeuter/winterkresse", "Barbarea vulgaris"), + ("kraeuter/brunnenkresse", "Nasturtium officinale"), + ("kraeuter/melisse", "Melissa officinalis"), + ("kraeuter/petersilie", "Petroselinum crispum"), + ("kraeuter/schnittsellerie", "Apium graveolens"), + ("kraeuter/beifuss", "Artemisia vulgaris"), +] + +GREEN_MANURE_CATEGORIES = [ + ("gruenduengung", None), +] + +ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES + +# ── Stats ───────────────────────────────────────────────────────────────── +stats = { + "categories_scraped": 0, + "products_found": 0, + "detail_pages_fetched": 0, + "cultivars_created": 0, + "cultivars_existed": 0, + "supplier_links_created": 0, + "supplier_links_existed": 0, + "species_created": 0, + "families_created": 0, + "species_not_matched": [], + "errors": [], +} + + +# ── HTTP helpers ────────────────────────────────────────────────────────── +def fetch_page(url: str) -> str: + """Fetch a web page with User-Agent header.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as e: + if e.code == 404: + return "" + raise + + +def api_get(path: str, params: dict = None) -> dict: + """GET from HerbAPI.""" + url = f"{API_BASE}{path}" + if params: + url += "?" + urllib.parse.urlencode(params) + req = urllib.request.Request(url, headers={ + "Authorization": f"Bearer {API_TOKEN}", + "Accept": "application/json", + }) + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read()) + + +def api_post(path: str, data: dict) -> tuple: + """POST to HerbAPI. Returns (response_dict, status_code).""" + url = f"{API_BASE}{path}" + body = json.dumps(data).encode("utf-8") + req = urllib.request.Request(url, data=body, method="POST", headers={ + "Authorization": f"Bearer {API_TOKEN}", + "Content-Type": "application/json", + "Accept": "application/json", + }) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read()), resp.status + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace") + return {"error": err_body, "_status": e.code}, e.code + + +# ── HTML parsing helpers ────────────────────────────────────────────────── +def parse_product_links(html: str) -> list: + """Parse product links from listing page using regex.""" + links = [] + # Magento product-item-link pattern + pattern = re.compile( + r']+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*', + re.DOTALL | re.IGNORECASE + ) + for match in pattern.finditer(html): + url = match.group(1) + name = re.sub(r'<[^>]+>', '', match.group(2)).strip() + if name: + if not url.startswith("http"): + url = SITE_BASE + url + links.append((url, name)) + + if not links: + # Broader pattern for product detail links + pattern2 = re.compile( + r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})', + re.IGNORECASE + ) + seen = set() + for match in pattern2.finditer(html): + url = match.group(1).strip() + name = match.group(2).strip() + if name and url not in seen and not url.endswith(".html"): + seen.add(url) + if not url.startswith("http"): + url = SITE_BASE + url + links.append((url, name)) + + # Deduplicate by URL + seen_urls = set() + unique = [] + for url, name in links: + if url not in seen_urls: + seen_urls.add(url) + unique.append((url, name)) + return unique + + +def extract_latin_from_detail(html: str) -> Optional[str]: + """Extract Latin/botanical name from product detail page.""" + patterns = [ + r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*', + r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})', + r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})', + ] + for pat in patterns: + m = re.search(pat, html, re.IGNORECASE) + if m: + name = m.group(1).strip() + parts = name.split() + if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower(): + return name + return None + + +def extract_description_from_detail(html: str) -> str: + """Extract product description from detail page.""" + desc_patterns = [ + r']*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)', + r']*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)', + r'data-content-type="description"[^>]*>(.*?)', + ] + for pat in desc_patterns: + m = re.search(pat, html, re.DOTALL | re.IGNORECASE) + if m: + raw = m.group(1) + text = re.sub(r'<[^>]+>', ' ', raw) + text = re.sub(r'\s+', ' ', text).strip() + if len(text) > 20: + return text[:2000] + return "" + + +def extract_article_number(product_name: str, url: str) -> Optional[str]: + """Extract article number from product name or URL.""" + m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name) + if m: + return m.group(1).replace(" ", "") + slug = url.rstrip("/").split("/")[-1] + m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE) + if m: + return m.group(1).upper() + return None + + +def extract_variety_name(product_name: str) -> str: + """Extract the variety/cultivar name from the full product name.""" + name = product_name.strip() + + # Remove article number suffix like (G802) + name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name) + + # Common German vegetable/herb type prefixes to strip + prefixes = [ + # Tomatoes + r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|' + r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+', + # Beans + r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+', + r'Edamame(?:-Sojabohne)?\s+', + # Peas + r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+', + # Cucurbits + r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+', + r'Zucchini\s+', + r'Kürbis\s+', + r'(?:Wasser)?[Mm]elone\s+', + # Brassicas + r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+', + r'Kohlrabi\s+', + r'Wirsing\s+', + r'Brokkoli\s+', + r'Chinakohl\s+', + r'Pak\s+Choi\s+', + r'Kohlrübe\s+', + r'Mai-/Herbstrüben?(?:/Navets)?\s+', + # Root vegetables + r'Möhre\s+', + r'Karotten?(?:\s*-?\s*Mix)?\s+', + r'Pastinake\s+', + r'Radies(?:chen)?\s+', + r'Rettich\s+', + r'Schwarzwurzel\s+', + r'Haferwurzel\s+', + r'Petersilienwurzel\s+', + # Beets + r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+', + r'Mangold\s+', + # Lettuce & leafy + r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+', + r'Feldsalat\s+', + r'Endivie\s+', + r'Asia[\s-]*Salat\s+', + r'Spinat\s+', + # Alliums + r'Zwiebel\s+', + r'Lauchzwiebel\s+', + r'Porree(?:/Lauch)?\s+', + r'Schnittlauch\s+', + r'Schnittknoblauch\s+', + # Peppers + r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+', + r'Chili\s+', + # Celery + r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+', + # Herbs + r'Basilikum\s+', + r'Koriander\s+', + r'Dill\s+', + r'Petersilie\s+', + r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+', + r'Salbei\s+', + r'Thymian\s+', + r'Oregano\s+', + r'Lavendel\s+', + r'Melisse\s+', + r'Majoran\s+', + r'Estragon\s+', + r'Kresse\s+', + r'Bohnenkraut\s+', + r'Borretsch\s+', + r'Kümmel\s+', + r'Kerbel\s+', + r'Liebstock\s+', + r'Ysop\s+', + r'Pimpinelle\s+', + r'Beifuß\s+', + r'Schwarzkümmel\s+', + # Other + r'Zuckermais\s+', + r'Artischocke\s+', + r'Physalis\s+', + r'Aubergine\s+', + r'Catalogna\s+', + ] + for prefix in prefixes: + name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE) + + name = name.strip().strip("'\"") + return name + + +# ── API data caches ─────────────────────────────────────────────────────── +species_cache = {} # scientific_name_lower -> {id, name_scientific, ...} +family_cache = {} # name_scientific_lower -> {id, name_scientific} +cultivar_cache = {} # slug -> {id, name, species_id, ...} +supplier_id = None + + +def load_api_data(): + """Load all existing data from HerbAPI for matching.""" + global supplier_id + + print("Loading existing HerbAPI data...") + + # Load families + page = 1 + while True: + resp = api_get("/families", {"per_page": 100, "page": page}) + for f in resp["data"]: + family_cache[f["name_scientific"].lower()] = f + if len(resp["data"]) < 100: + break + page += 1 + print(f" Loaded {len(family_cache)} families") + + # Load species + page = 1 + while True: + resp = api_get("/species", {"per_page": 100, "page": page}) + for s in resp["data"]: + species_cache[s["name_scientific"].lower()] = s + if len(resp["data"]) < 100: + break + page += 1 + print(f" Loaded {len(species_cache)} species") + + # Load ALL cultivars (slug + id + name + species_id) + page = 1 + while True: + resp = api_get("/cultivars", {"per_page": 100, "page": page}) + for c in resp["data"]: + cultivar_cache[c["slug"]] = { + "id": c["id"], + "name": c["name"], + "species_id": c["species_id"], + } + if len(resp["data"]) < 100: + break + page += 1 + print(f" Loaded {len(cultivar_cache)} cultivars") + + # Create or find Bingenheimer supplier + resp = api_get("/suppliers") + for s in resp: + if "bingenheimer" in s["name"].lower(): + supplier_id = s["id"] + print(f" Found existing supplier: {s['name']} ({s['id']})") + break + + if not supplier_id: + print(" Creating Bingenheimer Saatgut supplier...") + s, code = api_post("/suppliers", { + "name": "Bingenheimer Saatgut", + "url": "https://www.bingenheimersaatgut.de", + "country": "DE", + "is_organic": True, + "is_demeter": True, + "notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties" + }) + if "id" in s: + supplier_id = s["id"] + print(f" Created supplier: {s['id']}") + else: + print(f" ERROR creating supplier: {s}") + sys.exit(1) + + +def find_or_create_species(latin_name: str) -> Optional[str]: + """Find species by Latin name or create it. Returns species ID.""" + if not latin_name: + return None + + key = latin_name.lower().strip() + + # Direct match + if key in species_cache: + return species_cache[key]["id"] + + # Try without subspecies/variety + base = " ".join(key.split()[:2]) + if base in species_cache: + return species_cache[base]["id"] + + # Handle synonyms + synonyms = { + "lycopersicon esculentum": "solanum lycopersicum", + "capsicum annuum var. annuum": "capsicum annuum", + "brassica oleracea var. botrytis": "brassica oleracea", + "brassica oleracea var. italica": "brassica oleracea", + "brassica oleracea var. gemmifera": "brassica oleracea", + "brassica oleracea var. gongylodes": "brassica oleracea", + "brassica oleracea var. capitata": "brassica oleracea", + "brassica oleracea var. sabauda": "brassica oleracea", + "brassica oleracea var. sabellica": "brassica oleracea", + "brassica rapa var. rapa": "brassica rapa", + "brassica rapa subsp. pekinensis": "brassica rapa", + "brassica rapa subsp. chinensis": "brassica rapa", + "beta vulgaris var. conditiva": "beta vulgaris", + "beta vulgaris subsp. vulgaris": "beta vulgaris", + "beta vulgaris var. vulgaris": "beta vulgaris", + "allium porrum": "allium cepa", + "allium ampeloprasum": "allium cepa", + "origanum majorana": "origanum vulgare", + "cichorium intybus var. foliosum": "cichorium intybus", + "petroselinum crispum var. tuberosum": "petroselinum crispum", + "apium graveolens var. rapaceum": "apium graveolens", + "apium graveolens var. dulce": "apium graveolens", + "lactuca sativa var. capitata": "lactuca sativa", + "lactuca sativa var. crispa": "lactuca sativa", + "lactuca sativa var. longifolia": "lactuca sativa", + } + if key in synonyms: + syn_key = synonyms[key] + if syn_key in species_cache: + return species_cache[syn_key]["id"] + + # Try to create the species + genus = latin_name.split()[0] + family_map = { + "Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae", + "Nicandra": "Solanaceae", + "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae", + "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae", + "Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae", + "Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae", + "Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae", + "Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae", + "Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae", + "Allium": "Amaryllidaceae", + "Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae", + "Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae", + "Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae", + "Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae", + "Sanguisorba": "Rosaceae", + "Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae", + "Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae", + "Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae", + "Artemisia": "Asteraceae", + "Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae", + "Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae", + "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae", + "Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae", + "Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae", + "Mentha": "Lamiaceae", + "Zea": "Poaceae", + "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", + "Valerianella": "Caprifoliaceae", + "Tropaeolum": "Tropaeolaceae", + "Rumex": "Polygonaceae", + "Nigella": "Ranunculaceae", + "Claytonia": "Montiaceae", + "Tetragonia": "Aizoaceae", + "Basella": "Basellaceae", + "Plantago": "Plantaginaceae", + } + + family_name = family_map.get(genus) + if not family_name: + print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'") + stats["species_not_matched"].append(latin_name) + return None + + family_id = find_or_create_family(family_name) + if not family_id: + return None + + print(f" Creating species: {latin_name}") + resp, code = api_post("/species", { + "name_scientific": latin_name, + "family_id": family_id, + }) + if "id" in resp: + species_cache[latin_name.lower()] = resp + stats["species_created"] += 1 + return resp["id"] + else: + # Might already exist, reload + print(f" Species creation returned {code}: {resp.get('error','')[:100]}") + page = 1 + while True: + r = api_get("/species", {"per_page": 100, "page": page}) + for s in r["data"]: + species_cache[s["name_scientific"].lower()] = s + if len(r["data"]) < 100: + break + page += 1 + if latin_name.lower() in species_cache: + return species_cache[latin_name.lower()]["id"] + stats["errors"].append(f"Species creation failed: {latin_name}") + return None + + +def find_or_create_family(family_name: str) -> Optional[str]: + """Find or create a plant family. Returns family ID.""" + key = family_name.lower() + if key in family_cache: + return family_cache[key]["id"] + + print(f" Creating family: {family_name}") + resp, code = api_post("/families", {"name_scientific": family_name}) + if "id" in resp: + family_cache[key] = resp + stats["families_created"] += 1 + return resp["id"] + else: + # Reload + r = api_get("/families", {"per_page": 200}) + for ff in r["data"]: + family_cache[ff["name_scientific"].lower()] = ff + if key in family_cache: + return family_cache[key]["id"] + stats["errors"].append(f"Family creation failed: {family_name}") + return None + + +def slugify(text: str) -> str: + """Generate a URL-safe slug.""" + text = text.lower() + replacements = { + "ä": "a", "ö": "o", "ü": "u", "ß": "ss", + "é": "e", "è": "e", "ê": "e", "ë": "e", + "à": "a", "â": "a", "á": "a", + "ô": "o", "ù": "u", "û": "u", "ú": "u", + "ï": "i", "î": "i", "í": "i", + "ç": "c", "ñ": "n", "ó": "o", + "œ": "oe", "æ": "ae", + } + for old, new in replacements.items(): + text = text.replace(old, new) + text = re.sub(r'[^a-z0-9\s-]', '', text) + text = re.sub(r'[\s]+', '-', text.strip()) + text = re.sub(r'-+', '-', text) + return text.strip('-') + + +def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]: + """Check if cultivar already exists. Returns cultivar ID or None.""" + expected_slug = slugify(f"{species_name} {variety_name}") + + # Direct slug match + if expected_slug in cultivar_cache: + return cultivar_cache[expected_slug]["id"] + + # Check for name match in same species + variety_lower = variety_name.lower() + for slug, data in cultivar_cache.items(): + if data["species_id"] == species_id and data["name"].lower() == variety_lower: + return data["id"] + + return None + + +def scrape_category(cat_path: str, default_species: Optional[str]): + """Scrape a single category page and all its products.""" + url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html" + print(f"\n{'='*60}") + print(f"Category: {cat_path}") + + html = fetch_page(url) + if not html: + print(" SKIP: Page not found (404)") + return + + time.sleep(DELAY) + + products = parse_product_links(html) + print(f" Found {len(products)} products") + stats["products_found"] += len(products) + stats["categories_scraped"] += 1 + + for prod_url, prod_name in products: + process_product(prod_url, prod_name, default_species) + + +def process_product(prod_url: str, prod_name: str, default_species: Optional[str]): + """Process a single product: fetch detail, extract data, create cultivar.""" + article_number = extract_article_number(prod_name, prod_url) + variety_name = extract_variety_name(prod_name) + + if not variety_name: + print(f" SKIP (no variety): {prod_name}") + return + + # Skip mixes, sets, bundles + skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte", + "saat-set", " mix ", "trio ", "quartett", "gutschein", + "buch ", "düngung", "erde ", "-garten"] + name_lower = prod_name.lower() + # Exception: if the variety name itself is the whole thing, keep it + if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower(): + # Only skip if it really seems like a mix + if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower: + print(f" SKIP (mix/set): {prod_name}") + return + + print(f"\n Product: {prod_name}") + print(f" Variety: {variety_name}, SKU: {article_number}") + + # Fetch detail page + latin_name = None + description = "" + time.sleep(DELAY) + try: + detail_html = fetch_page(prod_url) + stats["detail_pages_fetched"] += 1 + if detail_html: + latin_name = extract_latin_from_detail(detail_html) + description = extract_description_from_detail(detail_html) + except Exception as e: + print(f" WARNING: Detail page error: {e}") + + species_name = latin_name or default_species + if not species_name: + print(f" SKIP: No species for '{prod_name}'") + stats["species_not_matched"].append(prod_name) + return + + print(f" Species: {species_name}") + + species_id = find_or_create_species(species_name) + if not species_id: + print(f" SKIP: Could not resolve species '{species_name}'") + return + + # Check if cultivar already exists + existing_id = find_existing_cultivar(species_name, variety_name, species_id) + + cultivar_id = None + + if existing_id: + cultivar_id = existing_id + print(f" EXISTS: cultivar already in DB") + stats["cultivars_existed"] += 1 + else: + # Create cultivar + data = { + "species_id": species_id, + "name": variety_name, + "name_de": variety_name, + "is_organic": True, + } + if description: + data["description"] = description + + resp, code = api_post("/cultivars", data) + + if "id" in resp: + cultivar_id = resp["id"] + cultivar_cache[resp["slug"]] = { + "id": resp["id"], + "name": variety_name, + "species_id": species_id, + } + stats["cultivars_created"] += 1 + print(f" CREATED: {resp['slug']}") + elif code == 500 and "Database error" in str(resp.get("error", "")): + # Likely slug conflict - try to find existing + print(f" DB conflict - searching for existing cultivar...") + # Reload cultivars for this species + page = 1 + while True: + r = api_get("/cultivars", {"per_page": 100, "page": page}) + for c in r["data"]: + cultivar_cache[c["slug"]] = { + "id": c["id"], + "name": c["name"], + "species_id": c["species_id"], + } + if c["species_id"] == species_id and c["name"].lower() == variety_name.lower(): + cultivar_id = c["id"] + if cultivar_id or len(r["data"]) < 100: + break + page += 1 + + if cultivar_id: + print(f" Found existing after conflict: {cultivar_id}") + stats["cultivars_existed"] += 1 + else: + print(f" ERROR: DB error and could not find existing cultivar") + stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}") + return + else: + print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}") + stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}") + return + + # Link to supplier + if cultivar_id and supplier_id: + link_data = { + "supplier_id": supplier_id, + "product_url": prod_url, + } + if article_number: + link_data["article_number"] = article_number + + resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data) + + if "id" in resp: + stats["supplier_links_created"] += 1 + print(f" LINKED (SKU: {article_number})") + elif code == 500 or "already" in str(resp.get("error", "")).lower(): + stats["supplier_links_existed"] += 1 + print(f" LINK EXISTS") + else: + print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}") + stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}") + + +def main(): + print("=" * 60) + print("Bingenheimer Saatgut Scraper for HerbAPI") + print("=" * 60) + + load_api_data() + + print(f"\nScraping {len(ALL_CATEGORIES)} categories...") + + for cat_path, default_species in ALL_CATEGORIES: + try: + scrape_category(cat_path, default_species) + except Exception as e: + print(f" ERROR in category {cat_path}: {e}") + stats["errors"].append(f"Category error: {cat_path}: {e}") + + # Summary + print("\n" + "=" * 60) + print("SCRAPING COMPLETE - SUMMARY") + print("=" * 60) + print(f"Categories scraped: {stats['categories_scraped']}") + print(f"Products found: {stats['products_found']}") + print(f"Detail pages fetched: {stats['detail_pages_fetched']}") + print(f"Cultivars created: {stats['cultivars_created']}") + print(f"Cultivars existed: {stats['cultivars_existed']}") + print(f"Supplier links created: {stats['supplier_links_created']}") + print(f"Supplier links existed: {stats['supplier_links_existed']}") + print(f"Species created: {stats['species_created']}") + print(f"Families created: {stats['families_created']}") + print(f"Errors: {len(stats['errors'])}") + + if stats["species_not_matched"]: + print(f"\nUnmatched species ({len(stats['species_not_matched'])}):") + for s in stats["species_not_matched"][:30]: + print(f" - {s}") + + if stats["errors"]: + print(f"\nErrors ({len(stats['errors'])}):") + for e in stats["errors"][:30]: + print(f" - {e}") + + return 0 if not stats["errors"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/scrapers/scrape_dreschflegel.py b/tools/scrapers/scrape_dreschflegel.py new file mode 100644 index 0000000..015baf6 --- /dev/null +++ b/tools/scrapers/scrape_dreschflegel.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python3 +""" +Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de). +Extracts cultivar data and imports into HerbAPI. + +Run 2 - fixes pagination (API caps at 100/page), better species matching, +caches scraped products, handles duplicates gracefully. +""" + +import urllib.request +import urllib.parse +import urllib.error +import gzip +import json +import re +import time +import sys +import os +import html as html_mod +from collections import defaultdict + +# --- Configuration --- +API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +SITE_BASE = "https://www.dreschflegel-saatgut.de" +DELAY = 0.5 +USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)" +CACHE_FILE = "/tmp/dreschflegel_products_cache.json" + +# Unbuffered output +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +stats = defaultdict(int) + + +def api_request(method, path, data=None): + """Make an API request to HerbAPI.""" + url = f"{API_BASE}{path}" + body = json.dumps(data).encode("utf-8") if data else None + req = urllib.request.Request(url, data=body, method=method) + req.add_header("Authorization", f"Bearer {API_TOKEN}") + req.add_header("Content-Type", "application/json") + req.add_header("Accept", "application/json") + try: + resp = urllib.request.urlopen(req) + return json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as e: + body_text = e.read().decode("utf-8", errors="replace") + if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower(): + return None # Duplicate, handled silently + if e.code == 500 and "database error" in body_text.lower(): + # Likely a unique constraint violation = duplicate + return None + print(f" API error {e.code} {method} {path}: {body_text[:200]}") + return None + + +def fetch_page(url): + """Fetch a web page with delay and user-agent.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + resp = urllib.request.urlopen(req, timeout=30) + return resp.read().decode("utf-8", errors="replace") + except Exception as e: + print(f" Fetch error {url}: {e}") + return None + + +def get_sitemap_urls(): + """Download sitemap and extract all URLs.""" + print("Fetching sitemap index...") + html = fetch_page(f"{SITE_BASE}/sitemap.xml") + if not html: + return [] + + sitemap_urls = re.findall(r"(.*?)", html) + all_urls = [] + + for smap_url in sitemap_urls: + if smap_url.endswith(".xml.gz"): + print(f" Fetching compressed sitemap...") + req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT}) + try: + resp = urllib.request.urlopen(req, timeout=30) + data = gzip.decompress(resp.read()).decode("utf-8") + urls = re.findall(r"(.*?)", data) + all_urls.extend(urls) + print(f" Found {len(urls)} URLs") + except Exception as e: + print(f" Error: {e}") + + return all_urls + + +def classify_urls(urls): + """Filter URLs to likely product pages (single-segment paths).""" + skip_prefixes = [ + "impressum", "agb", "datenschutz", "kontakt", "widerrufs", + "versand", "abkuerz", "zertifikat", "wichtige-hinweise", + "muster-", "gutscheine", "kalender", "flyer", "katalog", + "sommer-herbst", "unsere-hoefe", "bestellschein", + "dreschflegel-news", "termine", "rezepte", "anbautipps", + "tipps-zur", "gartentelefon", "gartenfreude", "buecher", + "navigation", "vielfalt", "sut20", "saatgut", + "neuheiten", "kennenlernangebote", "sut25", "vielfalt25", + "saatgut-vielfalt", "saat", + ] + candidates = [] + for url in urls: + url = url.rstrip("/") + path = url.replace("https://dreschflegel-saatgut.de/", "").replace( + "https://www.dreschflegel-saatgut.de/", "" + ) + if not path or "/" in path: + continue + if any(path == p or path.startswith(p) for p in skip_prefixes): + continue + candidates.append(url) + return candidates + + +def parse_product_page(html_content): + """Extract product data from a Dreschflegel product page.""" + if not html_content or 'class="botname"' not in html_content: + return None + + result = {} + + m = re.search(r"

(.*?)

", html_content) + if m: + result["name"] = html_mod.unescape(m.group(1).strip()) + + m = re.search(r'
\s*(.*?)\s*
', html_content, re.DOTALL) + if m: + result["botanical_name"] = html_mod.unescape(m.group(1).strip()) + + m = re.search( + r'class="product-detail-ordernumber"[^>]*>\s*(\d+)', + html_content, + re.DOTALL, + ) + if m: + result["article_number"] = m.group(1) + + m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content) + if m: + try: + result["price"] = float(m.group(1)) + except ValueError: + pass + + m = re.search( + r"product-detail-description-text.*?

(.*?)

", + html_content, + re.DOTALL, + ) + if m: + desc = re.sub(r"<[^>]+>", "", m.group(1).strip()) + desc = html_mod.unescape(desc).strip() + if desc: + result["description"] = desc + + m = re.search(r"Inhalt reicht f[üu]r:\s*(.*?)\s*", html_content) + if m: + result["pack_info"] = html_mod.unescape(m.group(1).strip()) + + return result if "name" in result and "botanical_name" in result else None + + +def scrape_all_products(candidate_urls): + """Scrape product pages, using cache for already-scraped URLs.""" + # Load cache + cache = {} + if os.path.exists(CACHE_FILE): + with open(CACHE_FILE, "r") as f: + cache = json.load(f) + print(f" Loaded {len(cache)} cached products") + + products = [] + to_fetch = [u for u in candidate_urls if u not in cache] + already_cached = [u for u in candidate_urls if u in cache] + + # Add cached products + for u in already_cached: + if cache[u]: # None means "not a product page" + products.append(cache[u]) + + cached_products = len(products) + cached_non_products = len(already_cached) - cached_products + print(f" {cached_products} products from cache, " + f"{cached_non_products} non-products cached, " + f"{len(to_fetch)} to fetch") + + for i, url in enumerate(to_fetch): + if (i + 1) % 50 == 0 or i == 0: + print(f" Fetching {i + 1}/{len(to_fetch)}...") + + time.sleep(DELAY) + html_content = fetch_page(url) + if not html_content: + stats["fetch_errors"] += 1 + cache[url] = None + continue + + product = parse_product_page(html_content) + if product: + product["url"] = url + products.append(product) + cache[url] = product + stats["products_scraped"] += 1 + else: + cache[url] = None + stats["not_product_pages"] += 1 + + # Save cache periodically + if (i + 1) % 100 == 0: + with open(CACHE_FILE, "w") as f: + json.dump(cache, f) + + # Final cache save + with open(CACHE_FILE, "w") as f: + json.dump(cache, f) + + print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)") + return products + + +def paginated_get(path): + """Fetch all pages from a paginated API endpoint.""" + all_items = [] + page = 1 + while True: + resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}") + if not resp or "data" not in resp or not resp["data"]: + break + all_items.extend(resp["data"]) + if len(resp["data"]) < 100: + break + page += 1 + return all_items + + +def load_api_data(): + """Load all species, families, cultivars from HerbAPI.""" + print("Loading HerbAPI data...") + + families = {} + for f in paginated_get("/families"): + families[f["name_scientific"].lower()] = f + print(f" {len(families)} families") + + species = {} + for s in paginated_get("/species"): + species[s["name_scientific"].lower().strip()] = s + print(f" {len(species)} species") + + cultivars = {} + for c in paginated_get("/cultivars"): + key = (c["species_id"], c["name"].lower().strip()) + cultivars[key] = c + print(f" {len(cultivars)} cultivars") + + return families, species, cultivars + + +def ensure_supplier(): + """Create or find the Dreschflegel supplier.""" + resp = api_request("GET", "/suppliers") + if resp: + for s in resp: + if "dreschflegel" in s["name"].lower(): + print(f" Supplier exists: {s['name']} ({s['id']})") + return s + data = { + "name": "Dreschflegel", + "url": "https://www.dreschflegel-saatgut.de", + "country": "DE", + "is_organic": True, + "is_demeter": False, + "notes": "German organic seed cooperative, open-pollinated heritage varieties", + } + resp = api_request("POST", "/suppliers", data) + if resp: + print(f" Created supplier: {resp['name']} ({resp['id']})") + return resp + + +# Genus → family mapping for species creation +GENUS_TO_FAMILY = { + # Asteraceae + "Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae", + "Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae", + "Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae", + "Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae", + "Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae", + "Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae", + "Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae", + "Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae", + "Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae", + "Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae", + "Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae", + "Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae", + "Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae", + "Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae", + # Solanaceae + "Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae", + "Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae", + # Cucurbitaceae + "Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", + "Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae", + # Fabaceae + "Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae", + "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae", + "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae", + "Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae", + "Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae", + # Brassicaceae + "Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae", + "Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae", + "Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae", + "Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae", + "Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae", + # Apiaceae + "Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae", + "Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae", + "Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae", + "Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae", + "Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae", + # Lamiaceae + "Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae", + "Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae", + "Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae", + "Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", + "Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae", + "Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae", + # Amaryllidaceae / Alliaceae + "Allium": "Amaryllidaceae", + # Poaceae + "Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae", + "Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae", + "Zea": "Poaceae", "Setaria": "Poaceae", + # Chenopodiaceae + "Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae", + "Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae", + # Rosaceae + "Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae", + "Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae", + "Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae", + "Waldsteinia": "Rosaceae", + # Boraginaceae + "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae", + "Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae", + # Malvaceae + "Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae", + "Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae", + # Polygonaceae + "Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae", + # Caryophyllaceae + "Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae", + "Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae", + "Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae", + # Tropaeolaceae + "Tropaeolum": "Tropaeolaceae", + # Papaveraceae + "Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae", + "Meconopsis": "Papaveraceae", + # Caprifoliaceae + "Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae", + "Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae", + # Plantaginaceae + "Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae", + "Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae", + # Violaceae + "Viola": "Violaceae", + # Ranunculaceae + "Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae", + "Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae", + # Linaceae + "Linum": "Linaceae", + # Convolvulaceae + "Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae", + # Portulacaceae / Montiaceae + "Claytonia": "Montiaceae", "Portulaca": "Portulacaceae", + # Amaranthaceae + "Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae", + "Gomphrena": "Amaranthaceae", + # Asparagaceae + "Asparagus": "Asparagaceae", + # Resedaceae + "Reseda": "Resedaceae", + # Balsaminaceae + "Impatiens": "Balsaminaceae", + # Hydrangeaceae + "Hydrangea": "Hydrangeaceae", + # Campanulaceae + "Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae", + # Scrophulariaceae + "Verbascum": "Scrophulariaceae", + # Verbenaceae + "Verbena": "Verbenaceae", + # Onagraceae + "Oenothera": "Onagraceae", "Clarkia": "Onagraceae", + # Cucurbitaceae extras + "Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae", + # Hypericaceae + "Hypericum": "Hypericaceae", + # Adoxaceae + "Sambucus": "Adoxaceae", + # Others + "Nigella": "Ranunculaceae", + "Dipsacus": "Caprifoliaceae", + "Knautia": "Caprifoliaceae", + "Scabiosa": "Caprifoliaceae", + "Succisa": "Caprifoliaceae", + "Asclepias": "Apocynaceae", + "Cynoglossum": "Boraginaceae", + "Echium": "Boraginaceae", + "Anchusa": "Boraginaceae", + "Lithospermum": "Boraginaceae", + "Tanacetum": "Asteraceae", + "Onobrychis": "Fabaceae", + "Ornithopus": "Fabaceae", + "Lotus": "Fabaceae", + "Anthyllis": "Fabaceae", + "Melilotus": "Fabaceae", + "Galega": "Fabaceae", + "Lespedeza": "Fabaceae", + "Arachis": "Fabaceae", + "Senna": "Fabaceae", + # Additional genera found in Dreschflegel catalog + "Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae", + "Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae", + "Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae", + "Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae", + "Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae", + "Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae", + "Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae", + "Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae", + "Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae", + "Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae", + "Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae", + "Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae", + "Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae", + "Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae", + "Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae", + "Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae", + "Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae", + "Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae", + "Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae", + "Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae", + "Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae", + "Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae", + "Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae", +} + + +def normalize_species_name(botanical_name): + """Normalize botanical name to 'Genus species' for matching. + Handles var., subsp., ssp., hybrids etc. + """ + name = botanical_name.strip() + parts = name.split() + if len(parts) < 2: + return None, None + + genus = parts[0] + # Handle 'Genus x species' (hybrid notation) + if parts[1] == "x" and len(parts) >= 3: + species = f"x {parts[2]}" + elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."): + # Only genus level - can't match to species + return genus, None + else: + species = parts[1] + + return genus, species + + +def find_species(botanical_name, species_cache): + """Find existing species matching a botanical name. + Tries exact match, then genus+species without var/subsp. + """ + genus, sp = normalize_species_name(botanical_name) + if not genus: + return None + + if sp: + # Try exact genus+species + search_key = f"{genus} {sp}".lower() + if search_key in species_cache: + return species_cache[search_key] + + # Try all species with same genus + genus_lower = genus.lower() + matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")} + if len(matches) == 1: + # Only one species in this genus - use it + return list(matches.values())[0] + + return None + + +def find_or_create_species(botanical_name, families, species_cache): + """Find or create a species from a botanical name.""" + # Try to find existing + sp = find_species(botanical_name, species_cache) + if sp: + return sp + + genus, species_epithet = normalize_species_name(botanical_name) + if not genus or not species_epithet: + stats["species_no_epithet"] += 1 + return None + + sci_name = f"{genus} {species_epithet}" + + # Check cache again with normalized name + if sci_name.lower() in species_cache: + return species_cache[sci_name.lower()] + + # Need to create - find the family + family_name = GENUS_TO_FAMILY.get(genus) + if not family_name: + stats["species_no_family"] += 1 + print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})") + return None + + # Find or create the family + family = families.get(family_name.lower()) + if not family: + print(f" Creating family: {family_name}") + resp = api_request("POST", "/families", {"name_scientific": family_name}) + if resp: + families[family_name.lower()] = resp + family = resp + stats["families_created"] += 1 + else: + # May already exist (duplicate from previous run) - reload + for f in paginated_get("/families"): + if f["name_scientific"].lower() == family_name.lower(): + families[family_name.lower()] = f + family = f + break + if not family: + print(f" [SKIP] Cannot create family: {family_name}") + return None + + # Create species + print(f" Creating species: {sci_name} (family: {family_name})") + resp = api_request("POST", "/species", { + "name_scientific": sci_name, + "family_id": family["id"], + }) + if resp: + species_cache[sci_name.lower()] = resp + stats["species_created"] += 1 + return resp + else: + # May already exist - try to find it + time.sleep(0.1) + for s in paginated_get("/species"): + if s["name_scientific"].lower() == sci_name.lower(): + species_cache[sci_name.lower()] = s + return s + return None + + +def extract_cultivar_name(product_name): + """Extract the cultivar/variety name from the full product name.""" + name = product_name.strip() + + # Common German crop type prefixes to strip (longest first) + prefixes = [ + # Tomatoes + "Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate", + "Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate", + "Flaschentomate", "Eitomate", "Datteltomate", "Tomate", + # Lettuce + "Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat", + "Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat", + "Spargelsalat", "Romanasalat", + # Beans + "Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne", + "Prunkbohne", + # Peas + "Markerbse", "Zuckererbse", "Palerbse", "Schalerbse", + "Knackerbse", "Kapuzinererbse", + # Cucumbers + "Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke", + "Freilandgurke", + # Squash + "Hokkaidokürbis", "Butternutkürbis", "Speisekürbis", + "Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis", + # Melon + "Wassermelone", "Zuckermelone", + # Peppers + "Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika", + "Snackpaprika", "Peperoni", "Chili", + # Brassicas + "Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl", + "Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl", + "Chinakohl", "Pak Choi", "Markstammkohl", + # Root veg + "Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete", + "Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich", + "Steckrübe", "Knollensellerie", "Petersilienwurzel", + "Rettich", "Radieschen", + # Onions + "Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel", + "Schalotte", "Wintersteckzwiebel", "Zwiebel", + # Herbs + "Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum", + "Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum", + "Basilikum", "Schnittknoblauch", + # Grains + "Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen", + "Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn", + # Misc + "Zuckermais", "Popcornmais", + "Salattomate", "Zucchini", + ] + + for prefix in sorted(prefixes, key=len, reverse=True): + if name.startswith(prefix + " "): + return name[len(prefix):].strip() + + return name + + +def get_existing_supplier_links(cultivar_id, supplier_id): + """Check if a cultivar-supplier link already exists.""" + resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers") + if resp: + for link in resp: + if link["supplier_id"] == supplier_id: + return True + return False + + +def main(): + print("=" * 60) + print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)") + print("=" * 60) + + # Step 1: Supplier + print("\n[1] Setting up supplier...") + supplier = ensure_supplier() + if not supplier: + print("FATAL: Could not create/find supplier") + sys.exit(1) + supplier_id = supplier["id"] + + # Step 2: Load API data + print("\n[2] Loading existing HerbAPI data...") + families, species_cache, cultivar_cache = load_api_data() + + # Step 3: Get product URLs + print("\n[3] Fetching sitemap...") + all_urls = get_sitemap_urls() + if not all_urls: + print("FATAL: Could not fetch sitemap") + sys.exit(1) + candidate_urls = classify_urls(all_urls) + print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates") + + # Step 4: Scrape + print(f"\n[4] Scraping product pages...") + products = scrape_all_products(candidate_urls) + + # Step 5: Import + print(f"\n[5] Importing {len(products)} products into HerbAPI...") + + for i, product in enumerate(products): + if (i + 1) % 50 == 0: + print(f" Processing {i + 1}/{len(products)}...") + + botanical = product.get("botanical_name", "") + if not botanical: + stats["no_botanical"] += 1 + continue + + # Find or create species + sp = find_or_create_species(botanical, families, species_cache) + if not sp: + stats["species_not_matched"] += 1 + continue + + species_id = sp["id"] + cultivar_name = extract_cultivar_name(product["name"]) + + # Check if cultivar already exists + cv_key = (species_id, cultivar_name.lower().strip()) + if cv_key in cultivar_cache: + cv = cultivar_cache[cv_key] + stats["cultivars_existing"] += 1 + else: + cv_data = { + "species_id": species_id, + "name": cultivar_name, + "is_organic": True, + } + if product.get("description"): + cv_data["description"] = product["description"] + + cv = api_request("POST", "/cultivars", cv_data) + if cv: + cultivar_cache[cv_key] = cv + stats["cultivars_created"] += 1 + else: + # Might already exist from previous run - try to find it + found = False + for c in paginated_get(f"/cultivars?species_id={species_id}"): + if c["name"].lower().strip() == cultivar_name.lower().strip(): + cultivar_cache[cv_key] = c + cv = c + stats["cultivars_existing"] += 1 + found = True + break + if not found: + stats["cultivar_create_errors"] += 1 + continue + + # Link to supplier (check first for idempotency) + if get_existing_supplier_links(cv["id"], supplier_id): + stats["supplier_links_existing"] += 1 + continue + + link_data = { + "supplier_id": supplier_id, + "article_number": product.get("article_number", ""), + "product_url": product.get("url", ""), + "price_eur": product.get("price"), + } + pack_info = product.get("pack_info", "") + if pack_info: + m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info) + if m: + link_data["pack_size"] = float(m.group(1)) + unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"} + link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2)) + + resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data) + if resp: + stats["supplier_links_created"] += 1 + else: + stats["supplier_link_errors"] += 1 + + # Summary + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + for key, val in sorted(stats.items()): + print(f" {key}: {val}") + print(f"\n Total species in DB: {len(species_cache)}") + print(f" Total cultivars tracked: {len(cultivar_cache)}") + + +if __name__ == "__main__": + main() diff --git a/tools/scrapers/scrape_mgs.py b/tools/scrapers/scrape_mgs.py new file mode 100644 index 0000000..9a71847 --- /dev/null +++ b/tools/scrapers/scrape_mgs.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +"""Scrape Magic Garden Seeds product pages and update herbapi database.""" + +import subprocess +import re +import time +import os +import sys + +DB_CMD = [ + 'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi', + '-t', '-A', '-F|' +] +DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'} + +MONTH_MAP = { + 'january': 1, 'february': 2, 'march': 3, 'april': 4, + 'may': 5, 'june': 6, 'july': 7, 'august': 8, + 'september': 9, 'october': 10, 'november': 11, 'december': 12, +} + + +def run_sql(sql): + result = subprocess.run( + DB_CMD + ['-c', sql], + capture_output=True, text=True, env=DB_ENV + ) + return result.stdout.strip() + + +def fetch_page(url): + result = subprocess.run( + ['curl', '-sL', '--max-time', '15', url], + capture_output=True, text=True + ) + return result.stdout + + +def parse_months(text): + if not text: + return None + text_lower = text.lower().strip() + months = [] + for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])): + if month_name in text_lower: + if month_num not in months: + months.append(month_num) + text_lower = text_lower.replace(month_name, '') + return sorted(months) if months else None + + +def parse_depth(text): + if not text: + return None + match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text) + if match: + v1 = float(match.group(1).replace(',', '.')) + v2 = float(match.group(2).replace(',', '.')) + return round((v1 + v2) / 2, 1) + match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text) + if match: + return float(match.group(1).replace(',', '.')) + return None + + +def parse_spacing(text): + """Parse planting distance. Returns (row_spacing, plant_spacing).""" + if not text: + return None, None + text = text.lower().strip() + # "X x Y cm" + match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text) + if match: + return float(match.group(2)), float(match.group(1)) + # "X - Y cm" range -> average as plant spacing + match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text) + if match: + return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1) + # Single value + match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text) + if match: + return None, float(match.group(1)) + return None, None + + +def parse_germination_days(text): + if not text: + return None + text = text.lower() + match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text) + if match: + return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7)) + match = re.search(r'(\d+)\s*weeks?', text) + if match: + return int(match.group(1)) * 7 + match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text) + if match: + return int(round((int(match.group(1)) + int(match.group(2))) / 2)) + match = re.search(r'(\d+)\s*days?', text) + if match: + return int(match.group(1)) + return None + + +def parse_germ_temp(text): + if not text: + return None + match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text) + if match: + return round((float(match.group(1)) + float(match.group(2))) / 2, 1) + match = re.search(r'(\d+)\s*°', text) + if match: + return float(match.group(1)) + return None + + +def parse_lifecycle(text): + if not text: + return None + text = text.lower().strip() + if 'perennial' in text: + return True + if 'annual' in text or 'biennial' in text: + return False + return None + + +def parse_light(text): + if not text: + return None + text = text.lower().strip() + if 'full sun' in text and 'partial' in text: + return 'full sun to partial shade' + if 'full sun' in text: + return 'full sun' + if 'partial' in text or 'semi' in text or 'half' in text: + return 'partial shade' + if 'shade' in text: + return 'shade' + if 'sun' in text: + return 'full sun' + return text + + +def extract_data(html): + data = {} + + # Extract table cell pairs + cells = re.findall(r']*>(.*?)', html, re.DOTALL) + clean_cells = [] + for c in cells: + clean = re.sub(r'<[^>]+>', ' ', c).strip() + clean = re.sub(r'\s+', ' ', clean) + clean_cells.append(clean) + + specs = {} + i = 0 + while i < len(clean_cells) - 1: + key = clean_cells[i].rstrip(':').strip() + val = clean_cells[i + 1].strip() + if key and val and not re.match(r'^[\d,.\s€*]+$', key): + specs[key.lower()] = val + i += 2 + + # Extract description from itemprop="description" + desc_match = re.search(r'itemprop="description">(.*?)\s*\s*', html, re.DOTALL) + if desc_match: + content = desc_match.group(1) + content = re.sub(r']*>.*?', '', content, flags=re.DOTALL) + content = re.sub(r']*>.*?', '', content, flags=re.DOTALL) + content = re.sub(r'<[^>]+>', ' ', content) + content = re.sub(r'\s+', ' ', content).strip() + for marker in ['Other names', 'Additional contact mail', 'Question about']: + idx = content.find(marker) + if idx > 0: + content = content[:idx].strip() + if len(content) > 20: + data['description'] = content + + if 'description' not in data: + meta_match = re.search(r']*name="description"[^>]*content="([^"]*)"', html) + if meta_match and len(meta_match.group(1)) > 20: + data['description'] = meta_match.group(1) + + # Parse specs + if 'planting distance' in specs: + row_sp, plant_sp = parse_spacing(specs['planting distance']) + if plant_sp: + data['plant_spacing_cm'] = plant_sp + if row_sp: + data['row_spacing_cm'] = row_sp + + if 'row spacing' in specs: + match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing']) + if match: + data['row_spacing_cm'] = float(match.group(1)) + + if 'sowing depth' in specs: + depth = parse_depth(specs['sowing depth']) + if depth is not None: + data['planting_depth_cm'] = depth + + # Harvesting months - prefer explicit harvest time over flowering + if 'harvest time' in specs: + months = parse_months(specs['harvest time']) + if months: + data['harvesting_months'] = months + elif 'harvesting months' in specs: + months = parse_months(specs['harvesting months']) + if months: + data['harvesting_months'] = months + elif 'flowering months' in specs: + months = parse_months(specs['flowering months']) + if months: + data['harvesting_months'] = months + + if 'when to sow outdoors' in specs: + months = parse_months(specs['when to sow outdoors']) + if months: + data['direct_sowing_months'] = months + + for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']: + if indoor_key in specs: + months = parse_months(specs[indoor_key]) + if months: + data['indoor_sowing_months'] = months + break + + if 'lifecycle' in specs: + perennial = parse_lifecycle(specs['lifecycle']) + if perennial is not None: + data['perennial'] = perennial + + if 'sunlight' in specs: + light = parse_light(specs['sunlight']) + if light: + data['light_requirement'] = light + + if 'germination time' in specs: + days = parse_germination_days(specs['germination time']) + if days: + data['days_to_germination'] = days + + if 'germination temperature' in specs: + temp = parse_germ_temp(specs['germination temperature']) + if temp: + data['germination_temp_c'] = temp + + return data + + +def get_current_values(cultivar_id): + sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm, + perennial, harvesting_months, direct_sowing_months, light_requirement, + days_to_germination, germination_temp_c, indoor_sowing_months + FROM cultivars WHERE id = '{cultivar_id}'""" + row = run_sql(sql) + if not row: + return {} + parts = row.split('|') + fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm', + 'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement', + 'days_to_germination', 'germination_temp_c', 'indoor_sowing_months'] + current = {} + for i, f in enumerate(fields): + if i < len(parts): + val = parts[i].strip() + if val and val != '': + current[f] = val + return current + + +def build_update_sql(cultivar_id, data, current): + sets = [] + updated_fields = [] + for field, value in data.items(): + if field in current and current[field]: + continue + + if isinstance(value, str): + escaped = value.replace("'", "''") + sets.append(f"{field} = '{escaped}'") + elif isinstance(value, bool): + sets.append(f"{field} = {'true' if value else 'false'}") + elif isinstance(value, list): + arr_str = '{' + ','.join(str(x) for x in value) + '}' + sets.append(f"{field} = '{arr_str}'") + elif isinstance(value, (int, float)): + sets.append(f"{field} = {value}") + updated_fields.append(field) + + if not sets: + return None, [] + + return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields + + +def main(): + sql = """ + SELECT c.id, c.name, cs.product_url + FROM cultivars c + JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id + JOIN suppliers s ON cs.supplier_id = s.id + WHERE s.name = 'Magic Garden Seeds' + AND cs.product_url IS NOT NULL AND cs.product_url <> '' + AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '') + ORDER BY c.name; + """ + rows = run_sql(sql) + if not rows: + print("No cultivars to process") + return + + cultivars = [] + for line in rows.strip().split('\n'): + parts = line.split('|') + if len(parts) >= 3: + cultivars.append({ + 'id': parts[0], + 'name': parts[1], + 'url': parts[2] + }) + + print(f"Processing {len(cultivars)} MGS cultivars...") + sys.stdout.flush() + + updated = 0 + skipped = 0 + failed = 0 + fields_updated = {} + + for i, cv in enumerate(cultivars): + print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True) + + try: + html = fetch_page(cv['url']) + if not html or len(html) < 1000: + print("FAILED (empty page)") + failed += 1 + time.sleep(0.5) + continue + + data = extract_data(html) + if not data: + print("NO DATA") + skipped += 1 + time.sleep(0.5) + continue + + current = get_current_values(cv['id']) + sql_stmt, upd_fields = build_update_sql(cv['id'], data, current) + + if not sql_stmt: + print(f"SKIP (all fields populated)") + skipped += 1 + else: + run_sql(sql_stmt) + for f in upd_fields: + fields_updated[f] = fields_updated.get(f, 0) + 1 + print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})") + updated += 1 + + except Exception as e: + print(f"ERROR: {e}") + failed += 1 + + time.sleep(0.5) + + print(f"\n=== MGS Summary ===") + print(f"Total processed: {len(cultivars)}") + print(f"Updated: {updated}") + print(f"Skipped (all fields already populated): {skipped}") + print(f"Failed: {failed}") + print(f"\nFields updated:") + for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]): + print(f" {field}: {count}") + + +if __name__ == '__main__': + main() diff --git a/tools/scrapers/scrape_naturadb.py b/tools/scrapers/scrape_naturadb.py new file mode 100644 index 0000000..e6963e3 --- /dev/null +++ b/tools/scrapers/scrape_naturadb.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 +""" +Scrape NaturaDB wildlife interaction data and enrich HerbAPI species. +""" + +import json +import re +import time +import urllib.request +import urllib.error +import sys + +HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +NATURADB_BASE = "https://www.naturadb.de/pflanzen" +USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)" +DELAY = 0.5 + + +def api_get(path): + """GET from HerbAPI.""" + url = f"{HERBAPI_BASE}{path}" + req = urllib.request.Request(url) + req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}") + req.add_header("Accept", "application/json") + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode()) + + +def api_put(path, data): + """PUT to HerbAPI.""" + url = f"{HERBAPI_BASE}{path}" + body = json.dumps(data).encode() + req = urllib.request.Request(url, data=body, method="PUT") + req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}") + req.add_header("Content-Type", "application/json") + req.add_header("Accept", "application/json") + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode()) + + +def fetch_naturadb(latin_name): + """Fetch a NaturaDB plant page. Returns HTML string or None.""" + slug = latin_name.lower().replace(" ", "-") + url = f"{NATURADB_BASE}/{slug}/" + req = urllib.request.Request(url) + req.add_header("User-Agent", USER_AGENT) + try: + with urllib.request.urlopen(req, timeout=15) as resp: + return resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as e: + if e.code == 404: + return None + print(f" HTTP {e.code} for {url}") + return None + except Exception as e: + print(f" Error fetching {url}: {e}") + return None + + +def extract_td_value(html, label): + """Extract value from label:value pattern.""" + pattern = rf"{re.escape(label)}:?\s*]*>(.*?)" + m = re.search(pattern, html, re.DOTALL) + if m: + # Strip HTML tags from value + val = re.sub(r"<[^>]+>", "", m.group(1)).strip() + return val + return None + + +def extract_native_status(html): + """Extract native status from chip badges.""" + # Look for the primary native status chips (large, colored) + statuses = [] + for m in re.finditer( + r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html + ): + tag = m.group(1).strip() + if tag in ( + "heimische Wildform", + "Archäophyt", + "Neophyt", + "nicht heimisch (Neophyt)", + ): + statuses.append(tag) + return statuses + + +def extract_badge_tags(html): + """Extract ecological badge chips (large, plain text).""" + tags = [] + for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html): + tag = m.group(1).strip() + if tag and tag not in ("", "winterhart"): + tags.append(tag) + return tags + + +def parse_count(text): + """Extract leading integer from text like '82 (Nektar und/oder ...)' """ + if not text: + return None + m = re.match(r"(\d+)", text.strip()) + return int(m.group(1)) if m else None + + +def parse_specialist_count(text): + """Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'.""" + if not text: + return None + m = re.search(r"davon\s+(\d+)\s+spezialisiert", text) + return int(m.group(1)) if m else None + + +def parse_nectar_pollen(text): + """Extract numeric value from '2/4 - mäßig' -> 2.""" + if not text: + return None + m = re.match(r"(\d+)/4", text.strip()) + return int(m.group(1)) if m else None + + +def build_wildlife_value(data): + """Build a structured wildlife_value string from scraped data.""" + parts = [] + + # Nectar and pollen + np_parts = [] + if data.get("nectar") is not None: + np_parts.append(f"Nectar: {data['nectar']}/4") + if data.get("pollen") is not None: + np_parts.append(f"Pollen: {data['pollen']}/4") + if np_parts: + parts.append(", ".join(np_parts) + ".") + + # Wild bees + if data.get("wildbienen_count") is not None: + s = f"Supports {data['wildbienen_count']} wild bee species" + if data.get("wildbienen_specialists") is not None: + s += f" ({data['wildbienen_specialists']} specialists)" + parts.append(s + ".") + + # Butterflies / moths + if data.get("schmetterlinge_count") is not None: + s = f"{data['schmetterlinge_count']} butterfly/moth species" + if data.get("raupen_count") is not None: + spec = "" + if data.get("raupen_specialists") is not None: + spec = f" ({data['raupen_specialists']} specialized)" + s += f", {data['raupen_count']} as caterpillar host{spec}" + parts.append(s + ".") + + # Hoverflies + if data.get("schwebfliegen_count") is not None: + parts.append(f"{data['schwebfliegen_count']} hoverfly species.") + + # Beetles + if data.get("kaefer_count") is not None: + parts.append(f"{data['kaefer_count']} beetle species.") + + # Birds + if data.get("vogelarten_count") is not None: + parts.append(f"{data['vogelarten_count']} bird species.") + + # Mammals + if data.get("saeugetier_count") is not None: + parts.append(f"{data['saeugetier_count']} mammal species.") + + # Native status + if data.get("native_status"): + parts.append(" ".join(data["native_status"]) + ".") + + # Notable badges + notable = [ + t + for t in data.get("badges", []) + if any( + kw in t.lower() + for kw in [ + "insektenpflanze", + "raupenfutter", + "vogelschutz", + "vogelnähr", + "bienenweide", + ] + ) + ] + if notable: + parts.append("Tags: " + ", ".join(notable) + ".") + + return " ".join(parts) if parts else None + + +def scrape_species(html): + """Parse NaturaDB HTML and return structured wildlife data dict.""" + data = {} + + # Nectar and pollen values + nectar_raw = extract_td_value(html, "Nektarwert") + pollen_raw = extract_td_value(html, "Pollenwert") + data["nectar"] = parse_nectar_pollen(nectar_raw) + data["pollen"] = parse_nectar_pollen(pollen_raw) + + # Wild bees + bees_raw = extract_td_value(html, "Wildbienen") + data["wildbienen_count"] = parse_count(bees_raw) + data["wildbienen_specialists"] = parse_specialist_count(bees_raw) + + # Butterflies/moths + schmett_raw = extract_td_value(html, "Schmetterlinge") + data["schmetterlinge_count"] = parse_count(schmett_raw) + + # Caterpillar hosts + raupen_raw = extract_td_value(html, "Raupen") + data["raupen_count"] = parse_count(raupen_raw) + data["raupen_specialists"] = parse_specialist_count(raupen_raw) + + # Hoverflies + schweb_raw = extract_td_value(html, "Schwebfliegen") + data["schwebfliegen_count"] = parse_count(schweb_raw) + + # Beetles + kaefer_raw = extract_td_value(html, "Käfer") + data["kaefer_count"] = parse_count(kaefer_raw) + + # Birds + vogel_raw = extract_td_value(html, "fressende Vogelarten") + data["vogelarten_count"] = parse_count(vogel_raw) + + # Mammals + saeuget_raw = extract_td_value(html, "fressende Säugetierarten") + data["saeugetier_count"] = parse_count(saeuget_raw) + + # Native status + data["native_status"] = extract_native_status(html) + + # Badge tags + data["badges"] = extract_badge_tags(html) + + return data + + +def has_any_data(data): + """Check if we scraped anything meaningful.""" + for k, v in data.items(): + if k in ("native_status", "badges"): + if v: + return True + elif v is not None: + return True + return False + + +def main(): + print("Fetching species list from HerbAPI...") + species_list = api_get("/species?per_page=200")["data"] + print(f"Found {len(species_list)} species.\n") + + enriched = 0 + skipped_has_data = 0 + skipped_not_found = 0 + skipped_no_data = 0 + errors = 0 + + for i, sp in enumerate(species_list): + slug = sp["slug"] + name = sp["name_scientific"] + existing_wv = sp.get("wildlife_value") + + # Only enrich if wildlife_value is empty/null + if existing_wv: + print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)") + skipped_has_data += 1 + continue + + print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True) + + # Fetch NaturaDB page + html = fetch_naturadb(name) + time.sleep(DELAY) + + if html is None: + print("NOT FOUND on NaturaDB") + skipped_not_found += 1 + continue + + # Parse wildlife data + data = scrape_species(html) + + if not has_any_data(data): + print("no wildlife data on page") + skipped_no_data += 1 + continue + + # Build wildlife_value string + wildlife_value = build_wildlife_value(data) + if not wildlife_value: + print("no wildlife data extracted") + skipped_no_data += 1 + continue + + # GET full species, merge, PUT back + try: + full = api_get(f"/species/{slug}") + full["wildlife_value"] = wildlife_value + + # Remove read-only / computed fields that the PUT endpoint might reject + for key in ("created_at", "updated_at", "family"): + full.pop(key, None) + + api_put(f"/species/{full['id']}", full) + print(f"ENRICHED -> {wildlife_value[:80]}...") + enriched += 1 + except Exception as e: + print(f"API ERROR: {e}") + errors += 1 + + print("\n" + "=" * 70) + print(f"DONE. Results:") + print(f" Enriched: {enriched}") + print(f" Already had data: {skipped_has_data}") + print(f" Not on NaturaDB: {skipped_not_found}") + print(f" No wildlife data: {skipped_no_data}") + print(f" Errors: {errors}") + print(f" Total: {len(species_list)}") + + +if __name__ == "__main__": + main() diff --git a/tools/scrapers/scrape_reinsaat.py b/tools/scrapers/scrape_reinsaat.py new file mode 100644 index 0000000..60ff189 --- /dev/null +++ b/tools/scrapers/scrape_reinsaat.py @@ -0,0 +1,560 @@ +#!/usr/bin/env python3 +""" +Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI. + +Strategy: +1. Fetch category pages, recursively discover product pages via JSON-LD detection +2. Extract structured data from JSON-LD Product schema + HTML text for growing data +3. Match Latin names to existing species in the API +4. Create cultivar records and link them to Reinsaat supplier +""" + +import json +import re +import ssl +import time +import urllib.request +import urllib.error +import urllib.parse +from html.parser import HTMLParser +from dataclasses import dataclass +from typing import Optional + +# ── Config ────────────────────────────────────────────────────────────────── +API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4" +DELAY = 0.5 # seconds between requests +USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)" + +# ── Categories to scrape ──────────────────────────────────────────────────── +# (category_url, default_species_hint for leaf pages in this category) +CATEGORIES = [ + ("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"), + ("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None), + ("https://www.reinsaat.at/shop/DE/kuerbis/", None), + ("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"), + ("https://www.reinsaat.at/shop/DE/bohnen/", None), + ("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"), + ("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"), + ("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None), +] + +# ── Known Latin name genera we can match ──────────────────────────────────── +KNOWN_GENERA = ( + "Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|" + "Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|" + "Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|" + "Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|" + "Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|" + "Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus" +) + +LATIN_PATTERN = re.compile( + rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)' +) + + +# ── HTML helpers ──────────────────────────────────────────────────────────── +class TextExtractor(HTMLParser): + """Extract all visible text from HTML.""" + def __init__(self): + super().__init__() + self.parts = [] + self._skip = 0 + + def handle_starttag(self, tag, attrs): + if tag in ("script", "style", "noscript"): + self._skip += 1 + + def handle_endtag(self, tag): + if tag in ("script", "style", "noscript") and self._skip > 0: + self._skip -= 1 + + def handle_data(self, data): + if self._skip == 0: + t = data.strip() + if t: + self.parts.append(t) + + +def extract_links(html: str, base_url: str) -> list[str]: + """Extract all links from HTML, resolving relative URLs.""" + links = [] + seen = set() + for m in re.finditer(r']*href="([^"]*)"', html, re.IGNORECASE): + href = m.group(1) + if not href or href.startswith("#") or href.startswith("javascript:"): + continue + full = urllib.parse.urljoin(base_url, href) + if full not in seen: + seen.add(full) + links.append(full) + return links + + +def extract_jsonld_product(html: str) -> Optional[dict]: + """Extract the JSON-LD Product object from HTML, if present.""" + for m in re.finditer( + r']*type="application/ld\+json"[^>]*>(.*?)', + html, re.DOTALL | re.IGNORECASE + ): + try: + data = json.loads(m.group(1)) + if isinstance(data, dict) and data.get("@type") == "Product": + return data + except (json.JSONDecodeError, ValueError): + continue + return None + + +# ── HTTP helpers ──────────────────────────────────────────────────────────── +_ssl_ctx = ssl.create_default_context() + +def fetch_url(url: str, retries: int = 2) -> str: + """Fetch a URL with retries.""" + req = urllib.request.Request(url, headers={ + "User-Agent": USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "de-AT,de;q=0.9,en;q=0.5", + }) + for attempt in range(retries + 1): + try: + with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp: + charset = resp.headers.get_content_charset() or "utf-8" + return resp.read().decode(charset) + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + if attempt < retries: + time.sleep(2) + continue + raise + return "" + + +def api_get(path: str): + """GET from HerbAPI.""" + req = urllib.request.Request( + f"{API_BASE}{path}", + headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=15) as resp: + return json.loads(resp.read()) + + +def api_post(path: str, data: dict): + """POST to HerbAPI.""" + body = json.dumps(data).encode("utf-8") + req = urllib.request.Request( + f"{API_BASE}{path}", + data=body, + headers={ + "Authorization": f"Bearer {AUTH_TOKEN}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=15) as resp: + return json.loads(resp.read()) + except urllib.error.HTTPError as e: + error_body = e.read().decode("utf-8", errors="replace") + print(f" API ERROR {e.code}: {error_body[:500]}") + raise + + +# ── Species matching ──────────────────────────────────────────────────────── +def load_species() -> dict: + """Load species from API. Returns dict: lowercase scientific name -> species dict.""" + result = {} + page = 1 + while True: + data = api_get(f"/species?per_page=100&page={page}") + species_list = data.get("data", data) if isinstance(data, dict) else data + for s in species_list: + key = s["name_scientific"].lower().strip() + result[key] = s + if isinstance(data, dict) and "pagination" in data: + if page >= data["pagination"].get("total_pages", 1): + break + else: + break + page += 1 + return result + + +def match_species(latin_name: str, species_map: dict) -> Optional[dict]: + """Match a Latin name to an existing species. Returns species dict or None.""" + if not latin_name: + return None + + # Clean the name: remove author citations, subspecies + clean = latin_name.strip() + clean = re.sub(r'\s+L\.\s*$', '', clean) + clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean) + clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean) + + key = clean.lower().strip() + if key in species_map: + return species_map[key] + + # Try genus + species (first two words) + parts = key.split() + if len(parts) >= 2: + two = f"{parts[0]} {parts[1]}" + if two in species_map: + return species_map[two] + + # Try genus-only match (less reliable, but useful for Borago, etc.) + if parts: + for skey, sval in species_map.items(): + if skey.startswith(parts[0] + " "): + return sval + + return None + + +# ── Product data extraction ───────────────────────────────────────────────── +@dataclass +class ProductData: + name: str = "" + latin_name: str = "" + description: str = "" + sku: str = "" + url: str = "" + is_organic: bool = True + sowing_depth_cm: Optional[float] = None + row_spacing_cm: Optional[float] = None + plant_spacing_cm: Optional[float] = None + germination_temp_c: Optional[float] = None + perennial: bool = False + + +def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]: + """Parse a product page. Returns ProductData or None if not a product page.""" + jsonld = extract_jsonld_product(html) + if not jsonld: + return None # Not a product page + + product = ProductData(url=url) + + # ── From JSON-LD ── + product.name = jsonld.get("name", "").strip() + product.description = jsonld.get("description", "").strip() + product.sku = jsonld.get("model", "").strip() + + # ── Extract full text for pattern matching ── + extractor = TextExtractor() + extractor.feed(html) + full_text = " ".join(extractor.parts) + + # ── Latin name ── + m = LATIN_PATTERN.search(full_text) + if m: + product.latin_name = m.group(1).strip() + # Also check / tags in HTML + if not product.latin_name: + for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)', html, re.IGNORECASE | re.DOTALL): + clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip() + im = LATIN_PATTERN.search(clean) + if im: + product.latin_name = im.group(1).strip() + break + if not product.latin_name and default_species: + product.latin_name = default_species + + # ── Sowing depth ── + depth_pats = [ + r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', + r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm', + r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)', + ] + for pat in depth_pats: + dm = re.search(pat, full_text, re.IGNORECASE) + if dm: + vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)] + product.sowing_depth_cm = sum(vals) / len(vals) + break + + # Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords + if product.sowing_depth_cm is None: + dm = re.search( + r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', + html, re.IGNORECASE + ) + if dm: + d1 = float(dm.group(1).replace(",", ".")) + d2 = float(dm.group(2).replace(",", ".")) + product.sowing_depth_cm = (d1 + d2) / 2 + + # ── Spacing ── + # Look for "ROW x PLANT cm" patterns + spacing_pats = [ + # "30–40 x 2–4 cm" (range x range) + r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm', + # "100 x 50 cm" (simple) + r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm', + ] + for pat in spacing_pats: + matches = re.findall(pat, full_text, re.IGNORECASE) + if matches: + # Prefer the last match (often the more relevant outdoor spacing) + m = matches[-1] + if len(m) == 4: + product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2 + product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2 + elif len(m) == 2: + v1 = float(m[0].replace(",", ".")) + v2 = float(m[1].replace(",", ".")) + product.row_spacing_cm = v1 + product.plant_spacing_cm = v2 + break + + # ── Germination temperature ── + temp_pats = [ + r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C', + r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C', + r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C', + ] + for pat in temp_pats: + tm = re.search(pat, full_text, re.IGNORECASE) + if tm: + vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)] + # Sanity check: germination temps are typically 5-35°C + avg = sum(vals) / len(vals) + if 5 <= avg <= 40: + product.germination_temp_c = avg + break + + # ── Perennial ── + perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude'] + for pat in perennial_pats: + if re.search(pat, full_text, re.IGNORECASE): + product.perennial = True + break + + return product + + +# ── Recursive product discovery ───────────────────────────────────────────── +def discover_products( + category_url: str, + default_species: Optional[str], + max_depth: int = 3, + _depth: int = 0, + _visited: set = None, +) -> list[ProductData]: + """Recursively discover and parse product pages under a category URL.""" + if _visited is None: + _visited = set() + if category_url in _visited or _depth > max_depth: + return [] + _visited.add(category_url) + + indent = " " * (_depth + 1) + print(f"{indent}Fetching: {category_url}") + + try: + html = fetch_url(category_url) + time.sleep(DELAY) + except Exception as e: + print(f"{indent} ERROR: {e}") + return [] + + # Check if this IS a product page + product = parse_product(html, category_url, default_species) + if product: + return [product] + + # It's a category/subcategory page: extract child links + cat_path = urllib.parse.urlparse(category_url).path.rstrip("/") + child_links = [] + for link in extract_links(html, category_url): + parsed = urllib.parse.urlparse(link) + if parsed.netloc and parsed.netloc != "www.reinsaat.at": + continue + child_path = parsed.path.rstrip("/") + # Must be a direct child of the category path + if not child_path.startswith(cat_path + "/"): + continue + relative = child_path[len(cat_path) + 1:] + # Must be exactly one level deeper (no further slashes) + if "/" in relative: + continue + # Skip empty or same-path + if not relative: + continue + # Build clean URL + clean_url = f"https://www.reinsaat.at{child_path}/" + if clean_url not in _visited: + child_links.append(clean_url) + + # Deduplicate + child_links = list(dict.fromkeys(child_links)) + print(f"{indent} Found {len(child_links)} child links") + + products = [] + for child_url in child_links: + results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited) + products.extend(results) + + return products + + +# ── Main ──────────────────────────────────────────────────────────────────── +def main(): + print("=" * 70) + print("Reinsaat Scraper -> HerbAPI") + print("=" * 70) + + # Load species + print("\n[1] Loading species from API...") + species_map = load_species() + sci_names = [k for k in species_map if " " in k] + print(f" {len(sci_names)} species loaded:") + for k in sorted(sci_names): + s = species_map[k] + print(f" {s['name_scientific']:40s} {s['id'][:12]}...") + + # Load existing cultivars + print("\n[2] Loading existing cultivars...") + existing_cultivars = {} # (species_id, name_lower) -> cultivar_id + page = 1 + while True: + data = api_get(f"/cultivars?per_page=100&page={page}") + clist = data.get("data", data) if isinstance(data, dict) else data + if not clist: + break + for c in clist: + existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"] + # Check pagination - API uses {data, total, page, per_page} format + if isinstance(data, dict): + total = data.get("total", len(clist)) + per_page = data.get("per_page", 100) + if page * per_page >= total: + break + else: + break + page += 1 + print(f" {len(existing_cultivars)} existing cultivars") + + # Discover products from all categories + print("\n[3] Discovering products from Reinsaat categories...") + all_products: list[ProductData] = [] + visited: set[str] = set() + + for cat_url, species_hint in CATEGORIES: + print(f"\n Category: {cat_url}") + products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited) + all_products.extend(products) + print(f" -> {len(products)} products from this category") + + print(f"\n Total products discovered: {len(all_products)}") + + # Deduplicate by URL + seen_urls = set() + unique_products = [] + for p in all_products: + if p.url not in seen_urls: + seen_urls.add(p.url) + unique_products.append(p) + all_products = unique_products + print(f" Unique products: {len(all_products)}") + + # Process products + print("\n[4] Creating cultivars in API...") + stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0} + + for i, product in enumerate(all_products): + pct = (i + 1) / len(all_products) * 100 + print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}") + + # Match species + species = match_species(product.latin_name, species_map) + if not species: + print(f" Skip: no species match for '{product.latin_name}'") + stats["skipped_no_species"] += 1 + continue + + species_id = species["id"] + print(f" Species: {species['name_scientific']}") + print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, " + f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, " + f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}") + + # Check duplicates + key = (species_id, product.name.lower()) + if key in existing_cultivars: + # Still try to link supplier if cultivar exists + cultivar_id = existing_cultivars[key] + print(f" Exists: {cultivar_id[:12]}... - checking supplier link") + try: + api_post(f"/cultivars/{cultivar_id}/suppliers", { + "supplier_id": REINSAAT_SUPPLIER_ID, + "product_url": product.url, + "article_number": product.sku, + }) + print(f" Linked to Reinsaat (SKU: {product.sku})") + stats["linked"] += 1 + except Exception: + pass # Already linked or other error + stats["skipped_exists"] += 1 + continue + + # Build payload + payload = { + "species_id": species_id, + "name": product.name, + "name_de": product.name, + "name_en": "", + "description": product.description, + "is_organic": product.is_organic, + "perennial": product.perennial, + } + if product.sowing_depth_cm is not None: + payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2) + if product.row_spacing_cm is not None: + payload["row_spacing_cm"] = round(product.row_spacing_cm, 1) + if product.plant_spacing_cm is not None: + payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1) + if product.germination_temp_c is not None: + payload["germination_temp_c"] = round(product.germination_temp_c, 1) + + # Create cultivar + try: + result = api_post("/cultivars", payload) + cultivar_id = result["id"] + print(f" Created: {cultivar_id}") + stats["created"] += 1 + existing_cultivars[key] = cultivar_id + except Exception as e: + print(f" FAILED to create: {e}") + stats["errors"] += 1 + continue + + # Link to supplier + try: + api_post(f"/cultivars/{cultivar_id}/suppliers", { + "supplier_id": REINSAAT_SUPPLIER_ID, + "product_url": product.url, + "article_number": product.sku, + }) + print(f" Linked to Reinsaat (SKU: {product.sku})") + stats["linked"] += 1 + except Exception as e: + print(f" FAILED to link supplier: {e}") + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + print(f" Created: {stats['created']}") + print(f" Linked to supplier: {stats['linked']}") + print(f" Skipped (no species): {stats['skipped_no_species']}") + print(f" Skipped (exists): {stats['skipped_exists']}") + print(f" Errors: {stats['errors']}") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tools/scrapers/scrape_reinsaat_v2.py b/tools/scrapers/scrape_reinsaat_v2.py new file mode 100644 index 0000000..8b9f2f6 --- /dev/null +++ b/tools/scrapers/scrape_reinsaat_v2.py @@ -0,0 +1,770 @@ +#!/usr/bin/env python3 +""" +Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting +genus+species from extended botanical names, create/enrich cultivars, link supplier. + +Uses direct PostgreSQL access (psycopg2) for speed and reliability. +""" + +import json +import re +import ssl +import sys +import time +import uuid +import html as html_mod +import urllib.request +import urllib.error +import urllib.parse +from dataclasses import dataclass, field +from typing import Optional + +# Unbuffered output +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +import psycopg2 +import psycopg2.extras + +# ── Config ────────────────────────────────────────────────────────────────── +DB_HOST = "10.31.3.90" +DB_NAME = "herbapi" +DB_USER = "herbapi" +DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj" + +REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4" +DELAY = 0.3 +USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)" + +# ── All Reinsaat categories ──────────────────────────────────────────────── +CATEGORIES = [ + "https://www.reinsaat.at/shop/DE/bohnen/", + "https://www.reinsaat.at/shop/DE/erbsen/", + "https://www.reinsaat.at/shop/DE/gurken/", + "https://www.reinsaat.at/shop/DE/karotten_moehren_1/", + "https://www.reinsaat.at/shop/DE/knollenfenchel/", + "https://www.reinsaat.at/shop/DE/kohlgewaechse/", + "https://www.reinsaat.at/shop/DE/kuerbis/", + "https://www.reinsaat.at/shop/DE/mais/", + "https://www.reinsaat.at/shop/DE/mangold/", + "https://www.reinsaat.at/shop/DE/melanzani_1/", + "https://www.reinsaat.at/shop/DE/melone/", + "https://www.reinsaat.at/shop/DE/paprika/", + "https://www.reinsaat.at/shop/DE/pastinaken_1/", + "https://www.reinsaat.at/shop/DE/petersilie/", + "https://www.reinsaat.at/shop/DE/pfefferoni_chili/", + "https://www.reinsaat.at/shop/DE/porree/", + "https://www.reinsaat.at/shop/DE/radies_rettich/", + "https://www.reinsaat.at/shop/DE/rote_ruebe/", + "https://www.reinsaat.at/shop/DE/salate/", + "https://www.reinsaat.at/shop/DE/schwarzwurzeln/", + "https://www.reinsaat.at/shop/DE/sellerie/", + "https://www.reinsaat.at/shop/DE/spinat/", + "https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", + "https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/", + "https://www.reinsaat.at/shop/DE/zucchini/", + "https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/", + "https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", + "https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", + "https://www.reinsaat.at/shop/DE/gruenduengung/", +] + +# ── HTTP ──────────────────────────────────────────────────────────────────── +_ssl_ctx = ssl.create_default_context() + + +def fetch_url(url: str, retries: int = 2) -> str: + req = urllib.request.Request(url, headers={ + "User-Agent": USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "de-AT,de;q=0.9,en;q=0.5", + }) + for attempt in range(retries + 1): + try: + with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp: + charset = resp.headers.get_content_charset() or "utf-8" + return resp.read().decode(charset) + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + if attempt < retries: + time.sleep(2) + continue + raise + return "" + + +# ── HTML parsing helpers ──────────────────────────────────────────────────── +def extract_links(html_text: str, base_url: str) -> list[str]: + links = [] + seen = set() + for m in re.finditer(r']*href="([^"]*)"', html_text, re.IGNORECASE): + href = m.group(1) + if not href or href.startswith("#") or href.startswith("javascript:"): + continue + full = urllib.parse.urljoin(base_url, href) + if full not in seen: + seen.add(full) + links.append(full) + return links + + +def extract_jsonld_product(html_text: str) -> Optional[dict]: + for m in re.finditer( + r']*type="application/ld\+json"[^>]*>(.*?)', + html_text, re.DOTALL | re.IGNORECASE + ): + try: + data = json.loads(m.group(1)) + if isinstance(data, dict) and data.get("@type") == "Product": + return data + except (json.JSONDecodeError, ValueError): + continue + return None + + +def html_to_text(html_text: str) -> str: + """Strip HTML tags and decode entities.""" + text = re.sub(r'<[^>]+>', ' ', html_text) + text = html_mod.unescape(text) + text = re.sub(r'\s+', ' ', text).strip() + return text + + +def extract_botanical_name(html_text: str) -> str: + """ + Extract the botanical/Latin name from the page. + Primary source:
content. + Fallback: tags in growing infos. + + Returns the raw text (may include authority names, infraspecific ranks, etc.) + """ + # Primary: kurztext div + m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)
', html_text, re.DOTALL | re.IGNORECASE) + if m: + text = html_to_text(m.group(1)).strip() + if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text): + return text + + # Fallback: first in growingInfos that looks like a Latin name + gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE) + if gi: + for em in re.finditer(r'(.*?)', gi.group(1), re.DOTALL): + text = html_to_text(em.group(1)).strip() + if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text): + return text + + # Last resort: any / tag with a Latin-looking name + for tag in re.finditer(r'<(?:em|i)>(.*?)', html_text, re.DOTALL | re.IGNORECASE): + text = html_to_text(tag.group(1)).strip() + if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100: + return text + + return "" + + +def normalize_latin_name(raw: str) -> str: + """ + Extract genus + species from an extended botanical name. + + Examples: + "Pisum sativum L. convar. sat." -> "Pisum sativum" + "Capsicum annuum L." -> "Capsicum annuum" + "Brassica oleracea L. convar. botrytis" -> "Brassica oleracea" + "Solanum lycopersicum L." -> "Solanum lycopersicum" + "Cucumis sativus" -> "Cucumis sativus" + "Mentha x piperita" -> "Mentha x piperita" + """ + if not raw: + return "" + + # Clean up + name = raw.strip() + # Remove leading/trailing punctuation + name = name.strip(".,;:") + + words = name.split() + if len(words) < 2: + return name + + genus = words[0] + + # Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita" + if len(words) >= 3 and words[1] in ("x", "×"): + return f"{genus} x {words[2]}" + + species = words[1] + + # Validate: genus should start uppercase, species lowercase + if not genus[0].isupper() or not species[0].islower(): + return name # Can't parse, return as-is + + return f"{genus} {species}" + + +# ── Calendar parsing ──────────────────────────────────────────────────────── +CALENDAR_ROW_TYPES = { + "voranzucht": "indoor_sowing_months", + "vorzucht": "indoor_sowing_months", + "vorkultur": "indoor_sowing_months", + "aussaat/ pflanzung freiland": "direct_sowing_months", + "aussaat/pflanzung freiland": "direct_sowing_months", + "aussaat freiland": "direct_sowing_months", + "direktsaat": "direct_sowing_months", + "pflanzung freiland": "transplanting_months", + "pflanzung": "transplanting_months", + "aussaat/ pflanzung gewächshaus": "glasshouse_months", + "aussaat/pflanzung gewächshaus": "glasshouse_months", + "gewächshaus": "glasshouse_months", + "ernte": "harvesting_months", +} + + +def parse_calendar(html_text: str) -> dict: + """ + Parse the Reinsaat growing calendar table. + Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc. + Each value is a sorted list of month integers (1-12). + """ + result = {} + + cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)', html_text, re.DOTALL) + if not cal_match: + return result + + cal = cal_match.group(1) + rows = re.findall(r'(.*?)', cal, re.DOTALL) + + for row in rows: + # Get label + label_m = re.search(r'class="type-lable"[^>]*>(.*?)', row, re.DOTALL) + if not label_m: + continue + label = html_to_text(label_m.group(1)).strip().lower() + + # Map label to our field + field_name = None + for pattern, fname in CALENDAR_ROW_TYPES.items(): + if pattern in label: + field_name = fname + break + if not field_name: + continue + + # Extract background colors for each cell (24 cells = 12 months x 2 halves) + colors = re.findall(r'background-color:\s*([^;"]+)', row) + + # Convert to months: cell i maps to month (i // 2) + 1 + active_months = set() + for i, color in enumerate(colors): + color = color.strip().lower() + if color != "none" and color != "transparent" and color != "": + month = (i // 2) + 1 + if 1 <= month <= 12: + active_months.add(month) + + if active_months: + # Merge if same field already found (e.g. two sowing rows) + if field_name in result: + result[field_name] = sorted(set(result[field_name]) | active_months) + else: + result[field_name] = sorted(active_months) + + return result + + +# ── Growing data extraction ───────────────────────────────────────────────── +def extract_growing_data(html_text: str) -> dict: + """Extract spacing, depth, germination temp from the growing text.""" + data = {} + + # Get the growingInfos text + gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE) + if not gi: + return data + + full_text = html_to_text(gi.group(1)) + # Also get the raw HTML for better entity handling + raw_html = gi.group(1) + # Convert HTML entities for pattern matching + raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html)) + raw_text = re.sub(r'\s+', ' ', raw_text) + + # ── Sowing depth ── + depth_pats = [ + r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', + r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm', + ] + for pat in depth_pats: + dm = re.search(pat, raw_text, re.IGNORECASE) + if dm: + vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)] + data["planting_depth_cm"] = round(sum(vals) / len(vals), 2) + break + + # ── Spacing: "ROW x PLANT cm" ── + spacing_pats = [ + # "30–45 x 3–5 cm" (range x range) + r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm', + # "100 x 50 cm" (simple) + r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm', + ] + for pat in spacing_pats: + matches = re.findall(pat, raw_text, re.IGNORECASE) + if matches: + m = matches[-1] # prefer last match + if len(m) == 4: + data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1) + data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1) + elif len(m) == 2: + v1 = float(m[0].replace(",", ".")) + v2 = float(m[1].replace(",", ".")) + data["row_spacing_cm"] = round(v1, 1) + data["plant_spacing_cm"] = round(v2, 1) + break + + # ── Germination temperature ── + temp_pats = [ + r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C', + r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C', + ] + for pat in temp_pats: + tm = re.search(pat, raw_text, re.IGNORECASE) + if tm: + vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)] + avg = sum(vals) / len(vals) + if 5 <= avg <= 40: + data["germination_temp_c"] = round(avg, 1) + break + + # ── Perennial ── + perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude'] + for pat in perennial_pats: + if re.search(pat, raw_text, re.IGNORECASE): + data["perennial"] = True + break + + return data + + +# ── Product data ──────────────────────────────────────────────────────────── +@dataclass +class ProductData: + name: str = "" + raw_latin_name: str = "" + normalized_latin: str = "" + description: str = "" + sku: str = "" + url: str = "" + is_organic: bool = True + growing_data: dict = field(default_factory=dict) + calendar: dict = field(default_factory=dict) + + +def parse_product(html_text: str, url: str) -> Optional[ProductData]: + """Parse a product page. Returns ProductData or None if not a product page.""" + jsonld = extract_jsonld_product(html_text) + if not jsonld: + return None + + product = ProductData(url=url) + product.name = jsonld.get("name", "").strip() + product.description = jsonld.get("description", "").strip() + product.sku = jsonld.get("model", "").strip() + + # Extract and normalize botanical name + product.raw_latin_name = extract_botanical_name(html_text) + product.normalized_latin = normalize_latin_name(product.raw_latin_name) + + # Extract growing data + product.growing_data = extract_growing_data(html_text) + + # Parse calendar + product.calendar = parse_calendar(html_text) + + # Check organic status (Reinsaat is all organic, but check for "demeter" too) + product.is_organic = True + + return product + + +# ── Recursive discovery ───────────────────────────────────────────────────── +def discover_products( + category_url: str, + max_depth: int = 4, + _depth: int = 0, + _visited: set = None, +) -> list[ProductData]: + if _visited is None: + _visited = set() + if category_url in _visited or _depth > max_depth: + return [] + _visited.add(category_url) + + indent = " " * (_depth + 1) + + try: + html_text = fetch_url(category_url) + time.sleep(DELAY) + except Exception as e: + print(f"{indent}ERROR fetching {category_url}: {e}") + return [] + + # Check if this is a product page + product = parse_product(html_text, category_url) + if product: + return [product] + + # Category page: find child links + cat_path = urllib.parse.urlparse(category_url).path.rstrip("/") + child_links = [] + for link in extract_links(html_text, category_url): + parsed = urllib.parse.urlparse(link) + if parsed.netloc and parsed.netloc != "www.reinsaat.at": + continue + child_path = parsed.path.rstrip("/") + if not child_path.startswith(cat_path + "/"): + continue + relative = child_path[len(cat_path) + 1:] + if "/" in relative or not relative: + continue + clean_url = f"https://www.reinsaat.at{child_path}/" + if clean_url not in _visited: + child_links.append(clean_url) + + child_links = list(dict.fromkeys(child_links)) + print(f"{indent}Category {category_url} -> {len(child_links)} children") + + products = [] + for child_url in child_links: + results = discover_products(child_url, max_depth, _depth + 1, _visited) + products.extend(results) + + return products + + +# ── Slug generation ───────────────────────────────────────────────────────── +def make_slug(species_name: str, cultivar_name: str) -> str: + """Generate a URL-friendly slug.""" + raw = f"{species_name}-{cultivar_name}".lower() + # Replace umlauts and special chars + replacements = { + 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', + 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e', + 'á': 'a', 'à': 'a', 'â': 'a', + 'í': 'i', 'ì': 'i', 'î': 'i', + 'ó': 'o', 'ò': 'o', 'ô': 'o', + 'ú': 'u', 'ù': 'u', 'û': 'u', + 'ñ': 'n', 'ç': 'c', + } + for old, new in replacements.items(): + raw = raw.replace(old, new) + # Keep only alphanumeric and hyphens + slug = re.sub(r'[^a-z0-9]+', '-', raw) + slug = slug.strip('-') + # Collapse multiple hyphens + slug = re.sub(r'-+', '-', slug) + return slug + + +# ── Main ──────────────────────────────────────────────────────────────────── +def db_connect(): + """Create a fresh DB connection.""" + conn = psycopg2.connect( + host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS + ) + conn.autocommit = False + return conn + + +def main(): + print("=" * 70) + print("Reinsaat Scraper v2") + print("=" * 70) + + # ── Phase 1: Discover all products (no DB needed) ── + print("\n[1] Discovering products from Reinsaat categories...") + all_products: list[ProductData] = [] + visited: set[str] = set() + + for cat_url in CATEGORIES: + print(f"\n Category: {cat_url}") + products = discover_products(cat_url, max_depth=4, _visited=visited) + all_products.extend(products) + print(f" -> {len(products)} products") + + # Deduplicate by URL + seen_urls = set() + unique_products = [] + for p in all_products: + if p.url not in seen_urls: + seen_urls.add(p.url) + unique_products.append(p) + all_products = unique_products + print(f"\n Total unique products: {len(all_products)}") + + # ── Phase 2: Connect to DB and load existing data ── + print("\n[2] Connecting to DB and loading existing data...") + conn = db_connect() + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + # Load species + cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific") + species_rows = cur.fetchall() + species_map = {} + for row in species_rows: + key = row["name_scientific"].lower().strip() + species_map[key] = row + print(f" {len(species_map)} species loaded") + + # Load existing cultivars + cur.execute(""" + SELECT id, species_id, name, slug, description, + row_spacing_cm, plant_spacing_cm, planting_depth_cm, + germination_temp_c, perennial, + indoor_sowing_months, direct_sowing_months, + transplanting_months, glasshouse_months, harvesting_months + FROM cultivars + """) + cultivar_rows = cur.fetchall() + existing_cultivars = {} + existing_slugs = set() + for row in cultivar_rows: + sid = str(row["species_id"]) + name_lower = row["name"].lower() + existing_cultivars[(sid, name_lower)] = dict(row) + existing_slugs.add(row["slug"]) + print(f" {len(existing_cultivars)} cultivars loaded") + + # Load existing Reinsaat supplier links + cur.execute(""" + SELECT cultivar_id, product_url, article_number + FROM cultivar_suppliers + WHERE supplier_id = %s + """, (REINSAAT_SUPPLIER_ID,)) + existing_links = {} + for row in cur.fetchall(): + cid = str(row["cultivar_id"]) + url = row["product_url"] or "" + sku = row["article_number"] or "" + existing_links.setdefault(cid, []).append((url, sku)) + print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars") + + # ── Phase 3: Process products ── + print("\n[3] Processing products...") + stats = { + "created": 0, + "linked": 0, + "enriched": 0, + "skipped_no_species": 0, + "skipped_no_name": 0, + "link_exists": 0, + "errors": 0, + } + unmatched = [] + + for i, product in enumerate(all_products): + pct = (i + 1) / len(all_products) * 100 + prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)" + + if not product.name: + stats["skipped_no_name"] += 1 + continue + + # Match species + normalized = product.normalized_latin.lower().strip() + species = species_map.get(normalized) + + if not species: + # Try exact match on raw name (first two words) + raw_words = product.raw_latin_name.split() + if len(raw_words) >= 2: + attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}" + species = species_map.get(attempt) + + if not species: + stats["skipped_no_species"] += 1 + unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url)) + continue + + species_id = str(species["id"]) + species_name = species["name_scientific"] + + # Check if cultivar exists + ckey = (species_id, product.name.lower()) + existing = existing_cultivars.get(ckey) + + if existing: + cultivar_id = str(existing["id"]) + + # ── Enrich existing cultivar with missing data ── + updates = {} + + # Growing data from page + gd = product.growing_data + if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"): + updates["planting_depth_cm"] = gd["planting_depth_cm"] + if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"): + updates["row_spacing_cm"] = gd["row_spacing_cm"] + if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"): + updates["plant_spacing_cm"] = gd["plant_spacing_cm"] + if gd.get("germination_temp_c") and not existing.get("germination_temp_c"): + updates["germination_temp_c"] = gd["germination_temp_c"] + if gd.get("perennial") and not existing.get("perennial"): + updates["perennial"] = True + + # Calendar data + cal = product.calendar + if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"): + updates["indoor_sowing_months"] = cal["indoor_sowing_months"] + if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"): + updates["direct_sowing_months"] = cal["direct_sowing_months"] + if cal.get("transplanting_months") and not existing.get("transplanting_months"): + updates["transplanting_months"] = cal["transplanting_months"] + if cal.get("glasshouse_months") and not existing.get("glasshouse_months"): + updates["glasshouse_months"] = cal["glasshouse_months"] + if cal.get("harvesting_months") and not existing.get("harvesting_months"): + updates["harvesting_months"] = cal["harvesting_months"] + + # Description + if product.description and not existing.get("description"): + updates["description"] = product.description + + if updates: + set_clauses = [] + values = [] + for col, val in updates.items(): + set_clauses.append(f"{col} = %s") + values.append(val) + set_clauses.append("updated_at = NOW()") + values.append(cultivar_id) + cur.execute( + f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid", + values + ) + stats["enriched"] += 1 + print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})") + + # ── Add supplier link if missing ── + link_exists = False + if cultivar_id in existing_links: + for lurl, lsku in existing_links[cultivar_id]: + if lurl == product.url or (lsku and lsku == product.sku): + link_exists = True + break + + if link_exists: + stats["link_exists"] += 1 + else: + try: + cur.execute("SAVEPOINT link_sp") + cur.execute(""" + INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at) + VALUES (%s::uuid, %s::uuid, %s, %s, NOW()) + ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE + SET product_url = EXCLUDED.product_url, last_checked_at = NOW() + """, (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku)) + cur.execute("RELEASE SAVEPOINT link_sp") + stats["linked"] += 1 + existing_links.setdefault(cultivar_id, []).append((product.url, product.sku)) + print(f"{prefix} {product.name} -> LINKED ({product.sku})") + except Exception as e: + print(f"{prefix} {product.name} -> LINK ERROR: {e}") + cur.execute("ROLLBACK TO SAVEPOINT link_sp") + stats["errors"] += 1 + else: + # ── Create new cultivar ── + slug = make_slug(species_name, product.name) + # Ensure unique slug + base_slug = slug + counter = 2 + while slug in existing_slugs: + slug = f"{base_slug}-{counter}" + counter += 1 + + gd = product.growing_data + cal = product.calendar + + try: + cur.execute("SAVEPOINT create_sp") + cur.execute(""" + INSERT INTO cultivars ( + species_id, name, name_de, slug, description, + is_organic, perennial, + planting_depth_cm, row_spacing_cm, plant_spacing_cm, + germination_temp_c, + indoor_sowing_months, direct_sowing_months, + transplanting_months, glasshouse_months, harvesting_months + ) VALUES ( + %s::uuid, %s, %s, %s, %s, + %s, %s, + %s, %s, %s, + %s, + %s, %s, + %s, %s, %s + ) + RETURNING id + """, ( + species_id, + product.name, + product.name, + slug, + product.description, + product.is_organic, + gd.get("perennial", False), + gd.get("planting_depth_cm"), + gd.get("row_spacing_cm"), + gd.get("plant_spacing_cm"), + gd.get("germination_temp_c"), + cal.get("indoor_sowing_months"), + cal.get("direct_sowing_months"), + cal.get("transplanting_months"), + cal.get("glasshouse_months"), + cal.get("harvesting_months"), + )) + new_id = str(cur.fetchone()["id"]) + existing_slugs.add(slug) + existing_cultivars[ckey] = {"id": new_id} + stats["created"] += 1 + + # Link to supplier + cur.execute(""" + INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at) + VALUES (%s::uuid, %s::uuid, %s, %s, NOW()) + """, (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku)) + stats["linked"] += 1 + existing_links.setdefault(new_id, []).append((product.url, product.sku)) + + print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})") + cur.execute("RELEASE SAVEPOINT create_sp") + except Exception as e: + print(f"{prefix} {product.name} -> CREATE ERROR: {e}") + cur.execute("ROLLBACK TO SAVEPOINT create_sp") + stats["errors"] += 1 + + # ── Commit ── + conn.commit() + + # ── Summary ── + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + print(f" Total products discovered: {len(all_products)}") + print(f" New cultivars created: {stats['created']}") + print(f" New supplier links added: {stats['linked']}") + print(f" Cultivars enriched: {stats['enriched']}") + print(f" Links already existed: {stats['link_exists']}") + print(f" Skipped (no species): {stats['skipped_no_species']}") + print(f" Skipped (no name): {stats['skipped_no_name']}") + print(f" Errors: {stats['errors']}") + print("=" * 70) + + if unmatched: + print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):") + for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]): + print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}") + + cur.close() + conn.close() + + +if __name__ == "__main__": + main() diff --git a/tools/scrapers/scrape_reinsaat_v3.py b/tools/scrapers/scrape_reinsaat_v3.py new file mode 100644 index 0000000..c013b2e --- /dev/null +++ b/tools/scrapers/scrape_reinsaat_v3.py @@ -0,0 +1,635 @@ +#!/usr/bin/env python3 +"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching.""" + +import json +import re +import sys +import time +import urllib.request +import urllib.error +import urllib.parse +from html import unescape + +# --- Config --- +API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" +API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" +REINSAAT_BASE = "https://www.reinsaat.at" +DELAY = 0.3 + +# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes) +CATEGORIES = [ + "beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress", + "pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons", + "carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley", + "parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify", + "celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic", + "culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs", + "wild_flowers_seeds", "green_manure", +] + +# Suffixes to strip from botanical names (authority names, infraspecific ranks) +STRIP_SUFFIXES = { + "l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var", + "subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.", + "hort.", "medik.", "moench", "pers.", "salisb.", "thunb.", + "crantz", "gaertn.", "lam.", "link", "siebold", "zucc.", + "sat.", "sat", "axillare", "medikus", +} + + +def api_get(path, params=None): + """GET from HerbAPI.""" + url = f"{API_BASE}{path}" + if params: + url += "?" + urllib.parse.urlencode(params) + req = urllib.request.Request(url) + req.add_header("Authorization", f"Bearer {API_TOKEN}") + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def api_post(path, data): + """POST to HerbAPI.""" + url = f"{API_BASE}{path}" + body = json.dumps(data).encode() + req = urllib.request.Request(url, data=body, method="POST") + req.add_header("Authorization", f"Bearer {API_TOKEN}") + req.add_header("Content-Type", "application/json") + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def fetch_page(url): + """Fetch a web page, return HTML string.""" + req = urllib.request.Request(url) + req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)") + with urllib.request.urlopen(req, timeout=15) as resp: + return resp.read().decode("utf-8", errors="replace") + + +BOTANICAL_TYPOS = { + "capscicum": "capsicum", + "capsicum frutenscens": "capsicum frutescens", + "tropaelum": "tropaeolum", + "lact.": "lactuca", +} + +ABBREVIATED_NAMES = { + "origanum vulg.": "origanum vulgare", + "helichrysum bract.": "helichrysum bracteatum", + "campanula lat.": "campanula latifolia", + "cosmos bip.": "cosmos bipinnatus", + "papaver somnif.": "papaver somniferum", +} + + +def normalise_botanical(raw): + """Strip botanical name to genus + species only. + + 'Pisum sativum L. convar. sat.' -> 'pisum sativum' + 'Solanum lycopersicum L.' -> 'solanum lycopersicum' + 'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris' + """ + if not raw: + return None + # Clean HTML entities + raw = unescape(raw).replace("\xa0", " ").strip() + # Remove trailing commas/periods + raw = raw.rstrip(",. ") + # Remove content in parentheses + raw = re.sub(r"\([^)]*\)", "", raw) + # Check abbreviated names first (before splitting) + raw_lower = raw.lower().strip() + for abbrev, full in ABBREVIATED_NAMES.items(): + if raw_lower.startswith(abbrev): + return full + + parts = raw.split() + if len(parts) < 2: + return None + # Genus (capitalised) + species (lowercase) + genus = parts[0].lower().rstrip(",") + species = parts[1].lower().rstrip(",") + + # Fix known typos + if genus in BOTANICAL_TYPOS: + genus = BOTANICAL_TYPOS[genus] + full_name = f"{genus} {species}" + if full_name in BOTANICAL_TYPOS: + full_name = BOTANICAL_TYPOS[full_name] + genus, species = full_name.split() + + # Validate: genus should start with letter, species should be all lowercase + if not genus[0].isalpha() or not species[0].isalpha(): + return None + # Skip if species looks like an authority (starts with uppercase in original) + if parts[1][0].isupper(): + return None + return f"{genus} {species}" + + +def extract_product_data(html, url): + """Extract product info from a Reinsaat product page.""" + result = {} + + # H1 = variety name + m = re.search(r']*>([^<]+)', html) + if m: + name = unescape(m.group(1)).strip() + # Clean up names like "RS-To-01.26 (Alda)" -> "Alda" + paren = re.search(r"\(([^)]+)\)", name) + if paren and re.match(r"RS-", name): + name = paren.group(1).strip() + result["name"] = name + + # Botanical name from fce_shop_kurztext + m = re.search( + r'fce_shop_kurztext[^>]*>\s*(?:]*>)?\s*([^<]+?)\s*(?:)?\s*', + html, + ) + if m: + result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip() + result["botanical_norm"] = normalise_botanical(result["botanical_raw"]) + + # Article number from JSON-LD + for jm in re.finditer( + r'', html, re.S + ): + try: + jd = json.loads(jm.group(1)) + except json.JSONDecodeError: + continue + if jd.get("@type") == "Product": + if "model" in jd: + result["article_number"] = str(jd["model"]) + # Get smallest pack price (usually the Portion) + offers = jd.get("offers", {}) + if isinstance(offers, dict): + offer_list = offers.get("offers", []) + elif isinstance(offers, list): + offer_list = offers + else: + offer_list = [] + if offer_list: + prices = [ + o["price"] + for o in offer_list + if isinstance(o.get("price"), (int, float)) and o["price"] > 0 + ] + if prices: + result["price_eur"] = min(prices) + break + + # Price table - get pack sizes + tables = re.findall(r"]*>(.*?)", html, re.S) + for tbl in tables: + if "€" not in tbl: + continue + rows = re.findall(r"]*>(.*?)", tbl, re.S) + if len(rows) >= 2: + size_cells = re.findall(r"]*>(.*?)", rows[0], re.S) + size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells] + price_cells = re.findall(r"]*>(.*?)", rows[1], re.S) + price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells] + # Find the "Port." entry + for i, st in enumerate(size_texts): + if "Port" in st: + if i < len(price_texts): + pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", ".")) + if pm: + result["port_price"] = float(pm.group()) + break + # Get portion content info + result["pack_sizes"] = size_texts + break + + # Sowing depth + m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I) + if m: + d1 = float(m.group(1).replace(",", ".")) + d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1 + result["planting_depth_cm"] = round((d1 + d2) / 2, 2) + + # Spacing: "row spacing NNxNN cm" or "NN x NN cm" + # Try outdoor spacing first + m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) + if not m: + m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) + if not m: + m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) + if m: + result["row_spacing_cm"] = float(m.group(1)) + result["plant_spacing_cm"] = float(m.group(2)) + + # Row spacing without plant spacing (e.g. "row spacing 30-45 cm") + if "row_spacing_cm" not in result: + m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I) + if m: + r1 = int(m.group(1)) + r2 = int(m.group(2)) if m.group(2) else r1 + result["row_spacing_cm"] = float((r1 + r2) // 2) + + # Germination temperature + m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I) + if m: + t1 = int(m.group(1)) + t2 = int(m.group(2)) if m.group(2) else t1 + result["germination_temp_c"] = float((t1 + t2) // 2) + + # Pack unit from portion info - "20 seeds" or "25 g" etc + portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html) + if not portion_m: + # Try "Port. (20 seeds)" format + portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html) + if portion_m: + result["pack_size"] = float(portion_m.group(1).replace(",", ".")) + unit = portion_m.group(2).lower() + if unit in ("seed", "seeds", "korn"): + result["pack_unit"] = "Korn" + else: + result["pack_unit"] = unit + + result["url"] = url + return result + + +def get_all_species(): + """Fetch all species from API, build lookup by normalised name.""" + species_map = {} + page = 1 + while True: + data = api_get("/species", {"per_page": 100, "page": page}) + batch = data.get("data", []) + for sp in batch: + norm = normalise_botanical(sp["name_scientific"]) + if norm: + species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]} + print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})") + if len(batch) < 100: + break + page += 1 + return species_map + + +def get_all_cultivars(): + """Fetch all cultivars, build lookup by (species_id, normalised name).""" + cultivar_map = {} # (species_id, lower_name) -> cultivar + page = 1 + while True: + data = api_get("/cultivars", {"per_page": 100, "page": page}) + batch = data.get("data", []) + for cv in batch: + key = (cv["species_id"], cv["name"].lower().strip()) + cultivar_map[key] = cv + print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})") + if len(batch) < 100: + break + page += 1 + return cultivar_map + + +def get_reinsaat_supplier(): + """Get Reinsaat supplier record.""" + suppliers = api_get("/suppliers") + for s in suppliers: + if s["slug"] == "reinsaat": + return s + raise RuntimeError("Reinsaat supplier not found in API") + + +def get_cultivar_suppliers(cultivar_id): + """Get existing supplier links for a cultivar.""" + return api_get(f"/cultivars/{cultivar_id}/suppliers") + + +def get_product_urls_from_category(cat_slug): + """Fetch product URLs from a category page. Handles one level of subcategories.""" + cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/" + try: + html = fetch_page(cat_url) + except Exception as e: + print(f" WARN: Failed to fetch category {cat_slug}: {e}") + return [] + + time.sleep(DELAY) + + # Get all internal links under this category + pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/' + raw_links = re.findall(rf'href="({pattern})"', html) + # raw_links is list of (full_path, slug_part) but re gives us captured groups + # Let me redo this + raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html) + unique_links = sorted(set(raw_links)) + + product_urls = [] + subcategory_urls = [] + + for link in unique_links: + full_url = REINSAAT_BASE + link + # Determine depth relative to category + parts = link.rstrip("/").split("/") + # /shop/EN/cat_slug/item -> 4 parts = product or subcategory + # /shop/EN/cat_slug/subcat/item -> 5 parts = nested product + if len(parts) == 4: + # Could be product or subcategory - we'll check later + product_urls.append(full_url) + elif len(parts) >= 5: + product_urls.append(full_url) + + return product_urls + + +def is_product_page(html): + """Check if HTML is a product page (has botanical name or JSON-LD Product).""" + return bool( + re.search(r'fce_shop_kurztext', html) + or re.search(r'"@type":\s*"Product"', html) + ) + + +def main(): + print("=" * 60) + print("Reinsaat v3 Scraper") + print("=" * 60) + + # Step 1: Load all species + print("\n[1/4] Loading species from API...") + species_map = get_all_species() + print(f" Loaded {len(species_map)} species") + + # Step 2: Load all cultivars + print("\n[2/4] Loading cultivars from API...") + cultivar_map = get_all_cultivars() + print(f" Loaded {len(cultivar_map)} cultivars") + + # Step 3: Get Reinsaat supplier + print("\n[3/4] Getting Reinsaat supplier...") + supplier = get_reinsaat_supplier() + supplier_id = supplier["id"] + print(f" Reinsaat ID: {supplier_id}") + + # Step 4: Scrape categories + print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...") + + stats = { + "products_found": 0, + "botanical_extracted": 0, + "species_matched": 0, + "species_not_matched": 0, + "cultivar_existed": 0, + "cultivar_created": 0, + "link_existed": 0, + "link_created": 0, + "errors": 0, + } + unmatched_species = {} # botanical_norm -> count + new_cultivars = [] + new_links = [] + + for cat_i, cat in enumerate(CATEGORIES): + print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---") + urls = get_product_urls_from_category(cat) + print(f" Found {len(urls)} URLs") + + for url in urls: + time.sleep(DELAY) + try: + html = fetch_page(url) + except Exception as e: + print(f" ERROR fetching {url}: {e}") + stats["errors"] += 1 + continue + + # Check if this is actually a product page + if not is_product_page(html): + # Might be a subcategory - get links from it + sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html) + sub_links = [ + REINSAAT_BASE + l + for l in sorted(set(sub_links)) + if l.startswith(f"/shop/EN/{cat}/") + and l.count("/") > url.rstrip("/").count("/") + ] + if sub_links: + # It's a subcategory, process its product links + for sub_url in sub_links: + if sub_url in urls: + continue # already in list + time.sleep(DELAY) + try: + sub_html = fetch_page(sub_url) + except Exception as e: + print(f" ERROR fetching {sub_url}: {e}") + stats["errors"] += 1 + continue + if not is_product_page(sub_html): + continue + process_product( + sub_html, sub_url, species_map, cultivar_map, + supplier_id, stats, unmatched_species, + new_cultivars, new_links, + ) + continue + + process_product( + html, url, species_map, cultivar_map, + supplier_id, stats, unmatched_species, + new_cultivars, new_links, + ) + + # Report + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + print(f"Products found: {stats['products_found']}") + print(f"Botanical extracted: {stats['botanical_extracted']}") + print(f"Species matched: {stats['species_matched']}") + print(f"Species NOT matched: {stats['species_not_matched']}") + print(f"Cultivars existed: {stats['cultivar_existed']}") + print(f"Cultivars created: {stats['cultivar_created']}") + print(f"Links existed: {stats['link_existed']}") + print(f"Links created: {stats['link_created']}") + print(f"Errors: {stats['errors']}") + + if new_cultivars: + print(f"\n--- New cultivars ({len(new_cultivars)}) ---") + for cv in new_cultivars: + print(f" + {cv['name']} ({cv.get('species', '?')})") + + if new_links: + print(f"\n--- New supplier links ({len(new_links)}) ---") + for lk in new_links: + print(f" + {lk['cultivar']} -> {lk.get('article', '?')}") + + if unmatched_species: + print(f"\n--- Unmatched species ({len(unmatched_species)}) ---") + for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]): + print(f" ? {name} (x{count})") + + print("\nDone.") + + +def process_product(html, url, species_map, cultivar_map, supplier_id, + stats, unmatched_species, new_cultivars, new_links): + """Process a single product page.""" + stats["products_found"] += 1 + prod = extract_product_data(html, url) + + if not prod.get("name"): + return + + bot_norm = prod.get("botanical_norm") + if not bot_norm: + # No botanical name found on page + stats["species_not_matched"] += 1 + unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1 + return + + stats["botanical_extracted"] += 1 + + # Match species + species = species_map.get(bot_norm) + if not species: + stats["species_not_matched"] += 1 + unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1 + return + + stats["species_matched"] += 1 + species_id = species["id"] + cultivar_name = prod["name"] + + # Check if cultivar exists + cv_key = (species_id, cultivar_name.lower().strip()) + existing_cv = cultivar_map.get(cv_key) + + if existing_cv: + stats["cultivar_existed"] += 1 + cultivar_id = existing_cv["id"] + else: + # Create cultivar + create_data = { + "species_id": species_id, + "name": cultivar_name, + "is_organic": True, + "source_urls": [url], + } + # Add growing data if we extracted any + if "planting_depth_cm" in prod: + create_data["planting_depth_cm"] = prod["planting_depth_cm"] + if "row_spacing_cm" in prod: + create_data["row_spacing_cm"] = prod["row_spacing_cm"] + if "plant_spacing_cm" in prod: + create_data["plant_spacing_cm"] = prod["plant_spacing_cm"] + if "germination_temp_c" in prod: + create_data["germination_temp_c"] = prod["germination_temp_c"] + + try: + new_cv = api_post("/cultivars", create_data) + cultivar_id = new_cv["id"] + stats["cultivar_created"] += 1 + new_cultivars.append({ + "name": cultivar_name, + "species": species["name"], + "id": cultivar_id, + }) + # Add to local cache + cultivar_map[cv_key] = new_cv + print(f" + Created cultivar: {cultivar_name} ({species['name']})") + except urllib.error.HTTPError as e: + body = e.read().decode() if hasattr(e, 'read') else str(e) + if e.code == 500 and "Database error" in body: + # Likely slug collision - search for existing cultivar + try: + # Try multiple search strategies + found = None + cn_lower = cultivar_name.lower().strip() + + # Strategy 1: search by full name + search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50}) + for cv in search_data.get("data", []): + if cv["name"].lower().strip() == cn_lower: + found = cv + break + # Strategy 2: match by species_id + partial name + if not found: + for cv in search_data.get("data", []): + if cv["species_id"] == species_id: + # Match if names are similar (ignoring punctuation) + cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower()) + cn_clean = re.sub(r'[^\w\s]', '', cn_lower) + if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean: + found = cv + break + # Strategy 3: search by last significant word + if not found: + words = [w for w in cultivar_name.split() if len(w) > 2] + if words: + search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50}) + for cv in search2.get("data", []): + if cv["species_id"] == species_id: + cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower()) + cn_clean = re.sub(r'[^\w\s]', '', cn_lower) + if cv_clean == cn_clean: + found = cv + break + + if found: + cultivar_id = found["id"] + cultivar_map[cv_key] = found + stats["cultivar_existed"] += 1 + else: + print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)") + stats["errors"] += 1 + return + except Exception as e2: + print(f" ERROR searching for '{cultivar_name}' after collision: {e2}") + stats["errors"] += 1 + return + else: + print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}") + stats["errors"] += 1 + return + + # Check if Reinsaat supplier link exists + try: + existing_links = get_cultivar_suppliers(cultivar_id) + except Exception: + existing_links = [] + + has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links) + + if has_reinsaat: + stats["link_existed"] += 1 + else: + # Create supplier link + link_data = { + "supplier_id": supplier_id, + "product_url": url, + } + if "article_number" in prod: + link_data["article_number"] = prod["article_number"] + if "port_price" in prod: + link_data["price_eur"] = prod["port_price"] + elif "price_eur" in prod: + link_data["price_eur"] = prod["price_eur"] + if "pack_size" in prod: + link_data["pack_size"] = prod["pack_size"] + if "pack_unit" in prod: + link_data["pack_unit"] = prod["pack_unit"] + + try: + api_post(f"/cultivars/{cultivar_id}/suppliers", link_data) + stats["link_created"] += 1 + new_links.append({ + "cultivar": cultivar_name, + "article": prod.get("article_number", "?"), + "url": url, + }) + except urllib.error.HTTPError as e: + body = e.read().decode() if hasattr(e, 'read') else str(e) + print(f" ERROR linking '{cultivar_name}': {e.code} {body}") + stats["errors"] += 1 + + +if __name__ == "__main__": + main()