#!/usr/bin/env python3 """ Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/) Extracts cultivar data and imports into HerbAPI. Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure). """ import json import re import sys import time import urllib.request import urllib.error import urllib.parse from html.parser import HTMLParser from typing import Optional # ── Configuration ───────────────────────────────────────────────────────── API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" SITE_BASE = "https://www.bingenheimersaatgut.de" DELAY = 0.5 USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)" # ── Category URLs to scrape ─────────────────────────────────────────────── # (url_path, default_species_scientific_name) VEGETABLE_CATEGORIES = [ ("gemuese/tomaten", "Solanum lycopersicum"), ("gemuese/gurken/gewuerzgurke", "Cucumis sativus"), ("gemuese/gurken/salatgurken", "Cucumis sativus"), ("gemuese/aubergine", "Solanum melongena"), ("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"), ("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"), ("gemuese/bohnen/dicke-bohne", "Vicia faba"), ("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"), ("gemuese/bohnen/edamame-sojabohne", "Glycine max"), ("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"), ("gemuese/erbsen/markerbse", "Pisum sativum"), ("gemuese/erbsen/schalerbse", "Pisum sativum"), ("gemuese/erbsen/zuckererbse", "Pisum sativum"), ("gemuese/feldsalat", "Valerianella locusta"), ("gemuese/knollenfenchel", "Foeniculum vulgare"), ("gemuese/kohl/blumenkohl", "Brassica oleracea"), ("gemuese/kohl/brokkoli", "Brassica oleracea"), ("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"), ("gemuese/kohl/gruenkohl", "Brassica oleracea"), ("gemuese/kohl/kohlrabi", "Brassica oleracea"), ("gemuese/kohl/rotkohl", "Brassica oleracea"), ("gemuese/kohl/weisskohl", "Brassica oleracea"), ("gemuese/kohl/wirsing", "Brassica oleracea"), ("gemuese/kohl/rosenkohl", "Brassica oleracea"), ("gemuese/kresse", "Lepidium sativum"), ("gemuese/kuerbis", "Cucurbita maxima"), ("gemuese/zuckermais", "Zea mays"), ("gemuese/mangold", "Beta vulgaris"), ("gemuese/melone", "Cucumis melo"), ("gemuese/moehren", "Daucus carota"), ("gemuese/paprika/gemuesepaprika", "Capsicum annuum"), ("gemuese/paprika/chili", "Capsicum annuum"), ("gemuese/pastinaken", "Pastinaca sativa"), ("gemuese/petersilienwurzel", "Petroselinum crispum"), ("gemuese/physalis", "Physalis peruviana"), ("gemuese/porreelauch", "Allium porrum"), ("gemuese/radies", "Raphanus sativus"), ("gemuese/rettich", "Raphanus sativus"), ("gemuese/rote-bete", "Beta vulgaris"), ("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"), ("gemuese/rueben/kohlruebe", "Brassica napus"), ("gemuese/rucola", "Eruca vesicaria"), ("gemuese/salat/bataviasalat", "Lactuca sativa"), ("gemuese/salat/eichblattsalat", "Lactuca sativa"), ("gemuese/salat/eissalat", "Lactuca sativa"), ("gemuese/salat/endivien", "Cichorium endivia"), ("gemuese/salat/hirschhornwegerich", "Plantago coronopus"), ("gemuese/salat/kopfsalat", "Lactuca sativa"), ("gemuese/salat/lollosalat", "Lactuca sativa"), ("gemuese/salat/romanasalat", "Lactuca sativa"), ("gemuese/salat/baby-leaf", "Lactuca sativa"), ("gemuese/sellerie/knollensellerie", "Apium graveolens"), ("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"), ("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"), ("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"), ("gemuese/blattstielgemuese", "Beta vulgaris"), ("gemuese/zwiebeln", "Allium cepa"), ("gemuese/lauchzwiebeln", "Allium fistulosum"), ("gemuese/artischocke", "Cynara cardunculus"), ("gemuese/asia-salate", "Brassica juncea"), ("gemuese/chicoree", "Cichorium intybus"), ("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"), ("gemuese/winterpostelein", "Claytonia perfoliata"), ("gemuese/zucchini", "Cucurbita pepo"), ("gemuese/catalogna", "Cichorium intybus"), ("gemuese/zichoriensalate", "Cichorium intybus"), ] HERB_CATEGORIES = [ ("kraeuter/basilikum", "Ocimum basilicum"), ("kraeuter/bohnenkraut", "Satureja hortensis"), ("kraeuter/borretsch", "Borago officinalis"), ("kraeuter/dill", "Anethum graveolens"), ("kraeuter/kuemmel", "Carum carvi"), ("kraeuter/kerbel", "Anthriscus cerefolium"), ("kraeuter/koriander", "Coriandrum sativum"), ("kraeuter/gewuerzfenchel", "Foeniculum vulgare"), ("kraeuter/kultursauerampfer", "Rumex acetosa"), ("kraeuter/lavendel", "Lavandula angustifolia"), ("kraeuter/liebstock", "Levisticum officinale"), ("kraeuter/majoran", "Origanum majorana"), ("kraeuter/oregano", "Origanum vulgare"), ("kraeuter/pimpinelle", "Sanguisorba minor"), ("kraeuter/estragon", "Artemisia dracunculus"), ("kraeuter/salbei", "Salvia officinalis"), ("kraeuter/schnittlauch", "Allium schoenoprasum"), ("kraeuter/schnittknoblauch", "Allium tuberosum"), ("kraeuter/schwarzkuemmel", "Nigella sativa"), ("kraeuter/speisechrysantheme", "Glebionis coronaria"), ("kraeuter/thymian", "Thymus vulgaris"), ("kraeuter/ysop", "Hyssopus officinalis"), ("kraeuter/winterkresse", "Barbarea vulgaris"), ("kraeuter/brunnenkresse", "Nasturtium officinale"), ("kraeuter/melisse", "Melissa officinalis"), ("kraeuter/petersilie", "Petroselinum crispum"), ("kraeuter/schnittsellerie", "Apium graveolens"), ("kraeuter/beifuss", "Artemisia vulgaris"), ] GREEN_MANURE_CATEGORIES = [ ("gruenduengung", None), ] ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES # ── Stats ───────────────────────────────────────────────────────────────── stats = { "categories_scraped": 0, "products_found": 0, "detail_pages_fetched": 0, "cultivars_created": 0, "cultivars_existed": 0, "supplier_links_created": 0, "supplier_links_existed": 0, "species_created": 0, "families_created": 0, "species_not_matched": [], "errors": [], } # ── HTTP helpers ────────────────────────────────────────────────────────── def fetch_page(url: str) -> str: """Fetch a web page with User-Agent header.""" req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") except urllib.error.HTTPError as e: if e.code == 404: return "" raise def api_get(path: str, params: dict = None) -> dict: """GET from HerbAPI.""" url = f"{API_BASE}{path}" if params: url += "?" + urllib.parse.urlencode(params) req = urllib.request.Request(url, headers={ "Authorization": f"Bearer {API_TOKEN}", "Accept": "application/json", }) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read()) def api_post(path: str, data: dict) -> tuple: """POST to HerbAPI. Returns (response_dict, status_code).""" url = f"{API_BASE}{path}" body = json.dumps(data).encode("utf-8") req = urllib.request.Request(url, data=body, method="POST", headers={ "Authorization": f"Bearer {API_TOKEN}", "Content-Type": "application/json", "Accept": "application/json", }) try: with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read()), resp.status except urllib.error.HTTPError as e: err_body = e.read().decode("utf-8", errors="replace") return {"error": err_body, "_status": e.code}, e.code # ── HTML parsing helpers ────────────────────────────────────────────────── def parse_product_links(html: str) -> list: """Parse product links from listing page using regex.""" links = [] # Magento product-item-link pattern pattern = re.compile( r']+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*', re.DOTALL | re.IGNORECASE ) for match in pattern.finditer(html): url = match.group(1) name = re.sub(r'<[^>]+>', '', match.group(2)).strip() if name: if not url.startswith("http"): url = SITE_BASE + url links.append((url, name)) if not links: # Broader pattern for product detail links pattern2 = re.compile( r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})', re.IGNORECASE ) seen = set() for match in pattern2.finditer(html): url = match.group(1).strip() name = match.group(2).strip() if name and url not in seen and not url.endswith(".html"): seen.add(url) if not url.startswith("http"): url = SITE_BASE + url links.append((url, name)) # Deduplicate by URL seen_urls = set() unique = [] for url, name in links: if url not in seen_urls: seen_urls.add(url) unique.append((url, name)) return unique def extract_latin_from_detail(html: str) -> Optional[str]: """Extract Latin/botanical name from product detail page.""" patterns = [ r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*', r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})', r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})', ] for pat in patterns: m = re.search(pat, html, re.IGNORECASE) if m: name = m.group(1).strip() parts = name.split() if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower(): return name return None def extract_description_from_detail(html: str) -> str: """Extract product description from detail page.""" desc_patterns = [ r']*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)', r']*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)', r'data-content-type="description"[^>]*>(.*?)', ] for pat in desc_patterns: m = re.search(pat, html, re.DOTALL | re.IGNORECASE) if m: raw = m.group(1) text = re.sub(r'<[^>]+>', ' ', raw) text = re.sub(r'\s+', ' ', text).strip() if len(text) > 20: return text[:2000] return "" def extract_article_number(product_name: str, url: str) -> Optional[str]: """Extract article number from product name or URL.""" m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name) if m: return m.group(1).replace(" ", "") slug = url.rstrip("/").split("/")[-1] m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE) if m: return m.group(1).upper() return None def extract_variety_name(product_name: str) -> str: """Extract the variety/cultivar name from the full product name.""" name = product_name.strip() # Remove article number suffix like (G802) name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name) # Common German vegetable/herb type prefixes to strip prefixes = [ # Tomatoes r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|' r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+', # Beans r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+', r'Edamame(?:-Sojabohne)?\s+', # Peas r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+', # Cucurbits r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+', r'Zucchini\s+', r'Kürbis\s+', r'(?:Wasser)?[Mm]elone\s+', # Brassicas r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+', r'Kohlrabi\s+', r'Wirsing\s+', r'Brokkoli\s+', r'Chinakohl\s+', r'Pak\s+Choi\s+', r'Kohlrübe\s+', r'Mai-/Herbstrüben?(?:/Navets)?\s+', # Root vegetables r'Möhre\s+', r'Karotten?(?:\s*-?\s*Mix)?\s+', r'Pastinake\s+', r'Radies(?:chen)?\s+', r'Rettich\s+', r'Schwarzwurzel\s+', r'Haferwurzel\s+', r'Petersilienwurzel\s+', # Beets r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+', r'Mangold\s+', # Lettuce & leafy r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+', r'Feldsalat\s+', r'Endivie\s+', r'Asia[\s-]*Salat\s+', r'Spinat\s+', # Alliums r'Zwiebel\s+', r'Lauchzwiebel\s+', r'Porree(?:/Lauch)?\s+', r'Schnittlauch\s+', r'Schnittknoblauch\s+', # Peppers r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+', r'Chili\s+', # Celery r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+', # Herbs r'Basilikum\s+', r'Koriander\s+', r'Dill\s+', r'Petersilie\s+', r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+', r'Salbei\s+', r'Thymian\s+', r'Oregano\s+', r'Lavendel\s+', r'Melisse\s+', r'Majoran\s+', r'Estragon\s+', r'Kresse\s+', r'Bohnenkraut\s+', r'Borretsch\s+', r'Kümmel\s+', r'Kerbel\s+', r'Liebstock\s+', r'Ysop\s+', r'Pimpinelle\s+', r'Beifuß\s+', r'Schwarzkümmel\s+', # Other r'Zuckermais\s+', r'Artischocke\s+', r'Physalis\s+', r'Aubergine\s+', r'Catalogna\s+', ] for prefix in prefixes: name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE) name = name.strip().strip("'\"") return name # ── API data caches ─────────────────────────────────────────────────────── species_cache = {} # scientific_name_lower -> {id, name_scientific, ...} family_cache = {} # name_scientific_lower -> {id, name_scientific} cultivar_cache = {} # slug -> {id, name, species_id, ...} supplier_id = None def load_api_data(): """Load all existing data from HerbAPI for matching.""" global supplier_id print("Loading existing HerbAPI data...") # Load families page = 1 while True: resp = api_get("/families", {"per_page": 100, "page": page}) for f in resp["data"]: family_cache[f["name_scientific"].lower()] = f if len(resp["data"]) < 100: break page += 1 print(f" Loaded {len(family_cache)} families") # Load species page = 1 while True: resp = api_get("/species", {"per_page": 100, "page": page}) for s in resp["data"]: species_cache[s["name_scientific"].lower()] = s if len(resp["data"]) < 100: break page += 1 print(f" Loaded {len(species_cache)} species") # Load ALL cultivars (slug + id + name + species_id) page = 1 while True: resp = api_get("/cultivars", {"per_page": 100, "page": page}) for c in resp["data"]: cultivar_cache[c["slug"]] = { "id": c["id"], "name": c["name"], "species_id": c["species_id"], } if len(resp["data"]) < 100: break page += 1 print(f" Loaded {len(cultivar_cache)} cultivars") # Create or find Bingenheimer supplier resp = api_get("/suppliers") for s in resp: if "bingenheimer" in s["name"].lower(): supplier_id = s["id"] print(f" Found existing supplier: {s['name']} ({s['id']})") break if not supplier_id: print(" Creating Bingenheimer Saatgut supplier...") s, code = api_post("/suppliers", { "name": "Bingenheimer Saatgut", "url": "https://www.bingenheimersaatgut.de", "country": "DE", "is_organic": True, "is_demeter": True, "notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties" }) if "id" in s: supplier_id = s["id"] print(f" Created supplier: {s['id']}") else: print(f" ERROR creating supplier: {s}") sys.exit(1) def find_or_create_species(latin_name: str) -> Optional[str]: """Find species by Latin name or create it. Returns species ID.""" if not latin_name: return None key = latin_name.lower().strip() # Direct match if key in species_cache: return species_cache[key]["id"] # Try without subspecies/variety base = " ".join(key.split()[:2]) if base in species_cache: return species_cache[base]["id"] # Handle synonyms synonyms = { "lycopersicon esculentum": "solanum lycopersicum", "capsicum annuum var. annuum": "capsicum annuum", "brassica oleracea var. botrytis": "brassica oleracea", "brassica oleracea var. italica": "brassica oleracea", "brassica oleracea var. gemmifera": "brassica oleracea", "brassica oleracea var. gongylodes": "brassica oleracea", "brassica oleracea var. capitata": "brassica oleracea", "brassica oleracea var. sabauda": "brassica oleracea", "brassica oleracea var. sabellica": "brassica oleracea", "brassica rapa var. rapa": "brassica rapa", "brassica rapa subsp. pekinensis": "brassica rapa", "brassica rapa subsp. chinensis": "brassica rapa", "beta vulgaris var. conditiva": "beta vulgaris", "beta vulgaris subsp. vulgaris": "beta vulgaris", "beta vulgaris var. vulgaris": "beta vulgaris", "allium porrum": "allium cepa", "allium ampeloprasum": "allium cepa", "origanum majorana": "origanum vulgare", "cichorium intybus var. foliosum": "cichorium intybus", "petroselinum crispum var. tuberosum": "petroselinum crispum", "apium graveolens var. rapaceum": "apium graveolens", "apium graveolens var. dulce": "apium graveolens", "lactuca sativa var. capitata": "lactuca sativa", "lactuca sativa var. crispa": "lactuca sativa", "lactuca sativa var. longifolia": "lactuca sativa", } if key in synonyms: syn_key = synonyms[key] if syn_key in species_cache: return species_cache[syn_key]["id"] # Try to create the species genus = latin_name.split()[0] family_map = { "Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae", "Nicandra": "Solanaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae", "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae", "Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae", "Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae", "Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae", "Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae", "Allium": "Amaryllidaceae", "Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae", "Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae", "Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae", "Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae", "Sanguisorba": "Rosaceae", "Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae", "Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae", "Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae", "Artemisia": "Asteraceae", "Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae", "Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae", "Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae", "Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Mentha": "Lamiaceae", "Zea": "Poaceae", "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Valerianella": "Caprifoliaceae", "Tropaeolum": "Tropaeolaceae", "Rumex": "Polygonaceae", "Nigella": "Ranunculaceae", "Claytonia": "Montiaceae", "Tetragonia": "Aizoaceae", "Basella": "Basellaceae", "Plantago": "Plantaginaceae", } family_name = family_map.get(genus) if not family_name: print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'") stats["species_not_matched"].append(latin_name) return None family_id = find_or_create_family(family_name) if not family_id: return None print(f" Creating species: {latin_name}") resp, code = api_post("/species", { "name_scientific": latin_name, "family_id": family_id, }) if "id" in resp: species_cache[latin_name.lower()] = resp stats["species_created"] += 1 return resp["id"] else: # Might already exist, reload print(f" Species creation returned {code}: {resp.get('error','')[:100]}") page = 1 while True: r = api_get("/species", {"per_page": 100, "page": page}) for s in r["data"]: species_cache[s["name_scientific"].lower()] = s if len(r["data"]) < 100: break page += 1 if latin_name.lower() in species_cache: return species_cache[latin_name.lower()]["id"] stats["errors"].append(f"Species creation failed: {latin_name}") return None def find_or_create_family(family_name: str) -> Optional[str]: """Find or create a plant family. Returns family ID.""" key = family_name.lower() if key in family_cache: return family_cache[key]["id"] print(f" Creating family: {family_name}") resp, code = api_post("/families", {"name_scientific": family_name}) if "id" in resp: family_cache[key] = resp stats["families_created"] += 1 return resp["id"] else: # Reload r = api_get("/families", {"per_page": 200}) for ff in r["data"]: family_cache[ff["name_scientific"].lower()] = ff if key in family_cache: return family_cache[key]["id"] stats["errors"].append(f"Family creation failed: {family_name}") return None def slugify(text: str) -> str: """Generate a URL-safe slug.""" text = text.lower() replacements = { "ä": "a", "ö": "o", "ü": "u", "ß": "ss", "é": "e", "è": "e", "ê": "e", "ë": "e", "à": "a", "â": "a", "á": "a", "ô": "o", "ù": "u", "û": "u", "ú": "u", "ï": "i", "î": "i", "í": "i", "ç": "c", "ñ": "n", "ó": "o", "œ": "oe", "æ": "ae", } for old, new in replacements.items(): text = text.replace(old, new) text = re.sub(r'[^a-z0-9\s-]', '', text) text = re.sub(r'[\s]+', '-', text.strip()) text = re.sub(r'-+', '-', text) return text.strip('-') def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]: """Check if cultivar already exists. Returns cultivar ID or None.""" expected_slug = slugify(f"{species_name} {variety_name}") # Direct slug match if expected_slug in cultivar_cache: return cultivar_cache[expected_slug]["id"] # Check for name match in same species variety_lower = variety_name.lower() for slug, data in cultivar_cache.items(): if data["species_id"] == species_id and data["name"].lower() == variety_lower: return data["id"] return None def scrape_category(cat_path: str, default_species: Optional[str]): """Scrape a single category page and all its products.""" url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html" print(f"\n{'='*60}") print(f"Category: {cat_path}") html = fetch_page(url) if not html: print(" SKIP: Page not found (404)") return time.sleep(DELAY) products = parse_product_links(html) print(f" Found {len(products)} products") stats["products_found"] += len(products) stats["categories_scraped"] += 1 for prod_url, prod_name in products: process_product(prod_url, prod_name, default_species) def process_product(prod_url: str, prod_name: str, default_species: Optional[str]): """Process a single product: fetch detail, extract data, create cultivar.""" article_number = extract_article_number(prod_name, prod_url) variety_name = extract_variety_name(prod_name) if not variety_name: print(f" SKIP (no variety): {prod_name}") return # Skip mixes, sets, bundles skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte", "saat-set", " mix ", "trio ", "quartett", "gutschein", "buch ", "düngung", "erde ", "-garten"] name_lower = prod_name.lower() # Exception: if the variety name itself is the whole thing, keep it if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower(): # Only skip if it really seems like a mix if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower: print(f" SKIP (mix/set): {prod_name}") return print(f"\n Product: {prod_name}") print(f" Variety: {variety_name}, SKU: {article_number}") # Fetch detail page latin_name = None description = "" time.sleep(DELAY) try: detail_html = fetch_page(prod_url) stats["detail_pages_fetched"] += 1 if detail_html: latin_name = extract_latin_from_detail(detail_html) description = extract_description_from_detail(detail_html) except Exception as e: print(f" WARNING: Detail page error: {e}") species_name = latin_name or default_species if not species_name: print(f" SKIP: No species for '{prod_name}'") stats["species_not_matched"].append(prod_name) return print(f" Species: {species_name}") species_id = find_or_create_species(species_name) if not species_id: print(f" SKIP: Could not resolve species '{species_name}'") return # Check if cultivar already exists existing_id = find_existing_cultivar(species_name, variety_name, species_id) cultivar_id = None if existing_id: cultivar_id = existing_id print(f" EXISTS: cultivar already in DB") stats["cultivars_existed"] += 1 else: # Create cultivar data = { "species_id": species_id, "name": variety_name, "name_de": variety_name, "is_organic": True, } if description: data["description"] = description resp, code = api_post("/cultivars", data) if "id" in resp: cultivar_id = resp["id"] cultivar_cache[resp["slug"]] = { "id": resp["id"], "name": variety_name, "species_id": species_id, } stats["cultivars_created"] += 1 print(f" CREATED: {resp['slug']}") elif code == 500 and "Database error" in str(resp.get("error", "")): # Likely slug conflict - try to find existing print(f" DB conflict - searching for existing cultivar...") # Reload cultivars for this species page = 1 while True: r = api_get("/cultivars", {"per_page": 100, "page": page}) for c in r["data"]: cultivar_cache[c["slug"]] = { "id": c["id"], "name": c["name"], "species_id": c["species_id"], } if c["species_id"] == species_id and c["name"].lower() == variety_name.lower(): cultivar_id = c["id"] if cultivar_id or len(r["data"]) < 100: break page += 1 if cultivar_id: print(f" Found existing after conflict: {cultivar_id}") stats["cultivars_existed"] += 1 else: print(f" ERROR: DB error and could not find existing cultivar") stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}") return else: print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}") stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}") return # Link to supplier if cultivar_id and supplier_id: link_data = { "supplier_id": supplier_id, "product_url": prod_url, } if article_number: link_data["article_number"] = article_number resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data) if "id" in resp: stats["supplier_links_created"] += 1 print(f" LINKED (SKU: {article_number})") elif code == 500 or "already" in str(resp.get("error", "")).lower(): stats["supplier_links_existed"] += 1 print(f" LINK EXISTS") else: print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}") stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}") def main(): print("=" * 60) print("Bingenheimer Saatgut Scraper for HerbAPI") print("=" * 60) load_api_data() print(f"\nScraping {len(ALL_CATEGORIES)} categories...") for cat_path, default_species in ALL_CATEGORIES: try: scrape_category(cat_path, default_species) except Exception as e: print(f" ERROR in category {cat_path}: {e}") stats["errors"].append(f"Category error: {cat_path}: {e}") # Summary print("\n" + "=" * 60) print("SCRAPING COMPLETE - SUMMARY") print("=" * 60) print(f"Categories scraped: {stats['categories_scraped']}") print(f"Products found: {stats['products_found']}") print(f"Detail pages fetched: {stats['detail_pages_fetched']}") print(f"Cultivars created: {stats['cultivars_created']}") print(f"Cultivars existed: {stats['cultivars_existed']}") print(f"Supplier links created: {stats['supplier_links_created']}") print(f"Supplier links existed: {stats['supplier_links_existed']}") print(f"Species created: {stats['species_created']}") print(f"Families created: {stats['families_created']}") print(f"Errors: {len(stats['errors'])}") if stats["species_not_matched"]: print(f"\nUnmatched species ({len(stats['species_not_matched'])}):") for s in stats["species_not_matched"][:30]: print(f" - {s}") if stats["errors"]: print(f"\nErrors ({len(stats['errors'])}):") for e in stats["errors"][:30]: print(f" - {e}") return 0 if not stats["errors"] else 1 if __name__ == "__main__": sys.exit(main())