#!/usr/bin/env python3
"""
Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
Extracts cultivar data and imports into HerbAPI.
Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
"""
import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser
from typing import Optional
# ── Configuration ─────────────────────────────────────────────────────────
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SITE_BASE = "https://www.bingenheimersaatgut.de"
DELAY = 0.5
USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
# ── Category URLs to scrape ───────────────────────────────────────────────
# (url_path, default_species_scientific_name)
VEGETABLE_CATEGORIES = [
("gemuese/tomaten", "Solanum lycopersicum"),
("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
("gemuese/gurken/salatgurken", "Cucumis sativus"),
("gemuese/aubergine", "Solanum melongena"),
("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
("gemuese/bohnen/dicke-bohne", "Vicia faba"),
("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
("gemuese/erbsen/markerbse", "Pisum sativum"),
("gemuese/erbsen/schalerbse", "Pisum sativum"),
("gemuese/erbsen/zuckererbse", "Pisum sativum"),
("gemuese/feldsalat", "Valerianella locusta"),
("gemuese/knollenfenchel", "Foeniculum vulgare"),
("gemuese/kohl/blumenkohl", "Brassica oleracea"),
("gemuese/kohl/brokkoli", "Brassica oleracea"),
("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
("gemuese/kohl/gruenkohl", "Brassica oleracea"),
("gemuese/kohl/kohlrabi", "Brassica oleracea"),
("gemuese/kohl/rotkohl", "Brassica oleracea"),
("gemuese/kohl/weisskohl", "Brassica oleracea"),
("gemuese/kohl/wirsing", "Brassica oleracea"),
("gemuese/kohl/rosenkohl", "Brassica oleracea"),
("gemuese/kresse", "Lepidium sativum"),
("gemuese/kuerbis", "Cucurbita maxima"),
("gemuese/zuckermais", "Zea mays"),
("gemuese/mangold", "Beta vulgaris"),
("gemuese/melone", "Cucumis melo"),
("gemuese/moehren", "Daucus carota"),
("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
("gemuese/paprika/chili", "Capsicum annuum"),
("gemuese/pastinaken", "Pastinaca sativa"),
("gemuese/petersilienwurzel", "Petroselinum crispum"),
("gemuese/physalis", "Physalis peruviana"),
("gemuese/porreelauch", "Allium porrum"),
("gemuese/radies", "Raphanus sativus"),
("gemuese/rettich", "Raphanus sativus"),
("gemuese/rote-bete", "Beta vulgaris"),
("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
("gemuese/rueben/kohlruebe", "Brassica napus"),
("gemuese/rucola", "Eruca vesicaria"),
("gemuese/salat/bataviasalat", "Lactuca sativa"),
("gemuese/salat/eichblattsalat", "Lactuca sativa"),
("gemuese/salat/eissalat", "Lactuca sativa"),
("gemuese/salat/endivien", "Cichorium endivia"),
("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
("gemuese/salat/kopfsalat", "Lactuca sativa"),
("gemuese/salat/lollosalat", "Lactuca sativa"),
("gemuese/salat/romanasalat", "Lactuca sativa"),
("gemuese/salat/baby-leaf", "Lactuca sativa"),
("gemuese/sellerie/knollensellerie", "Apium graveolens"),
("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
("gemuese/blattstielgemuese", "Beta vulgaris"),
("gemuese/zwiebeln", "Allium cepa"),
("gemuese/lauchzwiebeln", "Allium fistulosum"),
("gemuese/artischocke", "Cynara cardunculus"),
("gemuese/asia-salate", "Brassica juncea"),
("gemuese/chicoree", "Cichorium intybus"),
("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
("gemuese/winterpostelein", "Claytonia perfoliata"),
("gemuese/zucchini", "Cucurbita pepo"),
("gemuese/catalogna", "Cichorium intybus"),
("gemuese/zichoriensalate", "Cichorium intybus"),
]
HERB_CATEGORIES = [
("kraeuter/basilikum", "Ocimum basilicum"),
("kraeuter/bohnenkraut", "Satureja hortensis"),
("kraeuter/borretsch", "Borago officinalis"),
("kraeuter/dill", "Anethum graveolens"),
("kraeuter/kuemmel", "Carum carvi"),
("kraeuter/kerbel", "Anthriscus cerefolium"),
("kraeuter/koriander", "Coriandrum sativum"),
("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
("kraeuter/kultursauerampfer", "Rumex acetosa"),
("kraeuter/lavendel", "Lavandula angustifolia"),
("kraeuter/liebstock", "Levisticum officinale"),
("kraeuter/majoran", "Origanum majorana"),
("kraeuter/oregano", "Origanum vulgare"),
("kraeuter/pimpinelle", "Sanguisorba minor"),
("kraeuter/estragon", "Artemisia dracunculus"),
("kraeuter/salbei", "Salvia officinalis"),
("kraeuter/schnittlauch", "Allium schoenoprasum"),
("kraeuter/schnittknoblauch", "Allium tuberosum"),
("kraeuter/schwarzkuemmel", "Nigella sativa"),
("kraeuter/speisechrysantheme", "Glebionis coronaria"),
("kraeuter/thymian", "Thymus vulgaris"),
("kraeuter/ysop", "Hyssopus officinalis"),
("kraeuter/winterkresse", "Barbarea vulgaris"),
("kraeuter/brunnenkresse", "Nasturtium officinale"),
("kraeuter/melisse", "Melissa officinalis"),
("kraeuter/petersilie", "Petroselinum crispum"),
("kraeuter/schnittsellerie", "Apium graveolens"),
("kraeuter/beifuss", "Artemisia vulgaris"),
]
GREEN_MANURE_CATEGORIES = [
("gruenduengung", None),
]
ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
# ── Stats ─────────────────────────────────────────────────────────────────
stats = {
"categories_scraped": 0,
"products_found": 0,
"detail_pages_fetched": 0,
"cultivars_created": 0,
"cultivars_existed": 0,
"supplier_links_created": 0,
"supplier_links_existed": 0,
"species_created": 0,
"families_created": 0,
"species_not_matched": [],
"errors": [],
}
# ── HTTP helpers ──────────────────────────────────────────────────────────
def fetch_page(url: str) -> str:
"""Fetch a web page with User-Agent header."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404:
return ""
raise
def api_get(path: str, params: dict = None) -> dict:
"""GET from HerbAPI."""
url = f"{API_BASE}{path}"
if params:
url += "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url, headers={
"Authorization": f"Bearer {API_TOKEN}",
"Accept": "application/json",
})
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
def api_post(path: str, data: dict) -> tuple:
"""POST to HerbAPI. Returns (response_dict, status_code)."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode("utf-8")
req = urllib.request.Request(url, data=body, method="POST", headers={
"Authorization": f"Bearer {API_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read()), resp.status
except urllib.error.HTTPError as e:
err_body = e.read().decode("utf-8", errors="replace")
return {"error": err_body, "_status": e.code}, e.code
# ── HTML parsing helpers ──────────────────────────────────────────────────
def parse_product_links(html: str) -> list:
"""Parse product links from listing page using regex."""
links = []
# Magento product-item-link pattern
pattern = re.compile(
r']+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*',
re.DOTALL | re.IGNORECASE
)
for match in pattern.finditer(html):
url = match.group(1)
name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
if name:
if not url.startswith("http"):
url = SITE_BASE + url
links.append((url, name))
if not links:
# Broader pattern for product detail links
pattern2 = re.compile(
r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
re.IGNORECASE
)
seen = set()
for match in pattern2.finditer(html):
url = match.group(1).strip()
name = match.group(2).strip()
if name and url not in seen and not url.endswith(".html"):
seen.add(url)
if not url.startswith("http"):
url = SITE_BASE + url
links.append((url, name))
# Deduplicate by URL
seen_urls = set()
unique = []
for url, name in links:
if url not in seen_urls:
seen_urls.add(url)
unique.append((url, name))
return unique
def extract_latin_from_detail(html: str) -> Optional[str]:
"""Extract Latin/botanical name from product detail page."""
patterns = [
r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*(?:em|i)>',
r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
]
for pat in patterns:
m = re.search(pat, html, re.IGNORECASE)
if m:
name = m.group(1).strip()
parts = name.split()
if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
return name
return None
def extract_description_from_detail(html: str) -> str:
"""Extract product description from detail page."""
desc_patterns = [
r'
]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)
',
r']*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)
',
r'data-content-type="description"[^>]*>(.*?)',
]
for pat in desc_patterns:
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
if m:
raw = m.group(1)
text = re.sub(r'<[^>]+>', ' ', raw)
text = re.sub(r'\s+', ' ', text).strip()
if len(text) > 20:
return text[:2000]
return ""
def extract_article_number(product_name: str, url: str) -> Optional[str]:
"""Extract article number from product name or URL."""
m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
if m:
return m.group(1).replace(" ", "")
slug = url.rstrip("/").split("/")[-1]
m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
if m:
return m.group(1).upper()
return None
def extract_variety_name(product_name: str) -> str:
"""Extract the variety/cultivar name from the full product name."""
name = product_name.strip()
# Remove article number suffix like (G802)
name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
# Common German vegetable/herb type prefixes to strip
prefixes = [
# Tomatoes
r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
# Beans
r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
r'Edamame(?:-Sojabohne)?\s+',
# Peas
r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
# Cucurbits
r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
r'Zucchini\s+',
r'Kürbis\s+',
r'(?:Wasser)?[Mm]elone\s+',
# Brassicas
r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
r'Kohlrabi\s+',
r'Wirsing\s+',
r'Brokkoli\s+',
r'Chinakohl\s+',
r'Pak\s+Choi\s+',
r'Kohlrübe\s+',
r'Mai-/Herbstrüben?(?:/Navets)?\s+',
# Root vegetables
r'Möhre\s+',
r'Karotten?(?:\s*-?\s*Mix)?\s+',
r'Pastinake\s+',
r'Radies(?:chen)?\s+',
r'Rettich\s+',
r'Schwarzwurzel\s+',
r'Haferwurzel\s+',
r'Petersilienwurzel\s+',
# Beets
r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
r'Mangold\s+',
# Lettuce & leafy
r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
r'Feldsalat\s+',
r'Endivie\s+',
r'Asia[\s-]*Salat\s+',
r'Spinat\s+',
# Alliums
r'Zwiebel\s+',
r'Lauchzwiebel\s+',
r'Porree(?:/Lauch)?\s+',
r'Schnittlauch\s+',
r'Schnittknoblauch\s+',
# Peppers
r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
r'Chili\s+',
# Celery
r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
# Herbs
r'Basilikum\s+',
r'Koriander\s+',
r'Dill\s+',
r'Petersilie\s+',
r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
r'Salbei\s+',
r'Thymian\s+',
r'Oregano\s+',
r'Lavendel\s+',
r'Melisse\s+',
r'Majoran\s+',
r'Estragon\s+',
r'Kresse\s+',
r'Bohnenkraut\s+',
r'Borretsch\s+',
r'Kümmel\s+',
r'Kerbel\s+',
r'Liebstock\s+',
r'Ysop\s+',
r'Pimpinelle\s+',
r'Beifuß\s+',
r'Schwarzkümmel\s+',
# Other
r'Zuckermais\s+',
r'Artischocke\s+',
r'Physalis\s+',
r'Aubergine\s+',
r'Catalogna\s+',
]
for prefix in prefixes:
name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
name = name.strip().strip("'\"")
return name
# ── API data caches ───────────────────────────────────────────────────────
species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
family_cache = {} # name_scientific_lower -> {id, name_scientific}
cultivar_cache = {} # slug -> {id, name, species_id, ...}
supplier_id = None
def load_api_data():
"""Load all existing data from HerbAPI for matching."""
global supplier_id
print("Loading existing HerbAPI data...")
# Load families
page = 1
while True:
resp = api_get("/families", {"per_page": 100, "page": page})
for f in resp["data"]:
family_cache[f["name_scientific"].lower()] = f
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(family_cache)} families")
# Load species
page = 1
while True:
resp = api_get("/species", {"per_page": 100, "page": page})
for s in resp["data"]:
species_cache[s["name_scientific"].lower()] = s
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(species_cache)} species")
# Load ALL cultivars (slug + id + name + species_id)
page = 1
while True:
resp = api_get("/cultivars", {"per_page": 100, "page": page})
for c in resp["data"]:
cultivar_cache[c["slug"]] = {
"id": c["id"],
"name": c["name"],
"species_id": c["species_id"],
}
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(cultivar_cache)} cultivars")
# Create or find Bingenheimer supplier
resp = api_get("/suppliers")
for s in resp:
if "bingenheimer" in s["name"].lower():
supplier_id = s["id"]
print(f" Found existing supplier: {s['name']} ({s['id']})")
break
if not supplier_id:
print(" Creating Bingenheimer Saatgut supplier...")
s, code = api_post("/suppliers", {
"name": "Bingenheimer Saatgut",
"url": "https://www.bingenheimersaatgut.de",
"country": "DE",
"is_organic": True,
"is_demeter": True,
"notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
})
if "id" in s:
supplier_id = s["id"]
print(f" Created supplier: {s['id']}")
else:
print(f" ERROR creating supplier: {s}")
sys.exit(1)
def find_or_create_species(latin_name: str) -> Optional[str]:
"""Find species by Latin name or create it. Returns species ID."""
if not latin_name:
return None
key = latin_name.lower().strip()
# Direct match
if key in species_cache:
return species_cache[key]["id"]
# Try without subspecies/variety
base = " ".join(key.split()[:2])
if base in species_cache:
return species_cache[base]["id"]
# Handle synonyms
synonyms = {
"lycopersicon esculentum": "solanum lycopersicum",
"capsicum annuum var. annuum": "capsicum annuum",
"brassica oleracea var. botrytis": "brassica oleracea",
"brassica oleracea var. italica": "brassica oleracea",
"brassica oleracea var. gemmifera": "brassica oleracea",
"brassica oleracea var. gongylodes": "brassica oleracea",
"brassica oleracea var. capitata": "brassica oleracea",
"brassica oleracea var. sabauda": "brassica oleracea",
"brassica oleracea var. sabellica": "brassica oleracea",
"brassica rapa var. rapa": "brassica rapa",
"brassica rapa subsp. pekinensis": "brassica rapa",
"brassica rapa subsp. chinensis": "brassica rapa",
"beta vulgaris var. conditiva": "beta vulgaris",
"beta vulgaris subsp. vulgaris": "beta vulgaris",
"beta vulgaris var. vulgaris": "beta vulgaris",
"allium porrum": "allium cepa",
"allium ampeloprasum": "allium cepa",
"origanum majorana": "origanum vulgare",
"cichorium intybus var. foliosum": "cichorium intybus",
"petroselinum crispum var. tuberosum": "petroselinum crispum",
"apium graveolens var. rapaceum": "apium graveolens",
"apium graveolens var. dulce": "apium graveolens",
"lactuca sativa var. capitata": "lactuca sativa",
"lactuca sativa var. crispa": "lactuca sativa",
"lactuca sativa var. longifolia": "lactuca sativa",
}
if key in synonyms:
syn_key = synonyms[key]
if syn_key in species_cache:
return species_cache[syn_key]["id"]
# Try to create the species
genus = latin_name.split()[0]
family_map = {
"Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
"Nicandra": "Solanaceae",
"Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
"Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
"Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
"Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
"Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
"Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
"Allium": "Amaryllidaceae",
"Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
"Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
"Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
"Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
"Sanguisorba": "Rosaceae",
"Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
"Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
"Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
"Artemisia": "Asteraceae",
"Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
"Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
"Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
"Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
"Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
"Mentha": "Lamiaceae",
"Zea": "Poaceae",
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
"Valerianella": "Caprifoliaceae",
"Tropaeolum": "Tropaeolaceae",
"Rumex": "Polygonaceae",
"Nigella": "Ranunculaceae",
"Claytonia": "Montiaceae",
"Tetragonia": "Aizoaceae",
"Basella": "Basellaceae",
"Plantago": "Plantaginaceae",
}
family_name = family_map.get(genus)
if not family_name:
print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
stats["species_not_matched"].append(latin_name)
return None
family_id = find_or_create_family(family_name)
if not family_id:
return None
print(f" Creating species: {latin_name}")
resp, code = api_post("/species", {
"name_scientific": latin_name,
"family_id": family_id,
})
if "id" in resp:
species_cache[latin_name.lower()] = resp
stats["species_created"] += 1
return resp["id"]
else:
# Might already exist, reload
print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
page = 1
while True:
r = api_get("/species", {"per_page": 100, "page": page})
for s in r["data"]:
species_cache[s["name_scientific"].lower()] = s
if len(r["data"]) < 100:
break
page += 1
if latin_name.lower() in species_cache:
return species_cache[latin_name.lower()]["id"]
stats["errors"].append(f"Species creation failed: {latin_name}")
return None
def find_or_create_family(family_name: str) -> Optional[str]:
"""Find or create a plant family. Returns family ID."""
key = family_name.lower()
if key in family_cache:
return family_cache[key]["id"]
print(f" Creating family: {family_name}")
resp, code = api_post("/families", {"name_scientific": family_name})
if "id" in resp:
family_cache[key] = resp
stats["families_created"] += 1
return resp["id"]
else:
# Reload
r = api_get("/families", {"per_page": 200})
for ff in r["data"]:
family_cache[ff["name_scientific"].lower()] = ff
if key in family_cache:
return family_cache[key]["id"]
stats["errors"].append(f"Family creation failed: {family_name}")
return None
def slugify(text: str) -> str:
"""Generate a URL-safe slug."""
text = text.lower()
replacements = {
"ä": "a", "ö": "o", "ü": "u", "ß": "ss",
"é": "e", "è": "e", "ê": "e", "ë": "e",
"à": "a", "â": "a", "á": "a",
"ô": "o", "ù": "u", "û": "u", "ú": "u",
"ï": "i", "î": "i", "í": "i",
"ç": "c", "ñ": "n", "ó": "o",
"œ": "oe", "æ": "ae",
}
for old, new in replacements.items():
text = text.replace(old, new)
text = re.sub(r'[^a-z0-9\s-]', '', text)
text = re.sub(r'[\s]+', '-', text.strip())
text = re.sub(r'-+', '-', text)
return text.strip('-')
def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
"""Check if cultivar already exists. Returns cultivar ID or None."""
expected_slug = slugify(f"{species_name} {variety_name}")
# Direct slug match
if expected_slug in cultivar_cache:
return cultivar_cache[expected_slug]["id"]
# Check for name match in same species
variety_lower = variety_name.lower()
for slug, data in cultivar_cache.items():
if data["species_id"] == species_id and data["name"].lower() == variety_lower:
return data["id"]
return None
def scrape_category(cat_path: str, default_species: Optional[str]):
"""Scrape a single category page and all its products."""
url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
print(f"\n{'='*60}")
print(f"Category: {cat_path}")
html = fetch_page(url)
if not html:
print(" SKIP: Page not found (404)")
return
time.sleep(DELAY)
products = parse_product_links(html)
print(f" Found {len(products)} products")
stats["products_found"] += len(products)
stats["categories_scraped"] += 1
for prod_url, prod_name in products:
process_product(prod_url, prod_name, default_species)
def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
"""Process a single product: fetch detail, extract data, create cultivar."""
article_number = extract_article_number(prod_name, prod_url)
variety_name = extract_variety_name(prod_name)
if not variety_name:
print(f" SKIP (no variety): {prod_name}")
return
# Skip mixes, sets, bundles
skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
"saat-set", " mix ", "trio ", "quartett", "gutschein",
"buch ", "düngung", "erde ", "-garten"]
name_lower = prod_name.lower()
# Exception: if the variety name itself is the whole thing, keep it
if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
# Only skip if it really seems like a mix
if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
print(f" SKIP (mix/set): {prod_name}")
return
print(f"\n Product: {prod_name}")
print(f" Variety: {variety_name}, SKU: {article_number}")
# Fetch detail page
latin_name = None
description = ""
time.sleep(DELAY)
try:
detail_html = fetch_page(prod_url)
stats["detail_pages_fetched"] += 1
if detail_html:
latin_name = extract_latin_from_detail(detail_html)
description = extract_description_from_detail(detail_html)
except Exception as e:
print(f" WARNING: Detail page error: {e}")
species_name = latin_name or default_species
if not species_name:
print(f" SKIP: No species for '{prod_name}'")
stats["species_not_matched"].append(prod_name)
return
print(f" Species: {species_name}")
species_id = find_or_create_species(species_name)
if not species_id:
print(f" SKIP: Could not resolve species '{species_name}'")
return
# Check if cultivar already exists
existing_id = find_existing_cultivar(species_name, variety_name, species_id)
cultivar_id = None
if existing_id:
cultivar_id = existing_id
print(f" EXISTS: cultivar already in DB")
stats["cultivars_existed"] += 1
else:
# Create cultivar
data = {
"species_id": species_id,
"name": variety_name,
"name_de": variety_name,
"is_organic": True,
}
if description:
data["description"] = description
resp, code = api_post("/cultivars", data)
if "id" in resp:
cultivar_id = resp["id"]
cultivar_cache[resp["slug"]] = {
"id": resp["id"],
"name": variety_name,
"species_id": species_id,
}
stats["cultivars_created"] += 1
print(f" CREATED: {resp['slug']}")
elif code == 500 and "Database error" in str(resp.get("error", "")):
# Likely slug conflict - try to find existing
print(f" DB conflict - searching for existing cultivar...")
# Reload cultivars for this species
page = 1
while True:
r = api_get("/cultivars", {"per_page": 100, "page": page})
for c in r["data"]:
cultivar_cache[c["slug"]] = {
"id": c["id"],
"name": c["name"],
"species_id": c["species_id"],
}
if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
cultivar_id = c["id"]
if cultivar_id or len(r["data"]) < 100:
break
page += 1
if cultivar_id:
print(f" Found existing after conflict: {cultivar_id}")
stats["cultivars_existed"] += 1
else:
print(f" ERROR: DB error and could not find existing cultivar")
stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
return
else:
print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
return
# Link to supplier
if cultivar_id and supplier_id:
link_data = {
"supplier_id": supplier_id,
"product_url": prod_url,
}
if article_number:
link_data["article_number"] = article_number
resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
if "id" in resp:
stats["supplier_links_created"] += 1
print(f" LINKED (SKU: {article_number})")
elif code == 500 or "already" in str(resp.get("error", "")).lower():
stats["supplier_links_existed"] += 1
print(f" LINK EXISTS")
else:
print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
def main():
print("=" * 60)
print("Bingenheimer Saatgut Scraper for HerbAPI")
print("=" * 60)
load_api_data()
print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
for cat_path, default_species in ALL_CATEGORIES:
try:
scrape_category(cat_path, default_species)
except Exception as e:
print(f" ERROR in category {cat_path}: {e}")
stats["errors"].append(f"Category error: {cat_path}: {e}")
# Summary
print("\n" + "=" * 60)
print("SCRAPING COMPLETE - SUMMARY")
print("=" * 60)
print(f"Categories scraped: {stats['categories_scraped']}")
print(f"Products found: {stats['products_found']}")
print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
print(f"Cultivars created: {stats['cultivars_created']}")
print(f"Cultivars existed: {stats['cultivars_existed']}")
print(f"Supplier links created: {stats['supplier_links_created']}")
print(f"Supplier links existed: {stats['supplier_links_existed']}")
print(f"Species created: {stats['species_created']}")
print(f"Families created: {stats['families_created']}")
print(f"Errors: {len(stats['errors'])}")
if stats["species_not_matched"]:
print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
for s in stats["species_not_matched"][:30]:
print(f" - {s}")
if stats["errors"]:
print(f"\nErrors ({len(stats['errors'])}):")
for e in stats["errors"][:30]:
print(f" - {e}")
return 0 if not stats["errors"] else 1
if __name__ == "__main__":
sys.exit(main())