844 lines
32 KiB
Python
844 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
|
|
Extracts cultivar data and imports into HerbAPI.
|
|
|
|
Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
import urllib.parse
|
|
from html.parser import HTMLParser
|
|
from typing import Optional
|
|
|
|
# ── Configuration ─────────────────────────────────────────────────────────
|
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
|
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
|
SITE_BASE = "https://www.bingenheimersaatgut.de"
|
|
DELAY = 0.5
|
|
USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
|
|
|
|
# ── Category URLs to scrape ───────────────────────────────────────────────
|
|
# (url_path, default_species_scientific_name)
|
|
|
|
VEGETABLE_CATEGORIES = [
|
|
("gemuese/tomaten", "Solanum lycopersicum"),
|
|
("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
|
|
("gemuese/gurken/salatgurken", "Cucumis sativus"),
|
|
("gemuese/aubergine", "Solanum melongena"),
|
|
("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
|
|
("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
|
|
("gemuese/bohnen/dicke-bohne", "Vicia faba"),
|
|
("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
|
|
("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
|
|
("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
|
|
("gemuese/erbsen/markerbse", "Pisum sativum"),
|
|
("gemuese/erbsen/schalerbse", "Pisum sativum"),
|
|
("gemuese/erbsen/zuckererbse", "Pisum sativum"),
|
|
("gemuese/feldsalat", "Valerianella locusta"),
|
|
("gemuese/knollenfenchel", "Foeniculum vulgare"),
|
|
("gemuese/kohl/blumenkohl", "Brassica oleracea"),
|
|
("gemuese/kohl/brokkoli", "Brassica oleracea"),
|
|
("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
|
|
("gemuese/kohl/gruenkohl", "Brassica oleracea"),
|
|
("gemuese/kohl/kohlrabi", "Brassica oleracea"),
|
|
("gemuese/kohl/rotkohl", "Brassica oleracea"),
|
|
("gemuese/kohl/weisskohl", "Brassica oleracea"),
|
|
("gemuese/kohl/wirsing", "Brassica oleracea"),
|
|
("gemuese/kohl/rosenkohl", "Brassica oleracea"),
|
|
("gemuese/kresse", "Lepidium sativum"),
|
|
("gemuese/kuerbis", "Cucurbita maxima"),
|
|
("gemuese/zuckermais", "Zea mays"),
|
|
("gemuese/mangold", "Beta vulgaris"),
|
|
("gemuese/melone", "Cucumis melo"),
|
|
("gemuese/moehren", "Daucus carota"),
|
|
("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
|
|
("gemuese/paprika/chili", "Capsicum annuum"),
|
|
("gemuese/pastinaken", "Pastinaca sativa"),
|
|
("gemuese/petersilienwurzel", "Petroselinum crispum"),
|
|
("gemuese/physalis", "Physalis peruviana"),
|
|
("gemuese/porreelauch", "Allium porrum"),
|
|
("gemuese/radies", "Raphanus sativus"),
|
|
("gemuese/rettich", "Raphanus sativus"),
|
|
("gemuese/rote-bete", "Beta vulgaris"),
|
|
("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
|
|
("gemuese/rueben/kohlruebe", "Brassica napus"),
|
|
("gemuese/rucola", "Eruca vesicaria"),
|
|
("gemuese/salat/bataviasalat", "Lactuca sativa"),
|
|
("gemuese/salat/eichblattsalat", "Lactuca sativa"),
|
|
("gemuese/salat/eissalat", "Lactuca sativa"),
|
|
("gemuese/salat/endivien", "Cichorium endivia"),
|
|
("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
|
|
("gemuese/salat/kopfsalat", "Lactuca sativa"),
|
|
("gemuese/salat/lollosalat", "Lactuca sativa"),
|
|
("gemuese/salat/romanasalat", "Lactuca sativa"),
|
|
("gemuese/salat/baby-leaf", "Lactuca sativa"),
|
|
("gemuese/sellerie/knollensellerie", "Apium graveolens"),
|
|
("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
|
|
("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
|
|
("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
|
|
("gemuese/blattstielgemuese", "Beta vulgaris"),
|
|
("gemuese/zwiebeln", "Allium cepa"),
|
|
("gemuese/lauchzwiebeln", "Allium fistulosum"),
|
|
("gemuese/artischocke", "Cynara cardunculus"),
|
|
("gemuese/asia-salate", "Brassica juncea"),
|
|
("gemuese/chicoree", "Cichorium intybus"),
|
|
("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
|
|
("gemuese/winterpostelein", "Claytonia perfoliata"),
|
|
("gemuese/zucchini", "Cucurbita pepo"),
|
|
("gemuese/catalogna", "Cichorium intybus"),
|
|
("gemuese/zichoriensalate", "Cichorium intybus"),
|
|
]
|
|
|
|
HERB_CATEGORIES = [
|
|
("kraeuter/basilikum", "Ocimum basilicum"),
|
|
("kraeuter/bohnenkraut", "Satureja hortensis"),
|
|
("kraeuter/borretsch", "Borago officinalis"),
|
|
("kraeuter/dill", "Anethum graveolens"),
|
|
("kraeuter/kuemmel", "Carum carvi"),
|
|
("kraeuter/kerbel", "Anthriscus cerefolium"),
|
|
("kraeuter/koriander", "Coriandrum sativum"),
|
|
("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
|
|
("kraeuter/kultursauerampfer", "Rumex acetosa"),
|
|
("kraeuter/lavendel", "Lavandula angustifolia"),
|
|
("kraeuter/liebstock", "Levisticum officinale"),
|
|
("kraeuter/majoran", "Origanum majorana"),
|
|
("kraeuter/oregano", "Origanum vulgare"),
|
|
("kraeuter/pimpinelle", "Sanguisorba minor"),
|
|
("kraeuter/estragon", "Artemisia dracunculus"),
|
|
("kraeuter/salbei", "Salvia officinalis"),
|
|
("kraeuter/schnittlauch", "Allium schoenoprasum"),
|
|
("kraeuter/schnittknoblauch", "Allium tuberosum"),
|
|
("kraeuter/schwarzkuemmel", "Nigella sativa"),
|
|
("kraeuter/speisechrysantheme", "Glebionis coronaria"),
|
|
("kraeuter/thymian", "Thymus vulgaris"),
|
|
("kraeuter/ysop", "Hyssopus officinalis"),
|
|
("kraeuter/winterkresse", "Barbarea vulgaris"),
|
|
("kraeuter/brunnenkresse", "Nasturtium officinale"),
|
|
("kraeuter/melisse", "Melissa officinalis"),
|
|
("kraeuter/petersilie", "Petroselinum crispum"),
|
|
("kraeuter/schnittsellerie", "Apium graveolens"),
|
|
("kraeuter/beifuss", "Artemisia vulgaris"),
|
|
]
|
|
|
|
GREEN_MANURE_CATEGORIES = [
|
|
("gruenduengung", None),
|
|
]
|
|
|
|
ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
|
|
|
|
# ── Stats ─────────────────────────────────────────────────────────────────
|
|
stats = {
|
|
"categories_scraped": 0,
|
|
"products_found": 0,
|
|
"detail_pages_fetched": 0,
|
|
"cultivars_created": 0,
|
|
"cultivars_existed": 0,
|
|
"supplier_links_created": 0,
|
|
"supplier_links_existed": 0,
|
|
"species_created": 0,
|
|
"families_created": 0,
|
|
"species_not_matched": [],
|
|
"errors": [],
|
|
}
|
|
|
|
|
|
# ── HTTP helpers ──────────────────────────────────────────────────────────
|
|
def fetch_page(url: str) -> str:
|
|
"""Fetch a web page with User-Agent header."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
return ""
|
|
raise
|
|
|
|
|
|
def api_get(path: str, params: dict = None) -> dict:
|
|
"""GET from HerbAPI."""
|
|
url = f"{API_BASE}{path}"
|
|
if params:
|
|
url += "?" + urllib.parse.urlencode(params)
|
|
req = urllib.request.Request(url, headers={
|
|
"Authorization": f"Bearer {API_TOKEN}",
|
|
"Accept": "application/json",
|
|
})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read())
|
|
|
|
|
|
def api_post(path: str, data: dict) -> tuple:
|
|
"""POST to HerbAPI. Returns (response_dict, status_code)."""
|
|
url = f"{API_BASE}{path}"
|
|
body = json.dumps(data).encode("utf-8")
|
|
req = urllib.request.Request(url, data=body, method="POST", headers={
|
|
"Authorization": f"Bearer {API_TOKEN}",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json",
|
|
})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read()), resp.status
|
|
except urllib.error.HTTPError as e:
|
|
err_body = e.read().decode("utf-8", errors="replace")
|
|
return {"error": err_body, "_status": e.code}, e.code
|
|
|
|
|
|
# ── HTML parsing helpers ──────────────────────────────────────────────────
|
|
def parse_product_links(html: str) -> list:
|
|
"""Parse product links from listing page using regex."""
|
|
links = []
|
|
# Magento product-item-link pattern
|
|
pattern = re.compile(
|
|
r'<a[^>]+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*</a>',
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
for match in pattern.finditer(html):
|
|
url = match.group(1)
|
|
name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
|
|
if name:
|
|
if not url.startswith("http"):
|
|
url = SITE_BASE + url
|
|
links.append((url, name))
|
|
|
|
if not links:
|
|
# Broader pattern for product detail links
|
|
pattern2 = re.compile(
|
|
r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
|
|
re.IGNORECASE
|
|
)
|
|
seen = set()
|
|
for match in pattern2.finditer(html):
|
|
url = match.group(1).strip()
|
|
name = match.group(2).strip()
|
|
if name and url not in seen and not url.endswith(".html"):
|
|
seen.add(url)
|
|
if not url.startswith("http"):
|
|
url = SITE_BASE + url
|
|
links.append((url, name))
|
|
|
|
# Deduplicate by URL
|
|
seen_urls = set()
|
|
unique = []
|
|
for url, name in links:
|
|
if url not in seen_urls:
|
|
seen_urls.add(url)
|
|
unique.append((url, name))
|
|
return unique
|
|
|
|
|
|
def extract_latin_from_detail(html: str) -> Optional[str]:
|
|
"""Extract Latin/botanical name from product detail page."""
|
|
patterns = [
|
|
r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*</(?:em|i)>',
|
|
r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
|
r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
|
]
|
|
for pat in patterns:
|
|
m = re.search(pat, html, re.IGNORECASE)
|
|
if m:
|
|
name = m.group(1).strip()
|
|
parts = name.split()
|
|
if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
|
|
return name
|
|
return None
|
|
|
|
|
|
def extract_description_from_detail(html: str) -> str:
|
|
"""Extract product description from detail page."""
|
|
desc_patterns = [
|
|
r'<div[^>]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)</div>',
|
|
r'<div[^>]*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)</div>',
|
|
r'data-content-type="description"[^>]*>(.*?)</div>',
|
|
]
|
|
for pat in desc_patterns:
|
|
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
|
|
if m:
|
|
raw = m.group(1)
|
|
text = re.sub(r'<[^>]+>', ' ', raw)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
if len(text) > 20:
|
|
return text[:2000]
|
|
return ""
|
|
|
|
|
|
def extract_article_number(product_name: str, url: str) -> Optional[str]:
|
|
"""Extract article number from product name or URL."""
|
|
m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
|
|
if m:
|
|
return m.group(1).replace(" ", "")
|
|
slug = url.rstrip("/").split("/")[-1]
|
|
m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
|
|
if m:
|
|
return m.group(1).upper()
|
|
return None
|
|
|
|
|
|
def extract_variety_name(product_name: str) -> str:
|
|
"""Extract the variety/cultivar name from the full product name."""
|
|
name = product_name.strip()
|
|
|
|
# Remove article number suffix like (G802)
|
|
name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
|
|
|
|
# Common German vegetable/herb type prefixes to strip
|
|
prefixes = [
|
|
# Tomatoes
|
|
r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
|
|
r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
|
|
# Beans
|
|
r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
|
|
r'Edamame(?:-Sojabohne)?\s+',
|
|
# Peas
|
|
r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
|
|
# Cucurbits
|
|
r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
|
|
r'Zucchini\s+',
|
|
r'Kürbis\s+',
|
|
r'(?:Wasser)?[Mm]elone\s+',
|
|
# Brassicas
|
|
r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
|
|
r'Kohlrabi\s+',
|
|
r'Wirsing\s+',
|
|
r'Brokkoli\s+',
|
|
r'Chinakohl\s+',
|
|
r'Pak\s+Choi\s+',
|
|
r'Kohlrübe\s+',
|
|
r'Mai-/Herbstrüben?(?:/Navets)?\s+',
|
|
# Root vegetables
|
|
r'Möhre\s+',
|
|
r'Karotten?(?:\s*-?\s*Mix)?\s+',
|
|
r'Pastinake\s+',
|
|
r'Radies(?:chen)?\s+',
|
|
r'Rettich\s+',
|
|
r'Schwarzwurzel\s+',
|
|
r'Haferwurzel\s+',
|
|
r'Petersilienwurzel\s+',
|
|
# Beets
|
|
r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
|
|
r'Mangold\s+',
|
|
# Lettuce & leafy
|
|
r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
|
|
r'Feldsalat\s+',
|
|
r'Endivie\s+',
|
|
r'Asia[\s-]*Salat\s+',
|
|
r'Spinat\s+',
|
|
# Alliums
|
|
r'Zwiebel\s+',
|
|
r'Lauchzwiebel\s+',
|
|
r'Porree(?:/Lauch)?\s+',
|
|
r'Schnittlauch\s+',
|
|
r'Schnittknoblauch\s+',
|
|
# Peppers
|
|
r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
|
|
r'Chili\s+',
|
|
# Celery
|
|
r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
|
|
# Herbs
|
|
r'Basilikum\s+',
|
|
r'Koriander\s+',
|
|
r'Dill\s+',
|
|
r'Petersilie\s+',
|
|
r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
|
|
r'Salbei\s+',
|
|
r'Thymian\s+',
|
|
r'Oregano\s+',
|
|
r'Lavendel\s+',
|
|
r'Melisse\s+',
|
|
r'Majoran\s+',
|
|
r'Estragon\s+',
|
|
r'Kresse\s+',
|
|
r'Bohnenkraut\s+',
|
|
r'Borretsch\s+',
|
|
r'Kümmel\s+',
|
|
r'Kerbel\s+',
|
|
r'Liebstock\s+',
|
|
r'Ysop\s+',
|
|
r'Pimpinelle\s+',
|
|
r'Beifuß\s+',
|
|
r'Schwarzkümmel\s+',
|
|
# Other
|
|
r'Zuckermais\s+',
|
|
r'Artischocke\s+',
|
|
r'Physalis\s+',
|
|
r'Aubergine\s+',
|
|
r'Catalogna\s+',
|
|
]
|
|
for prefix in prefixes:
|
|
name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
|
|
|
|
name = name.strip().strip("'\"")
|
|
return name
|
|
|
|
|
|
# ── API data caches ───────────────────────────────────────────────────────
|
|
species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
|
|
family_cache = {} # name_scientific_lower -> {id, name_scientific}
|
|
cultivar_cache = {} # slug -> {id, name, species_id, ...}
|
|
supplier_id = None
|
|
|
|
|
|
def load_api_data():
|
|
"""Load all existing data from HerbAPI for matching."""
|
|
global supplier_id
|
|
|
|
print("Loading existing HerbAPI data...")
|
|
|
|
# Load families
|
|
page = 1
|
|
while True:
|
|
resp = api_get("/families", {"per_page": 100, "page": page})
|
|
for f in resp["data"]:
|
|
family_cache[f["name_scientific"].lower()] = f
|
|
if len(resp["data"]) < 100:
|
|
break
|
|
page += 1
|
|
print(f" Loaded {len(family_cache)} families")
|
|
|
|
# Load species
|
|
page = 1
|
|
while True:
|
|
resp = api_get("/species", {"per_page": 100, "page": page})
|
|
for s in resp["data"]:
|
|
species_cache[s["name_scientific"].lower()] = s
|
|
if len(resp["data"]) < 100:
|
|
break
|
|
page += 1
|
|
print(f" Loaded {len(species_cache)} species")
|
|
|
|
# Load ALL cultivars (slug + id + name + species_id)
|
|
page = 1
|
|
while True:
|
|
resp = api_get("/cultivars", {"per_page": 100, "page": page})
|
|
for c in resp["data"]:
|
|
cultivar_cache[c["slug"]] = {
|
|
"id": c["id"],
|
|
"name": c["name"],
|
|
"species_id": c["species_id"],
|
|
}
|
|
if len(resp["data"]) < 100:
|
|
break
|
|
page += 1
|
|
print(f" Loaded {len(cultivar_cache)} cultivars")
|
|
|
|
# Create or find Bingenheimer supplier
|
|
resp = api_get("/suppliers")
|
|
for s in resp:
|
|
if "bingenheimer" in s["name"].lower():
|
|
supplier_id = s["id"]
|
|
print(f" Found existing supplier: {s['name']} ({s['id']})")
|
|
break
|
|
|
|
if not supplier_id:
|
|
print(" Creating Bingenheimer Saatgut supplier...")
|
|
s, code = api_post("/suppliers", {
|
|
"name": "Bingenheimer Saatgut",
|
|
"url": "https://www.bingenheimersaatgut.de",
|
|
"country": "DE",
|
|
"is_organic": True,
|
|
"is_demeter": True,
|
|
"notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
|
|
})
|
|
if "id" in s:
|
|
supplier_id = s["id"]
|
|
print(f" Created supplier: {s['id']}")
|
|
else:
|
|
print(f" ERROR creating supplier: {s}")
|
|
sys.exit(1)
|
|
|
|
|
|
def find_or_create_species(latin_name: str) -> Optional[str]:
|
|
"""Find species by Latin name or create it. Returns species ID."""
|
|
if not latin_name:
|
|
return None
|
|
|
|
key = latin_name.lower().strip()
|
|
|
|
# Direct match
|
|
if key in species_cache:
|
|
return species_cache[key]["id"]
|
|
|
|
# Try without subspecies/variety
|
|
base = " ".join(key.split()[:2])
|
|
if base in species_cache:
|
|
return species_cache[base]["id"]
|
|
|
|
# Handle synonyms
|
|
synonyms = {
|
|
"lycopersicon esculentum": "solanum lycopersicum",
|
|
"capsicum annuum var. annuum": "capsicum annuum",
|
|
"brassica oleracea var. botrytis": "brassica oleracea",
|
|
"brassica oleracea var. italica": "brassica oleracea",
|
|
"brassica oleracea var. gemmifera": "brassica oleracea",
|
|
"brassica oleracea var. gongylodes": "brassica oleracea",
|
|
"brassica oleracea var. capitata": "brassica oleracea",
|
|
"brassica oleracea var. sabauda": "brassica oleracea",
|
|
"brassica oleracea var. sabellica": "brassica oleracea",
|
|
"brassica rapa var. rapa": "brassica rapa",
|
|
"brassica rapa subsp. pekinensis": "brassica rapa",
|
|
"brassica rapa subsp. chinensis": "brassica rapa",
|
|
"beta vulgaris var. conditiva": "beta vulgaris",
|
|
"beta vulgaris subsp. vulgaris": "beta vulgaris",
|
|
"beta vulgaris var. vulgaris": "beta vulgaris",
|
|
"allium porrum": "allium cepa",
|
|
"allium ampeloprasum": "allium cepa",
|
|
"origanum majorana": "origanum vulgare",
|
|
"cichorium intybus var. foliosum": "cichorium intybus",
|
|
"petroselinum crispum var. tuberosum": "petroselinum crispum",
|
|
"apium graveolens var. rapaceum": "apium graveolens",
|
|
"apium graveolens var. dulce": "apium graveolens",
|
|
"lactuca sativa var. capitata": "lactuca sativa",
|
|
"lactuca sativa var. crispa": "lactuca sativa",
|
|
"lactuca sativa var. longifolia": "lactuca sativa",
|
|
}
|
|
if key in synonyms:
|
|
syn_key = synonyms[key]
|
|
if syn_key in species_cache:
|
|
return species_cache[syn_key]["id"]
|
|
|
|
# Try to create the species
|
|
genus = latin_name.split()[0]
|
|
family_map = {
|
|
"Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
|
|
"Nicandra": "Solanaceae",
|
|
"Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
|
|
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
|
|
"Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
|
|
"Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
|
|
"Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
|
|
"Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
|
|
"Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
|
"Allium": "Amaryllidaceae",
|
|
"Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
|
|
"Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
|
|
"Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
|
|
"Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
|
|
"Sanguisorba": "Rosaceae",
|
|
"Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
|
|
"Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
|
|
"Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
|
|
"Artemisia": "Asteraceae",
|
|
"Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
|
"Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
|
|
"Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
|
|
"Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
|
|
"Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
|
|
"Mentha": "Lamiaceae",
|
|
"Zea": "Poaceae",
|
|
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
|
|
"Valerianella": "Caprifoliaceae",
|
|
"Tropaeolum": "Tropaeolaceae",
|
|
"Rumex": "Polygonaceae",
|
|
"Nigella": "Ranunculaceae",
|
|
"Claytonia": "Montiaceae",
|
|
"Tetragonia": "Aizoaceae",
|
|
"Basella": "Basellaceae",
|
|
"Plantago": "Plantaginaceae",
|
|
}
|
|
|
|
family_name = family_map.get(genus)
|
|
if not family_name:
|
|
print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
|
|
stats["species_not_matched"].append(latin_name)
|
|
return None
|
|
|
|
family_id = find_or_create_family(family_name)
|
|
if not family_id:
|
|
return None
|
|
|
|
print(f" Creating species: {latin_name}")
|
|
resp, code = api_post("/species", {
|
|
"name_scientific": latin_name,
|
|
"family_id": family_id,
|
|
})
|
|
if "id" in resp:
|
|
species_cache[latin_name.lower()] = resp
|
|
stats["species_created"] += 1
|
|
return resp["id"]
|
|
else:
|
|
# Might already exist, reload
|
|
print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
|
|
page = 1
|
|
while True:
|
|
r = api_get("/species", {"per_page": 100, "page": page})
|
|
for s in r["data"]:
|
|
species_cache[s["name_scientific"].lower()] = s
|
|
if len(r["data"]) < 100:
|
|
break
|
|
page += 1
|
|
if latin_name.lower() in species_cache:
|
|
return species_cache[latin_name.lower()]["id"]
|
|
stats["errors"].append(f"Species creation failed: {latin_name}")
|
|
return None
|
|
|
|
|
|
def find_or_create_family(family_name: str) -> Optional[str]:
|
|
"""Find or create a plant family. Returns family ID."""
|
|
key = family_name.lower()
|
|
if key in family_cache:
|
|
return family_cache[key]["id"]
|
|
|
|
print(f" Creating family: {family_name}")
|
|
resp, code = api_post("/families", {"name_scientific": family_name})
|
|
if "id" in resp:
|
|
family_cache[key] = resp
|
|
stats["families_created"] += 1
|
|
return resp["id"]
|
|
else:
|
|
# Reload
|
|
r = api_get("/families", {"per_page": 200})
|
|
for ff in r["data"]:
|
|
family_cache[ff["name_scientific"].lower()] = ff
|
|
if key in family_cache:
|
|
return family_cache[key]["id"]
|
|
stats["errors"].append(f"Family creation failed: {family_name}")
|
|
return None
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
"""Generate a URL-safe slug."""
|
|
text = text.lower()
|
|
replacements = {
|
|
"ä": "a", "ö": "o", "ü": "u", "ß": "ss",
|
|
"é": "e", "è": "e", "ê": "e", "ë": "e",
|
|
"à": "a", "â": "a", "á": "a",
|
|
"ô": "o", "ù": "u", "û": "u", "ú": "u",
|
|
"ï": "i", "î": "i", "í": "i",
|
|
"ç": "c", "ñ": "n", "ó": "o",
|
|
"œ": "oe", "æ": "ae",
|
|
}
|
|
for old, new in replacements.items():
|
|
text = text.replace(old, new)
|
|
text = re.sub(r'[^a-z0-9\s-]', '', text)
|
|
text = re.sub(r'[\s]+', '-', text.strip())
|
|
text = re.sub(r'-+', '-', text)
|
|
return text.strip('-')
|
|
|
|
|
|
def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
|
|
"""Check if cultivar already exists. Returns cultivar ID or None."""
|
|
expected_slug = slugify(f"{species_name} {variety_name}")
|
|
|
|
# Direct slug match
|
|
if expected_slug in cultivar_cache:
|
|
return cultivar_cache[expected_slug]["id"]
|
|
|
|
# Check for name match in same species
|
|
variety_lower = variety_name.lower()
|
|
for slug, data in cultivar_cache.items():
|
|
if data["species_id"] == species_id and data["name"].lower() == variety_lower:
|
|
return data["id"]
|
|
|
|
return None
|
|
|
|
|
|
def scrape_category(cat_path: str, default_species: Optional[str]):
|
|
"""Scrape a single category page and all its products."""
|
|
url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
|
|
print(f"\n{'='*60}")
|
|
print(f"Category: {cat_path}")
|
|
|
|
html = fetch_page(url)
|
|
if not html:
|
|
print(" SKIP: Page not found (404)")
|
|
return
|
|
|
|
time.sleep(DELAY)
|
|
|
|
products = parse_product_links(html)
|
|
print(f" Found {len(products)} products")
|
|
stats["products_found"] += len(products)
|
|
stats["categories_scraped"] += 1
|
|
|
|
for prod_url, prod_name in products:
|
|
process_product(prod_url, prod_name, default_species)
|
|
|
|
|
|
def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
|
|
"""Process a single product: fetch detail, extract data, create cultivar."""
|
|
article_number = extract_article_number(prod_name, prod_url)
|
|
variety_name = extract_variety_name(prod_name)
|
|
|
|
if not variety_name:
|
|
print(f" SKIP (no variety): {prod_name}")
|
|
return
|
|
|
|
# Skip mixes, sets, bundles
|
|
skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
|
|
"saat-set", " mix ", "trio ", "quartett", "gutschein",
|
|
"buch ", "düngung", "erde ", "-garten"]
|
|
name_lower = prod_name.lower()
|
|
# Exception: if the variety name itself is the whole thing, keep it
|
|
if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
|
|
# Only skip if it really seems like a mix
|
|
if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
|
|
print(f" SKIP (mix/set): {prod_name}")
|
|
return
|
|
|
|
print(f"\n Product: {prod_name}")
|
|
print(f" Variety: {variety_name}, SKU: {article_number}")
|
|
|
|
# Fetch detail page
|
|
latin_name = None
|
|
description = ""
|
|
time.sleep(DELAY)
|
|
try:
|
|
detail_html = fetch_page(prod_url)
|
|
stats["detail_pages_fetched"] += 1
|
|
if detail_html:
|
|
latin_name = extract_latin_from_detail(detail_html)
|
|
description = extract_description_from_detail(detail_html)
|
|
except Exception as e:
|
|
print(f" WARNING: Detail page error: {e}")
|
|
|
|
species_name = latin_name or default_species
|
|
if not species_name:
|
|
print(f" SKIP: No species for '{prod_name}'")
|
|
stats["species_not_matched"].append(prod_name)
|
|
return
|
|
|
|
print(f" Species: {species_name}")
|
|
|
|
species_id = find_or_create_species(species_name)
|
|
if not species_id:
|
|
print(f" SKIP: Could not resolve species '{species_name}'")
|
|
return
|
|
|
|
# Check if cultivar already exists
|
|
existing_id = find_existing_cultivar(species_name, variety_name, species_id)
|
|
|
|
cultivar_id = None
|
|
|
|
if existing_id:
|
|
cultivar_id = existing_id
|
|
print(f" EXISTS: cultivar already in DB")
|
|
stats["cultivars_existed"] += 1
|
|
else:
|
|
# Create cultivar
|
|
data = {
|
|
"species_id": species_id,
|
|
"name": variety_name,
|
|
"name_de": variety_name,
|
|
"is_organic": True,
|
|
}
|
|
if description:
|
|
data["description"] = description
|
|
|
|
resp, code = api_post("/cultivars", data)
|
|
|
|
if "id" in resp:
|
|
cultivar_id = resp["id"]
|
|
cultivar_cache[resp["slug"]] = {
|
|
"id": resp["id"],
|
|
"name": variety_name,
|
|
"species_id": species_id,
|
|
}
|
|
stats["cultivars_created"] += 1
|
|
print(f" CREATED: {resp['slug']}")
|
|
elif code == 500 and "Database error" in str(resp.get("error", "")):
|
|
# Likely slug conflict - try to find existing
|
|
print(f" DB conflict - searching for existing cultivar...")
|
|
# Reload cultivars for this species
|
|
page = 1
|
|
while True:
|
|
r = api_get("/cultivars", {"per_page": 100, "page": page})
|
|
for c in r["data"]:
|
|
cultivar_cache[c["slug"]] = {
|
|
"id": c["id"],
|
|
"name": c["name"],
|
|
"species_id": c["species_id"],
|
|
}
|
|
if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
|
|
cultivar_id = c["id"]
|
|
if cultivar_id or len(r["data"]) < 100:
|
|
break
|
|
page += 1
|
|
|
|
if cultivar_id:
|
|
print(f" Found existing after conflict: {cultivar_id}")
|
|
stats["cultivars_existed"] += 1
|
|
else:
|
|
print(f" ERROR: DB error and could not find existing cultivar")
|
|
stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
|
|
return
|
|
else:
|
|
print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
|
|
stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
|
|
return
|
|
|
|
# Link to supplier
|
|
if cultivar_id and supplier_id:
|
|
link_data = {
|
|
"supplier_id": supplier_id,
|
|
"product_url": prod_url,
|
|
}
|
|
if article_number:
|
|
link_data["article_number"] = article_number
|
|
|
|
resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
|
|
|
if "id" in resp:
|
|
stats["supplier_links_created"] += 1
|
|
print(f" LINKED (SKU: {article_number})")
|
|
elif code == 500 or "already" in str(resp.get("error", "")).lower():
|
|
stats["supplier_links_existed"] += 1
|
|
print(f" LINK EXISTS")
|
|
else:
|
|
print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
|
|
stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Bingenheimer Saatgut Scraper for HerbAPI")
|
|
print("=" * 60)
|
|
|
|
load_api_data()
|
|
|
|
print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
|
|
|
|
for cat_path, default_species in ALL_CATEGORIES:
|
|
try:
|
|
scrape_category(cat_path, default_species)
|
|
except Exception as e:
|
|
print(f" ERROR in category {cat_path}: {e}")
|
|
stats["errors"].append(f"Category error: {cat_path}: {e}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SCRAPING COMPLETE - SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Categories scraped: {stats['categories_scraped']}")
|
|
print(f"Products found: {stats['products_found']}")
|
|
print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
|
|
print(f"Cultivars created: {stats['cultivars_created']}")
|
|
print(f"Cultivars existed: {stats['cultivars_existed']}")
|
|
print(f"Supplier links created: {stats['supplier_links_created']}")
|
|
print(f"Supplier links existed: {stats['supplier_links_existed']}")
|
|
print(f"Species created: {stats['species_created']}")
|
|
print(f"Families created: {stats['families_created']}")
|
|
print(f"Errors: {len(stats['errors'])}")
|
|
|
|
if stats["species_not_matched"]:
|
|
print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
|
|
for s in stats["species_not_matched"][:30]:
|
|
print(f" - {s}")
|
|
|
|
if stats["errors"]:
|
|
print(f"\nErrors ({len(stats['errors'])}):")
|
|
for e in stats["errors"][:30]:
|
|
print(f" - {e}")
|
|
|
|
return 0 if not stats["errors"] else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|