Files
herbapi/tools/scrapers/scrape_dreschflegel.py

761 lines
28 KiB
Python

#!/usr/bin/env python3
"""
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
Extracts cultivar data and imports into HerbAPI.
Run 2 - fixes pagination (API caps at 100/page), better species matching,
caches scraped products, handles duplicates gracefully.
"""
import urllib.request
import urllib.parse
import urllib.error
import gzip
import json
import re
import time
import sys
import os
import html as html_mod
from collections import defaultdict
# --- Configuration ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SITE_BASE = "https://www.dreschflegel-saatgut.de"
DELAY = 0.5
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
stats = defaultdict(int)
def api_request(method, path, data=None):
"""Make an API request to HerbAPI."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode("utf-8") if data else None
req = urllib.request.Request(url, data=body, method=method)
req.add_header("Authorization", f"Bearer {API_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
try:
resp = urllib.request.urlopen(req)
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
body_text = e.read().decode("utf-8", errors="replace")
if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
return None # Duplicate, handled silently
if e.code == 500 and "database error" in body_text.lower():
# Likely a unique constraint violation = duplicate
return None
print(f" API error {e.code} {method} {path}: {body_text[:200]}")
return None
def fetch_page(url):
"""Fetch a web page with delay and user-agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
resp = urllib.request.urlopen(req, timeout=30)
return resp.read().decode("utf-8", errors="replace")
except Exception as e:
print(f" Fetch error {url}: {e}")
return None
def get_sitemap_urls():
"""Download sitemap and extract all URLs."""
print("Fetching sitemap index...")
html = fetch_page(f"{SITE_BASE}/sitemap.xml")
if not html:
return []
sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
all_urls = []
for smap_url in sitemap_urls:
if smap_url.endswith(".xml.gz"):
print(f" Fetching compressed sitemap...")
req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
try:
resp = urllib.request.urlopen(req, timeout=30)
data = gzip.decompress(resp.read()).decode("utf-8")
urls = re.findall(r"<loc>(.*?)</loc>", data)
all_urls.extend(urls)
print(f" Found {len(urls)} URLs")
except Exception as e:
print(f" Error: {e}")
return all_urls
def classify_urls(urls):
"""Filter URLs to likely product pages (single-segment paths)."""
skip_prefixes = [
"impressum", "agb", "datenschutz", "kontakt", "widerrufs",
"versand", "abkuerz", "zertifikat", "wichtige-hinweise",
"muster-", "gutscheine", "kalender", "flyer", "katalog",
"sommer-herbst", "unsere-hoefe", "bestellschein",
"dreschflegel-news", "termine", "rezepte", "anbautipps",
"tipps-zur", "gartentelefon", "gartenfreude", "buecher",
"navigation", "vielfalt", "sut20", "saatgut",
"neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
"saatgut-vielfalt", "saat",
]
candidates = []
for url in urls:
url = url.rstrip("/")
path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
"https://www.dreschflegel-saatgut.de/", ""
)
if not path or "/" in path:
continue
if any(path == p or path.startswith(p) for p in skip_prefixes):
continue
candidates.append(url)
return candidates
def parse_product_page(html_content):
"""Extract product data from a Dreschflegel product page."""
if not html_content or 'class="botname"' not in html_content:
return None
result = {}
m = re.search(r"<h1>(.*?)</h1>", html_content)
if m:
result["name"] = html_mod.unescape(m.group(1).strip())
m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
if m:
result["botanical_name"] = html_mod.unescape(m.group(1).strip())
m = re.search(
r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
html_content,
re.DOTALL,
)
if m:
result["article_number"] = m.group(1)
m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
if m:
try:
result["price"] = float(m.group(1))
except ValueError:
pass
m = re.search(
r"product-detail-description-text.*?<p>(.*?)</p>",
html_content,
re.DOTALL,
)
if m:
desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
desc = html_mod.unescape(desc).strip()
if desc:
result["description"] = desc
m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
if m:
result["pack_info"] = html_mod.unescape(m.group(1).strip())
return result if "name" in result and "botanical_name" in result else None
def scrape_all_products(candidate_urls):
"""Scrape product pages, using cache for already-scraped URLs."""
# Load cache
cache = {}
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r") as f:
cache = json.load(f)
print(f" Loaded {len(cache)} cached products")
products = []
to_fetch = [u for u in candidate_urls if u not in cache]
already_cached = [u for u in candidate_urls if u in cache]
# Add cached products
for u in already_cached:
if cache[u]: # None means "not a product page"
products.append(cache[u])
cached_products = len(products)
cached_non_products = len(already_cached) - cached_products
print(f" {cached_products} products from cache, "
f"{cached_non_products} non-products cached, "
f"{len(to_fetch)} to fetch")
for i, url in enumerate(to_fetch):
if (i + 1) % 50 == 0 or i == 0:
print(f" Fetching {i + 1}/{len(to_fetch)}...")
time.sleep(DELAY)
html_content = fetch_page(url)
if not html_content:
stats["fetch_errors"] += 1
cache[url] = None
continue
product = parse_product_page(html_content)
if product:
product["url"] = url
products.append(product)
cache[url] = product
stats["products_scraped"] += 1
else:
cache[url] = None
stats["not_product_pages"] += 1
# Save cache periodically
if (i + 1) % 100 == 0:
with open(CACHE_FILE, "w") as f:
json.dump(cache, f)
# Final cache save
with open(CACHE_FILE, "w") as f:
json.dump(cache, f)
print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
return products
def paginated_get(path):
"""Fetch all pages from a paginated API endpoint."""
all_items = []
page = 1
while True:
resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
if not resp or "data" not in resp or not resp["data"]:
break
all_items.extend(resp["data"])
if len(resp["data"]) < 100:
break
page += 1
return all_items
def load_api_data():
"""Load all species, families, cultivars from HerbAPI."""
print("Loading HerbAPI data...")
families = {}
for f in paginated_get("/families"):
families[f["name_scientific"].lower()] = f
print(f" {len(families)} families")
species = {}
for s in paginated_get("/species"):
species[s["name_scientific"].lower().strip()] = s
print(f" {len(species)} species")
cultivars = {}
for c in paginated_get("/cultivars"):
key = (c["species_id"], c["name"].lower().strip())
cultivars[key] = c
print(f" {len(cultivars)} cultivars")
return families, species, cultivars
def ensure_supplier():
"""Create or find the Dreschflegel supplier."""
resp = api_request("GET", "/suppliers")
if resp:
for s in resp:
if "dreschflegel" in s["name"].lower():
print(f" Supplier exists: {s['name']} ({s['id']})")
return s
data = {
"name": "Dreschflegel",
"url": "https://www.dreschflegel-saatgut.de",
"country": "DE",
"is_organic": True,
"is_demeter": False,
"notes": "German organic seed cooperative, open-pollinated heritage varieties",
}
resp = api_request("POST", "/suppliers", data)
if resp:
print(f" Created supplier: {resp['name']} ({resp['id']})")
return resp
# Genus → family mapping for species creation
GENUS_TO_FAMILY = {
# Asteraceae
"Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
"Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
"Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
"Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
"Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
"Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
"Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
"Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
"Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
"Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
"Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
"Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
"Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
"Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
# Solanaceae
"Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
"Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
# Cucurbitaceae
"Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
"Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
# Fabaceae
"Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
"Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
"Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
"Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
# Brassicaceae
"Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
"Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
"Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
"Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
"Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
# Apiaceae
"Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
"Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
"Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
"Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
"Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
# Lamiaceae
"Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
"Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
"Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
"Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
"Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
"Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
# Amaryllidaceae / Alliaceae
"Allium": "Amaryllidaceae",
# Poaceae
"Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
"Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
"Zea": "Poaceae", "Setaria": "Poaceae",
# Chenopodiaceae
"Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
"Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
# Rosaceae
"Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
"Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
"Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
"Waldsteinia": "Rosaceae",
# Boraginaceae
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
"Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
# Malvaceae
"Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
"Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
# Polygonaceae
"Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
# Caryophyllaceae
"Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
"Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
"Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
# Tropaeolaceae
"Tropaeolum": "Tropaeolaceae",
# Papaveraceae
"Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
"Meconopsis": "Papaveraceae",
# Caprifoliaceae
"Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
"Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
# Plantaginaceae
"Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
"Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
# Violaceae
"Viola": "Violaceae",
# Ranunculaceae
"Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
"Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
# Linaceae
"Linum": "Linaceae",
# Convolvulaceae
"Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
# Portulacaceae / Montiaceae
"Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
# Amaranthaceae
"Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
"Gomphrena": "Amaranthaceae",
# Asparagaceae
"Asparagus": "Asparagaceae",
# Resedaceae
"Reseda": "Resedaceae",
# Balsaminaceae
"Impatiens": "Balsaminaceae",
# Hydrangeaceae
"Hydrangea": "Hydrangeaceae",
# Campanulaceae
"Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
# Scrophulariaceae
"Verbascum": "Scrophulariaceae",
# Verbenaceae
"Verbena": "Verbenaceae",
# Onagraceae
"Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
# Cucurbitaceae extras
"Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
# Hypericaceae
"Hypericum": "Hypericaceae",
# Adoxaceae
"Sambucus": "Adoxaceae",
# Others
"Nigella": "Ranunculaceae",
"Dipsacus": "Caprifoliaceae",
"Knautia": "Caprifoliaceae",
"Scabiosa": "Caprifoliaceae",
"Succisa": "Caprifoliaceae",
"Asclepias": "Apocynaceae",
"Cynoglossum": "Boraginaceae",
"Echium": "Boraginaceae",
"Anchusa": "Boraginaceae",
"Lithospermum": "Boraginaceae",
"Tanacetum": "Asteraceae",
"Onobrychis": "Fabaceae",
"Ornithopus": "Fabaceae",
"Lotus": "Fabaceae",
"Anthyllis": "Fabaceae",
"Melilotus": "Fabaceae",
"Galega": "Fabaceae",
"Lespedeza": "Fabaceae",
"Arachis": "Fabaceae",
"Senna": "Fabaceae",
# Additional genera found in Dreschflegel catalog
"Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
"Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
"Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
"Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
"Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
"Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
"Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
"Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
"Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
"Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
"Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
"Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
"Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
"Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
"Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
"Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
"Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
"Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
"Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
"Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
"Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
"Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
"Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
}
def normalize_species_name(botanical_name):
"""Normalize botanical name to 'Genus species' for matching.
Handles var., subsp., ssp., hybrids etc.
"""
name = botanical_name.strip()
parts = name.split()
if len(parts) < 2:
return None, None
genus = parts[0]
# Handle 'Genus x species' (hybrid notation)
if parts[1] == "x" and len(parts) >= 3:
species = f"x {parts[2]}"
elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
# Only genus level - can't match to species
return genus, None
else:
species = parts[1]
return genus, species
def find_species(botanical_name, species_cache):
"""Find existing species matching a botanical name.
Tries exact match, then genus+species without var/subsp.
"""
genus, sp = normalize_species_name(botanical_name)
if not genus:
return None
if sp:
# Try exact genus+species
search_key = f"{genus} {sp}".lower()
if search_key in species_cache:
return species_cache[search_key]
# Try all species with same genus
genus_lower = genus.lower()
matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
if len(matches) == 1:
# Only one species in this genus - use it
return list(matches.values())[0]
return None
def find_or_create_species(botanical_name, families, species_cache):
"""Find or create a species from a botanical name."""
# Try to find existing
sp = find_species(botanical_name, species_cache)
if sp:
return sp
genus, species_epithet = normalize_species_name(botanical_name)
if not genus or not species_epithet:
stats["species_no_epithet"] += 1
return None
sci_name = f"{genus} {species_epithet}"
# Check cache again with normalized name
if sci_name.lower() in species_cache:
return species_cache[sci_name.lower()]
# Need to create - find the family
family_name = GENUS_TO_FAMILY.get(genus)
if not family_name:
stats["species_no_family"] += 1
print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
return None
# Find or create the family
family = families.get(family_name.lower())
if not family:
print(f" Creating family: {family_name}")
resp = api_request("POST", "/families", {"name_scientific": family_name})
if resp:
families[family_name.lower()] = resp
family = resp
stats["families_created"] += 1
else:
# May already exist (duplicate from previous run) - reload
for f in paginated_get("/families"):
if f["name_scientific"].lower() == family_name.lower():
families[family_name.lower()] = f
family = f
break
if not family:
print(f" [SKIP] Cannot create family: {family_name}")
return None
# Create species
print(f" Creating species: {sci_name} (family: {family_name})")
resp = api_request("POST", "/species", {
"name_scientific": sci_name,
"family_id": family["id"],
})
if resp:
species_cache[sci_name.lower()] = resp
stats["species_created"] += 1
return resp
else:
# May already exist - try to find it
time.sleep(0.1)
for s in paginated_get("/species"):
if s["name_scientific"].lower() == sci_name.lower():
species_cache[sci_name.lower()] = s
return s
return None
def extract_cultivar_name(product_name):
"""Extract the cultivar/variety name from the full product name."""
name = product_name.strip()
# Common German crop type prefixes to strip (longest first)
prefixes = [
# Tomatoes
"Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
"Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
"Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
# Lettuce
"Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
"Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
"Spargelsalat", "Romanasalat",
# Beans
"Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
"Prunkbohne",
# Peas
"Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
"Knackerbse", "Kapuzinererbse",
# Cucumbers
"Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
"Freilandgurke",
# Squash
"Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
"Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
# Melon
"Wassermelone", "Zuckermelone",
# Peppers
"Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
"Snackpaprika", "Peperoni", "Chili",
# Brassicas
"Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
"Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
"Chinakohl", "Pak Choi", "Markstammkohl",
# Root veg
"Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
"Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
"Steckrübe", "Knollensellerie", "Petersilienwurzel",
"Rettich", "Radieschen",
# Onions
"Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
"Schalotte", "Wintersteckzwiebel", "Zwiebel",
# Herbs
"Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
"Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
"Basilikum", "Schnittknoblauch",
# Grains
"Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
"Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
# Misc
"Zuckermais", "Popcornmais",
"Salattomate", "Zucchini",
]
for prefix in sorted(prefixes, key=len, reverse=True):
if name.startswith(prefix + " "):
return name[len(prefix):].strip()
return name
def get_existing_supplier_links(cultivar_id, supplier_id):
"""Check if a cultivar-supplier link already exists."""
resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
if resp:
for link in resp:
if link["supplier_id"] == supplier_id:
return True
return False
def main():
print("=" * 60)
print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
print("=" * 60)
# Step 1: Supplier
print("\n[1] Setting up supplier...")
supplier = ensure_supplier()
if not supplier:
print("FATAL: Could not create/find supplier")
sys.exit(1)
supplier_id = supplier["id"]
# Step 2: Load API data
print("\n[2] Loading existing HerbAPI data...")
families, species_cache, cultivar_cache = load_api_data()
# Step 3: Get product URLs
print("\n[3] Fetching sitemap...")
all_urls = get_sitemap_urls()
if not all_urls:
print("FATAL: Could not fetch sitemap")
sys.exit(1)
candidate_urls = classify_urls(all_urls)
print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
# Step 4: Scrape
print(f"\n[4] Scraping product pages...")
products = scrape_all_products(candidate_urls)
# Step 5: Import
print(f"\n[5] Importing {len(products)} products into HerbAPI...")
for i, product in enumerate(products):
if (i + 1) % 50 == 0:
print(f" Processing {i + 1}/{len(products)}...")
botanical = product.get("botanical_name", "")
if not botanical:
stats["no_botanical"] += 1
continue
# Find or create species
sp = find_or_create_species(botanical, families, species_cache)
if not sp:
stats["species_not_matched"] += 1
continue
species_id = sp["id"]
cultivar_name = extract_cultivar_name(product["name"])
# Check if cultivar already exists
cv_key = (species_id, cultivar_name.lower().strip())
if cv_key in cultivar_cache:
cv = cultivar_cache[cv_key]
stats["cultivars_existing"] += 1
else:
cv_data = {
"species_id": species_id,
"name": cultivar_name,
"is_organic": True,
}
if product.get("description"):
cv_data["description"] = product["description"]
cv = api_request("POST", "/cultivars", cv_data)
if cv:
cultivar_cache[cv_key] = cv
stats["cultivars_created"] += 1
else:
# Might already exist from previous run - try to find it
found = False
for c in paginated_get(f"/cultivars?species_id={species_id}"):
if c["name"].lower().strip() == cultivar_name.lower().strip():
cultivar_cache[cv_key] = c
cv = c
stats["cultivars_existing"] += 1
found = True
break
if not found:
stats["cultivar_create_errors"] += 1
continue
# Link to supplier (check first for idempotency)
if get_existing_supplier_links(cv["id"], supplier_id):
stats["supplier_links_existing"] += 1
continue
link_data = {
"supplier_id": supplier_id,
"article_number": product.get("article_number", ""),
"product_url": product.get("url", ""),
"price_eur": product.get("price"),
}
pack_info = product.get("pack_info", "")
if pack_info:
m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
if m:
link_data["pack_size"] = float(m.group(1))
unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
if resp:
stats["supplier_links_created"] += 1
else:
stats["supplier_link_errors"] += 1
# Summary
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
for key, val in sorted(stats.items()):
print(f" {key}: {val}")
print(f"\n Total species in DB: {len(species_cache)}")
print(f" Total cultivars tracked: {len(cultivar_cache)}")
if __name__ == "__main__":
main()