#!/usr/bin/env python3 """ Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de). Extracts cultivar data and imports into HerbAPI. Run 2 - fixes pagination (API caps at 100/page), better species matching, caches scraped products, handles duplicates gracefully. """ import urllib.request import urllib.parse import urllib.error import gzip import json import re import time import sys import os import html as html_mod from collections import defaultdict # --- Configuration --- API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" SITE_BASE = "https://www.dreschflegel-saatgut.de" DELAY = 0.5 USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)" CACHE_FILE = "/tmp/dreschflegel_products_cache.json" # Unbuffered output sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) stats = defaultdict(int) def api_request(method, path, data=None): """Make an API request to HerbAPI.""" url = f"{API_BASE}{path}" body = json.dumps(data).encode("utf-8") if data else None req = urllib.request.Request(url, data=body, method=method) req.add_header("Authorization", f"Bearer {API_TOKEN}") req.add_header("Content-Type", "application/json") req.add_header("Accept", "application/json") try: resp = urllib.request.urlopen(req) return json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as e: body_text = e.read().decode("utf-8", errors="replace") if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower(): return None # Duplicate, handled silently if e.code == 500 and "database error" in body_text.lower(): # Likely a unique constraint violation = duplicate return None print(f" API error {e.code} {method} {path}: {body_text[:200]}") return None def fetch_page(url): """Fetch a web page with delay and user-agent.""" req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: resp = urllib.request.urlopen(req, timeout=30) return resp.read().decode("utf-8", errors="replace") except Exception as e: print(f" Fetch error {url}: {e}") return None def get_sitemap_urls(): """Download sitemap and extract all URLs.""" print("Fetching sitemap index...") html = fetch_page(f"{SITE_BASE}/sitemap.xml") if not html: return [] sitemap_urls = re.findall(r"(.*?)", html) all_urls = [] for smap_url in sitemap_urls: if smap_url.endswith(".xml.gz"): print(f" Fetching compressed sitemap...") req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT}) try: resp = urllib.request.urlopen(req, timeout=30) data = gzip.decompress(resp.read()).decode("utf-8") urls = re.findall(r"(.*?)", data) all_urls.extend(urls) print(f" Found {len(urls)} URLs") except Exception as e: print(f" Error: {e}") return all_urls def classify_urls(urls): """Filter URLs to likely product pages (single-segment paths).""" skip_prefixes = [ "impressum", "agb", "datenschutz", "kontakt", "widerrufs", "versand", "abkuerz", "zertifikat", "wichtige-hinweise", "muster-", "gutscheine", "kalender", "flyer", "katalog", "sommer-herbst", "unsere-hoefe", "bestellschein", "dreschflegel-news", "termine", "rezepte", "anbautipps", "tipps-zur", "gartentelefon", "gartenfreude", "buecher", "navigation", "vielfalt", "sut20", "saatgut", "neuheiten", "kennenlernangebote", "sut25", "vielfalt25", "saatgut-vielfalt", "saat", ] candidates = [] for url in urls: url = url.rstrip("/") path = url.replace("https://dreschflegel-saatgut.de/", "").replace( "https://www.dreschflegel-saatgut.de/", "" ) if not path or "/" in path: continue if any(path == p or path.startswith(p) for p in skip_prefixes): continue candidates.append(url) return candidates def parse_product_page(html_content): """Extract product data from a Dreschflegel product page.""" if not html_content or 'class="botname"' not in html_content: return None result = {} m = re.search(r"

(.*?)

", html_content) if m: result["name"] = html_mod.unescape(m.group(1).strip()) m = re.search(r'

\s*(.*?)\s*

', html_content, re.DOTALL) if m: result["botanical_name"] = html_mod.unescape(m.group(1).strip()) m = re.search( r'class="product-detail-ordernumber"[^>]*>\s*(\d+)', html_content, re.DOTALL, ) if m: result["article_number"] = m.group(1) m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content) if m: try: result["price"] = float(m.group(1)) except ValueError: pass m = re.search( r"product-detail-description-text.*?

(.*?)

", html_content, re.DOTALL, ) if m: desc = re.sub(r"<[^>]+>", "", m.group(1).strip()) desc = html_mod.unescape(desc).strip() if desc: result["description"] = desc m = re.search(r"Inhalt reicht f[üu]r:\s*(.*?)\s*", html_content) if m: result["pack_info"] = html_mod.unescape(m.group(1).strip()) return result if "name" in result and "botanical_name" in result else None def scrape_all_products(candidate_urls): """Scrape product pages, using cache for already-scraped URLs.""" # Load cache cache = {} if os.path.exists(CACHE_FILE): with open(CACHE_FILE, "r") as f: cache = json.load(f) print(f" Loaded {len(cache)} cached products") products = [] to_fetch = [u for u in candidate_urls if u not in cache] already_cached = [u for u in candidate_urls if u in cache] # Add cached products for u in already_cached: if cache[u]: # None means "not a product page" products.append(cache[u]) cached_products = len(products) cached_non_products = len(already_cached) - cached_products print(f" {cached_products} products from cache, " f"{cached_non_products} non-products cached, " f"{len(to_fetch)} to fetch") for i, url in enumerate(to_fetch): if (i + 1) % 50 == 0 or i == 0: print(f" Fetching {i + 1}/{len(to_fetch)}...") time.sleep(DELAY) html_content = fetch_page(url) if not html_content: stats["fetch_errors"] += 1 cache[url] = None continue product = parse_product_page(html_content) if product: product["url"] = url products.append(product) cache[url] = product stats["products_scraped"] += 1 else: cache[url] = None stats["not_product_pages"] += 1 # Save cache periodically if (i + 1) % 100 == 0: with open(CACHE_FILE, "w") as f: json.dump(cache, f) # Final cache save with open(CACHE_FILE, "w") as f: json.dump(cache, f) print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)") return products def paginated_get(path): """Fetch all pages from a paginated API endpoint.""" all_items = [] page = 1 while True: resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}") if not resp or "data" not in resp or not resp["data"]: break all_items.extend(resp["data"]) if len(resp["data"]) < 100: break page += 1 return all_items def load_api_data(): """Load all species, families, cultivars from HerbAPI.""" print("Loading HerbAPI data...") families = {} for f in paginated_get("/families"): families[f["name_scientific"].lower()] = f print(f" {len(families)} families") species = {} for s in paginated_get("/species"): species[s["name_scientific"].lower().strip()] = s print(f" {len(species)} species") cultivars = {} for c in paginated_get("/cultivars"): key = (c["species_id"], c["name"].lower().strip()) cultivars[key] = c print(f" {len(cultivars)} cultivars") return families, species, cultivars def ensure_supplier(): """Create or find the Dreschflegel supplier.""" resp = api_request("GET", "/suppliers") if resp: for s in resp: if "dreschflegel" in s["name"].lower(): print(f" Supplier exists: {s['name']} ({s['id']})") return s data = { "name": "Dreschflegel", "url": "https://www.dreschflegel-saatgut.de", "country": "DE", "is_organic": True, "is_demeter": False, "notes": "German organic seed cooperative, open-pollinated heritage varieties", } resp = api_request("POST", "/suppliers", data) if resp: print(f" Created supplier: {resp['name']} ({resp['id']})") return resp # Genus → family mapping for species creation GENUS_TO_FAMILY = { # Asteraceae "Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae", "Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae", "Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae", "Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae", "Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae", "Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae", "Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae", "Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae", "Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae", "Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae", "Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae", "Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae", "Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae", "Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae", # Solanaceae "Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae", "Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae", # Cucurbitaceae "Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae", # Fabaceae "Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae", "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae", "Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae", "Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae", # Brassicaceae "Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae", "Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae", "Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae", "Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae", "Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae", # Apiaceae "Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae", "Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae", "Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae", "Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae", "Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae", # Lamiaceae "Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae", "Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae", "Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae", "Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae", "Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae", # Amaryllidaceae / Alliaceae "Allium": "Amaryllidaceae", # Poaceae "Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae", "Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae", "Zea": "Poaceae", "Setaria": "Poaceae", # Chenopodiaceae "Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae", # Rosaceae "Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae", "Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae", "Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae", "Waldsteinia": "Rosaceae", # Boraginaceae "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae", "Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae", # Malvaceae "Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae", "Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae", # Polygonaceae "Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae", # Caryophyllaceae "Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae", "Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae", "Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae", # Tropaeolaceae "Tropaeolum": "Tropaeolaceae", # Papaveraceae "Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae", "Meconopsis": "Papaveraceae", # Caprifoliaceae "Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae", "Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae", # Plantaginaceae "Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae", "Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae", # Violaceae "Viola": "Violaceae", # Ranunculaceae "Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae", "Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae", # Linaceae "Linum": "Linaceae", # Convolvulaceae "Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae", # Portulacaceae / Montiaceae "Claytonia": "Montiaceae", "Portulaca": "Portulacaceae", # Amaranthaceae "Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae", "Gomphrena": "Amaranthaceae", # Asparagaceae "Asparagus": "Asparagaceae", # Resedaceae "Reseda": "Resedaceae", # Balsaminaceae "Impatiens": "Balsaminaceae", # Hydrangeaceae "Hydrangea": "Hydrangeaceae", # Campanulaceae "Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae", # Scrophulariaceae "Verbascum": "Scrophulariaceae", # Verbenaceae "Verbena": "Verbenaceae", # Onagraceae "Oenothera": "Onagraceae", "Clarkia": "Onagraceae", # Cucurbitaceae extras "Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae", # Hypericaceae "Hypericum": "Hypericaceae", # Adoxaceae "Sambucus": "Adoxaceae", # Others "Nigella": "Ranunculaceae", "Dipsacus": "Caprifoliaceae", "Knautia": "Caprifoliaceae", "Scabiosa": "Caprifoliaceae", "Succisa": "Caprifoliaceae", "Asclepias": "Apocynaceae", "Cynoglossum": "Boraginaceae", "Echium": "Boraginaceae", "Anchusa": "Boraginaceae", "Lithospermum": "Boraginaceae", "Tanacetum": "Asteraceae", "Onobrychis": "Fabaceae", "Ornithopus": "Fabaceae", "Lotus": "Fabaceae", "Anthyllis": "Fabaceae", "Melilotus": "Fabaceae", "Galega": "Fabaceae", "Lespedeza": "Fabaceae", "Arachis": "Fabaceae", "Senna": "Fabaceae", # Additional genera found in Dreschflegel catalog "Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae", "Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae", "Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae", "Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae", "Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae", "Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae", "Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae", "Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae", "Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae", "Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae", "Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae", "Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae", "Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae", "Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae", "Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae", "Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae", "Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae", "Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae", "Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae", "Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae", "Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae", "Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae", "Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae", } def normalize_species_name(botanical_name): """Normalize botanical name to 'Genus species' for matching. Handles var., subsp., ssp., hybrids etc. """ name = botanical_name.strip() parts = name.split() if len(parts) < 2: return None, None genus = parts[0] # Handle 'Genus x species' (hybrid notation) if parts[1] == "x" and len(parts) >= 3: species = f"x {parts[2]}" elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."): # Only genus level - can't match to species return genus, None else: species = parts[1] return genus, species def find_species(botanical_name, species_cache): """Find existing species matching a botanical name. Tries exact match, then genus+species without var/subsp. """ genus, sp = normalize_species_name(botanical_name) if not genus: return None if sp: # Try exact genus+species search_key = f"{genus} {sp}".lower() if search_key in species_cache: return species_cache[search_key] # Try all species with same genus genus_lower = genus.lower() matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")} if len(matches) == 1: # Only one species in this genus - use it return list(matches.values())[0] return None def find_or_create_species(botanical_name, families, species_cache): """Find or create a species from a botanical name.""" # Try to find existing sp = find_species(botanical_name, species_cache) if sp: return sp genus, species_epithet = normalize_species_name(botanical_name) if not genus or not species_epithet: stats["species_no_epithet"] += 1 return None sci_name = f"{genus} {species_epithet}" # Check cache again with normalized name if sci_name.lower() in species_cache: return species_cache[sci_name.lower()] # Need to create - find the family family_name = GENUS_TO_FAMILY.get(genus) if not family_name: stats["species_no_family"] += 1 print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})") return None # Find or create the family family = families.get(family_name.lower()) if not family: print(f" Creating family: {family_name}") resp = api_request("POST", "/families", {"name_scientific": family_name}) if resp: families[family_name.lower()] = resp family = resp stats["families_created"] += 1 else: # May already exist (duplicate from previous run) - reload for f in paginated_get("/families"): if f["name_scientific"].lower() == family_name.lower(): families[family_name.lower()] = f family = f break if not family: print(f" [SKIP] Cannot create family: {family_name}") return None # Create species print(f" Creating species: {sci_name} (family: {family_name})") resp = api_request("POST", "/species", { "name_scientific": sci_name, "family_id": family["id"], }) if resp: species_cache[sci_name.lower()] = resp stats["species_created"] += 1 return resp else: # May already exist - try to find it time.sleep(0.1) for s in paginated_get("/species"): if s["name_scientific"].lower() == sci_name.lower(): species_cache[sci_name.lower()] = s return s return None def extract_cultivar_name(product_name): """Extract the cultivar/variety name from the full product name.""" name = product_name.strip() # Common German crop type prefixes to strip (longest first) prefixes = [ # Tomatoes "Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate", "Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate", "Flaschentomate", "Eitomate", "Datteltomate", "Tomate", # Lettuce "Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat", "Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat", "Spargelsalat", "Romanasalat", # Beans "Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne", "Prunkbohne", # Peas "Markerbse", "Zuckererbse", "Palerbse", "Schalerbse", "Knackerbse", "Kapuzinererbse", # Cucumbers "Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke", "Freilandgurke", # Squash "Hokkaidokürbis", "Butternutkürbis", "Speisekürbis", "Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis", # Melon "Wassermelone", "Zuckermelone", # Peppers "Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika", "Snackpaprika", "Peperoni", "Chili", # Brassicas "Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl", "Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl", "Chinakohl", "Pak Choi", "Markstammkohl", # Root veg "Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete", "Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich", "Steckrübe", "Knollensellerie", "Petersilienwurzel", "Rettich", "Radieschen", # Onions "Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel", "Schalotte", "Wintersteckzwiebel", "Zwiebel", # Herbs "Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum", "Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum", "Basilikum", "Schnittknoblauch", # Grains "Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen", "Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn", # Misc "Zuckermais", "Popcornmais", "Salattomate", "Zucchini", ] for prefix in sorted(prefixes, key=len, reverse=True): if name.startswith(prefix + " "): return name[len(prefix):].strip() return name def get_existing_supplier_links(cultivar_id, supplier_id): """Check if a cultivar-supplier link already exists.""" resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers") if resp: for link in resp: if link["supplier_id"] == supplier_id: return True return False def main(): print("=" * 60) print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)") print("=" * 60) # Step 1: Supplier print("\n[1] Setting up supplier...") supplier = ensure_supplier() if not supplier: print("FATAL: Could not create/find supplier") sys.exit(1) supplier_id = supplier["id"] # Step 2: Load API data print("\n[2] Loading existing HerbAPI data...") families, species_cache, cultivar_cache = load_api_data() # Step 3: Get product URLs print("\n[3] Fetching sitemap...") all_urls = get_sitemap_urls() if not all_urls: print("FATAL: Could not fetch sitemap") sys.exit(1) candidate_urls = classify_urls(all_urls) print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates") # Step 4: Scrape print(f"\n[4] Scraping product pages...") products = scrape_all_products(candidate_urls) # Step 5: Import print(f"\n[5] Importing {len(products)} products into HerbAPI...") for i, product in enumerate(products): if (i + 1) % 50 == 0: print(f" Processing {i + 1}/{len(products)}...") botanical = product.get("botanical_name", "") if not botanical: stats["no_botanical"] += 1 continue # Find or create species sp = find_or_create_species(botanical, families, species_cache) if not sp: stats["species_not_matched"] += 1 continue species_id = sp["id"] cultivar_name = extract_cultivar_name(product["name"]) # Check if cultivar already exists cv_key = (species_id, cultivar_name.lower().strip()) if cv_key in cultivar_cache: cv = cultivar_cache[cv_key] stats["cultivars_existing"] += 1 else: cv_data = { "species_id": species_id, "name": cultivar_name, "is_organic": True, } if product.get("description"): cv_data["description"] = product["description"] cv = api_request("POST", "/cultivars", cv_data) if cv: cultivar_cache[cv_key] = cv stats["cultivars_created"] += 1 else: # Might already exist from previous run - try to find it found = False for c in paginated_get(f"/cultivars?species_id={species_id}"): if c["name"].lower().strip() == cultivar_name.lower().strip(): cultivar_cache[cv_key] = c cv = c stats["cultivars_existing"] += 1 found = True break if not found: stats["cultivar_create_errors"] += 1 continue # Link to supplier (check first for idempotency) if get_existing_supplier_links(cv["id"], supplier_id): stats["supplier_links_existing"] += 1 continue link_data = { "supplier_id": supplier_id, "article_number": product.get("article_number", ""), "product_url": product.get("url", ""), "price_eur": product.get("price"), } pack_info = product.get("pack_info", "") if pack_info: m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info) if m: link_data["pack_size"] = float(m.group(1)) unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"} link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2)) resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data) if resp: stats["supplier_links_created"] += 1 else: stats["supplier_link_errors"] += 1 # Summary print("\n" + "=" * 60) print("RESULTS") print("=" * 60) for key, val in sorted(stats.items()): print(f" {key}: {val}") print(f"\n Total species in DB: {len(species_cache)}") print(f" Total cultivars tracked: {len(cultivar_cache)}") if __name__ == "__main__": main()