herbapi/tools/scrapers/scrape_dreschflegel.py

#!/usr/bin/env python3
"""
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
Extracts cultivar data and imports into HerbAPI.

Run 2 - fixes pagination (API caps at 100/page), better species matching,
caches scraped products, handles duplicates gracefully.
"""

import urllib.request
import urllib.parse
import urllib.error
import gzip
import json
import re
import time
import sys
import os
import html as html_mod
from collections import defaultdict

# --- Configuration ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SITE_BASE = "https://www.dreschflegel-saatgut.de"
DELAY = 0.5
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"

# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)

stats = defaultdict(int)


def api_request(method, path, data=None):
    """Make an API request to HerbAPI."""
    url = f"{API_BASE}{path}"
    body = json.dumps(data).encode("utf-8") if data else None
    req = urllib.request.Request(url, data=body, method=method)
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    req.add_header("Content-Type", "application/json")
    req.add_header("Accept", "application/json")
    try:
        resp = urllib.request.urlopen(req)
        return json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        body_text = e.read().decode("utf-8", errors="replace")
        if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
            return None  # Duplicate, handled silently
        if e.code == 500 and "database error" in body_text.lower():
            # Likely a unique constraint violation = duplicate
            return None
        print(f"  API error {e.code} {method} {path}: {body_text[:200]}")
        return None


def fetch_page(url):
    """Fetch a web page with delay and user-agent."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        resp = urllib.request.urlopen(req, timeout=30)
        return resp.read().decode("utf-8", errors="replace")
    except Exception as e:
        print(f"  Fetch error {url}: {e}")
        return None


def get_sitemap_urls():
    """Download sitemap and extract all URLs."""
    print("Fetching sitemap index...")
    html = fetch_page(f"{SITE_BASE}/sitemap.xml")
    if not html:
        return []

    sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
    all_urls = []

    for smap_url in sitemap_urls:
        if smap_url.endswith(".xml.gz"):
            print(f"  Fetching compressed sitemap...")
            req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
            try:
                resp = urllib.request.urlopen(req, timeout=30)
                data = gzip.decompress(resp.read()).decode("utf-8")
                urls = re.findall(r"<loc>(.*?)</loc>", data)
                all_urls.extend(urls)
                print(f"    Found {len(urls)} URLs")
            except Exception as e:
                print(f"    Error: {e}")

    return all_urls


def classify_urls(urls):
    """Filter URLs to likely product pages (single-segment paths)."""
    skip_prefixes = [
        "impressum", "agb", "datenschutz", "kontakt", "widerrufs",
        "versand", "abkuerz", "zertifikat", "wichtige-hinweise",
        "muster-", "gutscheine", "kalender", "flyer", "katalog",
        "sommer-herbst", "unsere-hoefe", "bestellschein",
        "dreschflegel-news", "termine", "rezepte", "anbautipps",
        "tipps-zur", "gartentelefon", "gartenfreude", "buecher",
        "navigation", "vielfalt", "sut20", "saatgut",
        "neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
        "saatgut-vielfalt", "saat",
    ]
    candidates = []
    for url in urls:
        url = url.rstrip("/")
        path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
            "https://www.dreschflegel-saatgut.de/", ""
        )
        if not path or "/" in path:
            continue
        if any(path == p or path.startswith(p) for p in skip_prefixes):
            continue
        candidates.append(url)
    return candidates


def parse_product_page(html_content):
    """Extract product data from a Dreschflegel product page."""
    if not html_content or 'class="botname"' not in html_content:
        return None

    result = {}

    m = re.search(r"<h1>(.*?)</h1>", html_content)
    if m:
        result["name"] = html_mod.unescape(m.group(1).strip())

    m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
    if m:
        result["botanical_name"] = html_mod.unescape(m.group(1).strip())

    m = re.search(
        r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
        html_content,
        re.DOTALL,
    )
    if m:
        result["article_number"] = m.group(1)

    m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
    if m:
        try:
            result["price"] = float(m.group(1))
        except ValueError:
            pass

    m = re.search(
        r"product-detail-description-text.*?<p>(.*?)</p>",
        html_content,
        re.DOTALL,
    )
    if m:
        desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
        desc = html_mod.unescape(desc).strip()
        if desc:
            result["description"] = desc

    m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
    if m:
        result["pack_info"] = html_mod.unescape(m.group(1).strip())

    return result if "name" in result and "botanical_name" in result else None


def scrape_all_products(candidate_urls):
    """Scrape product pages, using cache for already-scraped URLs."""
    # Load cache
    cache = {}
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            cache = json.load(f)
        print(f"  Loaded {len(cache)} cached products")

    products = []
    to_fetch = [u for u in candidate_urls if u not in cache]
    already_cached = [u for u in candidate_urls if u in cache]

    # Add cached products
    for u in already_cached:
        if cache[u]:  # None means "not a product page"
            products.append(cache[u])

    cached_products = len(products)
    cached_non_products = len(already_cached) - cached_products
    print(f"  {cached_products} products from cache, "
          f"{cached_non_products} non-products cached, "
          f"{len(to_fetch)} to fetch")

    for i, url in enumerate(to_fetch):
        if (i + 1) % 50 == 0 or i == 0:
            print(f"  Fetching {i + 1}/{len(to_fetch)}...")

        time.sleep(DELAY)
        html_content = fetch_page(url)
        if not html_content:
            stats["fetch_errors"] += 1
            cache[url] = None
            continue

        product = parse_product_page(html_content)
        if product:
            product["url"] = url
            products.append(product)
            cache[url] = product
            stats["products_scraped"] += 1
        else:
            cache[url] = None
            stats["not_product_pages"] += 1

        # Save cache periodically
        if (i + 1) % 100 == 0:
            with open(CACHE_FILE, "w") as f:
                json.dump(cache, f)

    # Final cache save
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)

    print(f"  Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
    return products


def paginated_get(path):
    """Fetch all pages from a paginated API endpoint."""
    all_items = []
    page = 1
    while True:
        resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
        if not resp or "data" not in resp or not resp["data"]:
            break
        all_items.extend(resp["data"])
        if len(resp["data"]) < 100:
            break
        page += 1
    return all_items


def load_api_data():
    """Load all species, families, cultivars from HerbAPI."""
    print("Loading HerbAPI data...")

    families = {}
    for f in paginated_get("/families"):
        families[f["name_scientific"].lower()] = f
    print(f"  {len(families)} families")

    species = {}
    for s in paginated_get("/species"):
        species[s["name_scientific"].lower().strip()] = s
    print(f"  {len(species)} species")

    cultivars = {}
    for c in paginated_get("/cultivars"):
        key = (c["species_id"], c["name"].lower().strip())
        cultivars[key] = c
    print(f"  {len(cultivars)} cultivars")

    return families, species, cultivars


def ensure_supplier():
    """Create or find the Dreschflegel supplier."""
    resp = api_request("GET", "/suppliers")
    if resp:
        for s in resp:
            if "dreschflegel" in s["name"].lower():
                print(f"  Supplier exists: {s['name']} ({s['id']})")
                return s
    data = {
        "name": "Dreschflegel",
        "url": "https://www.dreschflegel-saatgut.de",
        "country": "DE",
        "is_organic": True,
        "is_demeter": False,
        "notes": "German organic seed cooperative, open-pollinated heritage varieties",
    }
    resp = api_request("POST", "/suppliers", data)
    if resp:
        print(f"  Created supplier: {resp['name']} ({resp['id']})")
    return resp


# Genus → family mapping for species creation
GENUS_TO_FAMILY = {
    # Asteraceae
    "Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
    "Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
    "Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
    "Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
    "Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
    "Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
    "Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
    "Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
    "Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
    "Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
    "Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
    "Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
    "Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
    "Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
    # Solanaceae
    "Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
    "Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
    # Cucurbitaceae
    "Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
    "Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
    # Fabaceae
    "Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
    "Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
    "Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
    "Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
    "Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
    # Brassicaceae
    "Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
    "Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
    "Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
    "Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
    "Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
    # Apiaceae
    "Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
    "Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
    "Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
    "Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
    "Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
    # Lamiaceae
    "Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
    "Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
    "Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
    "Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
    "Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
    "Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
    # Amaryllidaceae / Alliaceae
    "Allium": "Amaryllidaceae",
    # Poaceae
    "Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
    "Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
    "Zea": "Poaceae", "Setaria": "Poaceae",
    # Chenopodiaceae
    "Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
    "Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
    # Rosaceae
    "Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
    "Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
    "Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
    "Waldsteinia": "Rosaceae",
    # Boraginaceae
    "Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
    "Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
    # Malvaceae
    "Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
    "Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
    # Polygonaceae
    "Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
    # Caryophyllaceae
    "Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
    "Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
    "Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
    # Tropaeolaceae
    "Tropaeolum": "Tropaeolaceae",
    # Papaveraceae
    "Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
    "Meconopsis": "Papaveraceae",
    # Caprifoliaceae
    "Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
    "Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
    # Plantaginaceae
    "Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
    "Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
    # Violaceae
    "Viola": "Violaceae",
    # Ranunculaceae
    "Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
    "Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
    # Linaceae
    "Linum": "Linaceae",
    # Convolvulaceae
    "Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
    # Portulacaceae / Montiaceae
    "Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
    # Amaranthaceae
    "Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
    "Gomphrena": "Amaranthaceae",
    # Asparagaceae
    "Asparagus": "Asparagaceae",
    # Resedaceae
    "Reseda": "Resedaceae",
    # Balsaminaceae
    "Impatiens": "Balsaminaceae",
    # Hydrangeaceae
    "Hydrangea": "Hydrangeaceae",
    # Campanulaceae
    "Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
    # Scrophulariaceae
    "Verbascum": "Scrophulariaceae",
    # Verbenaceae
    "Verbena": "Verbenaceae",
    # Onagraceae
    "Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
    # Cucurbitaceae extras
    "Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
    # Hypericaceae
    "Hypericum": "Hypericaceae",
    # Adoxaceae
    "Sambucus": "Adoxaceae",
    # Others
    "Nigella": "Ranunculaceae",
    "Dipsacus": "Caprifoliaceae",
    "Knautia": "Caprifoliaceae",
    "Scabiosa": "Caprifoliaceae",
    "Succisa": "Caprifoliaceae",
    "Asclepias": "Apocynaceae",
    "Cynoglossum": "Boraginaceae",
    "Echium": "Boraginaceae",
    "Anchusa": "Boraginaceae",
    "Lithospermum": "Boraginaceae",
    "Tanacetum": "Asteraceae",
    "Onobrychis": "Fabaceae",
    "Ornithopus": "Fabaceae",
    "Lotus": "Fabaceae",
    "Anthyllis": "Fabaceae",
    "Melilotus": "Fabaceae",
    "Galega": "Fabaceae",
    "Lespedeza": "Fabaceae",
    "Arachis": "Fabaceae",
    "Senna": "Fabaceae",
    # Additional genera found in Dreschflegel catalog
    "Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
    "Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
    "Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
    "Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
    "Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
    "Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
    "Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
    "Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
    "Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
    "Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
    "Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
    "Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
    "Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
    "Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
    "Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
    "Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
    "Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
    "Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
    "Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
    "Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
    "Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
    "Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
    "Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
}


def normalize_species_name(botanical_name):
    """Normalize botanical name to 'Genus species' for matching.
    Handles var., subsp., ssp., hybrids etc.
    """
    name = botanical_name.strip()
    parts = name.split()
    if len(parts) < 2:
        return None, None

    genus = parts[0]
    # Handle 'Genus x species' (hybrid notation)
    if parts[1] == "x" and len(parts) >= 3:
        species = f"x {parts[2]}"
    elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
        # Only genus level - can't match to species
        return genus, None
    else:
        species = parts[1]

    return genus, species


def find_species(botanical_name, species_cache):
    """Find existing species matching a botanical name.
    Tries exact match, then genus+species without var/subsp.
    """
    genus, sp = normalize_species_name(botanical_name)
    if not genus:
        return None

    if sp:
        # Try exact genus+species
        search_key = f"{genus} {sp}".lower()
        if search_key in species_cache:
            return species_cache[search_key]

    # Try all species with same genus
    genus_lower = genus.lower()
    matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
    if len(matches) == 1:
        # Only one species in this genus - use it
        return list(matches.values())[0]

    return None


def find_or_create_species(botanical_name, families, species_cache):
    """Find or create a species from a botanical name."""
    # Try to find existing
    sp = find_species(botanical_name, species_cache)
    if sp:
        return sp

    genus, species_epithet = normalize_species_name(botanical_name)
    if not genus or not species_epithet:
        stats["species_no_epithet"] += 1
        return None

    sci_name = f"{genus} {species_epithet}"

    # Check cache again with normalized name
    if sci_name.lower() in species_cache:
        return species_cache[sci_name.lower()]

    # Need to create - find the family
    family_name = GENUS_TO_FAMILY.get(genus)
    if not family_name:
        stats["species_no_family"] += 1
        print(f"    [SKIP] No family mapping for genus: {genus} ({botanical_name})")
        return None

    # Find or create the family
    family = families.get(family_name.lower())
    if not family:
        print(f"    Creating family: {family_name}")
        resp = api_request("POST", "/families", {"name_scientific": family_name})
        if resp:
            families[family_name.lower()] = resp
            family = resp
            stats["families_created"] += 1
        else:
            # May already exist (duplicate from previous run) - reload
            for f in paginated_get("/families"):
                if f["name_scientific"].lower() == family_name.lower():
                    families[family_name.lower()] = f
                    family = f
                    break
            if not family:
                print(f"    [SKIP] Cannot create family: {family_name}")
                return None

    # Create species
    print(f"    Creating species: {sci_name} (family: {family_name})")
    resp = api_request("POST", "/species", {
        "name_scientific": sci_name,
        "family_id": family["id"],
    })
    if resp:
        species_cache[sci_name.lower()] = resp
        stats["species_created"] += 1
        return resp
    else:
        # May already exist - try to find it
        time.sleep(0.1)
        for s in paginated_get("/species"):
            if s["name_scientific"].lower() == sci_name.lower():
                species_cache[sci_name.lower()] = s
                return s
        return None


def extract_cultivar_name(product_name):
    """Extract the cultivar/variety name from the full product name."""
    name = product_name.strip()

    # Common German crop type prefixes to strip (longest first)
    prefixes = [
        # Tomatoes
        "Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
        "Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
        "Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
        # Lettuce
        "Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
        "Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
        "Spargelsalat", "Romanasalat",
        # Beans
        "Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
        "Prunkbohne",
        # Peas
        "Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
        "Knackerbse", "Kapuzinererbse",
        # Cucumbers
        "Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
        "Freilandgurke",
        # Squash
        "Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
        "Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
        # Melon
        "Wassermelone", "Zuckermelone",
        # Peppers
        "Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
        "Snackpaprika", "Peperoni", "Chili",
        # Brassicas
        "Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
        "Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
        "Chinakohl", "Pak Choi", "Markstammkohl",
        # Root veg
        "Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
        "Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
        "Steckrübe", "Knollensellerie", "Petersilienwurzel",
        "Rettich", "Radieschen",
        # Onions
        "Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
        "Schalotte", "Wintersteckzwiebel", "Zwiebel",
        # Herbs
        "Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
        "Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
        "Basilikum", "Schnittknoblauch",
        # Grains
        "Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
        "Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
        # Misc
        "Zuckermais", "Popcornmais",
        "Salattomate", "Zucchini",
    ]

    for prefix in sorted(prefixes, key=len, reverse=True):
        if name.startswith(prefix + " "):
            return name[len(prefix):].strip()

    return name


def get_existing_supplier_links(cultivar_id, supplier_id):
    """Check if a cultivar-supplier link already exists."""
    resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
    if resp:
        for link in resp:
            if link["supplier_id"] == supplier_id:
                return True
    return False


def main():
    print("=" * 60)
    print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
    print("=" * 60)

    # Step 1: Supplier
    print("\n[1] Setting up supplier...")
    supplier = ensure_supplier()
    if not supplier:
        print("FATAL: Could not create/find supplier")
        sys.exit(1)
    supplier_id = supplier["id"]

    # Step 2: Load API data
    print("\n[2] Loading existing HerbAPI data...")
    families, species_cache, cultivar_cache = load_api_data()

    # Step 3: Get product URLs
    print("\n[3] Fetching sitemap...")
    all_urls = get_sitemap_urls()
    if not all_urls:
        print("FATAL: Could not fetch sitemap")
        sys.exit(1)
    candidate_urls = classify_urls(all_urls)
    print(f"  {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")

    # Step 4: Scrape
    print(f"\n[4] Scraping product pages...")
    products = scrape_all_products(candidate_urls)

    # Step 5: Import
    print(f"\n[5] Importing {len(products)} products into HerbAPI...")

    for i, product in enumerate(products):
        if (i + 1) % 50 == 0:
            print(f"  Processing {i + 1}/{len(products)}...")

        botanical = product.get("botanical_name", "")
        if not botanical:
            stats["no_botanical"] += 1
            continue

        # Find or create species
        sp = find_or_create_species(botanical, families, species_cache)
        if not sp:
            stats["species_not_matched"] += 1
            continue

        species_id = sp["id"]
        cultivar_name = extract_cultivar_name(product["name"])

        # Check if cultivar already exists
        cv_key = (species_id, cultivar_name.lower().strip())
        if cv_key in cultivar_cache:
            cv = cultivar_cache[cv_key]
            stats["cultivars_existing"] += 1
        else:
            cv_data = {
                "species_id": species_id,
                "name": cultivar_name,
                "is_organic": True,
            }
            if product.get("description"):
                cv_data["description"] = product["description"]

            cv = api_request("POST", "/cultivars", cv_data)
            if cv:
                cultivar_cache[cv_key] = cv
                stats["cultivars_created"] += 1
            else:
                # Might already exist from previous run - try to find it
                found = False
                for c in paginated_get(f"/cultivars?species_id={species_id}"):
                    if c["name"].lower().strip() == cultivar_name.lower().strip():
                        cultivar_cache[cv_key] = c
                        cv = c
                        stats["cultivars_existing"] += 1
                        found = True
                        break
                if not found:
                    stats["cultivar_create_errors"] += 1
                    continue

        # Link to supplier (check first for idempotency)
        if get_existing_supplier_links(cv["id"], supplier_id):
            stats["supplier_links_existing"] += 1
            continue

        link_data = {
            "supplier_id": supplier_id,
            "article_number": product.get("article_number", ""),
            "product_url": product.get("url", ""),
            "price_eur": product.get("price"),
        }
        pack_info = product.get("pack_info", "")
        if pack_info:
            m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
            if m:
                link_data["pack_size"] = float(m.group(1))
                unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
                link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))

        resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
        if resp:
            stats["supplier_links_created"] += 1
        else:
            stats["supplier_link_errors"] += 1

    # Summary
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    for key, val in sorted(stats.items()):
        print(f"  {key}: {val}")
    print(f"\n  Total species in DB: {len(species_cache)}")
    print(f"  Total cultivars tracked: {len(cultivar_cache)}")


if __name__ == "__main__":
    main()