herbapi/tools/scrapers/scrape_reinsaat_v2.py

#!/usr/bin/env python3
"""
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
genus+species from extended botanical names, create/enrich cultivars, link supplier.

Uses direct PostgreSQL access (psycopg2) for speed and reliability.
"""

import json
import re
import ssl
import sys
import time
import uuid
import html as html_mod
import urllib.request
import urllib.error
import urllib.parse
from dataclasses import dataclass, field
from typing import Optional

# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)

import psycopg2
import psycopg2.extras

# ── Config ──────────────────────────────────────────────────────────────────
DB_HOST = "10.31.3.90"
DB_NAME = "herbapi"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"

REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.3
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"

# ── All Reinsaat categories ────────────────────────────────────────────────
CATEGORIES = [
    "https://www.reinsaat.at/shop/DE/bohnen/",
    "https://www.reinsaat.at/shop/DE/erbsen/",
    "https://www.reinsaat.at/shop/DE/gurken/",
    "https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
    "https://www.reinsaat.at/shop/DE/knollenfenchel/",
    "https://www.reinsaat.at/shop/DE/kohlgewaechse/",
    "https://www.reinsaat.at/shop/DE/kuerbis/",
    "https://www.reinsaat.at/shop/DE/mais/",
    "https://www.reinsaat.at/shop/DE/mangold/",
    "https://www.reinsaat.at/shop/DE/melanzani_1/",
    "https://www.reinsaat.at/shop/DE/melone/",
    "https://www.reinsaat.at/shop/DE/paprika/",
    "https://www.reinsaat.at/shop/DE/pastinaken_1/",
    "https://www.reinsaat.at/shop/DE/petersilie/",
    "https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
    "https://www.reinsaat.at/shop/DE/porree/",
    "https://www.reinsaat.at/shop/DE/radies_rettich/",
    "https://www.reinsaat.at/shop/DE/rote_ruebe/",
    "https://www.reinsaat.at/shop/DE/salate/",
    "https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
    "https://www.reinsaat.at/shop/DE/sellerie/",
    "https://www.reinsaat.at/shop/DE/spinat/",
    "https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
    "https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
    "https://www.reinsaat.at/shop/DE/zucchini/",
    "https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
    "https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
    "https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
    "https://www.reinsaat.at/shop/DE/gruenduengung/",
]

# ── HTTP ────────────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()


def fetch_url(url: str, retries: int = 2) -> str:
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
    })
    for attempt in range(retries + 1):
        try:
            with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
                charset = resp.headers.get_content_charset() or "utf-8"
                return resp.read().decode(charset)
        except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
            if attempt < retries:
                time.sleep(2)
                continue
            raise
    return ""


# ── HTML parsing helpers ────────────────────────────────────────────────────
def extract_links(html_text: str, base_url: str) -> list[str]:
    links = []
    seen = set()
    for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
        href = m.group(1)
        if not href or href.startswith("#") or href.startswith("javascript:"):
            continue
        full = urllib.parse.urljoin(base_url, href)
        if full not in seen:
            seen.add(full)
            links.append(full)
    return links


def extract_jsonld_product(html_text: str) -> Optional[dict]:
    for m in re.finditer(
        r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
        html_text, re.DOTALL | re.IGNORECASE
    ):
        try:
            data = json.loads(m.group(1))
            if isinstance(data, dict) and data.get("@type") == "Product":
                return data
        except (json.JSONDecodeError, ValueError):
            continue
    return None


def html_to_text(html_text: str) -> str:
    """Strip HTML tags and decode entities."""
    text = re.sub(r'<[^>]+>', ' ', html_text)
    text = html_mod.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def extract_botanical_name(html_text: str) -> str:
    """
    Extract the botanical/Latin name from the page.
    Primary source: <div class="fce_shop_kurztext"> content.
    Fallback: <em> tags in growing infos.

    Returns the raw text (may include authority names, infraspecific ranks, etc.)
    """
    # Primary: kurztext div
    m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if m:
        text = html_to_text(m.group(1)).strip()
        if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
            return text

    # Fallback: first <em> in growingInfos that looks like a Latin name
    gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if gi:
        for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
            text = html_to_text(em.group(1)).strip()
            if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
                return text

    # Last resort: any <em>/<i> tag with a Latin-looking name
    for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
        text = html_to_text(tag.group(1)).strip()
        if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
            return text

    return ""


def normalize_latin_name(raw: str) -> str:
    """
    Extract genus + species from an extended botanical name.

    Examples:
      "Pisum sativum L. convar. sat." -> "Pisum sativum"
      "Capsicum annuum L." -> "Capsicum annuum"
      "Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
      "Solanum lycopersicum L." -> "Solanum lycopersicum"
      "Cucumis sativus" -> "Cucumis sativus"
      "Mentha x piperita" -> "Mentha x piperita"
    """
    if not raw:
        return ""

    # Clean up
    name = raw.strip()
    # Remove leading/trailing punctuation
    name = name.strip(".,;:")

    words = name.split()
    if len(words) < 2:
        return name

    genus = words[0]

    # Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
    if len(words) >= 3 and words[1] in ("x", "×"):
        return f"{genus} x {words[2]}"

    species = words[1]

    # Validate: genus should start uppercase, species lowercase
    if not genus[0].isupper() or not species[0].islower():
        return name  # Can't parse, return as-is

    return f"{genus} {species}"


# ── Calendar parsing ────────────────────────────────────────────────────────
CALENDAR_ROW_TYPES = {
    "voranzucht": "indoor_sowing_months",
    "vorzucht": "indoor_sowing_months",
    "vorkultur": "indoor_sowing_months",
    "aussaat/ pflanzung freiland": "direct_sowing_months",
    "aussaat/pflanzung freiland": "direct_sowing_months",
    "aussaat freiland": "direct_sowing_months",
    "direktsaat": "direct_sowing_months",
    "pflanzung freiland": "transplanting_months",
    "pflanzung": "transplanting_months",
    "aussaat/ pflanzung gewächshaus": "glasshouse_months",
    "aussaat/pflanzung gewächshaus": "glasshouse_months",
    "gewächshaus": "glasshouse_months",
    "ernte": "harvesting_months",
}


def parse_calendar(html_text: str) -> dict:
    """
    Parse the Reinsaat growing calendar table.
    Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
    Each value is a sorted list of month integers (1-12).
    """
    result = {}

    cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
    if not cal_match:
        return result

    cal = cal_match.group(1)
    rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)

    for row in rows:
        # Get label
        label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
        if not label_m:
            continue
        label = html_to_text(label_m.group(1)).strip().lower()

        # Map label to our field
        field_name = None
        for pattern, fname in CALENDAR_ROW_TYPES.items():
            if pattern in label:
                field_name = fname
                break
        if not field_name:
            continue

        # Extract background colors for each cell (24 cells = 12 months x 2 halves)
        colors = re.findall(r'background-color:\s*([^;"]+)', row)

        # Convert to months: cell i maps to month (i // 2) + 1
        active_months = set()
        for i, color in enumerate(colors):
            color = color.strip().lower()
            if color != "none" and color != "transparent" and color != "":
                month = (i // 2) + 1
                if 1 <= month <= 12:
                    active_months.add(month)

        if active_months:
            # Merge if same field already found (e.g. two sowing rows)
            if field_name in result:
                result[field_name] = sorted(set(result[field_name]) | active_months)
            else:
                result[field_name] = sorted(active_months)

    return result


# ── Growing data extraction ─────────────────────────────────────────────────
def extract_growing_data(html_text: str) -> dict:
    """Extract spacing, depth, germination temp from the growing text."""
    data = {}

    # Get the growingInfos text
    gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
    if not gi:
        return data

    full_text = html_to_text(gi.group(1))
    # Also get the raw HTML for better entity handling
    raw_html = gi.group(1)
    # Convert HTML entities for pattern matching
    raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
    raw_text = re.sub(r'\s+', ' ', raw_text)

    # ── Sowing depth ──
    depth_pats = [
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in depth_pats:
        dm = re.search(pat, raw_text, re.IGNORECASE)
        if dm:
            vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
            data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
            break

    # ── Spacing: "ROW x PLANT cm" ──
    spacing_pats = [
        # "30–45 x 3–5 cm" (range x range)
        r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
        # "100 x 50 cm" (simple)
        r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in spacing_pats:
        matches = re.findall(pat, raw_text, re.IGNORECASE)
        if matches:
            m = matches[-1]  # prefer last match
            if len(m) == 4:
                data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
                data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
            elif len(m) == 2:
                v1 = float(m[0].replace(",", "."))
                v2 = float(m[1].replace(",", "."))
                data["row_spacing_cm"] = round(v1, 1)
                data["plant_spacing_cm"] = round(v2, 1)
            break

    # ── Germination temperature ──
    temp_pats = [
        r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
        r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
    ]
    for pat in temp_pats:
        tm = re.search(pat, raw_text, re.IGNORECASE)
        if tm:
            vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
            avg = sum(vals) / len(vals)
            if 5 <= avg <= 40:
                data["germination_temp_c"] = round(avg, 1)
                break

    # ── Perennial ──
    perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
    for pat in perennial_pats:
        if re.search(pat, raw_text, re.IGNORECASE):
            data["perennial"] = True
            break

    return data


# ── Product data ────────────────────────────────────────────────────────────
@dataclass
class ProductData:
    name: str = ""
    raw_latin_name: str = ""
    normalized_latin: str = ""
    description: str = ""
    sku: str = ""
    url: str = ""
    is_organic: bool = True
    growing_data: dict = field(default_factory=dict)
    calendar: dict = field(default_factory=dict)


def parse_product(html_text: str, url: str) -> Optional[ProductData]:
    """Parse a product page. Returns ProductData or None if not a product page."""
    jsonld = extract_jsonld_product(html_text)
    if not jsonld:
        return None

    product = ProductData(url=url)
    product.name = jsonld.get("name", "").strip()
    product.description = jsonld.get("description", "").strip()
    product.sku = jsonld.get("model", "").strip()

    # Extract and normalize botanical name
    product.raw_latin_name = extract_botanical_name(html_text)
    product.normalized_latin = normalize_latin_name(product.raw_latin_name)

    # Extract growing data
    product.growing_data = extract_growing_data(html_text)

    # Parse calendar
    product.calendar = parse_calendar(html_text)

    # Check organic status (Reinsaat is all organic, but check for "demeter" too)
    product.is_organic = True

    return product


# ── Recursive discovery ─────────────────────────────────────────────────────
def discover_products(
    category_url: str,
    max_depth: int = 4,
    _depth: int = 0,
    _visited: set = None,
) -> list[ProductData]:
    if _visited is None:
        _visited = set()
    if category_url in _visited or _depth > max_depth:
        return []
    _visited.add(category_url)

    indent = "  " * (_depth + 1)

    try:
        html_text = fetch_url(category_url)
        time.sleep(DELAY)
    except Exception as e:
        print(f"{indent}ERROR fetching {category_url}: {e}")
        return []

    # Check if this is a product page
    product = parse_product(html_text, category_url)
    if product:
        return [product]

    # Category page: find child links
    cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
    child_links = []
    for link in extract_links(html_text, category_url):
        parsed = urllib.parse.urlparse(link)
        if parsed.netloc and parsed.netloc != "www.reinsaat.at":
            continue
        child_path = parsed.path.rstrip("/")
        if not child_path.startswith(cat_path + "/"):
            continue
        relative = child_path[len(cat_path) + 1:]
        if "/" in relative or not relative:
            continue
        clean_url = f"https://www.reinsaat.at{child_path}/"
        if clean_url not in _visited:
            child_links.append(clean_url)

    child_links = list(dict.fromkeys(child_links))
    print(f"{indent}Category {category_url} -> {len(child_links)} children")

    products = []
    for child_url in child_links:
        results = discover_products(child_url, max_depth, _depth + 1, _visited)
        products.extend(results)

    return products


# ── Slug generation ─────────────────────────────────────────────────────────
def make_slug(species_name: str, cultivar_name: str) -> str:
    """Generate a URL-friendly slug."""
    raw = f"{species_name}-{cultivar_name}".lower()
    # Replace umlauts and special chars
    replacements = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
        'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
        'á': 'a', 'à': 'a', 'â': 'a',
        'í': 'i', 'ì': 'i', 'î': 'i',
        'ó': 'o', 'ò': 'o', 'ô': 'o',
        'ú': 'u', 'ù': 'u', 'û': 'u',
        'ñ': 'n', 'ç': 'c',
    }
    for old, new in replacements.items():
        raw = raw.replace(old, new)
    # Keep only alphanumeric and hyphens
    slug = re.sub(r'[^a-z0-9]+', '-', raw)
    slug = slug.strip('-')
    # Collapse multiple hyphens
    slug = re.sub(r'-+', '-', slug)
    return slug


# ── Main ────────────────────────────────────────────────────────────────────
def db_connect():
    """Create a fresh DB connection."""
    conn = psycopg2.connect(
        host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
    )
    conn.autocommit = False
    return conn


def main():
    print("=" * 70)
    print("Reinsaat Scraper v2")
    print("=" * 70)

    # ── Phase 1: Discover all products (no DB needed) ──
    print("\n[1] Discovering products from Reinsaat categories...")
    all_products: list[ProductData] = []
    visited: set[str] = set()

    for cat_url in CATEGORIES:
        print(f"\n  Category: {cat_url}")
        products = discover_products(cat_url, max_depth=4, _visited=visited)
        all_products.extend(products)
        print(f"  -> {len(products)} products")

    # Deduplicate by URL
    seen_urls = set()
    unique_products = []
    for p in all_products:
        if p.url not in seen_urls:
            seen_urls.add(p.url)
            unique_products.append(p)
    all_products = unique_products
    print(f"\n  Total unique products: {len(all_products)}")

    # ── Phase 2: Connect to DB and load existing data ──
    print("\n[2] Connecting to DB and loading existing data...")
    conn = db_connect()
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    # Load species
    cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
    species_rows = cur.fetchall()
    species_map = {}
    for row in species_rows:
        key = row["name_scientific"].lower().strip()
        species_map[key] = row
    print(f"    {len(species_map)} species loaded")

    # Load existing cultivars
    cur.execute("""
        SELECT id, species_id, name, slug, description,
               row_spacing_cm, plant_spacing_cm, planting_depth_cm,
               germination_temp_c, perennial,
               indoor_sowing_months, direct_sowing_months,
               transplanting_months, glasshouse_months, harvesting_months
        FROM cultivars
    """)
    cultivar_rows = cur.fetchall()
    existing_cultivars = {}
    existing_slugs = set()
    for row in cultivar_rows:
        sid = str(row["species_id"])
        name_lower = row["name"].lower()
        existing_cultivars[(sid, name_lower)] = dict(row)
        existing_slugs.add(row["slug"])
    print(f"    {len(existing_cultivars)} cultivars loaded")

    # Load existing Reinsaat supplier links
    cur.execute("""
        SELECT cultivar_id, product_url, article_number
        FROM cultivar_suppliers
        WHERE supplier_id = %s
    """, (REINSAAT_SUPPLIER_ID,))
    existing_links = {}
    for row in cur.fetchall():
        cid = str(row["cultivar_id"])
        url = row["product_url"] or ""
        sku = row["article_number"] or ""
        existing_links.setdefault(cid, []).append((url, sku))
    print(f"    {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")

    # ── Phase 3: Process products ──
    print("\n[3] Processing products...")
    stats = {
        "created": 0,
        "linked": 0,
        "enriched": 0,
        "skipped_no_species": 0,
        "skipped_no_name": 0,
        "link_exists": 0,
        "errors": 0,
    }
    unmatched = []

    for i, product in enumerate(all_products):
        pct = (i + 1) / len(all_products) * 100
        prefix = f"  [{i+1}/{len(all_products)}] ({pct:.0f}%)"

        if not product.name:
            stats["skipped_no_name"] += 1
            continue

        # Match species
        normalized = product.normalized_latin.lower().strip()
        species = species_map.get(normalized)

        if not species:
            # Try exact match on raw name (first two words)
            raw_words = product.raw_latin_name.split()
            if len(raw_words) >= 2:
                attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
                species = species_map.get(attempt)

        if not species:
            stats["skipped_no_species"] += 1
            unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
            continue

        species_id = str(species["id"])
        species_name = species["name_scientific"]

        # Check if cultivar exists
        ckey = (species_id, product.name.lower())
        existing = existing_cultivars.get(ckey)

        if existing:
            cultivar_id = str(existing["id"])

            # ── Enrich existing cultivar with missing data ──
            updates = {}

            # Growing data from page
            gd = product.growing_data
            if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
                updates["planting_depth_cm"] = gd["planting_depth_cm"]
            if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
                updates["row_spacing_cm"] = gd["row_spacing_cm"]
            if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
                updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
            if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
                updates["germination_temp_c"] = gd["germination_temp_c"]
            if gd.get("perennial") and not existing.get("perennial"):
                updates["perennial"] = True

            # Calendar data
            cal = product.calendar
            if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
                updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
            if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
                updates["direct_sowing_months"] = cal["direct_sowing_months"]
            if cal.get("transplanting_months") and not existing.get("transplanting_months"):
                updates["transplanting_months"] = cal["transplanting_months"]
            if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
                updates["glasshouse_months"] = cal["glasshouse_months"]
            if cal.get("harvesting_months") and not existing.get("harvesting_months"):
                updates["harvesting_months"] = cal["harvesting_months"]

            # Description
            if product.description and not existing.get("description"):
                updates["description"] = product.description

            if updates:
                set_clauses = []
                values = []
                for col, val in updates.items():
                    set_clauses.append(f"{col} = %s")
                    values.append(val)
                set_clauses.append("updated_at = NOW()")
                values.append(cultivar_id)
                cur.execute(
                    f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
                    values
                )
                stats["enriched"] += 1
                print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")

            # ── Add supplier link if missing ──
            link_exists = False
            if cultivar_id in existing_links:
                for lurl, lsku in existing_links[cultivar_id]:
                    if lurl == product.url or (lsku and lsku == product.sku):
                        link_exists = True
                        break

            if link_exists:
                stats["link_exists"] += 1
            else:
                try:
                    cur.execute("SAVEPOINT link_sp")
                    cur.execute("""
                        INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
                        VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
                        ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
                        SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
                    """, (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
                    cur.execute("RELEASE SAVEPOINT link_sp")
                    stats["linked"] += 1
                    existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
                    print(f"{prefix} {product.name} -> LINKED ({product.sku})")
                except Exception as e:
                    print(f"{prefix} {product.name} -> LINK ERROR: {e}")
                    cur.execute("ROLLBACK TO SAVEPOINT link_sp")
                    stats["errors"] += 1
        else:
            # ── Create new cultivar ──
            slug = make_slug(species_name, product.name)
            # Ensure unique slug
            base_slug = slug
            counter = 2
            while slug in existing_slugs:
                slug = f"{base_slug}-{counter}"
                counter += 1

            gd = product.growing_data
            cal = product.calendar

            try:
                cur.execute("SAVEPOINT create_sp")
                cur.execute("""
                    INSERT INTO cultivars (
                        species_id, name, name_de, slug, description,
                        is_organic, perennial,
                        planting_depth_cm, row_spacing_cm, plant_spacing_cm,
                        germination_temp_c,
                        indoor_sowing_months, direct_sowing_months,
                        transplanting_months, glasshouse_months, harvesting_months
                    ) VALUES (
                        %s::uuid, %s, %s, %s, %s,
                        %s, %s,
                        %s, %s, %s,
                        %s,
                        %s, %s,
                        %s, %s, %s
                    )
                    RETURNING id
                """, (
                    species_id,
                    product.name,
                    product.name,
                    slug,
                    product.description,
                    product.is_organic,
                    gd.get("perennial", False),
                    gd.get("planting_depth_cm"),
                    gd.get("row_spacing_cm"),
                    gd.get("plant_spacing_cm"),
                    gd.get("germination_temp_c"),
                    cal.get("indoor_sowing_months"),
                    cal.get("direct_sowing_months"),
                    cal.get("transplanting_months"),
                    cal.get("glasshouse_months"),
                    cal.get("harvesting_months"),
                ))
                new_id = str(cur.fetchone()["id"])
                existing_slugs.add(slug)
                existing_cultivars[ckey] = {"id": new_id}
                stats["created"] += 1

                # Link to supplier
                cur.execute("""
                    INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
                    VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
                """, (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
                stats["linked"] += 1
                existing_links.setdefault(new_id, []).append((product.url, product.sku))

                print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
                cur.execute("RELEASE SAVEPOINT create_sp")
            except Exception as e:
                print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
                cur.execute("ROLLBACK TO SAVEPOINT create_sp")
                stats["errors"] += 1

    # ── Commit ──
    conn.commit()

    # ── Summary ──
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"  Total products discovered: {len(all_products)}")
    print(f"  New cultivars created:     {stats['created']}")
    print(f"  New supplier links added:  {stats['linked']}")
    print(f"  Cultivars enriched:        {stats['enriched']}")
    print(f"  Links already existed:     {stats['link_exists']}")
    print(f"  Skipped (no species):      {stats['skipped_no_species']}")
    print(f"  Skipped (no name):         {stats['skipped_no_name']}")
    print(f"  Errors:                    {stats['errors']}")
    print("=" * 70)

    if unmatched:
        print(f"\n  UNMATCHED PRODUCTS ({len(unmatched)}):")
        for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
            print(f"    {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")

    cur.close()
    conn.close()


if __name__ == "__main__":
    main()