herbapi/tools/scrapers/scrape_reinsaat_v3.py

#!/usr/bin/env python3
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""

import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html import unescape

# --- Config ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_BASE = "https://www.reinsaat.at"
DELAY = 0.3

# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
CATEGORIES = [
    "beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
    "pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
    "carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
    "parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
    "celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
    "culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
    "wild_flowers_seeds", "green_manure",
]

# Suffixes to strip from botanical names (authority names, infraspecific ranks)
STRIP_SUFFIXES = {
    "l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
    "subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
    "hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
    "crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
    "sat.", "sat", "axillare", "medikus",
}


def api_get(path, params=None):
    """GET from HerbAPI."""
    url = f"{API_BASE}{path}"
    if params:
        url += "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(url)
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())


def api_post(path, data):
    """POST to HerbAPI."""
    url = f"{API_BASE}{path}"
    body = json.dumps(data).encode()
    req = urllib.request.Request(url, data=body, method="POST")
    req.add_header("Authorization", f"Bearer {API_TOKEN}")
    req.add_header("Content-Type", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())


def fetch_page(url):
    """Fetch a web page, return HTML string."""
    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
    with urllib.request.urlopen(req, timeout=15) as resp:
        return resp.read().decode("utf-8", errors="replace")


BOTANICAL_TYPOS = {
    "capscicum": "capsicum",
    "capsicum frutenscens": "capsicum frutescens",
    "tropaelum": "tropaeolum",
    "lact.": "lactuca",
}

ABBREVIATED_NAMES = {
    "origanum vulg.": "origanum vulgare",
    "helichrysum bract.": "helichrysum bracteatum",
    "campanula lat.": "campanula latifolia",
    "cosmos bip.": "cosmos bipinnatus",
    "papaver somnif.": "papaver somniferum",
}


def normalise_botanical(raw):
    """Strip botanical name to genus + species only.

    'Pisum sativum L. convar. sat.' -> 'pisum sativum'
    'Solanum lycopersicum L.'       -> 'solanum lycopersicum'
    'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
    """
    if not raw:
        return None
    # Clean HTML entities
    raw = unescape(raw).replace("\xa0", " ").strip()
    # Remove trailing commas/periods
    raw = raw.rstrip(",. ")
    # Remove content in parentheses
    raw = re.sub(r"\([^)]*\)", "", raw)
    # Check abbreviated names first (before splitting)
    raw_lower = raw.lower().strip()
    for abbrev, full in ABBREVIATED_NAMES.items():
        if raw_lower.startswith(abbrev):
            return full

    parts = raw.split()
    if len(parts) < 2:
        return None
    # Genus (capitalised) + species (lowercase)
    genus = parts[0].lower().rstrip(",")
    species = parts[1].lower().rstrip(",")

    # Fix known typos
    if genus in BOTANICAL_TYPOS:
        genus = BOTANICAL_TYPOS[genus]
    full_name = f"{genus} {species}"
    if full_name in BOTANICAL_TYPOS:
        full_name = BOTANICAL_TYPOS[full_name]
        genus, species = full_name.split()

    # Validate: genus should start with letter, species should be all lowercase
    if not genus[0].isalpha() or not species[0].isalpha():
        return None
    # Skip if species looks like an authority (starts with uppercase in original)
    if parts[1][0].isupper():
        return None
    return f"{genus} {species}"


def extract_product_data(html, url):
    """Extract product info from a Reinsaat product page."""
    result = {}

    # H1 = variety name
    m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
    if m:
        name = unescape(m.group(1)).strip()
        # Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
        paren = re.search(r"\(([^)]+)\)", name)
        if paren and re.match(r"RS-", name):
            name = paren.group(1).strip()
        result["name"] = name

    # Botanical name from fce_shop_kurztext
    m = re.search(
        r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
        html,
    )
    if m:
        result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
        result["botanical_norm"] = normalise_botanical(result["botanical_raw"])

    # Article number from JSON-LD
    for jm in re.finditer(
        r'<script type="application/ld\+json">(.*?)</script>', html, re.S
    ):
        try:
            jd = json.loads(jm.group(1))
        except json.JSONDecodeError:
            continue
        if jd.get("@type") == "Product":
            if "model" in jd:
                result["article_number"] = str(jd["model"])
            # Get smallest pack price (usually the Portion)
            offers = jd.get("offers", {})
            if isinstance(offers, dict):
                offer_list = offers.get("offers", [])
            elif isinstance(offers, list):
                offer_list = offers
            else:
                offer_list = []
            if offer_list:
                prices = [
                    o["price"]
                    for o in offer_list
                    if isinstance(o.get("price"), (int, float)) and o["price"] > 0
                ]
                if prices:
                    result["price_eur"] = min(prices)
            break

    # Price table - get pack sizes
    tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
    for tbl in tables:
        if "€" not in tbl:
            continue
        rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
        if len(rows) >= 2:
            size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
            size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
            price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
            price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
            # Find the "Port." entry
            for i, st in enumerate(size_texts):
                if "Port" in st:
                    if i < len(price_texts):
                        pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
                        if pm:
                            result["port_price"] = float(pm.group())
                    break
            # Get portion content info
            result["pack_sizes"] = size_texts
            break

    # Sowing depth
    m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
    if m:
        d1 = float(m.group(1).replace(",", "."))
        d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
        result["planting_depth_cm"] = round((d1 + d2) / 2, 2)

    # Spacing: "row spacing NNxNN cm" or "NN x NN cm"
    # Try outdoor spacing first
    m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if not m:
        m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if not m:
        m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
    if m:
        result["row_spacing_cm"] = float(m.group(1))
        result["plant_spacing_cm"] = float(m.group(2))

    # Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
    if "row_spacing_cm" not in result:
        m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
        if m:
            r1 = int(m.group(1))
            r2 = int(m.group(2)) if m.group(2) else r1
            result["row_spacing_cm"] = float((r1 + r2) // 2)

    # Germination temperature
    m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
    if m:
        t1 = int(m.group(1))
        t2 = int(m.group(2)) if m.group(2) else t1
        result["germination_temp_c"] = float((t1 + t2) // 2)

    # Pack unit from portion info - "20 seeds" or "25 g" etc
    portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
    if not portion_m:
        # Try "Port. (20 seeds)" format
        portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
    if portion_m:
        result["pack_size"] = float(portion_m.group(1).replace(",", "."))
        unit = portion_m.group(2).lower()
        if unit in ("seed", "seeds", "korn"):
            result["pack_unit"] = "Korn"
        else:
            result["pack_unit"] = unit

    result["url"] = url
    return result


def get_all_species():
    """Fetch all species from API, build lookup by normalised name."""
    species_map = {}
    page = 1
    while True:
        data = api_get("/species", {"per_page": 100, "page": page})
        batch = data.get("data", [])
        for sp in batch:
            norm = normalise_botanical(sp["name_scientific"])
            if norm:
                species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
        print(f"    page {page}: {len(batch)} species (total so far: {len(species_map)})")
        if len(batch) < 100:
            break
        page += 1
    return species_map


def get_all_cultivars():
    """Fetch all cultivars, build lookup by (species_id, normalised name)."""
    cultivar_map = {}  # (species_id, lower_name) -> cultivar
    page = 1
    while True:
        data = api_get("/cultivars", {"per_page": 100, "page": page})
        batch = data.get("data", [])
        for cv in batch:
            key = (cv["species_id"], cv["name"].lower().strip())
            cultivar_map[key] = cv
        print(f"    page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
        if len(batch) < 100:
            break
        page += 1
    return cultivar_map


def get_reinsaat_supplier():
    """Get Reinsaat supplier record."""
    suppliers = api_get("/suppliers")
    for s in suppliers:
        if s["slug"] == "reinsaat":
            return s
    raise RuntimeError("Reinsaat supplier not found in API")


def get_cultivar_suppliers(cultivar_id):
    """Get existing supplier links for a cultivar."""
    return api_get(f"/cultivars/{cultivar_id}/suppliers")


def get_product_urls_from_category(cat_slug):
    """Fetch product URLs from a category page. Handles one level of subcategories."""
    cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
    try:
        html = fetch_page(cat_url)
    except Exception as e:
        print(f"  WARN: Failed to fetch category {cat_slug}: {e}")
        return []

    time.sleep(DELAY)

    # Get all internal links under this category
    pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
    raw_links = re.findall(rf'href="({pattern})"', html)
    # raw_links is list of (full_path, slug_part) but re gives us captured groups
    # Let me redo this
    raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
    unique_links = sorted(set(raw_links))

    product_urls = []
    subcategory_urls = []

    for link in unique_links:
        full_url = REINSAAT_BASE + link
        # Determine depth relative to category
        parts = link.rstrip("/").split("/")
        # /shop/EN/cat_slug/item -> 4 parts = product or subcategory
        # /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
        if len(parts) == 4:
            # Could be product or subcategory - we'll check later
            product_urls.append(full_url)
        elif len(parts) >= 5:
            product_urls.append(full_url)

    return product_urls


def is_product_page(html):
    """Check if HTML is a product page (has botanical name or JSON-LD Product)."""
    return bool(
        re.search(r'fce_shop_kurztext', html)
        or re.search(r'"@type":\s*"Product"', html)
    )


def main():
    print("=" * 60)
    print("Reinsaat v3 Scraper")
    print("=" * 60)

    # Step 1: Load all species
    print("\n[1/4] Loading species from API...")
    species_map = get_all_species()
    print(f"  Loaded {len(species_map)} species")

    # Step 2: Load all cultivars
    print("\n[2/4] Loading cultivars from API...")
    cultivar_map = get_all_cultivars()
    print(f"  Loaded {len(cultivar_map)} cultivars")

    # Step 3: Get Reinsaat supplier
    print("\n[3/4] Getting Reinsaat supplier...")
    supplier = get_reinsaat_supplier()
    supplier_id = supplier["id"]
    print(f"  Reinsaat ID: {supplier_id}")

    # Step 4: Scrape categories
    print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")

    stats = {
        "products_found": 0,
        "botanical_extracted": 0,
        "species_matched": 0,
        "species_not_matched": 0,
        "cultivar_existed": 0,
        "cultivar_created": 0,
        "link_existed": 0,
        "link_created": 0,
        "errors": 0,
    }
    unmatched_species = {}  # botanical_norm -> count
    new_cultivars = []
    new_links = []

    for cat_i, cat in enumerate(CATEGORIES):
        print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
        urls = get_product_urls_from_category(cat)
        print(f"  Found {len(urls)} URLs")

        for url in urls:
            time.sleep(DELAY)
            try:
                html = fetch_page(url)
            except Exception as e:
                print(f"  ERROR fetching {url}: {e}")
                stats["errors"] += 1
                continue

            # Check if this is actually a product page
            if not is_product_page(html):
                # Might be a subcategory - get links from it
                sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
                sub_links = [
                    REINSAAT_BASE + l
                    for l in sorted(set(sub_links))
                    if l.startswith(f"/shop/EN/{cat}/")
                    and l.count("/") > url.rstrip("/").count("/")
                ]
                if sub_links:
                    # It's a subcategory, process its product links
                    for sub_url in sub_links:
                        if sub_url in urls:
                            continue  # already in list
                        time.sleep(DELAY)
                        try:
                            sub_html = fetch_page(sub_url)
                        except Exception as e:
                            print(f"  ERROR fetching {sub_url}: {e}")
                            stats["errors"] += 1
                            continue
                        if not is_product_page(sub_html):
                            continue
                        process_product(
                            sub_html, sub_url, species_map, cultivar_map,
                            supplier_id, stats, unmatched_species,
                            new_cultivars, new_links,
                        )
                continue

            process_product(
                html, url, species_map, cultivar_map,
                supplier_id, stats, unmatched_species,
                new_cultivars, new_links,
            )

    # Report
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"Products found:        {stats['products_found']}")
    print(f"Botanical extracted:   {stats['botanical_extracted']}")
    print(f"Species matched:       {stats['species_matched']}")
    print(f"Species NOT matched:   {stats['species_not_matched']}")
    print(f"Cultivars existed:     {stats['cultivar_existed']}")
    print(f"Cultivars created:     {stats['cultivar_created']}")
    print(f"Links existed:         {stats['link_existed']}")
    print(f"Links created:         {stats['link_created']}")
    print(f"Errors:                {stats['errors']}")

    if new_cultivars:
        print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
        for cv in new_cultivars:
            print(f"  + {cv['name']} ({cv.get('species', '?')})")

    if new_links:
        print(f"\n--- New supplier links ({len(new_links)}) ---")
        for lk in new_links:
            print(f"  + {lk['cultivar']} -> {lk.get('article', '?')}")

    if unmatched_species:
        print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
        for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
            print(f"  ? {name} (x{count})")

    print("\nDone.")


def process_product(html, url, species_map, cultivar_map, supplier_id,
                    stats, unmatched_species, new_cultivars, new_links):
    """Process a single product page."""
    stats["products_found"] += 1
    prod = extract_product_data(html, url)

    if not prod.get("name"):
        return

    bot_norm = prod.get("botanical_norm")
    if not bot_norm:
        # No botanical name found on page
        stats["species_not_matched"] += 1
        unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
        return

    stats["botanical_extracted"] += 1

    # Match species
    species = species_map.get(bot_norm)
    if not species:
        stats["species_not_matched"] += 1
        unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
        return

    stats["species_matched"] += 1
    species_id = species["id"]
    cultivar_name = prod["name"]

    # Check if cultivar exists
    cv_key = (species_id, cultivar_name.lower().strip())
    existing_cv = cultivar_map.get(cv_key)

    if existing_cv:
        stats["cultivar_existed"] += 1
        cultivar_id = existing_cv["id"]
    else:
        # Create cultivar
        create_data = {
            "species_id": species_id,
            "name": cultivar_name,
            "is_organic": True,
            "source_urls": [url],
        }
        # Add growing data if we extracted any
        if "planting_depth_cm" in prod:
            create_data["planting_depth_cm"] = prod["planting_depth_cm"]
        if "row_spacing_cm" in prod:
            create_data["row_spacing_cm"] = prod["row_spacing_cm"]
        if "plant_spacing_cm" in prod:
            create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
        if "germination_temp_c" in prod:
            create_data["germination_temp_c"] = prod["germination_temp_c"]

        try:
            new_cv = api_post("/cultivars", create_data)
            cultivar_id = new_cv["id"]
            stats["cultivar_created"] += 1
            new_cultivars.append({
                "name": cultivar_name,
                "species": species["name"],
                "id": cultivar_id,
            })
            # Add to local cache
            cultivar_map[cv_key] = new_cv
            print(f"  + Created cultivar: {cultivar_name} ({species['name']})")
        except urllib.error.HTTPError as e:
            body = e.read().decode() if hasattr(e, 'read') else str(e)
            if e.code == 500 and "Database error" in body:
                # Likely slug collision - search for existing cultivar
                try:
                    # Try multiple search strategies
                    found = None
                    cn_lower = cultivar_name.lower().strip()

                    # Strategy 1: search by full name
                    search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
                    for cv in search_data.get("data", []):
                        if cv["name"].lower().strip() == cn_lower:
                            found = cv
                            break
                    # Strategy 2: match by species_id + partial name
                    if not found:
                        for cv in search_data.get("data", []):
                            if cv["species_id"] == species_id:
                                # Match if names are similar (ignoring punctuation)
                                cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
                                cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
                                if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
                                    found = cv
                                    break
                    # Strategy 3: search by last significant word
                    if not found:
                        words = [w for w in cultivar_name.split() if len(w) > 2]
                        if words:
                            search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
                            for cv in search2.get("data", []):
                                if cv["species_id"] == species_id:
                                    cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
                                    cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
                                    if cv_clean == cn_clean:
                                        found = cv
                                        break

                    if found:
                        cultivar_id = found["id"]
                        cultivar_map[cv_key] = found
                        stats["cultivar_existed"] += 1
                    else:
                        print(f"  WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
                        stats["errors"] += 1
                        return
                except Exception as e2:
                    print(f"  ERROR searching for '{cultivar_name}' after collision: {e2}")
                    stats["errors"] += 1
                    return
            else:
                print(f"  ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
                stats["errors"] += 1
                return

    # Check if Reinsaat supplier link exists
    try:
        existing_links = get_cultivar_suppliers(cultivar_id)
    except Exception:
        existing_links = []

    has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)

    if has_reinsaat:
        stats["link_existed"] += 1
    else:
        # Create supplier link
        link_data = {
            "supplier_id": supplier_id,
            "product_url": url,
        }
        if "article_number" in prod:
            link_data["article_number"] = prod["article_number"]
        if "port_price" in prod:
            link_data["price_eur"] = prod["port_price"]
        elif "price_eur" in prod:
            link_data["price_eur"] = prod["price_eur"]
        if "pack_size" in prod:
            link_data["pack_size"] = prod["pack_size"]
        if "pack_unit" in prod:
            link_data["pack_unit"] = prod["pack_unit"]

        try:
            api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
            stats["link_created"] += 1
            new_links.append({
                "cultivar": cultivar_name,
                "article": prod.get("article_number", "?"),
                "url": url,
            })
        except urllib.error.HTTPError as e:
            body = e.read().decode() if hasattr(e, 'read') else str(e)
            print(f"  ERROR linking '{cultivar_name}': {e.code} {body}")
            stats["errors"] += 1


if __name__ == "__main__":
    main()