herbapi/tools/scrapers/scrape_arche_noah.py

#!/usr/bin/env python3
"""
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.

Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
product listings and details, then creates cultivars in HerbAPI matched
to existing species.
"""

import json
import re
import time
import urllib.request
import urllib.error
import urllib.parse
import sys
from datetime import datetime, timezone

# --- Configuration -----------------------------------------------------------

HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"

SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

REQUEST_DELAY = 0.5  # seconds between requests

# Only import products from these Arche Noah article lines (their own seeds)
ARCHE_NOAH_LINES = {
    "Bio-Saatgut von ARCHE NOAH",
    "Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
}

# Search terms to discover all seed products across the shop
SEARCH_TERMS = [
    "Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
    "Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
    "Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
    "Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
    "Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
    "Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
    "Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
    "Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
    "Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
    "Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
    "Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
    "Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
    "Rote Bete", "Rote Rübe", "Mangold", "Melde",
    "Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
    "Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
    "Zuckermais", "Popcorn",
]

# --- Helpers -----------------------------------------------------------------

def herbapi_request(method, path, data=None):
    """Make a request to HerbAPI."""
    url = f"{HERBAPI_BASE}/{path}"
    body = json.dumps(data).encode() if data else None
    req = urllib.request.Request(url, data=body, method=method, headers={
        "Authorization": f"Bearer {HERBAPI_TOKEN}",
        "Content-Type": "application/json",
        "Accept": "application/json",
    })
    try:
        resp = urllib.request.urlopen(req, timeout=30)
        raw = resp.read().decode("utf-8")
        return json.loads(raw) if raw.strip() else None
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        print(f"  HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
        raise


def shop_create_session():
    """Create an anonymous session on the Arche Noah shop."""
    req = urllib.request.Request(
        SHOP_BASE + "webshop/createanonymoususer",
        data=json.dumps({}).encode(),
        headers={
            "User-Agent": SHOP_UA,
            "Content-Type": "application/json",
            "Origin": "https://shop.arche-noah.at",
            "Referer": "https://shop.arche-noah.at/",
        },
    )
    resp = urllib.request.urlopen(req, timeout=15)
    cookie = resp.headers.get("Set-Cookie", "")
    session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
    if not session:
        raise RuntimeError("Failed to get shop session")
    return session


def shop_request(session, endpoint, payload):
    """Make a POST request to the shop API."""
    req = urllib.request.Request(
        SHOP_BASE + endpoint,
        data=json.dumps(payload).encode(),
        headers={
            "User-Agent": SHOP_UA,
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Cookie": f"JSESSIONID={session}",
            "Origin": "https://shop.arche-noah.at",
            "Referer": "https://shop.arche-noah.at/",
        },
    )
    resp = urllib.request.urlopen(req, timeout=30)
    raw = resp.read().decode("utf-8")
    return json.loads(raw) if raw.strip() else None


def extract_latin_name(detail_headline3):
    """Extract the Latin/botanical name from the product detail headline3 field."""
    if not detail_headline3:
        return None
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
    # Remove "Hier geht es zu unseren..." trailing text
    text = text.split("Hier geht")[0].strip()
    # Should be something like "Solanum lycopersicum" or "Capsicum annuum"
    if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
        return text
    return None


def match_species(latin_name, species_by_scientific):
    """
    Match a Latin name to a species, handling subspecies/variety suffixes.
    E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
    Also handles "subsp.", "convar.", "f." qualifiers.
    """
    if not latin_name:
        return None

    normalized = latin_name.strip().lower()

    # Direct match
    species = species_by_scientific.get(normalized)
    if species:
        return species

    # Strip subspecies/variety/convar/forma qualifiers and try genus + species only
    # Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
    m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
    if m:
        base = m.group(1).strip()
        species = species_by_scientific.get(base)
        if species:
            return species

    return None


def extract_cultivar_name(product_name):
    """
    Extract the cultivar/variety name from the product name.
    Format examples:
      "Salatparadeiser 'Naama' HG026" -> "Naama"
      "Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
      "Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
    """
    # Try to extract name in quotes (various quote styles)
    m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
    if m:
        return m.group(1).strip()
    # Fallback: remove the article number suffix and type prefix
    # Remove trailing article number like HG026, TO019, etc.
    name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
    # Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
    # Just return the full cleaned name
    return name


def parse_pack_info(unit_desc):
    """
    Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
    Returns (pack_size, pack_unit) or (None, None).
    """
    if not unit_desc:
        return None, None
    # "20-30 Korn" -> take the lower bound
    m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
    if m:
        return float(m.group(1)), m.group(2)
    return None, None


# --- Main scraping logic -----------------------------------------------------

def fetch_all_arche_noah_products(session):
    """Search the shop API to find all Arche Noah seed products."""
    all_products = {}
    seen_terms = set()

    for term in SEARCH_TERMS:
        if term.lower() in seen_terms:
            continue
        seen_terms.add(term.lower())

        offset = 0
        while True:
            payload = {
                "searchCriteria": term,
                "startIndex": offset,
                "numDataSets": 200,
                "allowAllProducts": False,
            }
            try:
                data = shop_request(session, "webshop/getproducts", payload)
            except Exception as e:
                print(f"  Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
                break

            if not data:
                break

            new_count = 0
            for p in data:
                if p["sid"] not in all_products:
                    all_products[p["sid"]] = p
                    new_count += 1

            if len(data) < 200:
                break
            offset += len(data)
            time.sleep(REQUEST_DELAY)

        time.sleep(REQUEST_DELAY)

    # Filter to Arche Noah's own seed products only
    an_products = {
        sid: p for sid, p in all_products.items()
        if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
    }

    print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
    return an_products


def fetch_product_details(session, products):
    """Fetch detailed info (Latin names) for each product."""
    details = {}
    total = len(products)
    for i, (sid, product) in enumerate(products.items()):
        try:
            detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
            if detail:
                details[sid] = detail
        except Exception as e:
            print(f"  Detail for {sid} failed: {e}", file=sys.stderr)

        if (i + 1) % 20 == 0:
            print(f"  Fetched details: {i + 1}/{total}")
        time.sleep(REQUEST_DELAY)

    print(f"Fetched {len(details)} product details")
    return details


def load_herbapi_species():
    """Load all species from HerbAPI and build lookup maps (handles pagination)."""
    page = 1
    species_list = []
    while True:
        result = herbapi_request("GET", f"species?per_page=100&page={page}")
        if isinstance(result, dict) and "data" in result:
            data = result["data"]
            total = result.get("total", 0)
        elif isinstance(result, list):
            data = result
            total = len(data)
        else:
            break
        species_list.extend(data)
        if len(species_list) >= total or not data:
            break
        page += 1

    # Build lookup by scientific name (normalized lowercase)
    by_scientific = {}
    for s in species_list:
        key = s["name_scientific"].strip().lower()
        by_scientific[key] = s
    return species_list, by_scientific


def load_herbapi_cultivars():
    """Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
    page = 1
    all_cultivars = []
    while True:
        result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
        if isinstance(result, dict) and "data" in result:
            data = result["data"]
            total = result.get("total", 0)
        elif isinstance(result, list):
            data = result
            total = len(data)
        else:
            break

        all_cultivars.extend(data)
        if len(all_cultivars) >= total or not data:
            break
        page += 1

    # Build lookup by (species_id, normalized cultivar name)
    by_key = {}
    for c in all_cultivars:
        key = (c["species_id"], c["name"].strip().lower())
        by_key[key] = c

    return all_cultivars, by_key


def ensure_supplier():
    """Create the Arche Noah supplier if it doesn't exist, return its ID."""
    suppliers = herbapi_request("GET", "suppliers")
    if isinstance(suppliers, dict) and "data" in suppliers:
        suppliers = suppliers["data"]

    for s in suppliers:
        if "arche" in s["name"].lower() and "noah" in s["name"].lower():
            print(f"Supplier 'Arche Noah' already exists: {s['id']}")
            return s["id"]

    print("Creating supplier 'Arche Noah'...")
    result = herbapi_request("POST", "suppliers", {
        "name": "Arche Noah",
        "url": "https://www.arche-noah.at",
        "country": "AT",
        "is_organic": True,
        "is_demeter": False,
        "notes": "Austrian society for heritage seed preservation and biodiversity",
    })
    print(f"Created supplier: {result['id']}")
    return result["id"]


def load_existing_supplier_links(cultivar_id):
    """Load existing supplier links for a cultivar."""
    try:
        result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
        if isinstance(result, list):
            return result
        if isinstance(result, dict) and "data" in result:
            return result["data"]
        return []
    except Exception:
        return []


def main():
    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    print(f"=== Arche Noah Seed Catalog Scraper ===")
    print(f"Started at {now_str}\n")

    # Step 1: Create Arche Noah supplier in HerbAPI
    print("[1/6] Ensuring Arche Noah supplier exists...")
    supplier_id = ensure_supplier()
    print()

    # Step 2: Load HerbAPI species for matching
    print("[2/6] Loading HerbAPI species...")
    species_list, species_by_scientific = load_herbapi_species()
    print(f"Loaded {len(species_list)} species")
    print()

    # Step 3: Load existing cultivars for idempotency
    print("[3/6] Loading existing cultivars...")
    existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
    print(f"Loaded {len(existing_cultivars)} existing cultivars")
    print()

    # Step 4: Scrape Arche Noah shop
    print("[4/6] Scraping Arche Noah shop catalog...")
    session = shop_create_session()
    print(f"Got shop session")
    products = fetch_all_arche_noah_products(session)
    print()

    # Step 5: Fetch product details (to get Latin names)
    print("[5/6] Fetching product details for Latin name matching...")
    details = fetch_product_details(session, products)
    print()

    # Step 6: Create cultivars in HerbAPI
    print("[6/6] Creating cultivars in HerbAPI...")
    stats = {
        "created": 0,
        "skipped_existing": 0,
        "skipped_no_species": 0,
        "supplier_linked": 0,
        "supplier_link_existed": 0,
        "errors": 0,
    }

    for sid, product in sorted(products.items()):
        detail = details.get(sid, {})

        # Extract Latin name from detail
        latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
        if not latin_name:
            # Fallback: try from category mapping
            latin_name = None

        # Match to HerbAPI species (handles subspecies/variety suffixes)
        species = match_species(latin_name, species_by_scientific)

        if not species:
            print(f"  SKIP (no species match): {product['name']} | latin={latin_name}")
            stats["skipped_no_species"] += 1
            continue

        # Extract cultivar name
        cultivar_name = extract_cultivar_name(product["name"])
        if not cultivar_name:
            print(f"  SKIP (no cultivar name): {product['name']}")
            stats["skipped_no_species"] += 1
            continue

        # Check if cultivar already exists (idempotency)
        lookup_key = (species["id"], cultivar_name.strip().lower())
        existing = cultivars_by_key.get(lookup_key)

        if existing:
            cultivar_id = existing["id"]
            stats["skipped_existing"] += 1
        else:
            # Determine if this is organic
            is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"

            # Build product URL
            alias = product.get("alias") or detail.get("alias", "")
            product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None

            # Create cultivar
            cultivar_data = {
                "species_id": species["id"],
                "name": cultivar_name,
                "name_de": cultivar_name,
                "is_organic": is_organic,
                "source_urls": [product_url] if product_url else None,
            }

            try:
                result = herbapi_request("POST", "cultivars", cultivar_data)
                cultivar_id = result["id"]
                stats["created"] += 1
                # Add to lookup for idempotency within this run
                cultivars_by_key[lookup_key] = result
                print(f"  CREATED: {cultivar_name} ({species['name_scientific']})")
            except Exception as e:
                print(f"  ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
                stats["errors"] += 1
                continue

        # Link cultivar to supplier
        existing_links = load_existing_supplier_links(cultivar_id)
        already_linked = any(
            link["supplier_id"] == supplier_id for link in existing_links
        )

        if already_linked:
            stats["supplier_link_existed"] += 1
        else:
            # Parse pack info
            unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
            pack_size, pack_unit = parse_pack_info(unit_desc)

            # Get price
            price = None
            price_list = product.get("priceListPos") or detail.get("priceListPos", [])
            if price_list:
                price = price_list[0].get("singleUnitPrice")

            # Build product URL
            alias = product.get("alias") or detail.get("alias", "")
            product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None

            link_data = {
                "supplier_id": supplier_id,
                "article_number": str(product.get("articleNr", "")),
                "product_url": product_url,
                "price_eur": price,
                "pack_size": pack_size,
                "pack_unit": pack_unit,
            }

            try:
                herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
                stats["supplier_linked"] += 1
            except Exception as e:
                print(f"  ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
                stats["errors"] += 1

        time.sleep(0.1)  # small delay between HerbAPI calls

    # Summary
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"  Cultivars created:          {stats['created']}")
    print(f"  Cultivars already existed:   {stats['skipped_existing']}")
    print(f"  Skipped (no species match):  {stats['skipped_no_species']}")
    print(f"  Supplier links created:      {stats['supplier_linked']}")
    print(f"  Supplier links existed:      {stats['supplier_link_existed']}")
    print(f"  Errors:                      {stats['errors']}")


if __name__ == "__main__":
    main()