herbapi/tools/scrapers/scrape_reinsaat.py

#!/usr/bin/env python3
"""
Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.

Strategy:
1. Fetch category pages, recursively discover product pages via JSON-LD detection
2. Extract structured data from JSON-LD Product schema + HTML text for growing data
3. Match Latin names to existing species in the API
4. Create cultivar records and link them to Reinsaat supplier
"""

import json
import re
import ssl
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser
from dataclasses import dataclass
from typing import Optional

# ── Config ──────────────────────────────────────────────────────────────────
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.5  # seconds between requests
USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"

# ── Categories to scrape ────────────────────────────────────────────────────
# (category_url, default_species_hint for leaf pages in this category)
CATEGORIES = [
    ("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
    ("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
    ("https://www.reinsaat.at/shop/DE/kuerbis/", None),
    ("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
    ("https://www.reinsaat.at/shop/DE/bohnen/", None),
    ("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
    ("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
    ("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
]

# ── Known Latin name genera we can match ────────────────────────────────────
KNOWN_GENERA = (
    "Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
    "Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
    "Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
    "Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
    "Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
    "Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
)

LATIN_PATTERN = re.compile(
    rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
)


# ── HTML helpers ────────────────────────────────────────────────────────────
class TextExtractor(HTMLParser):
    """Extract all visible text from HTML."""
    def __init__(self):
        super().__init__()
        self.parts = []
        self._skip = 0

    def handle_starttag(self, tag, attrs):
        if tag in ("script", "style", "noscript"):
            self._skip += 1

    def handle_endtag(self, tag):
        if tag in ("script", "style", "noscript") and self._skip > 0:
            self._skip -= 1

    def handle_data(self, data):
        if self._skip == 0:
            t = data.strip()
            if t:
                self.parts.append(t)


def extract_links(html: str, base_url: str) -> list[str]:
    """Extract all <a href> links from HTML, resolving relative URLs."""
    links = []
    seen = set()
    for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
        href = m.group(1)
        if not href or href.startswith("#") or href.startswith("javascript:"):
            continue
        full = urllib.parse.urljoin(base_url, href)
        if full not in seen:
            seen.add(full)
            links.append(full)
    return links


def extract_jsonld_product(html: str) -> Optional[dict]:
    """Extract the JSON-LD Product object from HTML, if present."""
    for m in re.finditer(
        r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
        html, re.DOTALL | re.IGNORECASE
    ):
        try:
            data = json.loads(m.group(1))
            if isinstance(data, dict) and data.get("@type") == "Product":
                return data
        except (json.JSONDecodeError, ValueError):
            continue
    return None


# ── HTTP helpers ────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()

def fetch_url(url: str, retries: int = 2) -> str:
    """Fetch a URL with retries."""
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
    })
    for attempt in range(retries + 1):
        try:
            with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
                charset = resp.headers.get_content_charset() or "utf-8"
                return resp.read().decode(charset)
        except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
            if attempt < retries:
                time.sleep(2)
                continue
            raise
    return ""


def api_get(path: str):
    """GET from HerbAPI."""
    req = urllib.request.Request(
        f"{API_BASE}{path}",
        headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=15) as resp:
        return json.loads(resp.read())


def api_post(path: str, data: dict):
    """POST to HerbAPI."""
    body = json.dumps(data).encode("utf-8")
    req = urllib.request.Request(
        f"{API_BASE}{path}",
        data=body,
        headers={
            "Authorization": f"Bearer {AUTH_TOKEN}",
            "Content-Type": "application/json",
            "Accept": "application/json",
        },
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        error_body = e.read().decode("utf-8", errors="replace")
        print(f"    API ERROR {e.code}: {error_body[:500]}")
        raise


# ── Species matching ────────────────────────────────────────────────────────
def load_species() -> dict:
    """Load species from API. Returns dict: lowercase scientific name -> species dict."""
    result = {}
    page = 1
    while True:
        data = api_get(f"/species?per_page=100&page={page}")
        species_list = data.get("data", data) if isinstance(data, dict) else data
        for s in species_list:
            key = s["name_scientific"].lower().strip()
            result[key] = s
        if isinstance(data, dict) and "pagination" in data:
            if page >= data["pagination"].get("total_pages", 1):
                break
        else:
            break
        page += 1
    return result


def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
    """Match a Latin name to an existing species. Returns species dict or None."""
    if not latin_name:
        return None

    # Clean the name: remove author citations, subspecies
    clean = latin_name.strip()
    clean = re.sub(r'\s+L\.\s*$', '', clean)
    clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
    clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)

    key = clean.lower().strip()
    if key in species_map:
        return species_map[key]

    # Try genus + species (first two words)
    parts = key.split()
    if len(parts) >= 2:
        two = f"{parts[0]} {parts[1]}"
        if two in species_map:
            return species_map[two]

    # Try genus-only match (less reliable, but useful for Borago, etc.)
    if parts:
        for skey, sval in species_map.items():
            if skey.startswith(parts[0] + " "):
                return sval

    return None


# ── Product data extraction ─────────────────────────────────────────────────
@dataclass
class ProductData:
    name: str = ""
    latin_name: str = ""
    description: str = ""
    sku: str = ""
    url: str = ""
    is_organic: bool = True
    sowing_depth_cm: Optional[float] = None
    row_spacing_cm: Optional[float] = None
    plant_spacing_cm: Optional[float] = None
    germination_temp_c: Optional[float] = None
    perennial: bool = False


def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
    """Parse a product page. Returns ProductData or None if not a product page."""
    jsonld = extract_jsonld_product(html)
    if not jsonld:
        return None  # Not a product page

    product = ProductData(url=url)

    # ── From JSON-LD ──
    product.name = jsonld.get("name", "").strip()
    product.description = jsonld.get("description", "").strip()
    product.sku = jsonld.get("model", "").strip()

    # ── Extract full text for pattern matching ──
    extractor = TextExtractor()
    extractor.feed(html)
    full_text = " ".join(extractor.parts)

    # ── Latin name ──
    m = LATIN_PATTERN.search(full_text)
    if m:
        product.latin_name = m.group(1).strip()
    # Also check <i>/<em> tags in HTML
    if not product.latin_name:
        for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
            clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
            im = LATIN_PATTERN.search(clean)
            if im:
                product.latin_name = im.group(1).strip()
                break
    if not product.latin_name and default_species:
        product.latin_name = default_species

    # ── Sowing depth ──
    depth_pats = [
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
        r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
        r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
    ]
    for pat in depth_pats:
        dm = re.search(pat, full_text, re.IGNORECASE)
        if dm:
            vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
            product.sowing_depth_cm = sum(vals) / len(vals)
            break

    # Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords
    if product.sowing_depth_cm is None:
        dm = re.search(
            r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
            html, re.IGNORECASE
        )
        if dm:
            d1 = float(dm.group(1).replace(",", "."))
            d2 = float(dm.group(2).replace(",", "."))
            product.sowing_depth_cm = (d1 + d2) / 2

    # ── Spacing ──
    # Look for "ROW x PLANT cm" patterns
    spacing_pats = [
        # "30–40 x 2–4 cm" (range x range)
        r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
        # "100 x 50 cm" (simple)
        r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
    ]
    for pat in spacing_pats:
        matches = re.findall(pat, full_text, re.IGNORECASE)
        if matches:
            # Prefer the last match (often the more relevant outdoor spacing)
            m = matches[-1]
            if len(m) == 4:
                product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
                product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
            elif len(m) == 2:
                v1 = float(m[0].replace(",", "."))
                v2 = float(m[1].replace(",", "."))
                product.row_spacing_cm = v1
                product.plant_spacing_cm = v2
            break

    # ── Germination temperature ──
    temp_pats = [
        r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C',
        r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C',
        r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
    ]
    for pat in temp_pats:
        tm = re.search(pat, full_text, re.IGNORECASE)
        if tm:
            vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
            # Sanity check: germination temps are typically 5-35°C
            avg = sum(vals) / len(vals)
            if 5 <= avg <= 40:
                product.germination_temp_c = avg
                break

    # ── Perennial ──
    perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
    for pat in perennial_pats:
        if re.search(pat, full_text, re.IGNORECASE):
            product.perennial = True
            break

    return product


# ── Recursive product discovery ─────────────────────────────────────────────
def discover_products(
    category_url: str,
    default_species: Optional[str],
    max_depth: int = 3,
    _depth: int = 0,
    _visited: set = None,
) -> list[ProductData]:
    """Recursively discover and parse product pages under a category URL."""
    if _visited is None:
        _visited = set()
    if category_url in _visited or _depth > max_depth:
        return []
    _visited.add(category_url)

    indent = "  " * (_depth + 1)
    print(f"{indent}Fetching: {category_url}")

    try:
        html = fetch_url(category_url)
        time.sleep(DELAY)
    except Exception as e:
        print(f"{indent}  ERROR: {e}")
        return []

    # Check if this IS a product page
    product = parse_product(html, category_url, default_species)
    if product:
        return [product]

    # It's a category/subcategory page: extract child links
    cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
    child_links = []
    for link in extract_links(html, category_url):
        parsed = urllib.parse.urlparse(link)
        if parsed.netloc and parsed.netloc != "www.reinsaat.at":
            continue
        child_path = parsed.path.rstrip("/")
        # Must be a direct child of the category path
        if not child_path.startswith(cat_path + "/"):
            continue
        relative = child_path[len(cat_path) + 1:]
        # Must be exactly one level deeper (no further slashes)
        if "/" in relative:
            continue
        # Skip empty or same-path
        if not relative:
            continue
        # Build clean URL
        clean_url = f"https://www.reinsaat.at{child_path}/"
        if clean_url not in _visited:
            child_links.append(clean_url)

    # Deduplicate
    child_links = list(dict.fromkeys(child_links))
    print(f"{indent}  Found {len(child_links)} child links")

    products = []
    for child_url in child_links:
        results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
        products.extend(results)

    return products


# ── Main ────────────────────────────────────────────────────────────────────
def main():
    print("=" * 70)
    print("Reinsaat Scraper -> HerbAPI")
    print("=" * 70)

    # Load species
    print("\n[1] Loading species from API...")
    species_map = load_species()
    sci_names = [k for k in species_map if " " in k]
    print(f"    {len(sci_names)} species loaded:")
    for k in sorted(sci_names):
        s = species_map[k]
        print(f"      {s['name_scientific']:40s} {s['id'][:12]}...")

    # Load existing cultivars
    print("\n[2] Loading existing cultivars...")
    existing_cultivars = {}  # (species_id, name_lower) -> cultivar_id
    page = 1
    while True:
        data = api_get(f"/cultivars?per_page=100&page={page}")
        clist = data.get("data", data) if isinstance(data, dict) else data
        if not clist:
            break
        for c in clist:
            existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
        # Check pagination - API uses {data, total, page, per_page} format
        if isinstance(data, dict):
            total = data.get("total", len(clist))
            per_page = data.get("per_page", 100)
            if page * per_page >= total:
                break
        else:
            break
        page += 1
    print(f"    {len(existing_cultivars)} existing cultivars")

    # Discover products from all categories
    print("\n[3] Discovering products from Reinsaat categories...")
    all_products: list[ProductData] = []
    visited: set[str] = set()

    for cat_url, species_hint in CATEGORIES:
        print(f"\n  Category: {cat_url}")
        products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
        all_products.extend(products)
        print(f"  -> {len(products)} products from this category")

    print(f"\n  Total products discovered: {len(all_products)}")

    # Deduplicate by URL
    seen_urls = set()
    unique_products = []
    for p in all_products:
        if p.url not in seen_urls:
            seen_urls.add(p.url)
            unique_products.append(p)
    all_products = unique_products
    print(f"  Unique products: {len(all_products)}")

    # Process products
    print("\n[4] Creating cultivars in API...")
    stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}

    for i, product in enumerate(all_products):
        pct = (i + 1) / len(all_products) * 100
        print(f"\n  [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")

        # Match species
        species = match_species(product.latin_name, species_map)
        if not species:
            print(f"    Skip: no species match for '{product.latin_name}'")
            stats["skipped_no_species"] += 1
            continue

        species_id = species["id"]
        print(f"    Species: {species['name_scientific']}")
        print(f"    SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
              f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
              f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")

        # Check duplicates
        key = (species_id, product.name.lower())
        if key in existing_cultivars:
            # Still try to link supplier if cultivar exists
            cultivar_id = existing_cultivars[key]
            print(f"    Exists: {cultivar_id[:12]}... - checking supplier link")
            try:
                api_post(f"/cultivars/{cultivar_id}/suppliers", {
                    "supplier_id": REINSAAT_SUPPLIER_ID,
                    "product_url": product.url,
                    "article_number": product.sku,
                })
                print(f"    Linked to Reinsaat (SKU: {product.sku})")
                stats["linked"] += 1
            except Exception:
                pass  # Already linked or other error
            stats["skipped_exists"] += 1
            continue

        # Build payload
        payload = {
            "species_id": species_id,
            "name": product.name,
            "name_de": product.name,
            "name_en": "",
            "description": product.description,
            "is_organic": product.is_organic,
            "perennial": product.perennial,
        }
        if product.sowing_depth_cm is not None:
            payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
        if product.row_spacing_cm is not None:
            payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
        if product.plant_spacing_cm is not None:
            payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
        if product.germination_temp_c is not None:
            payload["germination_temp_c"] = round(product.germination_temp_c, 1)

        # Create cultivar
        try:
            result = api_post("/cultivars", payload)
            cultivar_id = result["id"]
            print(f"    Created: {cultivar_id}")
            stats["created"] += 1
            existing_cultivars[key] = cultivar_id
        except Exception as e:
            print(f"    FAILED to create: {e}")
            stats["errors"] += 1
            continue

        # Link to supplier
        try:
            api_post(f"/cultivars/{cultivar_id}/suppliers", {
                "supplier_id": REINSAAT_SUPPLIER_ID,
                "product_url": product.url,
                "article_number": product.sku,
            })
            print(f"    Linked to Reinsaat (SKU: {product.sku})")
            stats["linked"] += 1
        except Exception as e:
            print(f"    FAILED to link supplier: {e}")

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"  Created:              {stats['created']}")
    print(f"  Linked to supplier:   {stats['linked']}")
    print(f"  Skipped (no species): {stats['skipped_no_species']}")
    print(f"  Skipped (exists):     {stats['skipped_exists']}")
    print(f"  Errors:               {stats['errors']}")
    print("=" * 70)


if __name__ == "__main__":
    main()