#!/usr/bin/env python3 """ Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI. Strategy: 1. Fetch category pages, recursively discover product pages via JSON-LD detection 2. Extract structured data from JSON-LD Product schema + HTML text for growing data 3. Match Latin names to existing species in the API 4. Create cultivar records and link them to Reinsaat supplier """ import json import re import ssl import time import urllib.request import urllib.error import urllib.parse from html.parser import HTMLParser from dataclasses import dataclass from typing import Optional # ── Config ────────────────────────────────────────────────────────────────── API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4" DELAY = 0.5 # seconds between requests USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)" # ── Categories to scrape ──────────────────────────────────────────────────── # (category_url, default_species_hint for leaf pages in this category) CATEGORIES = [ ("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"), ("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None), ("https://www.reinsaat.at/shop/DE/kuerbis/", None), ("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"), ("https://www.reinsaat.at/shop/DE/bohnen/", None), ("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"), ("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"), ("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None), ] # ── Known Latin name genera we can match ──────────────────────────────────── KNOWN_GENERA = ( "Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|" "Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|" "Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|" "Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|" "Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|" "Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus" ) LATIN_PATTERN = re.compile( rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)' ) # ── HTML helpers ──────────────────────────────────────────────────────────── class TextExtractor(HTMLParser): """Extract all visible text from HTML.""" def __init__(self): super().__init__() self.parts = [] self._skip = 0 def handle_starttag(self, tag, attrs): if tag in ("script", "style", "noscript"): self._skip += 1 def handle_endtag(self, tag): if tag in ("script", "style", "noscript") and self._skip > 0: self._skip -= 1 def handle_data(self, data): if self._skip == 0: t = data.strip() if t: self.parts.append(t) def extract_links(html: str, base_url: str) -> list[str]: """Extract all links from HTML, resolving relative URLs.""" links = [] seen = set() for m in re.finditer(r']*href="([^"]*)"', html, re.IGNORECASE): href = m.group(1) if not href or href.startswith("#") or href.startswith("javascript:"): continue full = urllib.parse.urljoin(base_url, href) if full not in seen: seen.add(full) links.append(full) return links def extract_jsonld_product(html: str) -> Optional[dict]: """Extract the JSON-LD Product object from HTML, if present.""" for m in re.finditer( r']*type="application/ld\+json"[^>]*>(.*?)', html, re.DOTALL | re.IGNORECASE ): try: data = json.loads(m.group(1)) if isinstance(data, dict) and data.get("@type") == "Product": return data except (json.JSONDecodeError, ValueError): continue return None # ── HTTP helpers ──────────────────────────────────────────────────────────── _ssl_ctx = ssl.create_default_context() def fetch_url(url: str, retries: int = 2) -> str: """Fetch a URL with retries.""" req = urllib.request.Request(url, headers={ "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "de-AT,de;q=0.9,en;q=0.5", }) for attempt in range(retries + 1): try: with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp: charset = resp.headers.get_content_charset() or "utf-8" return resp.read().decode(charset) except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: if attempt < retries: time.sleep(2) continue raise return "" def api_get(path: str): """GET from HerbAPI.""" req = urllib.request.Request( f"{API_BASE}{path}", headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"}, ) with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read()) def api_post(path: str, data: dict): """POST to HerbAPI.""" body = json.dumps(data).encode("utf-8") req = urllib.request.Request( f"{API_BASE}{path}", data=body, headers={ "Authorization": f"Bearer {AUTH_TOKEN}", "Content-Type": "application/json", "Accept": "application/json", }, method="POST", ) try: with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read()) except urllib.error.HTTPError as e: error_body = e.read().decode("utf-8", errors="replace") print(f" API ERROR {e.code}: {error_body[:500]}") raise # ── Species matching ──────────────────────────────────────────────────────── def load_species() -> dict: """Load species from API. Returns dict: lowercase scientific name -> species dict.""" result = {} page = 1 while True: data = api_get(f"/species?per_page=100&page={page}") species_list = data.get("data", data) if isinstance(data, dict) else data for s in species_list: key = s["name_scientific"].lower().strip() result[key] = s if isinstance(data, dict) and "pagination" in data: if page >= data["pagination"].get("total_pages", 1): break else: break page += 1 return result def match_species(latin_name: str, species_map: dict) -> Optional[dict]: """Match a Latin name to an existing species. Returns species dict or None.""" if not latin_name: return None # Clean the name: remove author citations, subspecies clean = latin_name.strip() clean = re.sub(r'\s+L\.\s*$', '', clean) clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean) clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean) key = clean.lower().strip() if key in species_map: return species_map[key] # Try genus + species (first two words) parts = key.split() if len(parts) >= 2: two = f"{parts[0]} {parts[1]}" if two in species_map: return species_map[two] # Try genus-only match (less reliable, but useful for Borago, etc.) if parts: for skey, sval in species_map.items(): if skey.startswith(parts[0] + " "): return sval return None # ── Product data extraction ───────────────────────────────────────────────── @dataclass class ProductData: name: str = "" latin_name: str = "" description: str = "" sku: str = "" url: str = "" is_organic: bool = True sowing_depth_cm: Optional[float] = None row_spacing_cm: Optional[float] = None plant_spacing_cm: Optional[float] = None germination_temp_c: Optional[float] = None perennial: bool = False def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]: """Parse a product page. Returns ProductData or None if not a product page.""" jsonld = extract_jsonld_product(html) if not jsonld: return None # Not a product page product = ProductData(url=url) # ── From JSON-LD ── product.name = jsonld.get("name", "").strip() product.description = jsonld.get("description", "").strip() product.sku = jsonld.get("model", "").strip() # ── Extract full text for pattern matching ── extractor = TextExtractor() extractor.feed(html) full_text = " ".join(extractor.parts) # ── Latin name ── m = LATIN_PATTERN.search(full_text) if m: product.latin_name = m.group(1).strip() # Also check / tags in HTML if not product.latin_name: for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)', html, re.IGNORECASE | re.DOTALL): clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip() im = LATIN_PATTERN.search(clean) if im: product.latin_name = im.group(1).strip() break if not product.latin_name and default_species: product.latin_name = default_species # ── Sowing depth ── depth_pats = [ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm', r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)', ] for pat in depth_pats: dm = re.search(pat, full_text, re.IGNORECASE) if dm: vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)] product.sowing_depth_cm = sum(vals) / len(vals) break # Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords if product.sowing_depth_cm is None: dm = re.search( r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', html, re.IGNORECASE ) if dm: d1 = float(dm.group(1).replace(",", ".")) d2 = float(dm.group(2).replace(",", ".")) product.sowing_depth_cm = (d1 + d2) / 2 # ── Spacing ── # Look for "ROW x PLANT cm" patterns spacing_pats = [ # "30–40 x 2–4 cm" (range x range) r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm', # "100 x 50 cm" (simple) r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm', ] for pat in spacing_pats: matches = re.findall(pat, full_text, re.IGNORECASE) if matches: # Prefer the last match (often the more relevant outdoor spacing) m = matches[-1] if len(m) == 4: product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2 product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2 elif len(m) == 2: v1 = float(m[0].replace(",", ".")) v2 = float(m[1].replace(",", ".")) product.row_spacing_cm = v1 product.plant_spacing_cm = v2 break # ── Germination temperature ── temp_pats = [ r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C', r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C', r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C', ] for pat in temp_pats: tm = re.search(pat, full_text, re.IGNORECASE) if tm: vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)] # Sanity check: germination temps are typically 5-35°C avg = sum(vals) / len(vals) if 5 <= avg <= 40: product.germination_temp_c = avg break # ── Perennial ── perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude'] for pat in perennial_pats: if re.search(pat, full_text, re.IGNORECASE): product.perennial = True break return product # ── Recursive product discovery ───────────────────────────────────────────── def discover_products( category_url: str, default_species: Optional[str], max_depth: int = 3, _depth: int = 0, _visited: set = None, ) -> list[ProductData]: """Recursively discover and parse product pages under a category URL.""" if _visited is None: _visited = set() if category_url in _visited or _depth > max_depth: return [] _visited.add(category_url) indent = " " * (_depth + 1) print(f"{indent}Fetching: {category_url}") try: html = fetch_url(category_url) time.sleep(DELAY) except Exception as e: print(f"{indent} ERROR: {e}") return [] # Check if this IS a product page product = parse_product(html, category_url, default_species) if product: return [product] # It's a category/subcategory page: extract child links cat_path = urllib.parse.urlparse(category_url).path.rstrip("/") child_links = [] for link in extract_links(html, category_url): parsed = urllib.parse.urlparse(link) if parsed.netloc and parsed.netloc != "www.reinsaat.at": continue child_path = parsed.path.rstrip("/") # Must be a direct child of the category path if not child_path.startswith(cat_path + "/"): continue relative = child_path[len(cat_path) + 1:] # Must be exactly one level deeper (no further slashes) if "/" in relative: continue # Skip empty or same-path if not relative: continue # Build clean URL clean_url = f"https://www.reinsaat.at{child_path}/" if clean_url not in _visited: child_links.append(clean_url) # Deduplicate child_links = list(dict.fromkeys(child_links)) print(f"{indent} Found {len(child_links)} child links") products = [] for child_url in child_links: results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited) products.extend(results) return products # ── Main ──────────────────────────────────────────────────────────────────── def main(): print("=" * 70) print("Reinsaat Scraper -> HerbAPI") print("=" * 70) # Load species print("\n[1] Loading species from API...") species_map = load_species() sci_names = [k for k in species_map if " " in k] print(f" {len(sci_names)} species loaded:") for k in sorted(sci_names): s = species_map[k] print(f" {s['name_scientific']:40s} {s['id'][:12]}...") # Load existing cultivars print("\n[2] Loading existing cultivars...") existing_cultivars = {} # (species_id, name_lower) -> cultivar_id page = 1 while True: data = api_get(f"/cultivars?per_page=100&page={page}") clist = data.get("data", data) if isinstance(data, dict) else data if not clist: break for c in clist: existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"] # Check pagination - API uses {data, total, page, per_page} format if isinstance(data, dict): total = data.get("total", len(clist)) per_page = data.get("per_page", 100) if page * per_page >= total: break else: break page += 1 print(f" {len(existing_cultivars)} existing cultivars") # Discover products from all categories print("\n[3] Discovering products from Reinsaat categories...") all_products: list[ProductData] = [] visited: set[str] = set() for cat_url, species_hint in CATEGORIES: print(f"\n Category: {cat_url}") products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited) all_products.extend(products) print(f" -> {len(products)} products from this category") print(f"\n Total products discovered: {len(all_products)}") # Deduplicate by URL seen_urls = set() unique_products = [] for p in all_products: if p.url not in seen_urls: seen_urls.add(p.url) unique_products.append(p) all_products = unique_products print(f" Unique products: {len(all_products)}") # Process products print("\n[4] Creating cultivars in API...") stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0} for i, product in enumerate(all_products): pct = (i + 1) / len(all_products) * 100 print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}") # Match species species = match_species(product.latin_name, species_map) if not species: print(f" Skip: no species match for '{product.latin_name}'") stats["skipped_no_species"] += 1 continue species_id = species["id"] print(f" Species: {species['name_scientific']}") print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, " f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, " f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}") # Check duplicates key = (species_id, product.name.lower()) if key in existing_cultivars: # Still try to link supplier if cultivar exists cultivar_id = existing_cultivars[key] print(f" Exists: {cultivar_id[:12]}... - checking supplier link") try: api_post(f"/cultivars/{cultivar_id}/suppliers", { "supplier_id": REINSAAT_SUPPLIER_ID, "product_url": product.url, "article_number": product.sku, }) print(f" Linked to Reinsaat (SKU: {product.sku})") stats["linked"] += 1 except Exception: pass # Already linked or other error stats["skipped_exists"] += 1 continue # Build payload payload = { "species_id": species_id, "name": product.name, "name_de": product.name, "name_en": "", "description": product.description, "is_organic": product.is_organic, "perennial": product.perennial, } if product.sowing_depth_cm is not None: payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2) if product.row_spacing_cm is not None: payload["row_spacing_cm"] = round(product.row_spacing_cm, 1) if product.plant_spacing_cm is not None: payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1) if product.germination_temp_c is not None: payload["germination_temp_c"] = round(product.germination_temp_c, 1) # Create cultivar try: result = api_post("/cultivars", payload) cultivar_id = result["id"] print(f" Created: {cultivar_id}") stats["created"] += 1 existing_cultivars[key] = cultivar_id except Exception as e: print(f" FAILED to create: {e}") stats["errors"] += 1 continue # Link to supplier try: api_post(f"/cultivars/{cultivar_id}/suppliers", { "supplier_id": REINSAAT_SUPPLIER_ID, "product_url": product.url, "article_number": product.sku, }) print(f" Linked to Reinsaat (SKU: {product.sku})") stats["linked"] += 1 except Exception as e: print(f" FAILED to link supplier: {e}") # Summary print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f" Created: {stats['created']}") print(f" Linked to supplier: {stats['linked']}") print(f" Skipped (no species): {stats['skipped_no_species']}") print(f" Skipped (exists): {stats['skipped_exists']}") print(f" Errors: {stats['errors']}") print("=" * 70) if __name__ == "__main__": main()