#!/usr/bin/env python3 """ Scrape Arche Noah seed catalog and import cultivars into HerbAPI. Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch product listings and details, then creates cultivars in HerbAPI matched to existing species. """ import json import re import time import urllib.request import urllib.error import urllib.parse import sys from datetime import datetime, timezone # --- Configuration ----------------------------------------------------------- HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" SHOP_BASE = "https://shop.arche-noah.at/ACM/api/" SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" REQUEST_DELAY = 0.5 # seconds between requests # Only import products from these Arche Noah article lines (their own seeds) ARCHE_NOAH_LINES = { "Bio-Saatgut von ARCHE NOAH", "Kostbarkeiten aus dem ARCHE NOAH Samenarchiv", } # Search terms to discover all seed products across the shop SEARCH_TERMS = [ "Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini", "Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine", "Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter", "Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold", "Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen", "Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze", "Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf", "Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn", "Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake", "Schnittlauch", "Knoblauch", "Bärlauch", "Wermut", "Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell", "Studentenblume", "Tagetes", "Phacelia", "Buchweizen", "Rote Bete", "Rote Rübe", "Mangold", "Melde", "Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing", "Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat", "Zuckermais", "Popcorn", ] # --- Helpers ----------------------------------------------------------------- def herbapi_request(method, path, data=None): """Make a request to HerbAPI.""" url = f"{HERBAPI_BASE}/{path}" body = json.dumps(data).encode() if data else None req = urllib.request.Request(url, data=body, method=method, headers={ "Authorization": f"Bearer {HERBAPI_TOKEN}", "Content-Type": "application/json", "Accept": "application/json", }) try: resp = urllib.request.urlopen(req, timeout=30) raw = resp.read().decode("utf-8") return json.loads(raw) if raw.strip() else None except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace") print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr) raise def shop_create_session(): """Create an anonymous session on the Arche Noah shop.""" req = urllib.request.Request( SHOP_BASE + "webshop/createanonymoususer", data=json.dumps({}).encode(), headers={ "User-Agent": SHOP_UA, "Content-Type": "application/json", "Origin": "https://shop.arche-noah.at", "Referer": "https://shop.arche-noah.at/", }, ) resp = urllib.request.urlopen(req, timeout=15) cookie = resp.headers.get("Set-Cookie", "") session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else "" if not session: raise RuntimeError("Failed to get shop session") return session def shop_request(session, endpoint, payload): """Make a POST request to the shop API.""" req = urllib.request.Request( SHOP_BASE + endpoint, data=json.dumps(payload).encode(), headers={ "User-Agent": SHOP_UA, "Content-Type": "application/json", "Accept": "application/json", "Cookie": f"JSESSIONID={session}", "Origin": "https://shop.arche-noah.at", "Referer": "https://shop.arche-noah.at/", }, ) resp = urllib.request.urlopen(req, timeout=30) raw = resp.read().decode("utf-8") return json.loads(raw) if raw.strip() else None def extract_latin_name(detail_headline3): """Extract the Latin/botanical name from the product detail headline3 field.""" if not detail_headline3: return None # Remove HTML tags text = re.sub(r"<[^>]+>", "", detail_headline3).strip() # Remove "Hier geht es zu unseren..." trailing text text = text.split("Hier geht")[0].strip() # Should be something like "Solanum lycopersicum" or "Capsicum annuum" if text and re.match(r"^[A-Z][a-z]+ [a-z]", text): return text return None def match_species(latin_name, species_by_scientific): """ Match a Latin name to a species, handling subspecies/variety suffixes. E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris". Also handles "subsp.", "convar.", "f." qualifiers. """ if not latin_name: return None normalized = latin_name.strip().lower() # Direct match species = species_by_scientific.get(normalized) if species: return species # Strip subspecies/variety/convar/forma qualifiers and try genus + species only # Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..." m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized) if m: base = m.group(1).strip() species = species_by_scientific.get(base) if species: return species return None def extract_cultivar_name(product_name): """ Extract the cultivar/variety name from the product name. Format examples: "Salatparadeiser 'Naama' HG026" -> "Naama" "Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection" "Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond" """ # Try to extract name in quotes (various quote styles) m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name) if m: return m.group(1).strip() # Fallback: remove the article number suffix and type prefix # Remove trailing article number like HG026, TO019, etc. name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip() # Remove common prefixes like "Salatparadeiser", "Buschbohne", etc. # Just return the full cleaned name return name def parse_pack_info(unit_desc): """ Parse pack size info from unitDesc like '20-30 Korn' or '2g'. Returns (pack_size, pack_unit) or (None, None). """ if not unit_desc: return None, None # "20-30 Korn" -> take the lower bound m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc) if m: return float(m.group(1)), m.group(2) return None, None # --- Main scraping logic ----------------------------------------------------- def fetch_all_arche_noah_products(session): """Search the shop API to find all Arche Noah seed products.""" all_products = {} seen_terms = set() for term in SEARCH_TERMS: if term.lower() in seen_terms: continue seen_terms.add(term.lower()) offset = 0 while True: payload = { "searchCriteria": term, "startIndex": offset, "numDataSets": 200, "allowAllProducts": False, } try: data = shop_request(session, "webshop/getproducts", payload) except Exception as e: print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr) break if not data: break new_count = 0 for p in data: if p["sid"] not in all_products: all_products[p["sid"]] = p new_count += 1 if len(data) < 200: break offset += len(data) time.sleep(REQUEST_DELAY) time.sleep(REQUEST_DELAY) # Filter to Arche Noah's own seed products only an_products = { sid: p for sid, p in all_products.items() if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES } print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products") return an_products def fetch_product_details(session, products): """Fetch detailed info (Latin names) for each product.""" details = {} total = len(products) for i, (sid, product) in enumerate(products.items()): try: detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid}) if detail: details[sid] = detail except Exception as e: print(f" Detail for {sid} failed: {e}", file=sys.stderr) if (i + 1) % 20 == 0: print(f" Fetched details: {i + 1}/{total}") time.sleep(REQUEST_DELAY) print(f"Fetched {len(details)} product details") return details def load_herbapi_species(): """Load all species from HerbAPI and build lookup maps (handles pagination).""" page = 1 species_list = [] while True: result = herbapi_request("GET", f"species?per_page=100&page={page}") if isinstance(result, dict) and "data" in result: data = result["data"] total = result.get("total", 0) elif isinstance(result, list): data = result total = len(data) else: break species_list.extend(data) if len(species_list) >= total or not data: break page += 1 # Build lookup by scientific name (normalized lowercase) by_scientific = {} for s in species_list: key = s["name_scientific"].strip().lower() by_scientific[key] = s return species_list, by_scientific def load_herbapi_cultivars(): """Load all existing cultivars from HerbAPI (handles pagination, max 100/page).""" page = 1 all_cultivars = [] while True: result = herbapi_request("GET", f"cultivars?per_page=100&page={page}") if isinstance(result, dict) and "data" in result: data = result["data"] total = result.get("total", 0) elif isinstance(result, list): data = result total = len(data) else: break all_cultivars.extend(data) if len(all_cultivars) >= total or not data: break page += 1 # Build lookup by (species_id, normalized cultivar name) by_key = {} for c in all_cultivars: key = (c["species_id"], c["name"].strip().lower()) by_key[key] = c return all_cultivars, by_key def ensure_supplier(): """Create the Arche Noah supplier if it doesn't exist, return its ID.""" suppliers = herbapi_request("GET", "suppliers") if isinstance(suppliers, dict) and "data" in suppliers: suppliers = suppliers["data"] for s in suppliers: if "arche" in s["name"].lower() and "noah" in s["name"].lower(): print(f"Supplier 'Arche Noah' already exists: {s['id']}") return s["id"] print("Creating supplier 'Arche Noah'...") result = herbapi_request("POST", "suppliers", { "name": "Arche Noah", "url": "https://www.arche-noah.at", "country": "AT", "is_organic": True, "is_demeter": False, "notes": "Austrian society for heritage seed preservation and biodiversity", }) print(f"Created supplier: {result['id']}") return result["id"] def load_existing_supplier_links(cultivar_id): """Load existing supplier links for a cultivar.""" try: result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers") if isinstance(result, list): return result if isinstance(result, dict) and "data" in result: return result["data"] return [] except Exception: return [] def main(): now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") print(f"=== Arche Noah Seed Catalog Scraper ===") print(f"Started at {now_str}\n") # Step 1: Create Arche Noah supplier in HerbAPI print("[1/6] Ensuring Arche Noah supplier exists...") supplier_id = ensure_supplier() print() # Step 2: Load HerbAPI species for matching print("[2/6] Loading HerbAPI species...") species_list, species_by_scientific = load_herbapi_species() print(f"Loaded {len(species_list)} species") print() # Step 3: Load existing cultivars for idempotency print("[3/6] Loading existing cultivars...") existing_cultivars, cultivars_by_key = load_herbapi_cultivars() print(f"Loaded {len(existing_cultivars)} existing cultivars") print() # Step 4: Scrape Arche Noah shop print("[4/6] Scraping Arche Noah shop catalog...") session = shop_create_session() print(f"Got shop session") products = fetch_all_arche_noah_products(session) print() # Step 5: Fetch product details (to get Latin names) print("[5/6] Fetching product details for Latin name matching...") details = fetch_product_details(session, products) print() # Step 6: Create cultivars in HerbAPI print("[6/6] Creating cultivars in HerbAPI...") stats = { "created": 0, "skipped_existing": 0, "skipped_no_species": 0, "supplier_linked": 0, "supplier_link_existed": 0, "errors": 0, } for sid, product in sorted(products.items()): detail = details.get(sid, {}) # Extract Latin name from detail latin_name = extract_latin_name(detail.get("detailHeadline3", "")) if not latin_name: # Fallback: try from category mapping latin_name = None # Match to HerbAPI species (handles subspecies/variety suffixes) species = match_species(latin_name, species_by_scientific) if not species: print(f" SKIP (no species match): {product['name']} | latin={latin_name}") stats["skipped_no_species"] += 1 continue # Extract cultivar name cultivar_name = extract_cultivar_name(product["name"]) if not cultivar_name: print(f" SKIP (no cultivar name): {product['name']}") stats["skipped_no_species"] += 1 continue # Check if cultivar already exists (idempotency) lookup_key = (species["id"], cultivar_name.strip().lower()) existing = cultivars_by_key.get(lookup_key) if existing: cultivar_id = existing["id"] stats["skipped_existing"] += 1 else: # Determine if this is organic is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH" # Build product URL alias = product.get("alias") or detail.get("alias", "") product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None # Create cultivar cultivar_data = { "species_id": species["id"], "name": cultivar_name, "name_de": cultivar_name, "is_organic": is_organic, "source_urls": [product_url] if product_url else None, } try: result = herbapi_request("POST", "cultivars", cultivar_data) cultivar_id = result["id"] stats["created"] += 1 # Add to lookup for idempotency within this run cultivars_by_key[lookup_key] = result print(f" CREATED: {cultivar_name} ({species['name_scientific']})") except Exception as e: print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr) stats["errors"] += 1 continue # Link cultivar to supplier existing_links = load_existing_supplier_links(cultivar_id) already_linked = any( link["supplier_id"] == supplier_id for link in existing_links ) if already_linked: stats["supplier_link_existed"] += 1 else: # Parse pack info unit_desc = product.get("unitDesc") or detail.get("unitDesc", "") pack_size, pack_unit = parse_pack_info(unit_desc) # Get price price = None price_list = product.get("priceListPos") or detail.get("priceListPos", []) if price_list: price = price_list[0].get("singleUnitPrice") # Build product URL alias = product.get("alias") or detail.get("alias", "") product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None link_data = { "supplier_id": supplier_id, "article_number": str(product.get("articleNr", "")), "product_url": product_url, "price_eur": price, "pack_size": pack_size, "pack_unit": pack_unit, } try: herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data) stats["supplier_linked"] += 1 except Exception as e: print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr) stats["errors"] += 1 time.sleep(0.1) # small delay between HerbAPI calls # Summary print(f"\n{'='*60}") print(f"Scraping complete!") print(f" Cultivars created: {stats['created']}") print(f" Cultivars already existed: {stats['skipped_existing']}") print(f" Skipped (no species match): {stats['skipped_no_species']}") print(f" Supplier links created: {stats['supplier_linked']}") print(f" Supplier links existed: {stats['supplier_link_existed']}") print(f" Errors: {stats['errors']}") if __name__ == "__main__": main()