#!/usr/bin/env python3 """Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching.""" import json import re import sys import time import urllib.request import urllib.error import urllib.parse from html import unescape # --- Config --- API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1" API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk" REINSAAT_BASE = "https://www.reinsaat.at" DELAY = 0.3 # Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes) CATEGORIES = [ "beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress", "pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons", "carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley", "parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify", "celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic", "culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs", "wild_flowers_seeds", "green_manure", ] # Suffixes to strip from botanical names (authority names, infraspecific ranks) STRIP_SUFFIXES = { "l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var", "subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.", "hort.", "medik.", "moench", "pers.", "salisb.", "thunb.", "crantz", "gaertn.", "lam.", "link", "siebold", "zucc.", "sat.", "sat", "axillare", "medikus", } def api_get(path, params=None): """GET from HerbAPI.""" url = f"{API_BASE}{path}" if params: url += "?" + urllib.parse.urlencode(params) req = urllib.request.Request(url) req.add_header("Authorization", f"Bearer {API_TOKEN}") with urllib.request.urlopen(req) as resp: return json.loads(resp.read()) def api_post(path, data): """POST to HerbAPI.""" url = f"{API_BASE}{path}" body = json.dumps(data).encode() req = urllib.request.Request(url, data=body, method="POST") req.add_header("Authorization", f"Bearer {API_TOKEN}") req.add_header("Content-Type", "application/json") with urllib.request.urlopen(req) as resp: return json.loads(resp.read()) def fetch_page(url): """Fetch a web page, return HTML string.""" req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)") with urllib.request.urlopen(req, timeout=15) as resp: return resp.read().decode("utf-8", errors="replace") BOTANICAL_TYPOS = { "capscicum": "capsicum", "capsicum frutenscens": "capsicum frutescens", "tropaelum": "tropaeolum", "lact.": "lactuca", } ABBREVIATED_NAMES = { "origanum vulg.": "origanum vulgare", "helichrysum bract.": "helichrysum bracteatum", "campanula lat.": "campanula latifolia", "cosmos bip.": "cosmos bipinnatus", "papaver somnif.": "papaver somniferum", } def normalise_botanical(raw): """Strip botanical name to genus + species only. 'Pisum sativum L. convar. sat.' -> 'pisum sativum' 'Solanum lycopersicum L.' -> 'solanum lycopersicum' 'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris' """ if not raw: return None # Clean HTML entities raw = unescape(raw).replace("\xa0", " ").strip() # Remove trailing commas/periods raw = raw.rstrip(",. ") # Remove content in parentheses raw = re.sub(r"\([^)]*\)", "", raw) # Check abbreviated names first (before splitting) raw_lower = raw.lower().strip() for abbrev, full in ABBREVIATED_NAMES.items(): if raw_lower.startswith(abbrev): return full parts = raw.split() if len(parts) < 2: return None # Genus (capitalised) + species (lowercase) genus = parts[0].lower().rstrip(",") species = parts[1].lower().rstrip(",") # Fix known typos if genus in BOTANICAL_TYPOS: genus = BOTANICAL_TYPOS[genus] full_name = f"{genus} {species}" if full_name in BOTANICAL_TYPOS: full_name = BOTANICAL_TYPOS[full_name] genus, species = full_name.split() # Validate: genus should start with letter, species should be all lowercase if not genus[0].isalpha() or not species[0].isalpha(): return None # Skip if species looks like an authority (starts with uppercase in original) if parts[1][0].isupper(): return None return f"{genus} {species}" def extract_product_data(html, url): """Extract product info from a Reinsaat product page.""" result = {} # H1 = variety name m = re.search(r']*>([^<]+)', html) if m: name = unescape(m.group(1)).strip() # Clean up names like "RS-To-01.26 (Alda)" -> "Alda" paren = re.search(r"\(([^)]+)\)", name) if paren and re.match(r"RS-", name): name = paren.group(1).strip() result["name"] = name # Botanical name from fce_shop_kurztext m = re.search( r'fce_shop_kurztext[^>]*>\s*(?:]*>)?\s*([^<]+?)\s*(?:)?\s*', html, ) if m: result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip() result["botanical_norm"] = normalise_botanical(result["botanical_raw"]) # Article number from JSON-LD for jm in re.finditer( r'', html, re.S ): try: jd = json.loads(jm.group(1)) except json.JSONDecodeError: continue if jd.get("@type") == "Product": if "model" in jd: result["article_number"] = str(jd["model"]) # Get smallest pack price (usually the Portion) offers = jd.get("offers", {}) if isinstance(offers, dict): offer_list = offers.get("offers", []) elif isinstance(offers, list): offer_list = offers else: offer_list = [] if offer_list: prices = [ o["price"] for o in offer_list if isinstance(o.get("price"), (int, float)) and o["price"] > 0 ] if prices: result["price_eur"] = min(prices) break # Price table - get pack sizes tables = re.findall(r"]*>(.*?)", html, re.S) for tbl in tables: if "€" not in tbl: continue rows = re.findall(r"]*>(.*?)", tbl, re.S) if len(rows) >= 2: size_cells = re.findall(r"]*>(.*?)", rows[0], re.S) size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells] price_cells = re.findall(r"]*>(.*?)", rows[1], re.S) price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells] # Find the "Port." entry for i, st in enumerate(size_texts): if "Port" in st: if i < len(price_texts): pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", ".")) if pm: result["port_price"] = float(pm.group()) break # Get portion content info result["pack_sizes"] = size_texts break # Sowing depth m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I) if m: d1 = float(m.group(1).replace(",", ".")) d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1 result["planting_depth_cm"] = round((d1 + d2) / 2, 2) # Spacing: "row spacing NNxNN cm" or "NN x NN cm" # Try outdoor spacing first m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) if not m: m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) if not m: m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I) if m: result["row_spacing_cm"] = float(m.group(1)) result["plant_spacing_cm"] = float(m.group(2)) # Row spacing without plant spacing (e.g. "row spacing 30-45 cm") if "row_spacing_cm" not in result: m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I) if m: r1 = int(m.group(1)) r2 = int(m.group(2)) if m.group(2) else r1 result["row_spacing_cm"] = float((r1 + r2) // 2) # Germination temperature m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I) if m: t1 = int(m.group(1)) t2 = int(m.group(2)) if m.group(2) else t1 result["germination_temp_c"] = float((t1 + t2) // 2) # Pack unit from portion info - "20 seeds" or "25 g" etc portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html) if not portion_m: # Try "Port. (20 seeds)" format portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html) if portion_m: result["pack_size"] = float(portion_m.group(1).replace(",", ".")) unit = portion_m.group(2).lower() if unit in ("seed", "seeds", "korn"): result["pack_unit"] = "Korn" else: result["pack_unit"] = unit result["url"] = url return result def get_all_species(): """Fetch all species from API, build lookup by normalised name.""" species_map = {} page = 1 while True: data = api_get("/species", {"per_page": 100, "page": page}) batch = data.get("data", []) for sp in batch: norm = normalise_botanical(sp["name_scientific"]) if norm: species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]} print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})") if len(batch) < 100: break page += 1 return species_map def get_all_cultivars(): """Fetch all cultivars, build lookup by (species_id, normalised name).""" cultivar_map = {} # (species_id, lower_name) -> cultivar page = 1 while True: data = api_get("/cultivars", {"per_page": 100, "page": page}) batch = data.get("data", []) for cv in batch: key = (cv["species_id"], cv["name"].lower().strip()) cultivar_map[key] = cv print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})") if len(batch) < 100: break page += 1 return cultivar_map def get_reinsaat_supplier(): """Get Reinsaat supplier record.""" suppliers = api_get("/suppliers") for s in suppliers: if s["slug"] == "reinsaat": return s raise RuntimeError("Reinsaat supplier not found in API") def get_cultivar_suppliers(cultivar_id): """Get existing supplier links for a cultivar.""" return api_get(f"/cultivars/{cultivar_id}/suppliers") def get_product_urls_from_category(cat_slug): """Fetch product URLs from a category page. Handles one level of subcategories.""" cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/" try: html = fetch_page(cat_url) except Exception as e: print(f" WARN: Failed to fetch category {cat_slug}: {e}") return [] time.sleep(DELAY) # Get all internal links under this category pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/' raw_links = re.findall(rf'href="({pattern})"', html) # raw_links is list of (full_path, slug_part) but re gives us captured groups # Let me redo this raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html) unique_links = sorted(set(raw_links)) product_urls = [] subcategory_urls = [] for link in unique_links: full_url = REINSAAT_BASE + link # Determine depth relative to category parts = link.rstrip("/").split("/") # /shop/EN/cat_slug/item -> 4 parts = product or subcategory # /shop/EN/cat_slug/subcat/item -> 5 parts = nested product if len(parts) == 4: # Could be product or subcategory - we'll check later product_urls.append(full_url) elif len(parts) >= 5: product_urls.append(full_url) return product_urls def is_product_page(html): """Check if HTML is a product page (has botanical name or JSON-LD Product).""" return bool( re.search(r'fce_shop_kurztext', html) or re.search(r'"@type":\s*"Product"', html) ) def main(): print("=" * 60) print("Reinsaat v3 Scraper") print("=" * 60) # Step 1: Load all species print("\n[1/4] Loading species from API...") species_map = get_all_species() print(f" Loaded {len(species_map)} species") # Step 2: Load all cultivars print("\n[2/4] Loading cultivars from API...") cultivar_map = get_all_cultivars() print(f" Loaded {len(cultivar_map)} cultivars") # Step 3: Get Reinsaat supplier print("\n[3/4] Getting Reinsaat supplier...") supplier = get_reinsaat_supplier() supplier_id = supplier["id"] print(f" Reinsaat ID: {supplier_id}") # Step 4: Scrape categories print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...") stats = { "products_found": 0, "botanical_extracted": 0, "species_matched": 0, "species_not_matched": 0, "cultivar_existed": 0, "cultivar_created": 0, "link_existed": 0, "link_created": 0, "errors": 0, } unmatched_species = {} # botanical_norm -> count new_cultivars = [] new_links = [] for cat_i, cat in enumerate(CATEGORIES): print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---") urls = get_product_urls_from_category(cat) print(f" Found {len(urls)} URLs") for url in urls: time.sleep(DELAY) try: html = fetch_page(url) except Exception as e: print(f" ERROR fetching {url}: {e}") stats["errors"] += 1 continue # Check if this is actually a product page if not is_product_page(html): # Might be a subcategory - get links from it sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html) sub_links = [ REINSAAT_BASE + l for l in sorted(set(sub_links)) if l.startswith(f"/shop/EN/{cat}/") and l.count("/") > url.rstrip("/").count("/") ] if sub_links: # It's a subcategory, process its product links for sub_url in sub_links: if sub_url in urls: continue # already in list time.sleep(DELAY) try: sub_html = fetch_page(sub_url) except Exception as e: print(f" ERROR fetching {sub_url}: {e}") stats["errors"] += 1 continue if not is_product_page(sub_html): continue process_product( sub_html, sub_url, species_map, cultivar_map, supplier_id, stats, unmatched_species, new_cultivars, new_links, ) continue process_product( html, url, species_map, cultivar_map, supplier_id, stats, unmatched_species, new_cultivars, new_links, ) # Report print("\n" + "=" * 60) print("RESULTS") print("=" * 60) print(f"Products found: {stats['products_found']}") print(f"Botanical extracted: {stats['botanical_extracted']}") print(f"Species matched: {stats['species_matched']}") print(f"Species NOT matched: {stats['species_not_matched']}") print(f"Cultivars existed: {stats['cultivar_existed']}") print(f"Cultivars created: {stats['cultivar_created']}") print(f"Links existed: {stats['link_existed']}") print(f"Links created: {stats['link_created']}") print(f"Errors: {stats['errors']}") if new_cultivars: print(f"\n--- New cultivars ({len(new_cultivars)}) ---") for cv in new_cultivars: print(f" + {cv['name']} ({cv.get('species', '?')})") if new_links: print(f"\n--- New supplier links ({len(new_links)}) ---") for lk in new_links: print(f" + {lk['cultivar']} -> {lk.get('article', '?')}") if unmatched_species: print(f"\n--- Unmatched species ({len(unmatched_species)}) ---") for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]): print(f" ? {name} (x{count})") print("\nDone.") def process_product(html, url, species_map, cultivar_map, supplier_id, stats, unmatched_species, new_cultivars, new_links): """Process a single product page.""" stats["products_found"] += 1 prod = extract_product_data(html, url) if not prod.get("name"): return bot_norm = prod.get("botanical_norm") if not bot_norm: # No botanical name found on page stats["species_not_matched"] += 1 unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1 return stats["botanical_extracted"] += 1 # Match species species = species_map.get(bot_norm) if not species: stats["species_not_matched"] += 1 unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1 return stats["species_matched"] += 1 species_id = species["id"] cultivar_name = prod["name"] # Check if cultivar exists cv_key = (species_id, cultivar_name.lower().strip()) existing_cv = cultivar_map.get(cv_key) if existing_cv: stats["cultivar_existed"] += 1 cultivar_id = existing_cv["id"] else: # Create cultivar create_data = { "species_id": species_id, "name": cultivar_name, "is_organic": True, "source_urls": [url], } # Add growing data if we extracted any if "planting_depth_cm" in prod: create_data["planting_depth_cm"] = prod["planting_depth_cm"] if "row_spacing_cm" in prod: create_data["row_spacing_cm"] = prod["row_spacing_cm"] if "plant_spacing_cm" in prod: create_data["plant_spacing_cm"] = prod["plant_spacing_cm"] if "germination_temp_c" in prod: create_data["germination_temp_c"] = prod["germination_temp_c"] try: new_cv = api_post("/cultivars", create_data) cultivar_id = new_cv["id"] stats["cultivar_created"] += 1 new_cultivars.append({ "name": cultivar_name, "species": species["name"], "id": cultivar_id, }) # Add to local cache cultivar_map[cv_key] = new_cv print(f" + Created cultivar: {cultivar_name} ({species['name']})") except urllib.error.HTTPError as e: body = e.read().decode() if hasattr(e, 'read') else str(e) if e.code == 500 and "Database error" in body: # Likely slug collision - search for existing cultivar try: # Try multiple search strategies found = None cn_lower = cultivar_name.lower().strip() # Strategy 1: search by full name search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50}) for cv in search_data.get("data", []): if cv["name"].lower().strip() == cn_lower: found = cv break # Strategy 2: match by species_id + partial name if not found: for cv in search_data.get("data", []): if cv["species_id"] == species_id: # Match if names are similar (ignoring punctuation) cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower()) cn_clean = re.sub(r'[^\w\s]', '', cn_lower) if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean: found = cv break # Strategy 3: search by last significant word if not found: words = [w for w in cultivar_name.split() if len(w) > 2] if words: search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50}) for cv in search2.get("data", []): if cv["species_id"] == species_id: cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower()) cn_clean = re.sub(r'[^\w\s]', '', cn_lower) if cv_clean == cn_clean: found = cv break if found: cultivar_id = found["id"] cultivar_map[cv_key] = found stats["cultivar_existed"] += 1 else: print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)") stats["errors"] += 1 return except Exception as e2: print(f" ERROR searching for '{cultivar_name}' after collision: {e2}") stats["errors"] += 1 return else: print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}") stats["errors"] += 1 return # Check if Reinsaat supplier link exists try: existing_links = get_cultivar_suppliers(cultivar_id) except Exception: existing_links = [] has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links) if has_reinsaat: stats["link_existed"] += 1 else: # Create supplier link link_data = { "supplier_id": supplier_id, "product_url": url, } if "article_number" in prod: link_data["article_number"] = prod["article_number"] if "port_price" in prod: link_data["price_eur"] = prod["port_price"] elif "price_eur" in prod: link_data["price_eur"] = prod["price_eur"] if "pack_size" in prod: link_data["pack_size"] = prod["pack_size"] if "pack_unit" in prod: link_data["pack_unit"] = prod["pack_unit"] try: api_post(f"/cultivars/{cultivar_id}/suppliers", link_data) stats["link_created"] += 1 new_links.append({ "cultivar": cultivar_name, "article": prod.get("article_number", "?"), "url": url, }) except urllib.error.HTTPError as e: body = e.read().decode() if hasattr(e, 'read') else str(e) print(f" ERROR linking '{cultivar_name}': {e.code} {body}") stats["errors"] += 1 if __name__ == "__main__": main()