#!/usr/bin/env python3 """ Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting genus+species from extended botanical names, create/enrich cultivars, link supplier. Uses direct PostgreSQL access (psycopg2) for speed and reliability. """ import json import re import ssl import sys import time import uuid import html as html_mod import urllib.request import urllib.error import urllib.parse from dataclasses import dataclass, field from typing import Optional # Unbuffered output sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) import psycopg2 import psycopg2.extras # ── Config ────────────────────────────────────────────────────────────────── DB_HOST = "10.31.3.90" DB_NAME = "herbapi" DB_USER = "herbapi" DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj" REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4" DELAY = 0.3 USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)" # ── All Reinsaat categories ──────────────────────────────────────────────── CATEGORIES = [ "https://www.reinsaat.at/shop/DE/bohnen/", "https://www.reinsaat.at/shop/DE/erbsen/", "https://www.reinsaat.at/shop/DE/gurken/", "https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "https://www.reinsaat.at/shop/DE/knollenfenchel/", "https://www.reinsaat.at/shop/DE/kohlgewaechse/", "https://www.reinsaat.at/shop/DE/kuerbis/", "https://www.reinsaat.at/shop/DE/mais/", "https://www.reinsaat.at/shop/DE/mangold/", "https://www.reinsaat.at/shop/DE/melanzani_1/", "https://www.reinsaat.at/shop/DE/melone/", "https://www.reinsaat.at/shop/DE/paprika/", "https://www.reinsaat.at/shop/DE/pastinaken_1/", "https://www.reinsaat.at/shop/DE/petersilie/", "https://www.reinsaat.at/shop/DE/pfefferoni_chili/", "https://www.reinsaat.at/shop/DE/porree/", "https://www.reinsaat.at/shop/DE/radies_rettich/", "https://www.reinsaat.at/shop/DE/rote_ruebe/", "https://www.reinsaat.at/shop/DE/salate/", "https://www.reinsaat.at/shop/DE/schwarzwurzeln/", "https://www.reinsaat.at/shop/DE/sellerie/", "https://www.reinsaat.at/shop/DE/spinat/", "https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/", "https://www.reinsaat.at/shop/DE/zucchini/", "https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/", "https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", "https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", "https://www.reinsaat.at/shop/DE/gruenduengung/", ] # ── HTTP ──────────────────────────────────────────────────────────────────── _ssl_ctx = ssl.create_default_context() def fetch_url(url: str, retries: int = 2) -> str: req = urllib.request.Request(url, headers={ "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "de-AT,de;q=0.9,en;q=0.5", }) for attempt in range(retries + 1): try: with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp: charset = resp.headers.get_content_charset() or "utf-8" return resp.read().decode(charset) except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: if attempt < retries: time.sleep(2) continue raise return "" # ── HTML parsing helpers ──────────────────────────────────────────────────── def extract_links(html_text: str, base_url: str) -> list[str]: links = [] seen = set() for m in re.finditer(r']*href="([^"]*)"', html_text, re.IGNORECASE): href = m.group(1) if not href or href.startswith("#") or href.startswith("javascript:"): continue full = urllib.parse.urljoin(base_url, href) if full not in seen: seen.add(full) links.append(full) return links def extract_jsonld_product(html_text: str) -> Optional[dict]: for m in re.finditer( r']*type="application/ld\+json"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE ): try: data = json.loads(m.group(1)) if isinstance(data, dict) and data.get("@type") == "Product": return data except (json.JSONDecodeError, ValueError): continue return None def html_to_text(html_text: str) -> str: """Strip HTML tags and decode entities.""" text = re.sub(r'<[^>]+>', ' ', html_text) text = html_mod.unescape(text) text = re.sub(r'\s+', ' ', text).strip() return text def extract_botanical_name(html_text: str) -> str: """ Extract the botanical/Latin name from the page. Primary source:
content. Fallback: tags in growing infos. Returns the raw text (may include authority names, infraspecific ranks, etc.) """ # Primary: kurztext div m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)
', html_text, re.DOTALL | re.IGNORECASE) if m: text = html_to_text(m.group(1)).strip() if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text): return text # Fallback: first in growingInfos that looks like a Latin name gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE) if gi: for em in re.finditer(r'(.*?)', gi.group(1), re.DOTALL): text = html_to_text(em.group(1)).strip() if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text): return text # Last resort: any / tag with a Latin-looking name for tag in re.finditer(r'<(?:em|i)>(.*?)', html_text, re.DOTALL | re.IGNORECASE): text = html_to_text(tag.group(1)).strip() if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100: return text return "" def normalize_latin_name(raw: str) -> str: """ Extract genus + species from an extended botanical name. Examples: "Pisum sativum L. convar. sat." -> "Pisum sativum" "Capsicum annuum L." -> "Capsicum annuum" "Brassica oleracea L. convar. botrytis" -> "Brassica oleracea" "Solanum lycopersicum L." -> "Solanum lycopersicum" "Cucumis sativus" -> "Cucumis sativus" "Mentha x piperita" -> "Mentha x piperita" """ if not raw: return "" # Clean up name = raw.strip() # Remove leading/trailing punctuation name = name.strip(".,;:") words = name.split() if len(words) < 2: return name genus = words[0] # Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita" if len(words) >= 3 and words[1] in ("x", "×"): return f"{genus} x {words[2]}" species = words[1] # Validate: genus should start uppercase, species lowercase if not genus[0].isupper() or not species[0].islower(): return name # Can't parse, return as-is return f"{genus} {species}" # ── Calendar parsing ──────────────────────────────────────────────────────── CALENDAR_ROW_TYPES = { "voranzucht": "indoor_sowing_months", "vorzucht": "indoor_sowing_months", "vorkultur": "indoor_sowing_months", "aussaat/ pflanzung freiland": "direct_sowing_months", "aussaat/pflanzung freiland": "direct_sowing_months", "aussaat freiland": "direct_sowing_months", "direktsaat": "direct_sowing_months", "pflanzung freiland": "transplanting_months", "pflanzung": "transplanting_months", "aussaat/ pflanzung gewächshaus": "glasshouse_months", "aussaat/pflanzung gewächshaus": "glasshouse_months", "gewächshaus": "glasshouse_months", "ernte": "harvesting_months", } def parse_calendar(html_text: str) -> dict: """ Parse the Reinsaat growing calendar table. Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc. Each value is a sorted list of month integers (1-12). """ result = {} cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)', html_text, re.DOTALL) if not cal_match: return result cal = cal_match.group(1) rows = re.findall(r'(.*?)', cal, re.DOTALL) for row in rows: # Get label label_m = re.search(r'class="type-lable"[^>]*>(.*?)', row, re.DOTALL) if not label_m: continue label = html_to_text(label_m.group(1)).strip().lower() # Map label to our field field_name = None for pattern, fname in CALENDAR_ROW_TYPES.items(): if pattern in label: field_name = fname break if not field_name: continue # Extract background colors for each cell (24 cells = 12 months x 2 halves) colors = re.findall(r'background-color:\s*([^;"]+)', row) # Convert to months: cell i maps to month (i // 2) + 1 active_months = set() for i, color in enumerate(colors): color = color.strip().lower() if color != "none" and color != "transparent" and color != "": month = (i // 2) + 1 if 1 <= month <= 12: active_months.add(month) if active_months: # Merge if same field already found (e.g. two sowing rows) if field_name in result: result[field_name] = sorted(set(result[field_name]) | active_months) else: result[field_name] = sorted(active_months) return result # ── Growing data extraction ───────────────────────────────────────────────── def extract_growing_data(html_text: str) -> dict: """Extract spacing, depth, germination temp from the growing text.""" data = {} # Get the growingInfos text gi = re.search(r'class="growingInfos"[^>]*>(.*?)', html_text, re.DOTALL | re.IGNORECASE) if not gi: return data full_text = html_to_text(gi.group(1)) # Also get the raw HTML for better entity handling raw_html = gi.group(1) # Convert HTML entities for pattern matching raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html)) raw_text = re.sub(r'\s+', ' ', raw_text) # ── Sowing depth ── depth_pats = [ r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm', r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm', ] for pat in depth_pats: dm = re.search(pat, raw_text, re.IGNORECASE) if dm: vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)] data["planting_depth_cm"] = round(sum(vals) / len(vals), 2) break # ── Spacing: "ROW x PLANT cm" ── spacing_pats = [ # "30–45 x 3–5 cm" (range x range) r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm', # "100 x 50 cm" (simple) r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm', ] for pat in spacing_pats: matches = re.findall(pat, raw_text, re.IGNORECASE) if matches: m = matches[-1] # prefer last match if len(m) == 4: data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1) data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1) elif len(m) == 2: v1 = float(m[0].replace(",", ".")) v2 = float(m[1].replace(",", ".")) data["row_spacing_cm"] = round(v1, 1) data["plant_spacing_cm"] = round(v2, 1) break # ── Germination temperature ── temp_pats = [ r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C', r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C', ] for pat in temp_pats: tm = re.search(pat, raw_text, re.IGNORECASE) if tm: vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)] avg = sum(vals) / len(vals) if 5 <= avg <= 40: data["germination_temp_c"] = round(avg, 1) break # ── Perennial ── perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude'] for pat in perennial_pats: if re.search(pat, raw_text, re.IGNORECASE): data["perennial"] = True break return data # ── Product data ──────────────────────────────────────────────────────────── @dataclass class ProductData: name: str = "" raw_latin_name: str = "" normalized_latin: str = "" description: str = "" sku: str = "" url: str = "" is_organic: bool = True growing_data: dict = field(default_factory=dict) calendar: dict = field(default_factory=dict) def parse_product(html_text: str, url: str) -> Optional[ProductData]: """Parse a product page. Returns ProductData or None if not a product page.""" jsonld = extract_jsonld_product(html_text) if not jsonld: return None product = ProductData(url=url) product.name = jsonld.get("name", "").strip() product.description = jsonld.get("description", "").strip() product.sku = jsonld.get("model", "").strip() # Extract and normalize botanical name product.raw_latin_name = extract_botanical_name(html_text) product.normalized_latin = normalize_latin_name(product.raw_latin_name) # Extract growing data product.growing_data = extract_growing_data(html_text) # Parse calendar product.calendar = parse_calendar(html_text) # Check organic status (Reinsaat is all organic, but check for "demeter" too) product.is_organic = True return product # ── Recursive discovery ───────────────────────────────────────────────────── def discover_products( category_url: str, max_depth: int = 4, _depth: int = 0, _visited: set = None, ) -> list[ProductData]: if _visited is None: _visited = set() if category_url in _visited or _depth > max_depth: return [] _visited.add(category_url) indent = " " * (_depth + 1) try: html_text = fetch_url(category_url) time.sleep(DELAY) except Exception as e: print(f"{indent}ERROR fetching {category_url}: {e}") return [] # Check if this is a product page product = parse_product(html_text, category_url) if product: return [product] # Category page: find child links cat_path = urllib.parse.urlparse(category_url).path.rstrip("/") child_links = [] for link in extract_links(html_text, category_url): parsed = urllib.parse.urlparse(link) if parsed.netloc and parsed.netloc != "www.reinsaat.at": continue child_path = parsed.path.rstrip("/") if not child_path.startswith(cat_path + "/"): continue relative = child_path[len(cat_path) + 1:] if "/" in relative or not relative: continue clean_url = f"https://www.reinsaat.at{child_path}/" if clean_url not in _visited: child_links.append(clean_url) child_links = list(dict.fromkeys(child_links)) print(f"{indent}Category {category_url} -> {len(child_links)} children") products = [] for child_url in child_links: results = discover_products(child_url, max_depth, _depth + 1, _visited) products.extend(results) return products # ── Slug generation ───────────────────────────────────────────────────────── def make_slug(species_name: str, cultivar_name: str) -> str: """Generate a URL-friendly slug.""" raw = f"{species_name}-{cultivar_name}".lower() # Replace umlauts and special chars replacements = { 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e', 'á': 'a', 'à': 'a', 'â': 'a', 'í': 'i', 'ì': 'i', 'î': 'i', 'ó': 'o', 'ò': 'o', 'ô': 'o', 'ú': 'u', 'ù': 'u', 'û': 'u', 'ñ': 'n', 'ç': 'c', } for old, new in replacements.items(): raw = raw.replace(old, new) # Keep only alphanumeric and hyphens slug = re.sub(r'[^a-z0-9]+', '-', raw) slug = slug.strip('-') # Collapse multiple hyphens slug = re.sub(r'-+', '-', slug) return slug # ── Main ──────────────────────────────────────────────────────────────────── def db_connect(): """Create a fresh DB connection.""" conn = psycopg2.connect( host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS ) conn.autocommit = False return conn def main(): print("=" * 70) print("Reinsaat Scraper v2") print("=" * 70) # ── Phase 1: Discover all products (no DB needed) ── print("\n[1] Discovering products from Reinsaat categories...") all_products: list[ProductData] = [] visited: set[str] = set() for cat_url in CATEGORIES: print(f"\n Category: {cat_url}") products = discover_products(cat_url, max_depth=4, _visited=visited) all_products.extend(products) print(f" -> {len(products)} products") # Deduplicate by URL seen_urls = set() unique_products = [] for p in all_products: if p.url not in seen_urls: seen_urls.add(p.url) unique_products.append(p) all_products = unique_products print(f"\n Total unique products: {len(all_products)}") # ── Phase 2: Connect to DB and load existing data ── print("\n[2] Connecting to DB and loading existing data...") conn = db_connect() cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Load species cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific") species_rows = cur.fetchall() species_map = {} for row in species_rows: key = row["name_scientific"].lower().strip() species_map[key] = row print(f" {len(species_map)} species loaded") # Load existing cultivars cur.execute(""" SELECT id, species_id, name, slug, description, row_spacing_cm, plant_spacing_cm, planting_depth_cm, germination_temp_c, perennial, indoor_sowing_months, direct_sowing_months, transplanting_months, glasshouse_months, harvesting_months FROM cultivars """) cultivar_rows = cur.fetchall() existing_cultivars = {} existing_slugs = set() for row in cultivar_rows: sid = str(row["species_id"]) name_lower = row["name"].lower() existing_cultivars[(sid, name_lower)] = dict(row) existing_slugs.add(row["slug"]) print(f" {len(existing_cultivars)} cultivars loaded") # Load existing Reinsaat supplier links cur.execute(""" SELECT cultivar_id, product_url, article_number FROM cultivar_suppliers WHERE supplier_id = %s """, (REINSAAT_SUPPLIER_ID,)) existing_links = {} for row in cur.fetchall(): cid = str(row["cultivar_id"]) url = row["product_url"] or "" sku = row["article_number"] or "" existing_links.setdefault(cid, []).append((url, sku)) print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars") # ── Phase 3: Process products ── print("\n[3] Processing products...") stats = { "created": 0, "linked": 0, "enriched": 0, "skipped_no_species": 0, "skipped_no_name": 0, "link_exists": 0, "errors": 0, } unmatched = [] for i, product in enumerate(all_products): pct = (i + 1) / len(all_products) * 100 prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)" if not product.name: stats["skipped_no_name"] += 1 continue # Match species normalized = product.normalized_latin.lower().strip() species = species_map.get(normalized) if not species: # Try exact match on raw name (first two words) raw_words = product.raw_latin_name.split() if len(raw_words) >= 2: attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}" species = species_map.get(attempt) if not species: stats["skipped_no_species"] += 1 unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url)) continue species_id = str(species["id"]) species_name = species["name_scientific"] # Check if cultivar exists ckey = (species_id, product.name.lower()) existing = existing_cultivars.get(ckey) if existing: cultivar_id = str(existing["id"]) # ── Enrich existing cultivar with missing data ── updates = {} # Growing data from page gd = product.growing_data if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"): updates["planting_depth_cm"] = gd["planting_depth_cm"] if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"): updates["row_spacing_cm"] = gd["row_spacing_cm"] if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"): updates["plant_spacing_cm"] = gd["plant_spacing_cm"] if gd.get("germination_temp_c") and not existing.get("germination_temp_c"): updates["germination_temp_c"] = gd["germination_temp_c"] if gd.get("perennial") and not existing.get("perennial"): updates["perennial"] = True # Calendar data cal = product.calendar if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"): updates["indoor_sowing_months"] = cal["indoor_sowing_months"] if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"): updates["direct_sowing_months"] = cal["direct_sowing_months"] if cal.get("transplanting_months") and not existing.get("transplanting_months"): updates["transplanting_months"] = cal["transplanting_months"] if cal.get("glasshouse_months") and not existing.get("glasshouse_months"): updates["glasshouse_months"] = cal["glasshouse_months"] if cal.get("harvesting_months") and not existing.get("harvesting_months"): updates["harvesting_months"] = cal["harvesting_months"] # Description if product.description and not existing.get("description"): updates["description"] = product.description if updates: set_clauses = [] values = [] for col, val in updates.items(): set_clauses.append(f"{col} = %s") values.append(val) set_clauses.append("updated_at = NOW()") values.append(cultivar_id) cur.execute( f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid", values ) stats["enriched"] += 1 print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})") # ── Add supplier link if missing ── link_exists = False if cultivar_id in existing_links: for lurl, lsku in existing_links[cultivar_id]: if lurl == product.url or (lsku and lsku == product.sku): link_exists = True break if link_exists: stats["link_exists"] += 1 else: try: cur.execute("SAVEPOINT link_sp") cur.execute(""" INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at) VALUES (%s::uuid, %s::uuid, %s, %s, NOW()) ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE SET product_url = EXCLUDED.product_url, last_checked_at = NOW() """, (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku)) cur.execute("RELEASE SAVEPOINT link_sp") stats["linked"] += 1 existing_links.setdefault(cultivar_id, []).append((product.url, product.sku)) print(f"{prefix} {product.name} -> LINKED ({product.sku})") except Exception as e: print(f"{prefix} {product.name} -> LINK ERROR: {e}") cur.execute("ROLLBACK TO SAVEPOINT link_sp") stats["errors"] += 1 else: # ── Create new cultivar ── slug = make_slug(species_name, product.name) # Ensure unique slug base_slug = slug counter = 2 while slug in existing_slugs: slug = f"{base_slug}-{counter}" counter += 1 gd = product.growing_data cal = product.calendar try: cur.execute("SAVEPOINT create_sp") cur.execute(""" INSERT INTO cultivars ( species_id, name, name_de, slug, description, is_organic, perennial, planting_depth_cm, row_spacing_cm, plant_spacing_cm, germination_temp_c, indoor_sowing_months, direct_sowing_months, transplanting_months, glasshouse_months, harvesting_months ) VALUES ( %s::uuid, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) RETURNING id """, ( species_id, product.name, product.name, slug, product.description, product.is_organic, gd.get("perennial", False), gd.get("planting_depth_cm"), gd.get("row_spacing_cm"), gd.get("plant_spacing_cm"), gd.get("germination_temp_c"), cal.get("indoor_sowing_months"), cal.get("direct_sowing_months"), cal.get("transplanting_months"), cal.get("glasshouse_months"), cal.get("harvesting_months"), )) new_id = str(cur.fetchone()["id"]) existing_slugs.add(slug) existing_cultivars[ckey] = {"id": new_id} stats["created"] += 1 # Link to supplier cur.execute(""" INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at) VALUES (%s::uuid, %s::uuid, %s, %s, NOW()) """, (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku)) stats["linked"] += 1 existing_links.setdefault(new_id, []).append((product.url, product.sku)) print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})") cur.execute("RELEASE SAVEPOINT create_sp") except Exception as e: print(f"{prefix} {product.name} -> CREATE ERROR: {e}") cur.execute("ROLLBACK TO SAVEPOINT create_sp") stats["errors"] += 1 # ── Commit ── conn.commit() # ── Summary ── print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f" Total products discovered: {len(all_products)}") print(f" New cultivars created: {stats['created']}") print(f" New supplier links added: {stats['linked']}") print(f" Cultivars enriched: {stats['enriched']}") print(f" Links already existed: {stats['link_exists']}") print(f" Skipped (no species): {stats['skipped_no_species']}") print(f" Skipped (no name): {stats['skipped_no_name']}") print(f" Errors: {stats['errors']}") print("=" * 70) if unmatched: print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):") for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]): print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}") cur.close() conn.close() if __name__ == "__main__": main()