Files
herbapi/tools/scrapers/scrape_reinsaat_v2.py

771 lines
30 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
genus+species from extended botanical names, create/enrich cultivars, link supplier.
Uses direct PostgreSQL access (psycopg2) for speed and reliability.
"""
import json
import re
import ssl
import sys
import time
import uuid
import html as html_mod
import urllib.request
import urllib.error
import urllib.parse
from dataclasses import dataclass, field
from typing import Optional
# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
import psycopg2
import psycopg2.extras
# ── Config ──────────────────────────────────────────────────────────────────
DB_HOST = "10.31.3.90"
DB_NAME = "herbapi"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.3
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
# ── All Reinsaat categories ────────────────────────────────────────────────
CATEGORIES = [
"https://www.reinsaat.at/shop/DE/bohnen/",
"https://www.reinsaat.at/shop/DE/erbsen/",
"https://www.reinsaat.at/shop/DE/gurken/",
"https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
"https://www.reinsaat.at/shop/DE/knollenfenchel/",
"https://www.reinsaat.at/shop/DE/kohlgewaechse/",
"https://www.reinsaat.at/shop/DE/kuerbis/",
"https://www.reinsaat.at/shop/DE/mais/",
"https://www.reinsaat.at/shop/DE/mangold/",
"https://www.reinsaat.at/shop/DE/melanzani_1/",
"https://www.reinsaat.at/shop/DE/melone/",
"https://www.reinsaat.at/shop/DE/paprika/",
"https://www.reinsaat.at/shop/DE/pastinaken_1/",
"https://www.reinsaat.at/shop/DE/petersilie/",
"https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
"https://www.reinsaat.at/shop/DE/porree/",
"https://www.reinsaat.at/shop/DE/radies_rettich/",
"https://www.reinsaat.at/shop/DE/rote_ruebe/",
"https://www.reinsaat.at/shop/DE/salate/",
"https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
"https://www.reinsaat.at/shop/DE/sellerie/",
"https://www.reinsaat.at/shop/DE/spinat/",
"https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
"https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
"https://www.reinsaat.at/shop/DE/zucchini/",
"https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
"https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
"https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
"https://www.reinsaat.at/shop/DE/gruenduengung/",
]
# ── HTTP ────────────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()
def fetch_url(url: str, retries: int = 2) -> str:
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
})
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
charset = resp.headers.get_content_charset() or "utf-8"
return resp.read().decode(charset)
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
if attempt < retries:
time.sleep(2)
continue
raise
return ""
# ── HTML parsing helpers ────────────────────────────────────────────────────
def extract_links(html_text: str, base_url: str) -> list[str]:
links = []
seen = set()
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
href = m.group(1)
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
full = urllib.parse.urljoin(base_url, href)
if full not in seen:
seen.add(full)
links.append(full)
return links
def extract_jsonld_product(html_text: str) -> Optional[dict]:
for m in re.finditer(
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
html_text, re.DOTALL | re.IGNORECASE
):
try:
data = json.loads(m.group(1))
if isinstance(data, dict) and data.get("@type") == "Product":
return data
except (json.JSONDecodeError, ValueError):
continue
return None
def html_to_text(html_text: str) -> str:
"""Strip HTML tags and decode entities."""
text = re.sub(r'<[^>]+>', ' ', html_text)
text = html_mod.unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_botanical_name(html_text: str) -> str:
"""
Extract the botanical/Latin name from the page.
Primary source: <div class="fce_shop_kurztext"> content.
Fallback: <em> tags in growing infos.
Returns the raw text (may include authority names, infraspecific ranks, etc.)
"""
# Primary: kurztext div
m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if m:
text = html_to_text(m.group(1)).strip()
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
return text
# Fallback: first <em> in growingInfos that looks like a Latin name
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if gi:
for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
text = html_to_text(em.group(1)).strip()
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
return text
# Last resort: any <em>/<i> tag with a Latin-looking name
for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
text = html_to_text(tag.group(1)).strip()
if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
return text
return ""
def normalize_latin_name(raw: str) -> str:
"""
Extract genus + species from an extended botanical name.
Examples:
"Pisum sativum L. convar. sat." -> "Pisum sativum"
"Capsicum annuum L." -> "Capsicum annuum"
"Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
"Solanum lycopersicum L." -> "Solanum lycopersicum"
"Cucumis sativus" -> "Cucumis sativus"
"Mentha x piperita" -> "Mentha x piperita"
"""
if not raw:
return ""
# Clean up
name = raw.strip()
# Remove leading/trailing punctuation
name = name.strip(".,;:")
words = name.split()
if len(words) < 2:
return name
genus = words[0]
# Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
if len(words) >= 3 and words[1] in ("x", "×"):
return f"{genus} x {words[2]}"
species = words[1]
# Validate: genus should start uppercase, species lowercase
if not genus[0].isupper() or not species[0].islower():
return name # Can't parse, return as-is
return f"{genus} {species}"
# ── Calendar parsing ────────────────────────────────────────────────────────
CALENDAR_ROW_TYPES = {
"voranzucht": "indoor_sowing_months",
"vorzucht": "indoor_sowing_months",
"vorkultur": "indoor_sowing_months",
"aussaat/ pflanzung freiland": "direct_sowing_months",
"aussaat/pflanzung freiland": "direct_sowing_months",
"aussaat freiland": "direct_sowing_months",
"direktsaat": "direct_sowing_months",
"pflanzung freiland": "transplanting_months",
"pflanzung": "transplanting_months",
"aussaat/ pflanzung gewächshaus": "glasshouse_months",
"aussaat/pflanzung gewächshaus": "glasshouse_months",
"gewächshaus": "glasshouse_months",
"ernte": "harvesting_months",
}
def parse_calendar(html_text: str) -> dict:
"""
Parse the Reinsaat growing calendar table.
Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
Each value is a sorted list of month integers (1-12).
"""
result = {}
cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
if not cal_match:
return result
cal = cal_match.group(1)
rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
for row in rows:
# Get label
label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
if not label_m:
continue
label = html_to_text(label_m.group(1)).strip().lower()
# Map label to our field
field_name = None
for pattern, fname in CALENDAR_ROW_TYPES.items():
if pattern in label:
field_name = fname
break
if not field_name:
continue
# Extract background colors for each cell (24 cells = 12 months x 2 halves)
colors = re.findall(r'background-color:\s*([^;"]+)', row)
# Convert to months: cell i maps to month (i // 2) + 1
active_months = set()
for i, color in enumerate(colors):
color = color.strip().lower()
if color != "none" and color != "transparent" and color != "":
month = (i // 2) + 1
if 1 <= month <= 12:
active_months.add(month)
if active_months:
# Merge if same field already found (e.g. two sowing rows)
if field_name in result:
result[field_name] = sorted(set(result[field_name]) | active_months)
else:
result[field_name] = sorted(active_months)
return result
# ── Growing data extraction ─────────────────────────────────────────────────
def extract_growing_data(html_text: str) -> dict:
"""Extract spacing, depth, germination temp from the growing text."""
data = {}
# Get the growingInfos text
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if not gi:
return data
full_text = html_to_text(gi.group(1))
# Also get the raw HTML for better entity handling
raw_html = gi.group(1)
# Convert HTML entities for pattern matching
raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
raw_text = re.sub(r'\s+', ' ', raw_text)
# ── Sowing depth ──
depth_pats = [
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in depth_pats:
dm = re.search(pat, raw_text, re.IGNORECASE)
if dm:
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
break
# ── Spacing: "ROW x PLANT cm" ──
spacing_pats = [
# "3045 x 35 cm" (range x range)
r'(\d+)\s*[-]\s*(\d+)\s*[x×]\s*(\d+)\s*[-]\s*(\d+)\s*cm',
# "100 x 50 cm" (simple)
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in spacing_pats:
matches = re.findall(pat, raw_text, re.IGNORECASE)
if matches:
m = matches[-1] # prefer last match
if len(m) == 4:
data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
elif len(m) == 2:
v1 = float(m[0].replace(",", "."))
v2 = float(m[1].replace(",", "."))
data["row_spacing_cm"] = round(v1, 1)
data["plant_spacing_cm"] = round(v2, 1)
break
# ── Germination temperature ──
temp_pats = [
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-]\s*(\d+)\s*[°]?\s*C',
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
]
for pat in temp_pats:
tm = re.search(pat, raw_text, re.IGNORECASE)
if tm:
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
avg = sum(vals) / len(vals)
if 5 <= avg <= 40:
data["germination_temp_c"] = round(avg, 1)
break
# ── Perennial ──
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
for pat in perennial_pats:
if re.search(pat, raw_text, re.IGNORECASE):
data["perennial"] = True
break
return data
# ── Product data ────────────────────────────────────────────────────────────
@dataclass
class ProductData:
name: str = ""
raw_latin_name: str = ""
normalized_latin: str = ""
description: str = ""
sku: str = ""
url: str = ""
is_organic: bool = True
growing_data: dict = field(default_factory=dict)
calendar: dict = field(default_factory=dict)
def parse_product(html_text: str, url: str) -> Optional[ProductData]:
"""Parse a product page. Returns ProductData or None if not a product page."""
jsonld = extract_jsonld_product(html_text)
if not jsonld:
return None
product = ProductData(url=url)
product.name = jsonld.get("name", "").strip()
product.description = jsonld.get("description", "").strip()
product.sku = jsonld.get("model", "").strip()
# Extract and normalize botanical name
product.raw_latin_name = extract_botanical_name(html_text)
product.normalized_latin = normalize_latin_name(product.raw_latin_name)
# Extract growing data
product.growing_data = extract_growing_data(html_text)
# Parse calendar
product.calendar = parse_calendar(html_text)
# Check organic status (Reinsaat is all organic, but check for "demeter" too)
product.is_organic = True
return product
# ── Recursive discovery ─────────────────────────────────────────────────────
def discover_products(
category_url: str,
max_depth: int = 4,
_depth: int = 0,
_visited: set = None,
) -> list[ProductData]:
if _visited is None:
_visited = set()
if category_url in _visited or _depth > max_depth:
return []
_visited.add(category_url)
indent = " " * (_depth + 1)
try:
html_text = fetch_url(category_url)
time.sleep(DELAY)
except Exception as e:
print(f"{indent}ERROR fetching {category_url}: {e}")
return []
# Check if this is a product page
product = parse_product(html_text, category_url)
if product:
return [product]
# Category page: find child links
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
child_links = []
for link in extract_links(html_text, category_url):
parsed = urllib.parse.urlparse(link)
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
continue
child_path = parsed.path.rstrip("/")
if not child_path.startswith(cat_path + "/"):
continue
relative = child_path[len(cat_path) + 1:]
if "/" in relative or not relative:
continue
clean_url = f"https://www.reinsaat.at{child_path}/"
if clean_url not in _visited:
child_links.append(clean_url)
child_links = list(dict.fromkeys(child_links))
print(f"{indent}Category {category_url} -> {len(child_links)} children")
products = []
for child_url in child_links:
results = discover_products(child_url, max_depth, _depth + 1, _visited)
products.extend(results)
return products
# ── Slug generation ─────────────────────────────────────────────────────────
def make_slug(species_name: str, cultivar_name: str) -> str:
"""Generate a URL-friendly slug."""
raw = f"{species_name}-{cultivar_name}".lower()
# Replace umlauts and special chars
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
'á': 'a', 'à': 'a', 'â': 'a',
'í': 'i', 'ì': 'i', 'î': 'i',
'ó': 'o', 'ò': 'o', 'ô': 'o',
'ú': 'u', 'ù': 'u', 'û': 'u',
'ñ': 'n', 'ç': 'c',
}
for old, new in replacements.items():
raw = raw.replace(old, new)
# Keep only alphanumeric and hyphens
slug = re.sub(r'[^a-z0-9]+', '-', raw)
slug = slug.strip('-')
# Collapse multiple hyphens
slug = re.sub(r'-+', '-', slug)
return slug
# ── Main ────────────────────────────────────────────────────────────────────
def db_connect():
"""Create a fresh DB connection."""
conn = psycopg2.connect(
host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
)
conn.autocommit = False
return conn
def main():
print("=" * 70)
print("Reinsaat Scraper v2")
print("=" * 70)
# ── Phase 1: Discover all products (no DB needed) ──
print("\n[1] Discovering products from Reinsaat categories...")
all_products: list[ProductData] = []
visited: set[str] = set()
for cat_url in CATEGORIES:
print(f"\n Category: {cat_url}")
products = discover_products(cat_url, max_depth=4, _visited=visited)
all_products.extend(products)
print(f" -> {len(products)} products")
# Deduplicate by URL
seen_urls = set()
unique_products = []
for p in all_products:
if p.url not in seen_urls:
seen_urls.add(p.url)
unique_products.append(p)
all_products = unique_products
print(f"\n Total unique products: {len(all_products)}")
# ── Phase 2: Connect to DB and load existing data ──
print("\n[2] Connecting to DB and loading existing data...")
conn = db_connect()
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# Load species
cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
species_rows = cur.fetchall()
species_map = {}
for row in species_rows:
key = row["name_scientific"].lower().strip()
species_map[key] = row
print(f" {len(species_map)} species loaded")
# Load existing cultivars
cur.execute("""
SELECT id, species_id, name, slug, description,
row_spacing_cm, plant_spacing_cm, planting_depth_cm,
germination_temp_c, perennial,
indoor_sowing_months, direct_sowing_months,
transplanting_months, glasshouse_months, harvesting_months
FROM cultivars
""")
cultivar_rows = cur.fetchall()
existing_cultivars = {}
existing_slugs = set()
for row in cultivar_rows:
sid = str(row["species_id"])
name_lower = row["name"].lower()
existing_cultivars[(sid, name_lower)] = dict(row)
existing_slugs.add(row["slug"])
print(f" {len(existing_cultivars)} cultivars loaded")
# Load existing Reinsaat supplier links
cur.execute("""
SELECT cultivar_id, product_url, article_number
FROM cultivar_suppliers
WHERE supplier_id = %s
""", (REINSAAT_SUPPLIER_ID,))
existing_links = {}
for row in cur.fetchall():
cid = str(row["cultivar_id"])
url = row["product_url"] or ""
sku = row["article_number"] or ""
existing_links.setdefault(cid, []).append((url, sku))
print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
# ── Phase 3: Process products ──
print("\n[3] Processing products...")
stats = {
"created": 0,
"linked": 0,
"enriched": 0,
"skipped_no_species": 0,
"skipped_no_name": 0,
"link_exists": 0,
"errors": 0,
}
unmatched = []
for i, product in enumerate(all_products):
pct = (i + 1) / len(all_products) * 100
prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
if not product.name:
stats["skipped_no_name"] += 1
continue
# Match species
normalized = product.normalized_latin.lower().strip()
species = species_map.get(normalized)
if not species:
# Try exact match on raw name (first two words)
raw_words = product.raw_latin_name.split()
if len(raw_words) >= 2:
attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
species = species_map.get(attempt)
if not species:
stats["skipped_no_species"] += 1
unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
continue
species_id = str(species["id"])
species_name = species["name_scientific"]
# Check if cultivar exists
ckey = (species_id, product.name.lower())
existing = existing_cultivars.get(ckey)
if existing:
cultivar_id = str(existing["id"])
# ── Enrich existing cultivar with missing data ──
updates = {}
# Growing data from page
gd = product.growing_data
if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
updates["planting_depth_cm"] = gd["planting_depth_cm"]
if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
updates["row_spacing_cm"] = gd["row_spacing_cm"]
if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
updates["germination_temp_c"] = gd["germination_temp_c"]
if gd.get("perennial") and not existing.get("perennial"):
updates["perennial"] = True
# Calendar data
cal = product.calendar
if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
updates["direct_sowing_months"] = cal["direct_sowing_months"]
if cal.get("transplanting_months") and not existing.get("transplanting_months"):
updates["transplanting_months"] = cal["transplanting_months"]
if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
updates["glasshouse_months"] = cal["glasshouse_months"]
if cal.get("harvesting_months") and not existing.get("harvesting_months"):
updates["harvesting_months"] = cal["harvesting_months"]
# Description
if product.description and not existing.get("description"):
updates["description"] = product.description
if updates:
set_clauses = []
values = []
for col, val in updates.items():
set_clauses.append(f"{col} = %s")
values.append(val)
set_clauses.append("updated_at = NOW()")
values.append(cultivar_id)
cur.execute(
f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
values
)
stats["enriched"] += 1
print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
# ── Add supplier link if missing ──
link_exists = False
if cultivar_id in existing_links:
for lurl, lsku in existing_links[cultivar_id]:
if lurl == product.url or (lsku and lsku == product.sku):
link_exists = True
break
if link_exists:
stats["link_exists"] += 1
else:
try:
cur.execute("SAVEPOINT link_sp")
cur.execute("""
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
""", (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
cur.execute("RELEASE SAVEPOINT link_sp")
stats["linked"] += 1
existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
print(f"{prefix} {product.name} -> LINKED ({product.sku})")
except Exception as e:
print(f"{prefix} {product.name} -> LINK ERROR: {e}")
cur.execute("ROLLBACK TO SAVEPOINT link_sp")
stats["errors"] += 1
else:
# ── Create new cultivar ──
slug = make_slug(species_name, product.name)
# Ensure unique slug
base_slug = slug
counter = 2
while slug in existing_slugs:
slug = f"{base_slug}-{counter}"
counter += 1
gd = product.growing_data
cal = product.calendar
try:
cur.execute("SAVEPOINT create_sp")
cur.execute("""
INSERT INTO cultivars (
species_id, name, name_de, slug, description,
is_organic, perennial,
planting_depth_cm, row_spacing_cm, plant_spacing_cm,
germination_temp_c,
indoor_sowing_months, direct_sowing_months,
transplanting_months, glasshouse_months, harvesting_months
) VALUES (
%s::uuid, %s, %s, %s, %s,
%s, %s,
%s, %s, %s,
%s,
%s, %s,
%s, %s, %s
)
RETURNING id
""", (
species_id,
product.name,
product.name,
slug,
product.description,
product.is_organic,
gd.get("perennial", False),
gd.get("planting_depth_cm"),
gd.get("row_spacing_cm"),
gd.get("plant_spacing_cm"),
gd.get("germination_temp_c"),
cal.get("indoor_sowing_months"),
cal.get("direct_sowing_months"),
cal.get("transplanting_months"),
cal.get("glasshouse_months"),
cal.get("harvesting_months"),
))
new_id = str(cur.fetchone()["id"])
existing_slugs.add(slug)
existing_cultivars[ckey] = {"id": new_id}
stats["created"] += 1
# Link to supplier
cur.execute("""
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
""", (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
stats["linked"] += 1
existing_links.setdefault(new_id, []).append((product.url, product.sku))
print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
cur.execute("RELEASE SAVEPOINT create_sp")
except Exception as e:
print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
cur.execute("ROLLBACK TO SAVEPOINT create_sp")
stats["errors"] += 1
# ── Commit ──
conn.commit()
# ── Summary ──
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f" Total products discovered: {len(all_products)}")
print(f" New cultivars created: {stats['created']}")
print(f" New supplier links added: {stats['linked']}")
print(f" Cultivars enriched: {stats['enriched']}")
print(f" Links already existed: {stats['link_exists']}")
print(f" Skipped (no species): {stats['skipped_no_species']}")
print(f" Skipped (no name): {stats['skipped_no_name']}")
print(f" Errors: {stats['errors']}")
print("=" * 70)
if unmatched:
print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
cur.close()
conn.close()
if __name__ == "__main__":
main()