771 lines
30 KiB
Python
771 lines
30 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
|
||
genus+species from extended botanical names, create/enrich cultivars, link supplier.
|
||
|
||
Uses direct PostgreSQL access (psycopg2) for speed and reliability.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import ssl
|
||
import sys
|
||
import time
|
||
import uuid
|
||
import html as html_mod
|
||
import urllib.request
|
||
import urllib.error
|
||
import urllib.parse
|
||
from dataclasses import dataclass, field
|
||
from typing import Optional
|
||
|
||
# Unbuffered output
|
||
sys.stdout.reconfigure(line_buffering=True)
|
||
sys.stderr.reconfigure(line_buffering=True)
|
||
|
||
import psycopg2
|
||
import psycopg2.extras
|
||
|
||
# ── Config ──────────────────────────────────────────────────────────────────
|
||
DB_HOST = "10.31.3.90"
|
||
DB_NAME = "herbapi"
|
||
DB_USER = "herbapi"
|
||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||
|
||
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
|
||
DELAY = 0.3
|
||
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
|
||
|
||
# ── All Reinsaat categories ────────────────────────────────────────────────
|
||
CATEGORIES = [
|
||
"https://www.reinsaat.at/shop/DE/bohnen/",
|
||
"https://www.reinsaat.at/shop/DE/erbsen/",
|
||
"https://www.reinsaat.at/shop/DE/gurken/",
|
||
"https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
|
||
"https://www.reinsaat.at/shop/DE/knollenfenchel/",
|
||
"https://www.reinsaat.at/shop/DE/kohlgewaechse/",
|
||
"https://www.reinsaat.at/shop/DE/kuerbis/",
|
||
"https://www.reinsaat.at/shop/DE/mais/",
|
||
"https://www.reinsaat.at/shop/DE/mangold/",
|
||
"https://www.reinsaat.at/shop/DE/melanzani_1/",
|
||
"https://www.reinsaat.at/shop/DE/melone/",
|
||
"https://www.reinsaat.at/shop/DE/paprika/",
|
||
"https://www.reinsaat.at/shop/DE/pastinaken_1/",
|
||
"https://www.reinsaat.at/shop/DE/petersilie/",
|
||
"https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
|
||
"https://www.reinsaat.at/shop/DE/porree/",
|
||
"https://www.reinsaat.at/shop/DE/radies_rettich/",
|
||
"https://www.reinsaat.at/shop/DE/rote_ruebe/",
|
||
"https://www.reinsaat.at/shop/DE/salate/",
|
||
"https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
|
||
"https://www.reinsaat.at/shop/DE/sellerie/",
|
||
"https://www.reinsaat.at/shop/DE/spinat/",
|
||
"https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
|
||
"https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
|
||
"https://www.reinsaat.at/shop/DE/zucchini/",
|
||
"https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
|
||
"https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
|
||
"https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
|
||
"https://www.reinsaat.at/shop/DE/gruenduengung/",
|
||
]
|
||
|
||
# ── HTTP ────────────────────────────────────────────────────────────────────
|
||
_ssl_ctx = ssl.create_default_context()
|
||
|
||
|
||
def fetch_url(url: str, retries: int = 2) -> str:
|
||
req = urllib.request.Request(url, headers={
|
||
"User-Agent": USER_AGENT,
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
|
||
})
|
||
for attempt in range(retries + 1):
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
|
||
charset = resp.headers.get_content_charset() or "utf-8"
|
||
return resp.read().decode(charset)
|
||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||
if attempt < retries:
|
||
time.sleep(2)
|
||
continue
|
||
raise
|
||
return ""
|
||
|
||
|
||
# ── HTML parsing helpers ────────────────────────────────────────────────────
|
||
def extract_links(html_text: str, base_url: str) -> list[str]:
|
||
links = []
|
||
seen = set()
|
||
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
|
||
href = m.group(1)
|
||
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||
continue
|
||
full = urllib.parse.urljoin(base_url, href)
|
||
if full not in seen:
|
||
seen.add(full)
|
||
links.append(full)
|
||
return links
|
||
|
||
|
||
def extract_jsonld_product(html_text: str) -> Optional[dict]:
|
||
for m in re.finditer(
|
||
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
|
||
html_text, re.DOTALL | re.IGNORECASE
|
||
):
|
||
try:
|
||
data = json.loads(m.group(1))
|
||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||
return data
|
||
except (json.JSONDecodeError, ValueError):
|
||
continue
|
||
return None
|
||
|
||
|
||
def html_to_text(html_text: str) -> str:
|
||
"""Strip HTML tags and decode entities."""
|
||
text = re.sub(r'<[^>]+>', ' ', html_text)
|
||
text = html_mod.unescape(text)
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
return text
|
||
|
||
|
||
def extract_botanical_name(html_text: str) -> str:
|
||
"""
|
||
Extract the botanical/Latin name from the page.
|
||
Primary source: <div class="fce_shop_kurztext"> content.
|
||
Fallback: <em> tags in growing infos.
|
||
|
||
Returns the raw text (may include authority names, infraspecific ranks, etc.)
|
||
"""
|
||
# Primary: kurztext div
|
||
m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||
if m:
|
||
text = html_to_text(m.group(1)).strip()
|
||
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||
return text
|
||
|
||
# Fallback: first <em> in growingInfos that looks like a Latin name
|
||
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||
if gi:
|
||
for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
|
||
text = html_to_text(em.group(1)).strip()
|
||
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||
return text
|
||
|
||
# Last resort: any <em>/<i> tag with a Latin-looking name
|
||
for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
|
||
text = html_to_text(tag.group(1)).strip()
|
||
if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
|
||
return text
|
||
|
||
return ""
|
||
|
||
|
||
def normalize_latin_name(raw: str) -> str:
|
||
"""
|
||
Extract genus + species from an extended botanical name.
|
||
|
||
Examples:
|
||
"Pisum sativum L. convar. sat." -> "Pisum sativum"
|
||
"Capsicum annuum L." -> "Capsicum annuum"
|
||
"Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
|
||
"Solanum lycopersicum L." -> "Solanum lycopersicum"
|
||
"Cucumis sativus" -> "Cucumis sativus"
|
||
"Mentha x piperita" -> "Mentha x piperita"
|
||
"""
|
||
if not raw:
|
||
return ""
|
||
|
||
# Clean up
|
||
name = raw.strip()
|
||
# Remove leading/trailing punctuation
|
||
name = name.strip(".,;:")
|
||
|
||
words = name.split()
|
||
if len(words) < 2:
|
||
return name
|
||
|
||
genus = words[0]
|
||
|
||
# Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
|
||
if len(words) >= 3 and words[1] in ("x", "×"):
|
||
return f"{genus} x {words[2]}"
|
||
|
||
species = words[1]
|
||
|
||
# Validate: genus should start uppercase, species lowercase
|
||
if not genus[0].isupper() or not species[0].islower():
|
||
return name # Can't parse, return as-is
|
||
|
||
return f"{genus} {species}"
|
||
|
||
|
||
# ── Calendar parsing ────────────────────────────────────────────────────────
|
||
CALENDAR_ROW_TYPES = {
|
||
"voranzucht": "indoor_sowing_months",
|
||
"vorzucht": "indoor_sowing_months",
|
||
"vorkultur": "indoor_sowing_months",
|
||
"aussaat/ pflanzung freiland": "direct_sowing_months",
|
||
"aussaat/pflanzung freiland": "direct_sowing_months",
|
||
"aussaat freiland": "direct_sowing_months",
|
||
"direktsaat": "direct_sowing_months",
|
||
"pflanzung freiland": "transplanting_months",
|
||
"pflanzung": "transplanting_months",
|
||
"aussaat/ pflanzung gewächshaus": "glasshouse_months",
|
||
"aussaat/pflanzung gewächshaus": "glasshouse_months",
|
||
"gewächshaus": "glasshouse_months",
|
||
"ernte": "harvesting_months",
|
||
}
|
||
|
||
|
||
def parse_calendar(html_text: str) -> dict:
|
||
"""
|
||
Parse the Reinsaat growing calendar table.
|
||
Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
|
||
Each value is a sorted list of month integers (1-12).
|
||
"""
|
||
result = {}
|
||
|
||
cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
|
||
if not cal_match:
|
||
return result
|
||
|
||
cal = cal_match.group(1)
|
||
rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
|
||
|
||
for row in rows:
|
||
# Get label
|
||
label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
|
||
if not label_m:
|
||
continue
|
||
label = html_to_text(label_m.group(1)).strip().lower()
|
||
|
||
# Map label to our field
|
||
field_name = None
|
||
for pattern, fname in CALENDAR_ROW_TYPES.items():
|
||
if pattern in label:
|
||
field_name = fname
|
||
break
|
||
if not field_name:
|
||
continue
|
||
|
||
# Extract background colors for each cell (24 cells = 12 months x 2 halves)
|
||
colors = re.findall(r'background-color:\s*([^;"]+)', row)
|
||
|
||
# Convert to months: cell i maps to month (i // 2) + 1
|
||
active_months = set()
|
||
for i, color in enumerate(colors):
|
||
color = color.strip().lower()
|
||
if color != "none" and color != "transparent" and color != "":
|
||
month = (i // 2) + 1
|
||
if 1 <= month <= 12:
|
||
active_months.add(month)
|
||
|
||
if active_months:
|
||
# Merge if same field already found (e.g. two sowing rows)
|
||
if field_name in result:
|
||
result[field_name] = sorted(set(result[field_name]) | active_months)
|
||
else:
|
||
result[field_name] = sorted(active_months)
|
||
|
||
return result
|
||
|
||
|
||
# ── Growing data extraction ─────────────────────────────────────────────────
|
||
def extract_growing_data(html_text: str) -> dict:
|
||
"""Extract spacing, depth, germination temp from the growing text."""
|
||
data = {}
|
||
|
||
# Get the growingInfos text
|
||
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||
if not gi:
|
||
return data
|
||
|
||
full_text = html_to_text(gi.group(1))
|
||
# Also get the raw HTML for better entity handling
|
||
raw_html = gi.group(1)
|
||
# Convert HTML entities for pattern matching
|
||
raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
|
||
raw_text = re.sub(r'\s+', ' ', raw_text)
|
||
|
||
# ── Sowing depth ──
|
||
depth_pats = [
|
||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
|
||
]
|
||
for pat in depth_pats:
|
||
dm = re.search(pat, raw_text, re.IGNORECASE)
|
||
if dm:
|
||
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
|
||
data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
|
||
break
|
||
|
||
# ── Spacing: "ROW x PLANT cm" ──
|
||
spacing_pats = [
|
||
# "30–45 x 3–5 cm" (range x range)
|
||
r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
|
||
# "100 x 50 cm" (simple)
|
||
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||
]
|
||
for pat in spacing_pats:
|
||
matches = re.findall(pat, raw_text, re.IGNORECASE)
|
||
if matches:
|
||
m = matches[-1] # prefer last match
|
||
if len(m) == 4:
|
||
data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
|
||
data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
|
||
elif len(m) == 2:
|
||
v1 = float(m[0].replace(",", "."))
|
||
v2 = float(m[1].replace(",", "."))
|
||
data["row_spacing_cm"] = round(v1, 1)
|
||
data["plant_spacing_cm"] = round(v2, 1)
|
||
break
|
||
|
||
# ── Germination temperature ──
|
||
temp_pats = [
|
||
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
|
||
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
|
||
]
|
||
for pat in temp_pats:
|
||
tm = re.search(pat, raw_text, re.IGNORECASE)
|
||
if tm:
|
||
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
|
||
avg = sum(vals) / len(vals)
|
||
if 5 <= avg <= 40:
|
||
data["germination_temp_c"] = round(avg, 1)
|
||
break
|
||
|
||
# ── Perennial ──
|
||
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
|
||
for pat in perennial_pats:
|
||
if re.search(pat, raw_text, re.IGNORECASE):
|
||
data["perennial"] = True
|
||
break
|
||
|
||
return data
|
||
|
||
|
||
# ── Product data ────────────────────────────────────────────────────────────
|
||
@dataclass
|
||
class ProductData:
|
||
name: str = ""
|
||
raw_latin_name: str = ""
|
||
normalized_latin: str = ""
|
||
description: str = ""
|
||
sku: str = ""
|
||
url: str = ""
|
||
is_organic: bool = True
|
||
growing_data: dict = field(default_factory=dict)
|
||
calendar: dict = field(default_factory=dict)
|
||
|
||
|
||
def parse_product(html_text: str, url: str) -> Optional[ProductData]:
|
||
"""Parse a product page. Returns ProductData or None if not a product page."""
|
||
jsonld = extract_jsonld_product(html_text)
|
||
if not jsonld:
|
||
return None
|
||
|
||
product = ProductData(url=url)
|
||
product.name = jsonld.get("name", "").strip()
|
||
product.description = jsonld.get("description", "").strip()
|
||
product.sku = jsonld.get("model", "").strip()
|
||
|
||
# Extract and normalize botanical name
|
||
product.raw_latin_name = extract_botanical_name(html_text)
|
||
product.normalized_latin = normalize_latin_name(product.raw_latin_name)
|
||
|
||
# Extract growing data
|
||
product.growing_data = extract_growing_data(html_text)
|
||
|
||
# Parse calendar
|
||
product.calendar = parse_calendar(html_text)
|
||
|
||
# Check organic status (Reinsaat is all organic, but check for "demeter" too)
|
||
product.is_organic = True
|
||
|
||
return product
|
||
|
||
|
||
# ── Recursive discovery ─────────────────────────────────────────────────────
|
||
def discover_products(
|
||
category_url: str,
|
||
max_depth: int = 4,
|
||
_depth: int = 0,
|
||
_visited: set = None,
|
||
) -> list[ProductData]:
|
||
if _visited is None:
|
||
_visited = set()
|
||
if category_url in _visited or _depth > max_depth:
|
||
return []
|
||
_visited.add(category_url)
|
||
|
||
indent = " " * (_depth + 1)
|
||
|
||
try:
|
||
html_text = fetch_url(category_url)
|
||
time.sleep(DELAY)
|
||
except Exception as e:
|
||
print(f"{indent}ERROR fetching {category_url}: {e}")
|
||
return []
|
||
|
||
# Check if this is a product page
|
||
product = parse_product(html_text, category_url)
|
||
if product:
|
||
return [product]
|
||
|
||
# Category page: find child links
|
||
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
|
||
child_links = []
|
||
for link in extract_links(html_text, category_url):
|
||
parsed = urllib.parse.urlparse(link)
|
||
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
|
||
continue
|
||
child_path = parsed.path.rstrip("/")
|
||
if not child_path.startswith(cat_path + "/"):
|
||
continue
|
||
relative = child_path[len(cat_path) + 1:]
|
||
if "/" in relative or not relative:
|
||
continue
|
||
clean_url = f"https://www.reinsaat.at{child_path}/"
|
||
if clean_url not in _visited:
|
||
child_links.append(clean_url)
|
||
|
||
child_links = list(dict.fromkeys(child_links))
|
||
print(f"{indent}Category {category_url} -> {len(child_links)} children")
|
||
|
||
products = []
|
||
for child_url in child_links:
|
||
results = discover_products(child_url, max_depth, _depth + 1, _visited)
|
||
products.extend(results)
|
||
|
||
return products
|
||
|
||
|
||
# ── Slug generation ─────────────────────────────────────────────────────────
|
||
def make_slug(species_name: str, cultivar_name: str) -> str:
|
||
"""Generate a URL-friendly slug."""
|
||
raw = f"{species_name}-{cultivar_name}".lower()
|
||
# Replace umlauts and special chars
|
||
replacements = {
|
||
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
|
||
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
||
'á': 'a', 'à': 'a', 'â': 'a',
|
||
'í': 'i', 'ì': 'i', 'î': 'i',
|
||
'ó': 'o', 'ò': 'o', 'ô': 'o',
|
||
'ú': 'u', 'ù': 'u', 'û': 'u',
|
||
'ñ': 'n', 'ç': 'c',
|
||
}
|
||
for old, new in replacements.items():
|
||
raw = raw.replace(old, new)
|
||
# Keep only alphanumeric and hyphens
|
||
slug = re.sub(r'[^a-z0-9]+', '-', raw)
|
||
slug = slug.strip('-')
|
||
# Collapse multiple hyphens
|
||
slug = re.sub(r'-+', '-', slug)
|
||
return slug
|
||
|
||
|
||
# ── Main ────────────────────────────────────────────────────────────────────
|
||
def db_connect():
|
||
"""Create a fresh DB connection."""
|
||
conn = psycopg2.connect(
|
||
host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
|
||
)
|
||
conn.autocommit = False
|
||
return conn
|
||
|
||
|
||
def main():
|
||
print("=" * 70)
|
||
print("Reinsaat Scraper v2")
|
||
print("=" * 70)
|
||
|
||
# ── Phase 1: Discover all products (no DB needed) ──
|
||
print("\n[1] Discovering products from Reinsaat categories...")
|
||
all_products: list[ProductData] = []
|
||
visited: set[str] = set()
|
||
|
||
for cat_url in CATEGORIES:
|
||
print(f"\n Category: {cat_url}")
|
||
products = discover_products(cat_url, max_depth=4, _visited=visited)
|
||
all_products.extend(products)
|
||
print(f" -> {len(products)} products")
|
||
|
||
# Deduplicate by URL
|
||
seen_urls = set()
|
||
unique_products = []
|
||
for p in all_products:
|
||
if p.url not in seen_urls:
|
||
seen_urls.add(p.url)
|
||
unique_products.append(p)
|
||
all_products = unique_products
|
||
print(f"\n Total unique products: {len(all_products)}")
|
||
|
||
# ── Phase 2: Connect to DB and load existing data ──
|
||
print("\n[2] Connecting to DB and loading existing data...")
|
||
conn = db_connect()
|
||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||
|
||
# Load species
|
||
cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
|
||
species_rows = cur.fetchall()
|
||
species_map = {}
|
||
for row in species_rows:
|
||
key = row["name_scientific"].lower().strip()
|
||
species_map[key] = row
|
||
print(f" {len(species_map)} species loaded")
|
||
|
||
# Load existing cultivars
|
||
cur.execute("""
|
||
SELECT id, species_id, name, slug, description,
|
||
row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||
germination_temp_c, perennial,
|
||
indoor_sowing_months, direct_sowing_months,
|
||
transplanting_months, glasshouse_months, harvesting_months
|
||
FROM cultivars
|
||
""")
|
||
cultivar_rows = cur.fetchall()
|
||
existing_cultivars = {}
|
||
existing_slugs = set()
|
||
for row in cultivar_rows:
|
||
sid = str(row["species_id"])
|
||
name_lower = row["name"].lower()
|
||
existing_cultivars[(sid, name_lower)] = dict(row)
|
||
existing_slugs.add(row["slug"])
|
||
print(f" {len(existing_cultivars)} cultivars loaded")
|
||
|
||
# Load existing Reinsaat supplier links
|
||
cur.execute("""
|
||
SELECT cultivar_id, product_url, article_number
|
||
FROM cultivar_suppliers
|
||
WHERE supplier_id = %s
|
||
""", (REINSAAT_SUPPLIER_ID,))
|
||
existing_links = {}
|
||
for row in cur.fetchall():
|
||
cid = str(row["cultivar_id"])
|
||
url = row["product_url"] or ""
|
||
sku = row["article_number"] or ""
|
||
existing_links.setdefault(cid, []).append((url, sku))
|
||
print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
|
||
|
||
# ── Phase 3: Process products ──
|
||
print("\n[3] Processing products...")
|
||
stats = {
|
||
"created": 0,
|
||
"linked": 0,
|
||
"enriched": 0,
|
||
"skipped_no_species": 0,
|
||
"skipped_no_name": 0,
|
||
"link_exists": 0,
|
||
"errors": 0,
|
||
}
|
||
unmatched = []
|
||
|
||
for i, product in enumerate(all_products):
|
||
pct = (i + 1) / len(all_products) * 100
|
||
prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
|
||
|
||
if not product.name:
|
||
stats["skipped_no_name"] += 1
|
||
continue
|
||
|
||
# Match species
|
||
normalized = product.normalized_latin.lower().strip()
|
||
species = species_map.get(normalized)
|
||
|
||
if not species:
|
||
# Try exact match on raw name (first two words)
|
||
raw_words = product.raw_latin_name.split()
|
||
if len(raw_words) >= 2:
|
||
attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
|
||
species = species_map.get(attempt)
|
||
|
||
if not species:
|
||
stats["skipped_no_species"] += 1
|
||
unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
|
||
continue
|
||
|
||
species_id = str(species["id"])
|
||
species_name = species["name_scientific"]
|
||
|
||
# Check if cultivar exists
|
||
ckey = (species_id, product.name.lower())
|
||
existing = existing_cultivars.get(ckey)
|
||
|
||
if existing:
|
||
cultivar_id = str(existing["id"])
|
||
|
||
# ── Enrich existing cultivar with missing data ──
|
||
updates = {}
|
||
|
||
# Growing data from page
|
||
gd = product.growing_data
|
||
if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
|
||
updates["planting_depth_cm"] = gd["planting_depth_cm"]
|
||
if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
|
||
updates["row_spacing_cm"] = gd["row_spacing_cm"]
|
||
if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
|
||
updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
|
||
if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
|
||
updates["germination_temp_c"] = gd["germination_temp_c"]
|
||
if gd.get("perennial") and not existing.get("perennial"):
|
||
updates["perennial"] = True
|
||
|
||
# Calendar data
|
||
cal = product.calendar
|
||
if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
|
||
updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
|
||
if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
|
||
updates["direct_sowing_months"] = cal["direct_sowing_months"]
|
||
if cal.get("transplanting_months") and not existing.get("transplanting_months"):
|
||
updates["transplanting_months"] = cal["transplanting_months"]
|
||
if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
|
||
updates["glasshouse_months"] = cal["glasshouse_months"]
|
||
if cal.get("harvesting_months") and not existing.get("harvesting_months"):
|
||
updates["harvesting_months"] = cal["harvesting_months"]
|
||
|
||
# Description
|
||
if product.description and not existing.get("description"):
|
||
updates["description"] = product.description
|
||
|
||
if updates:
|
||
set_clauses = []
|
||
values = []
|
||
for col, val in updates.items():
|
||
set_clauses.append(f"{col} = %s")
|
||
values.append(val)
|
||
set_clauses.append("updated_at = NOW()")
|
||
values.append(cultivar_id)
|
||
cur.execute(
|
||
f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
|
||
values
|
||
)
|
||
stats["enriched"] += 1
|
||
print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
|
||
|
||
# ── Add supplier link if missing ──
|
||
link_exists = False
|
||
if cultivar_id in existing_links:
|
||
for lurl, lsku in existing_links[cultivar_id]:
|
||
if lurl == product.url or (lsku and lsku == product.sku):
|
||
link_exists = True
|
||
break
|
||
|
||
if link_exists:
|
||
stats["link_exists"] += 1
|
||
else:
|
||
try:
|
||
cur.execute("SAVEPOINT link_sp")
|
||
cur.execute("""
|
||
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||
ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
|
||
SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
|
||
""", (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||
cur.execute("RELEASE SAVEPOINT link_sp")
|
||
stats["linked"] += 1
|
||
existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
|
||
print(f"{prefix} {product.name} -> LINKED ({product.sku})")
|
||
except Exception as e:
|
||
print(f"{prefix} {product.name} -> LINK ERROR: {e}")
|
||
cur.execute("ROLLBACK TO SAVEPOINT link_sp")
|
||
stats["errors"] += 1
|
||
else:
|
||
# ── Create new cultivar ──
|
||
slug = make_slug(species_name, product.name)
|
||
# Ensure unique slug
|
||
base_slug = slug
|
||
counter = 2
|
||
while slug in existing_slugs:
|
||
slug = f"{base_slug}-{counter}"
|
||
counter += 1
|
||
|
||
gd = product.growing_data
|
||
cal = product.calendar
|
||
|
||
try:
|
||
cur.execute("SAVEPOINT create_sp")
|
||
cur.execute("""
|
||
INSERT INTO cultivars (
|
||
species_id, name, name_de, slug, description,
|
||
is_organic, perennial,
|
||
planting_depth_cm, row_spacing_cm, plant_spacing_cm,
|
||
germination_temp_c,
|
||
indoor_sowing_months, direct_sowing_months,
|
||
transplanting_months, glasshouse_months, harvesting_months
|
||
) VALUES (
|
||
%s::uuid, %s, %s, %s, %s,
|
||
%s, %s,
|
||
%s, %s, %s,
|
||
%s,
|
||
%s, %s,
|
||
%s, %s, %s
|
||
)
|
||
RETURNING id
|
||
""", (
|
||
species_id,
|
||
product.name,
|
||
product.name,
|
||
slug,
|
||
product.description,
|
||
product.is_organic,
|
||
gd.get("perennial", False),
|
||
gd.get("planting_depth_cm"),
|
||
gd.get("row_spacing_cm"),
|
||
gd.get("plant_spacing_cm"),
|
||
gd.get("germination_temp_c"),
|
||
cal.get("indoor_sowing_months"),
|
||
cal.get("direct_sowing_months"),
|
||
cal.get("transplanting_months"),
|
||
cal.get("glasshouse_months"),
|
||
cal.get("harvesting_months"),
|
||
))
|
||
new_id = str(cur.fetchone()["id"])
|
||
existing_slugs.add(slug)
|
||
existing_cultivars[ckey] = {"id": new_id}
|
||
stats["created"] += 1
|
||
|
||
# Link to supplier
|
||
cur.execute("""
|
||
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||
""", (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||
stats["linked"] += 1
|
||
existing_links.setdefault(new_id, []).append((product.url, product.sku))
|
||
|
||
print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
|
||
cur.execute("RELEASE SAVEPOINT create_sp")
|
||
except Exception as e:
|
||
print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
|
||
cur.execute("ROLLBACK TO SAVEPOINT create_sp")
|
||
stats["errors"] += 1
|
||
|
||
# ── Commit ──
|
||
conn.commit()
|
||
|
||
# ── Summary ──
|
||
print("\n" + "=" * 70)
|
||
print("SUMMARY")
|
||
print("=" * 70)
|
||
print(f" Total products discovered: {len(all_products)}")
|
||
print(f" New cultivars created: {stats['created']}")
|
||
print(f" New supplier links added: {stats['linked']}")
|
||
print(f" Cultivars enriched: {stats['enriched']}")
|
||
print(f" Links already existed: {stats['link_exists']}")
|
||
print(f" Skipped (no species): {stats['skipped_no_species']}")
|
||
print(f" Skipped (no name): {stats['skipped_no_name']}")
|
||
print(f" Errors: {stats['errors']}")
|
||
print("=" * 70)
|
||
|
||
if unmatched:
|
||
print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
|
||
for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
|
||
print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|