636 lines
23 KiB
Python
636 lines
23 KiB
Python
#!/usr/bin/env python3
|
||
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
import time
|
||
import urllib.request
|
||
import urllib.error
|
||
import urllib.parse
|
||
from html import unescape
|
||
|
||
# --- Config ---
|
||
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||
REINSAAT_BASE = "https://www.reinsaat.at"
|
||
DELAY = 0.3
|
||
|
||
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
|
||
CATEGORIES = [
|
||
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
|
||
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
|
||
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
|
||
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
|
||
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
|
||
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
|
||
"wild_flowers_seeds", "green_manure",
|
||
]
|
||
|
||
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
|
||
STRIP_SUFFIXES = {
|
||
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
|
||
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
|
||
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
|
||
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
|
||
"sat.", "sat", "axillare", "medikus",
|
||
}
|
||
|
||
|
||
def api_get(path, params=None):
|
||
"""GET from HerbAPI."""
|
||
url = f"{API_BASE}{path}"
|
||
if params:
|
||
url += "?" + urllib.parse.urlencode(params)
|
||
req = urllib.request.Request(url)
|
||
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||
with urllib.request.urlopen(req) as resp:
|
||
return json.loads(resp.read())
|
||
|
||
|
||
def api_post(path, data):
|
||
"""POST to HerbAPI."""
|
||
url = f"{API_BASE}{path}"
|
||
body = json.dumps(data).encode()
|
||
req = urllib.request.Request(url, data=body, method="POST")
|
||
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||
req.add_header("Content-Type", "application/json")
|
||
with urllib.request.urlopen(req) as resp:
|
||
return json.loads(resp.read())
|
||
|
||
|
||
def fetch_page(url):
|
||
"""Fetch a web page, return HTML string."""
|
||
req = urllib.request.Request(url)
|
||
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
|
||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
return resp.read().decode("utf-8", errors="replace")
|
||
|
||
|
||
BOTANICAL_TYPOS = {
|
||
"capscicum": "capsicum",
|
||
"capsicum frutenscens": "capsicum frutescens",
|
||
"tropaelum": "tropaeolum",
|
||
"lact.": "lactuca",
|
||
}
|
||
|
||
ABBREVIATED_NAMES = {
|
||
"origanum vulg.": "origanum vulgare",
|
||
"helichrysum bract.": "helichrysum bracteatum",
|
||
"campanula lat.": "campanula latifolia",
|
||
"cosmos bip.": "cosmos bipinnatus",
|
||
"papaver somnif.": "papaver somniferum",
|
||
}
|
||
|
||
|
||
def normalise_botanical(raw):
|
||
"""Strip botanical name to genus + species only.
|
||
|
||
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
|
||
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
|
||
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
|
||
"""
|
||
if not raw:
|
||
return None
|
||
# Clean HTML entities
|
||
raw = unescape(raw).replace("\xa0", " ").strip()
|
||
# Remove trailing commas/periods
|
||
raw = raw.rstrip(",. ")
|
||
# Remove content in parentheses
|
||
raw = re.sub(r"\([^)]*\)", "", raw)
|
||
# Check abbreviated names first (before splitting)
|
||
raw_lower = raw.lower().strip()
|
||
for abbrev, full in ABBREVIATED_NAMES.items():
|
||
if raw_lower.startswith(abbrev):
|
||
return full
|
||
|
||
parts = raw.split()
|
||
if len(parts) < 2:
|
||
return None
|
||
# Genus (capitalised) + species (lowercase)
|
||
genus = parts[0].lower().rstrip(",")
|
||
species = parts[1].lower().rstrip(",")
|
||
|
||
# Fix known typos
|
||
if genus in BOTANICAL_TYPOS:
|
||
genus = BOTANICAL_TYPOS[genus]
|
||
full_name = f"{genus} {species}"
|
||
if full_name in BOTANICAL_TYPOS:
|
||
full_name = BOTANICAL_TYPOS[full_name]
|
||
genus, species = full_name.split()
|
||
|
||
# Validate: genus should start with letter, species should be all lowercase
|
||
if not genus[0].isalpha() or not species[0].isalpha():
|
||
return None
|
||
# Skip if species looks like an authority (starts with uppercase in original)
|
||
if parts[1][0].isupper():
|
||
return None
|
||
return f"{genus} {species}"
|
||
|
||
|
||
def extract_product_data(html, url):
|
||
"""Extract product info from a Reinsaat product page."""
|
||
result = {}
|
||
|
||
# H1 = variety name
|
||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||
if m:
|
||
name = unescape(m.group(1)).strip()
|
||
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
|
||
paren = re.search(r"\(([^)]+)\)", name)
|
||
if paren and re.match(r"RS-", name):
|
||
name = paren.group(1).strip()
|
||
result["name"] = name
|
||
|
||
# Botanical name from fce_shop_kurztext
|
||
m = re.search(
|
||
r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
|
||
html,
|
||
)
|
||
if m:
|
||
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
|
||
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
|
||
|
||
# Article number from JSON-LD
|
||
for jm in re.finditer(
|
||
r'<script type="application/ld\+json">(.*?)</script>', html, re.S
|
||
):
|
||
try:
|
||
jd = json.loads(jm.group(1))
|
||
except json.JSONDecodeError:
|
||
continue
|
||
if jd.get("@type") == "Product":
|
||
if "model" in jd:
|
||
result["article_number"] = str(jd["model"])
|
||
# Get smallest pack price (usually the Portion)
|
||
offers = jd.get("offers", {})
|
||
if isinstance(offers, dict):
|
||
offer_list = offers.get("offers", [])
|
||
elif isinstance(offers, list):
|
||
offer_list = offers
|
||
else:
|
||
offer_list = []
|
||
if offer_list:
|
||
prices = [
|
||
o["price"]
|
||
for o in offer_list
|
||
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
|
||
]
|
||
if prices:
|
||
result["price_eur"] = min(prices)
|
||
break
|
||
|
||
# Price table - get pack sizes
|
||
tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
|
||
for tbl in tables:
|
||
if "€" not in tbl:
|
||
continue
|
||
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
|
||
if len(rows) >= 2:
|
||
size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
|
||
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
|
||
price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
|
||
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
|
||
# Find the "Port." entry
|
||
for i, st in enumerate(size_texts):
|
||
if "Port" in st:
|
||
if i < len(price_texts):
|
||
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
|
||
if pm:
|
||
result["port_price"] = float(pm.group())
|
||
break
|
||
# Get portion content info
|
||
result["pack_sizes"] = size_texts
|
||
break
|
||
|
||
# Sowing depth
|
||
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
|
||
if m:
|
||
d1 = float(m.group(1).replace(",", "."))
|
||
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
|
||
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
|
||
|
||
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
|
||
# Try outdoor spacing first
|
||
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||
if not m:
|
||
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||
if not m:
|
||
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||
if m:
|
||
result["row_spacing_cm"] = float(m.group(1))
|
||
result["plant_spacing_cm"] = float(m.group(2))
|
||
|
||
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
|
||
if "row_spacing_cm" not in result:
|
||
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
|
||
if m:
|
||
r1 = int(m.group(1))
|
||
r2 = int(m.group(2)) if m.group(2) else r1
|
||
result["row_spacing_cm"] = float((r1 + r2) // 2)
|
||
|
||
# Germination temperature
|
||
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
|
||
if m:
|
||
t1 = int(m.group(1))
|
||
t2 = int(m.group(2)) if m.group(2) else t1
|
||
result["germination_temp_c"] = float((t1 + t2) // 2)
|
||
|
||
# Pack unit from portion info - "20 seeds" or "25 g" etc
|
||
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||
if not portion_m:
|
||
# Try "Port. (20 seeds)" format
|
||
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||
if portion_m:
|
||
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
|
||
unit = portion_m.group(2).lower()
|
||
if unit in ("seed", "seeds", "korn"):
|
||
result["pack_unit"] = "Korn"
|
||
else:
|
||
result["pack_unit"] = unit
|
||
|
||
result["url"] = url
|
||
return result
|
||
|
||
|
||
def get_all_species():
|
||
"""Fetch all species from API, build lookup by normalised name."""
|
||
species_map = {}
|
||
page = 1
|
||
while True:
|
||
data = api_get("/species", {"per_page": 100, "page": page})
|
||
batch = data.get("data", [])
|
||
for sp in batch:
|
||
norm = normalise_botanical(sp["name_scientific"])
|
||
if norm:
|
||
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
|
||
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
|
||
if len(batch) < 100:
|
||
break
|
||
page += 1
|
||
return species_map
|
||
|
||
|
||
def get_all_cultivars():
|
||
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
|
||
cultivar_map = {} # (species_id, lower_name) -> cultivar
|
||
page = 1
|
||
while True:
|
||
data = api_get("/cultivars", {"per_page": 100, "page": page})
|
||
batch = data.get("data", [])
|
||
for cv in batch:
|
||
key = (cv["species_id"], cv["name"].lower().strip())
|
||
cultivar_map[key] = cv
|
||
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
|
||
if len(batch) < 100:
|
||
break
|
||
page += 1
|
||
return cultivar_map
|
||
|
||
|
||
def get_reinsaat_supplier():
|
||
"""Get Reinsaat supplier record."""
|
||
suppliers = api_get("/suppliers")
|
||
for s in suppliers:
|
||
if s["slug"] == "reinsaat":
|
||
return s
|
||
raise RuntimeError("Reinsaat supplier not found in API")
|
||
|
||
|
||
def get_cultivar_suppliers(cultivar_id):
|
||
"""Get existing supplier links for a cultivar."""
|
||
return api_get(f"/cultivars/{cultivar_id}/suppliers")
|
||
|
||
|
||
def get_product_urls_from_category(cat_slug):
|
||
"""Fetch product URLs from a category page. Handles one level of subcategories."""
|
||
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
|
||
try:
|
||
html = fetch_page(cat_url)
|
||
except Exception as e:
|
||
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
|
||
return []
|
||
|
||
time.sleep(DELAY)
|
||
|
||
# Get all internal links under this category
|
||
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
|
||
raw_links = re.findall(rf'href="({pattern})"', html)
|
||
# raw_links is list of (full_path, slug_part) but re gives us captured groups
|
||
# Let me redo this
|
||
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
|
||
unique_links = sorted(set(raw_links))
|
||
|
||
product_urls = []
|
||
subcategory_urls = []
|
||
|
||
for link in unique_links:
|
||
full_url = REINSAAT_BASE + link
|
||
# Determine depth relative to category
|
||
parts = link.rstrip("/").split("/")
|
||
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
|
||
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
|
||
if len(parts) == 4:
|
||
# Could be product or subcategory - we'll check later
|
||
product_urls.append(full_url)
|
||
elif len(parts) >= 5:
|
||
product_urls.append(full_url)
|
||
|
||
return product_urls
|
||
|
||
|
||
def is_product_page(html):
|
||
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
|
||
return bool(
|
||
re.search(r'fce_shop_kurztext', html)
|
||
or re.search(r'"@type":\s*"Product"', html)
|
||
)
|
||
|
||
|
||
def main():
|
||
print("=" * 60)
|
||
print("Reinsaat v3 Scraper")
|
||
print("=" * 60)
|
||
|
||
# Step 1: Load all species
|
||
print("\n[1/4] Loading species from API...")
|
||
species_map = get_all_species()
|
||
print(f" Loaded {len(species_map)} species")
|
||
|
||
# Step 2: Load all cultivars
|
||
print("\n[2/4] Loading cultivars from API...")
|
||
cultivar_map = get_all_cultivars()
|
||
print(f" Loaded {len(cultivar_map)} cultivars")
|
||
|
||
# Step 3: Get Reinsaat supplier
|
||
print("\n[3/4] Getting Reinsaat supplier...")
|
||
supplier = get_reinsaat_supplier()
|
||
supplier_id = supplier["id"]
|
||
print(f" Reinsaat ID: {supplier_id}")
|
||
|
||
# Step 4: Scrape categories
|
||
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
|
||
|
||
stats = {
|
||
"products_found": 0,
|
||
"botanical_extracted": 0,
|
||
"species_matched": 0,
|
||
"species_not_matched": 0,
|
||
"cultivar_existed": 0,
|
||
"cultivar_created": 0,
|
||
"link_existed": 0,
|
||
"link_created": 0,
|
||
"errors": 0,
|
||
}
|
||
unmatched_species = {} # botanical_norm -> count
|
||
new_cultivars = []
|
||
new_links = []
|
||
|
||
for cat_i, cat in enumerate(CATEGORIES):
|
||
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
|
||
urls = get_product_urls_from_category(cat)
|
||
print(f" Found {len(urls)} URLs")
|
||
|
||
for url in urls:
|
||
time.sleep(DELAY)
|
||
try:
|
||
html = fetch_page(url)
|
||
except Exception as e:
|
||
print(f" ERROR fetching {url}: {e}")
|
||
stats["errors"] += 1
|
||
continue
|
||
|
||
# Check if this is actually a product page
|
||
if not is_product_page(html):
|
||
# Might be a subcategory - get links from it
|
||
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
|
||
sub_links = [
|
||
REINSAAT_BASE + l
|
||
for l in sorted(set(sub_links))
|
||
if l.startswith(f"/shop/EN/{cat}/")
|
||
and l.count("/") > url.rstrip("/").count("/")
|
||
]
|
||
if sub_links:
|
||
# It's a subcategory, process its product links
|
||
for sub_url in sub_links:
|
||
if sub_url in urls:
|
||
continue # already in list
|
||
time.sleep(DELAY)
|
||
try:
|
||
sub_html = fetch_page(sub_url)
|
||
except Exception as e:
|
||
print(f" ERROR fetching {sub_url}: {e}")
|
||
stats["errors"] += 1
|
||
continue
|
||
if not is_product_page(sub_html):
|
||
continue
|
||
process_product(
|
||
sub_html, sub_url, species_map, cultivar_map,
|
||
supplier_id, stats, unmatched_species,
|
||
new_cultivars, new_links,
|
||
)
|
||
continue
|
||
|
||
process_product(
|
||
html, url, species_map, cultivar_map,
|
||
supplier_id, stats, unmatched_species,
|
||
new_cultivars, new_links,
|
||
)
|
||
|
||
# Report
|
||
print("\n" + "=" * 60)
|
||
print("RESULTS")
|
||
print("=" * 60)
|
||
print(f"Products found: {stats['products_found']}")
|
||
print(f"Botanical extracted: {stats['botanical_extracted']}")
|
||
print(f"Species matched: {stats['species_matched']}")
|
||
print(f"Species NOT matched: {stats['species_not_matched']}")
|
||
print(f"Cultivars existed: {stats['cultivar_existed']}")
|
||
print(f"Cultivars created: {stats['cultivar_created']}")
|
||
print(f"Links existed: {stats['link_existed']}")
|
||
print(f"Links created: {stats['link_created']}")
|
||
print(f"Errors: {stats['errors']}")
|
||
|
||
if new_cultivars:
|
||
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
|
||
for cv in new_cultivars:
|
||
print(f" + {cv['name']} ({cv.get('species', '?')})")
|
||
|
||
if new_links:
|
||
print(f"\n--- New supplier links ({len(new_links)}) ---")
|
||
for lk in new_links:
|
||
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
|
||
|
||
if unmatched_species:
|
||
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
|
||
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
|
||
print(f" ? {name} (x{count})")
|
||
|
||
print("\nDone.")
|
||
|
||
|
||
def process_product(html, url, species_map, cultivar_map, supplier_id,
|
||
stats, unmatched_species, new_cultivars, new_links):
|
||
"""Process a single product page."""
|
||
stats["products_found"] += 1
|
||
prod = extract_product_data(html, url)
|
||
|
||
if not prod.get("name"):
|
||
return
|
||
|
||
bot_norm = prod.get("botanical_norm")
|
||
if not bot_norm:
|
||
# No botanical name found on page
|
||
stats["species_not_matched"] += 1
|
||
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
|
||
return
|
||
|
||
stats["botanical_extracted"] += 1
|
||
|
||
# Match species
|
||
species = species_map.get(bot_norm)
|
||
if not species:
|
||
stats["species_not_matched"] += 1
|
||
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
|
||
return
|
||
|
||
stats["species_matched"] += 1
|
||
species_id = species["id"]
|
||
cultivar_name = prod["name"]
|
||
|
||
# Check if cultivar exists
|
||
cv_key = (species_id, cultivar_name.lower().strip())
|
||
existing_cv = cultivar_map.get(cv_key)
|
||
|
||
if existing_cv:
|
||
stats["cultivar_existed"] += 1
|
||
cultivar_id = existing_cv["id"]
|
||
else:
|
||
# Create cultivar
|
||
create_data = {
|
||
"species_id": species_id,
|
||
"name": cultivar_name,
|
||
"is_organic": True,
|
||
"source_urls": [url],
|
||
}
|
||
# Add growing data if we extracted any
|
||
if "planting_depth_cm" in prod:
|
||
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
|
||
if "row_spacing_cm" in prod:
|
||
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
|
||
if "plant_spacing_cm" in prod:
|
||
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
|
||
if "germination_temp_c" in prod:
|
||
create_data["germination_temp_c"] = prod["germination_temp_c"]
|
||
|
||
try:
|
||
new_cv = api_post("/cultivars", create_data)
|
||
cultivar_id = new_cv["id"]
|
||
stats["cultivar_created"] += 1
|
||
new_cultivars.append({
|
||
"name": cultivar_name,
|
||
"species": species["name"],
|
||
"id": cultivar_id,
|
||
})
|
||
# Add to local cache
|
||
cultivar_map[cv_key] = new_cv
|
||
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
|
||
except urllib.error.HTTPError as e:
|
||
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||
if e.code == 500 and "Database error" in body:
|
||
# Likely slug collision - search for existing cultivar
|
||
try:
|
||
# Try multiple search strategies
|
||
found = None
|
||
cn_lower = cultivar_name.lower().strip()
|
||
|
||
# Strategy 1: search by full name
|
||
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
|
||
for cv in search_data.get("data", []):
|
||
if cv["name"].lower().strip() == cn_lower:
|
||
found = cv
|
||
break
|
||
# Strategy 2: match by species_id + partial name
|
||
if not found:
|
||
for cv in search_data.get("data", []):
|
||
if cv["species_id"] == species_id:
|
||
# Match if names are similar (ignoring punctuation)
|
||
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
|
||
found = cv
|
||
break
|
||
# Strategy 3: search by last significant word
|
||
if not found:
|
||
words = [w for w in cultivar_name.split() if len(w) > 2]
|
||
if words:
|
||
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
|
||
for cv in search2.get("data", []):
|
||
if cv["species_id"] == species_id:
|
||
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||
if cv_clean == cn_clean:
|
||
found = cv
|
||
break
|
||
|
||
if found:
|
||
cultivar_id = found["id"]
|
||
cultivar_map[cv_key] = found
|
||
stats["cultivar_existed"] += 1
|
||
else:
|
||
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
|
||
stats["errors"] += 1
|
||
return
|
||
except Exception as e2:
|
||
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
|
||
stats["errors"] += 1
|
||
return
|
||
else:
|
||
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
|
||
stats["errors"] += 1
|
||
return
|
||
|
||
# Check if Reinsaat supplier link exists
|
||
try:
|
||
existing_links = get_cultivar_suppliers(cultivar_id)
|
||
except Exception:
|
||
existing_links = []
|
||
|
||
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
|
||
|
||
if has_reinsaat:
|
||
stats["link_existed"] += 1
|
||
else:
|
||
# Create supplier link
|
||
link_data = {
|
||
"supplier_id": supplier_id,
|
||
"product_url": url,
|
||
}
|
||
if "article_number" in prod:
|
||
link_data["article_number"] = prod["article_number"]
|
||
if "port_price" in prod:
|
||
link_data["price_eur"] = prod["port_price"]
|
||
elif "price_eur" in prod:
|
||
link_data["price_eur"] = prod["price_eur"]
|
||
if "pack_size" in prod:
|
||
link_data["pack_size"] = prod["pack_size"]
|
||
if "pack_unit" in prod:
|
||
link_data["pack_unit"] = prod["pack_unit"]
|
||
|
||
try:
|
||
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
||
stats["link_created"] += 1
|
||
new_links.append({
|
||
"cultivar": cultivar_name,
|
||
"article": prod.get("article_number", "?"),
|
||
"url": url,
|
||
})
|
||
except urllib.error.HTTPError as e:
|
||
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
|
||
stats["errors"] += 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|