#!/usr/bin/env python3
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html import unescape
# --- Config ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_BASE = "https://www.reinsaat.at"
DELAY = 0.3
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
CATEGORIES = [
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
"wild_flowers_seeds", "green_manure",
]
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
STRIP_SUFFIXES = {
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
"sat.", "sat", "axillare", "medikus",
}
def api_get(path, params=None):
"""GET from HerbAPI."""
url = f"{API_BASE}{path}"
if params:
url += "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url)
req.add_header("Authorization", f"Bearer {API_TOKEN}")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def api_post(path, data):
"""POST to HerbAPI."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Authorization", f"Bearer {API_TOKEN}")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def fetch_page(url):
"""Fetch a web page, return HTML string."""
req = urllib.request.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
BOTANICAL_TYPOS = {
"capscicum": "capsicum",
"capsicum frutenscens": "capsicum frutescens",
"tropaelum": "tropaeolum",
"lact.": "lactuca",
}
ABBREVIATED_NAMES = {
"origanum vulg.": "origanum vulgare",
"helichrysum bract.": "helichrysum bracteatum",
"campanula lat.": "campanula latifolia",
"cosmos bip.": "cosmos bipinnatus",
"papaver somnif.": "papaver somniferum",
}
def normalise_botanical(raw):
"""Strip botanical name to genus + species only.
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
"""
if not raw:
return None
# Clean HTML entities
raw = unescape(raw).replace("\xa0", " ").strip()
# Remove trailing commas/periods
raw = raw.rstrip(",. ")
# Remove content in parentheses
raw = re.sub(r"\([^)]*\)", "", raw)
# Check abbreviated names first (before splitting)
raw_lower = raw.lower().strip()
for abbrev, full in ABBREVIATED_NAMES.items():
if raw_lower.startswith(abbrev):
return full
parts = raw.split()
if len(parts) < 2:
return None
# Genus (capitalised) + species (lowercase)
genus = parts[0].lower().rstrip(",")
species = parts[1].lower().rstrip(",")
# Fix known typos
if genus in BOTANICAL_TYPOS:
genus = BOTANICAL_TYPOS[genus]
full_name = f"{genus} {species}"
if full_name in BOTANICAL_TYPOS:
full_name = BOTANICAL_TYPOS[full_name]
genus, species = full_name.split()
# Validate: genus should start with letter, species should be all lowercase
if not genus[0].isalpha() or not species[0].isalpha():
return None
# Skip if species looks like an authority (starts with uppercase in original)
if parts[1][0].isupper():
return None
return f"{genus} {species}"
def extract_product_data(html, url):
"""Extract product info from a Reinsaat product page."""
result = {}
# H1 = variety name
m = re.search(r'
]*>([^<]+)
', html)
if m:
name = unescape(m.group(1)).strip()
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
paren = re.search(r"\(([^)]+)\)", name)
if paren and re.match(r"RS-", name):
name = paren.group(1).strip()
result["name"] = name
# Botanical name from fce_shop_kurztext
m = re.search(
r'fce_shop_kurztext[^>]*>\s*(?:]*>)?\s*([^<]+?)\s*(?:)?\s*',
html,
)
if m:
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
# Article number from JSON-LD
for jm in re.finditer(
r'', html, re.S
):
try:
jd = json.loads(jm.group(1))
except json.JSONDecodeError:
continue
if jd.get("@type") == "Product":
if "model" in jd:
result["article_number"] = str(jd["model"])
# Get smallest pack price (usually the Portion)
offers = jd.get("offers", {})
if isinstance(offers, dict):
offer_list = offers.get("offers", [])
elif isinstance(offers, list):
offer_list = offers
else:
offer_list = []
if offer_list:
prices = [
o["price"]
for o in offer_list
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
]
if prices:
result["price_eur"] = min(prices)
break
# Price table - get pack sizes
tables = re.findall(r"", html, re.S)
for tbl in tables:
if "€" not in tbl:
continue
rows = re.findall(r"]*>(.*?)
", tbl, re.S)
if len(rows) >= 2:
size_cells = re.findall(r"]*>(.*?) | ", rows[0], re.S)
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
price_cells = re.findall(r"]*>(.*?) | ", rows[1], re.S)
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
# Find the "Port." entry
for i, st in enumerate(size_texts):
if "Port" in st:
if i < len(price_texts):
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
if pm:
result["port_price"] = float(pm.group())
break
# Get portion content info
result["pack_sizes"] = size_texts
break
# Sowing depth
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
if m:
d1 = float(m.group(1).replace(",", "."))
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
# Try outdoor spacing first
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if m:
result["row_spacing_cm"] = float(m.group(1))
result["plant_spacing_cm"] = float(m.group(2))
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
if "row_spacing_cm" not in result:
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
if m:
r1 = int(m.group(1))
r2 = int(m.group(2)) if m.group(2) else r1
result["row_spacing_cm"] = float((r1 + r2) // 2)
# Germination temperature
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
if m:
t1 = int(m.group(1))
t2 = int(m.group(2)) if m.group(2) else t1
result["germination_temp_c"] = float((t1 + t2) // 2)
# Pack unit from portion info - "20 seeds" or "25 g" etc
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if not portion_m:
# Try "Port. (20 seeds)" format
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if portion_m:
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
unit = portion_m.group(2).lower()
if unit in ("seed", "seeds", "korn"):
result["pack_unit"] = "Korn"
else:
result["pack_unit"] = unit
result["url"] = url
return result
def get_all_species():
"""Fetch all species from API, build lookup by normalised name."""
species_map = {}
page = 1
while True:
data = api_get("/species", {"per_page": 100, "page": page})
batch = data.get("data", [])
for sp in batch:
norm = normalise_botanical(sp["name_scientific"])
if norm:
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
if len(batch) < 100:
break
page += 1
return species_map
def get_all_cultivars():
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
cultivar_map = {} # (species_id, lower_name) -> cultivar
page = 1
while True:
data = api_get("/cultivars", {"per_page": 100, "page": page})
batch = data.get("data", [])
for cv in batch:
key = (cv["species_id"], cv["name"].lower().strip())
cultivar_map[key] = cv
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
if len(batch) < 100:
break
page += 1
return cultivar_map
def get_reinsaat_supplier():
"""Get Reinsaat supplier record."""
suppliers = api_get("/suppliers")
for s in suppliers:
if s["slug"] == "reinsaat":
return s
raise RuntimeError("Reinsaat supplier not found in API")
def get_cultivar_suppliers(cultivar_id):
"""Get existing supplier links for a cultivar."""
return api_get(f"/cultivars/{cultivar_id}/suppliers")
def get_product_urls_from_category(cat_slug):
"""Fetch product URLs from a category page. Handles one level of subcategories."""
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
try:
html = fetch_page(cat_url)
except Exception as e:
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
return []
time.sleep(DELAY)
# Get all internal links under this category
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
raw_links = re.findall(rf'href="({pattern})"', html)
# raw_links is list of (full_path, slug_part) but re gives us captured groups
# Let me redo this
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
unique_links = sorted(set(raw_links))
product_urls = []
subcategory_urls = []
for link in unique_links:
full_url = REINSAAT_BASE + link
# Determine depth relative to category
parts = link.rstrip("/").split("/")
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
if len(parts) == 4:
# Could be product or subcategory - we'll check later
product_urls.append(full_url)
elif len(parts) >= 5:
product_urls.append(full_url)
return product_urls
def is_product_page(html):
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
return bool(
re.search(r'fce_shop_kurztext', html)
or re.search(r'"@type":\s*"Product"', html)
)
def main():
print("=" * 60)
print("Reinsaat v3 Scraper")
print("=" * 60)
# Step 1: Load all species
print("\n[1/4] Loading species from API...")
species_map = get_all_species()
print(f" Loaded {len(species_map)} species")
# Step 2: Load all cultivars
print("\n[2/4] Loading cultivars from API...")
cultivar_map = get_all_cultivars()
print(f" Loaded {len(cultivar_map)} cultivars")
# Step 3: Get Reinsaat supplier
print("\n[3/4] Getting Reinsaat supplier...")
supplier = get_reinsaat_supplier()
supplier_id = supplier["id"]
print(f" Reinsaat ID: {supplier_id}")
# Step 4: Scrape categories
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
stats = {
"products_found": 0,
"botanical_extracted": 0,
"species_matched": 0,
"species_not_matched": 0,
"cultivar_existed": 0,
"cultivar_created": 0,
"link_existed": 0,
"link_created": 0,
"errors": 0,
}
unmatched_species = {} # botanical_norm -> count
new_cultivars = []
new_links = []
for cat_i, cat in enumerate(CATEGORIES):
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
urls = get_product_urls_from_category(cat)
print(f" Found {len(urls)} URLs")
for url in urls:
time.sleep(DELAY)
try:
html = fetch_page(url)
except Exception as e:
print(f" ERROR fetching {url}: {e}")
stats["errors"] += 1
continue
# Check if this is actually a product page
if not is_product_page(html):
# Might be a subcategory - get links from it
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
sub_links = [
REINSAAT_BASE + l
for l in sorted(set(sub_links))
if l.startswith(f"/shop/EN/{cat}/")
and l.count("/") > url.rstrip("/").count("/")
]
if sub_links:
# It's a subcategory, process its product links
for sub_url in sub_links:
if sub_url in urls:
continue # already in list
time.sleep(DELAY)
try:
sub_html = fetch_page(sub_url)
except Exception as e:
print(f" ERROR fetching {sub_url}: {e}")
stats["errors"] += 1
continue
if not is_product_page(sub_html):
continue
process_product(
sub_html, sub_url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
continue
process_product(
html, url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
# Report
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Products found: {stats['products_found']}")
print(f"Botanical extracted: {stats['botanical_extracted']}")
print(f"Species matched: {stats['species_matched']}")
print(f"Species NOT matched: {stats['species_not_matched']}")
print(f"Cultivars existed: {stats['cultivar_existed']}")
print(f"Cultivars created: {stats['cultivar_created']}")
print(f"Links existed: {stats['link_existed']}")
print(f"Links created: {stats['link_created']}")
print(f"Errors: {stats['errors']}")
if new_cultivars:
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
for cv in new_cultivars:
print(f" + {cv['name']} ({cv.get('species', '?')})")
if new_links:
print(f"\n--- New supplier links ({len(new_links)}) ---")
for lk in new_links:
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
if unmatched_species:
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
print(f" ? {name} (x{count})")
print("\nDone.")
def process_product(html, url, species_map, cultivar_map, supplier_id,
stats, unmatched_species, new_cultivars, new_links):
"""Process a single product page."""
stats["products_found"] += 1
prod = extract_product_data(html, url)
if not prod.get("name"):
return
bot_norm = prod.get("botanical_norm")
if not bot_norm:
# No botanical name found on page
stats["species_not_matched"] += 1
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
return
stats["botanical_extracted"] += 1
# Match species
species = species_map.get(bot_norm)
if not species:
stats["species_not_matched"] += 1
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
return
stats["species_matched"] += 1
species_id = species["id"]
cultivar_name = prod["name"]
# Check if cultivar exists
cv_key = (species_id, cultivar_name.lower().strip())
existing_cv = cultivar_map.get(cv_key)
if existing_cv:
stats["cultivar_existed"] += 1
cultivar_id = existing_cv["id"]
else:
# Create cultivar
create_data = {
"species_id": species_id,
"name": cultivar_name,
"is_organic": True,
"source_urls": [url],
}
# Add growing data if we extracted any
if "planting_depth_cm" in prod:
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
if "row_spacing_cm" in prod:
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
if "plant_spacing_cm" in prod:
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
if "germination_temp_c" in prod:
create_data["germination_temp_c"] = prod["germination_temp_c"]
try:
new_cv = api_post("/cultivars", create_data)
cultivar_id = new_cv["id"]
stats["cultivar_created"] += 1
new_cultivars.append({
"name": cultivar_name,
"species": species["name"],
"id": cultivar_id,
})
# Add to local cache
cultivar_map[cv_key] = new_cv
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
if e.code == 500 and "Database error" in body:
# Likely slug collision - search for existing cultivar
try:
# Try multiple search strategies
found = None
cn_lower = cultivar_name.lower().strip()
# Strategy 1: search by full name
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
for cv in search_data.get("data", []):
if cv["name"].lower().strip() == cn_lower:
found = cv
break
# Strategy 2: match by species_id + partial name
if not found:
for cv in search_data.get("data", []):
if cv["species_id"] == species_id:
# Match if names are similar (ignoring punctuation)
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
found = cv
break
# Strategy 3: search by last significant word
if not found:
words = [w for w in cultivar_name.split() if len(w) > 2]
if words:
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
for cv in search2.get("data", []):
if cv["species_id"] == species_id:
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean:
found = cv
break
if found:
cultivar_id = found["id"]
cultivar_map[cv_key] = found
stats["cultivar_existed"] += 1
else:
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
stats["errors"] += 1
return
except Exception as e2:
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
stats["errors"] += 1
return
else:
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
return
# Check if Reinsaat supplier link exists
try:
existing_links = get_cultivar_suppliers(cultivar_id)
except Exception:
existing_links = []
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
if has_reinsaat:
stats["link_existed"] += 1
else:
# Create supplier link
link_data = {
"supplier_id": supplier_id,
"product_url": url,
}
if "article_number" in prod:
link_data["article_number"] = prod["article_number"]
if "port_price" in prod:
link_data["price_eur"] = prod["port_price"]
elif "price_eur" in prod:
link_data["price_eur"] = prod["price_eur"]
if "pack_size" in prod:
link_data["pack_size"] = prod["pack_size"]
if "pack_unit" in prod:
link_data["pack_unit"] = prod["pack_unit"]
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
stats["link_created"] += 1
new_links.append({
"cultivar": cultivar_name,
"article": prod.get("article_number", "?"),
"url": url,
})
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
if __name__ == "__main__":
main()