Files
herbapi/tools/scrapers/scrape_reinsaat_v3.py
T

636 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html import unescape
# --- Config ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_BASE = "https://www.reinsaat.at"
DELAY = 0.3
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
CATEGORIES = [
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
"wild_flowers_seeds", "green_manure",
]
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
STRIP_SUFFIXES = {
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
"sat.", "sat", "axillare", "medikus",
}
def api_get(path, params=None):
"""GET from HerbAPI."""
url = f"{API_BASE}{path}"
if params:
url += "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url)
req.add_header("Authorization", f"Bearer {API_TOKEN}")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def api_post(path, data):
"""POST to HerbAPI."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Authorization", f"Bearer {API_TOKEN}")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def fetch_page(url):
"""Fetch a web page, return HTML string."""
req = urllib.request.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
BOTANICAL_TYPOS = {
"capscicum": "capsicum",
"capsicum frutenscens": "capsicum frutescens",
"tropaelum": "tropaeolum",
"lact.": "lactuca",
}
ABBREVIATED_NAMES = {
"origanum vulg.": "origanum vulgare",
"helichrysum bract.": "helichrysum bracteatum",
"campanula lat.": "campanula latifolia",
"cosmos bip.": "cosmos bipinnatus",
"papaver somnif.": "papaver somniferum",
}
def normalise_botanical(raw):
"""Strip botanical name to genus + species only.
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
"""
if not raw:
return None
# Clean HTML entities
raw = unescape(raw).replace("\xa0", " ").strip()
# Remove trailing commas/periods
raw = raw.rstrip(",. ")
# Remove content in parentheses
raw = re.sub(r"\([^)]*\)", "", raw)
# Check abbreviated names first (before splitting)
raw_lower = raw.lower().strip()
for abbrev, full in ABBREVIATED_NAMES.items():
if raw_lower.startswith(abbrev):
return full
parts = raw.split()
if len(parts) < 2:
return None
# Genus (capitalised) + species (lowercase)
genus = parts[0].lower().rstrip(",")
species = parts[1].lower().rstrip(",")
# Fix known typos
if genus in BOTANICAL_TYPOS:
genus = BOTANICAL_TYPOS[genus]
full_name = f"{genus} {species}"
if full_name in BOTANICAL_TYPOS:
full_name = BOTANICAL_TYPOS[full_name]
genus, species = full_name.split()
# Validate: genus should start with letter, species should be all lowercase
if not genus[0].isalpha() or not species[0].isalpha():
return None
# Skip if species looks like an authority (starts with uppercase in original)
if parts[1][0].isupper():
return None
return f"{genus} {species}"
def extract_product_data(html, url):
"""Extract product info from a Reinsaat product page."""
result = {}
# H1 = variety name
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
if m:
name = unescape(m.group(1)).strip()
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
paren = re.search(r"\(([^)]+)\)", name)
if paren and re.match(r"RS-", name):
name = paren.group(1).strip()
result["name"] = name
# Botanical name from fce_shop_kurztext
m = re.search(
r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
html,
)
if m:
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
# Article number from JSON-LD
for jm in re.finditer(
r'<script type="application/ld\+json">(.*?)</script>', html, re.S
):
try:
jd = json.loads(jm.group(1))
except json.JSONDecodeError:
continue
if jd.get("@type") == "Product":
if "model" in jd:
result["article_number"] = str(jd["model"])
# Get smallest pack price (usually the Portion)
offers = jd.get("offers", {})
if isinstance(offers, dict):
offer_list = offers.get("offers", [])
elif isinstance(offers, list):
offer_list = offers
else:
offer_list = []
if offer_list:
prices = [
o["price"]
for o in offer_list
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
]
if prices:
result["price_eur"] = min(prices)
break
# Price table - get pack sizes
tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
for tbl in tables:
if "" not in tbl:
continue
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
if len(rows) >= 2:
size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
# Find the "Port." entry
for i, st in enumerate(size_texts):
if "Port" in st:
if i < len(price_texts):
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
if pm:
result["port_price"] = float(pm.group())
break
# Get portion content info
result["pack_sizes"] = size_texts
break
# Sowing depth
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
if m:
d1 = float(m.group(1).replace(",", "."))
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
# Try outdoor spacing first
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if m:
result["row_spacing_cm"] = float(m.group(1))
result["plant_spacing_cm"] = float(m.group(2))
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
if "row_spacing_cm" not in result:
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
if m:
r1 = int(m.group(1))
r2 = int(m.group(2)) if m.group(2) else r1
result["row_spacing_cm"] = float((r1 + r2) // 2)
# Germination temperature
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
if m:
t1 = int(m.group(1))
t2 = int(m.group(2)) if m.group(2) else t1
result["germination_temp_c"] = float((t1 + t2) // 2)
# Pack unit from portion info - "20 seeds" or "25 g" etc
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if not portion_m:
# Try "Port. (20 seeds)" format
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if portion_m:
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
unit = portion_m.group(2).lower()
if unit in ("seed", "seeds", "korn"):
result["pack_unit"] = "Korn"
else:
result["pack_unit"] = unit
result["url"] = url
return result
def get_all_species():
"""Fetch all species from API, build lookup by normalised name."""
species_map = {}
page = 1
while True:
data = api_get("/species", {"per_page": 100, "page": page})
batch = data.get("data", [])
for sp in batch:
norm = normalise_botanical(sp["name_scientific"])
if norm:
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
if len(batch) < 100:
break
page += 1
return species_map
def get_all_cultivars():
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
cultivar_map = {} # (species_id, lower_name) -> cultivar
page = 1
while True:
data = api_get("/cultivars", {"per_page": 100, "page": page})
batch = data.get("data", [])
for cv in batch:
key = (cv["species_id"], cv["name"].lower().strip())
cultivar_map[key] = cv
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
if len(batch) < 100:
break
page += 1
return cultivar_map
def get_reinsaat_supplier():
"""Get Reinsaat supplier record."""
suppliers = api_get("/suppliers")
for s in suppliers:
if s["slug"] == "reinsaat":
return s
raise RuntimeError("Reinsaat supplier not found in API")
def get_cultivar_suppliers(cultivar_id):
"""Get existing supplier links for a cultivar."""
return api_get(f"/cultivars/{cultivar_id}/suppliers")
def get_product_urls_from_category(cat_slug):
"""Fetch product URLs from a category page. Handles one level of subcategories."""
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
try:
html = fetch_page(cat_url)
except Exception as e:
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
return []
time.sleep(DELAY)
# Get all internal links under this category
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
raw_links = re.findall(rf'href="({pattern})"', html)
# raw_links is list of (full_path, slug_part) but re gives us captured groups
# Let me redo this
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
unique_links = sorted(set(raw_links))
product_urls = []
subcategory_urls = []
for link in unique_links:
full_url = REINSAAT_BASE + link
# Determine depth relative to category
parts = link.rstrip("/").split("/")
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
if len(parts) == 4:
# Could be product or subcategory - we'll check later
product_urls.append(full_url)
elif len(parts) >= 5:
product_urls.append(full_url)
return product_urls
def is_product_page(html):
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
return bool(
re.search(r'fce_shop_kurztext', html)
or re.search(r'"@type":\s*"Product"', html)
)
def main():
print("=" * 60)
print("Reinsaat v3 Scraper")
print("=" * 60)
# Step 1: Load all species
print("\n[1/4] Loading species from API...")
species_map = get_all_species()
print(f" Loaded {len(species_map)} species")
# Step 2: Load all cultivars
print("\n[2/4] Loading cultivars from API...")
cultivar_map = get_all_cultivars()
print(f" Loaded {len(cultivar_map)} cultivars")
# Step 3: Get Reinsaat supplier
print("\n[3/4] Getting Reinsaat supplier...")
supplier = get_reinsaat_supplier()
supplier_id = supplier["id"]
print(f" Reinsaat ID: {supplier_id}")
# Step 4: Scrape categories
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
stats = {
"products_found": 0,
"botanical_extracted": 0,
"species_matched": 0,
"species_not_matched": 0,
"cultivar_existed": 0,
"cultivar_created": 0,
"link_existed": 0,
"link_created": 0,
"errors": 0,
}
unmatched_species = {} # botanical_norm -> count
new_cultivars = []
new_links = []
for cat_i, cat in enumerate(CATEGORIES):
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
urls = get_product_urls_from_category(cat)
print(f" Found {len(urls)} URLs")
for url in urls:
time.sleep(DELAY)
try:
html = fetch_page(url)
except Exception as e:
print(f" ERROR fetching {url}: {e}")
stats["errors"] += 1
continue
# Check if this is actually a product page
if not is_product_page(html):
# Might be a subcategory - get links from it
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
sub_links = [
REINSAAT_BASE + l
for l in sorted(set(sub_links))
if l.startswith(f"/shop/EN/{cat}/")
and l.count("/") > url.rstrip("/").count("/")
]
if sub_links:
# It's a subcategory, process its product links
for sub_url in sub_links:
if sub_url in urls:
continue # already in list
time.sleep(DELAY)
try:
sub_html = fetch_page(sub_url)
except Exception as e:
print(f" ERROR fetching {sub_url}: {e}")
stats["errors"] += 1
continue
if not is_product_page(sub_html):
continue
process_product(
sub_html, sub_url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
continue
process_product(
html, url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
# Report
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Products found: {stats['products_found']}")
print(f"Botanical extracted: {stats['botanical_extracted']}")
print(f"Species matched: {stats['species_matched']}")
print(f"Species NOT matched: {stats['species_not_matched']}")
print(f"Cultivars existed: {stats['cultivar_existed']}")
print(f"Cultivars created: {stats['cultivar_created']}")
print(f"Links existed: {stats['link_existed']}")
print(f"Links created: {stats['link_created']}")
print(f"Errors: {stats['errors']}")
if new_cultivars:
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
for cv in new_cultivars:
print(f" + {cv['name']} ({cv.get('species', '?')})")
if new_links:
print(f"\n--- New supplier links ({len(new_links)}) ---")
for lk in new_links:
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
if unmatched_species:
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
print(f" ? {name} (x{count})")
print("\nDone.")
def process_product(html, url, species_map, cultivar_map, supplier_id,
stats, unmatched_species, new_cultivars, new_links):
"""Process a single product page."""
stats["products_found"] += 1
prod = extract_product_data(html, url)
if not prod.get("name"):
return
bot_norm = prod.get("botanical_norm")
if not bot_norm:
# No botanical name found on page
stats["species_not_matched"] += 1
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
return
stats["botanical_extracted"] += 1
# Match species
species = species_map.get(bot_norm)
if not species:
stats["species_not_matched"] += 1
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
return
stats["species_matched"] += 1
species_id = species["id"]
cultivar_name = prod["name"]
# Check if cultivar exists
cv_key = (species_id, cultivar_name.lower().strip())
existing_cv = cultivar_map.get(cv_key)
if existing_cv:
stats["cultivar_existed"] += 1
cultivar_id = existing_cv["id"]
else:
# Create cultivar
create_data = {
"species_id": species_id,
"name": cultivar_name,
"is_organic": True,
"source_urls": [url],
}
# Add growing data if we extracted any
if "planting_depth_cm" in prod:
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
if "row_spacing_cm" in prod:
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
if "plant_spacing_cm" in prod:
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
if "germination_temp_c" in prod:
create_data["germination_temp_c"] = prod["germination_temp_c"]
try:
new_cv = api_post("/cultivars", create_data)
cultivar_id = new_cv["id"]
stats["cultivar_created"] += 1
new_cultivars.append({
"name": cultivar_name,
"species": species["name"],
"id": cultivar_id,
})
# Add to local cache
cultivar_map[cv_key] = new_cv
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
if e.code == 500 and "Database error" in body:
# Likely slug collision - search for existing cultivar
try:
# Try multiple search strategies
found = None
cn_lower = cultivar_name.lower().strip()
# Strategy 1: search by full name
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
for cv in search_data.get("data", []):
if cv["name"].lower().strip() == cn_lower:
found = cv
break
# Strategy 2: match by species_id + partial name
if not found:
for cv in search_data.get("data", []):
if cv["species_id"] == species_id:
# Match if names are similar (ignoring punctuation)
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
found = cv
break
# Strategy 3: search by last significant word
if not found:
words = [w for w in cultivar_name.split() if len(w) > 2]
if words:
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
for cv in search2.get("data", []):
if cv["species_id"] == species_id:
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean:
found = cv
break
if found:
cultivar_id = found["id"]
cultivar_map[cv_key] = found
stats["cultivar_existed"] += 1
else:
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
stats["errors"] += 1
return
except Exception as e2:
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
stats["errors"] += 1
return
else:
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
return
# Check if Reinsaat supplier link exists
try:
existing_links = get_cultivar_suppliers(cultivar_id)
except Exception:
existing_links = []
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
if has_reinsaat:
stats["link_existed"] += 1
else:
# Create supplier link
link_data = {
"supplier_id": supplier_id,
"product_url": url,
}
if "article_number" in prod:
link_data["article_number"] = prod["article_number"]
if "port_price" in prod:
link_data["price_eur"] = prod["port_price"]
elif "price_eur" in prod:
link_data["price_eur"] = prod["price_eur"]
if "pack_size" in prod:
link_data["pack_size"] = prod["pack_size"]
if "pack_unit" in prod:
link_data["pack_unit"] = prod["pack_unit"]
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
stats["link_created"] += 1
new_links.append({
"cultivar": cultivar_name,
"article": prod.get("article_number", "?"),
"url": url,
})
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
if __name__ == "__main__":
main()