Files
herbapi/tools/scrapers/scrape_reinsaat.py
T

561 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
Strategy:
1. Fetch category pages, recursively discover product pages via JSON-LD detection
2. Extract structured data from JSON-LD Product schema + HTML text for growing data
3. Match Latin names to existing species in the API
4. Create cultivar records and link them to Reinsaat supplier
"""
import json
import re
import ssl
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser
from dataclasses import dataclass
from typing import Optional
# ── Config ──────────────────────────────────────────────────────────────────
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.5 # seconds between requests
USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
# ── Categories to scrape ────────────────────────────────────────────────────
# (category_url, default_species_hint for leaf pages in this category)
CATEGORIES = [
("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
("https://www.reinsaat.at/shop/DE/kuerbis/", None),
("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
("https://www.reinsaat.at/shop/DE/bohnen/", None),
("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
]
# ── Known Latin name genera we can match ────────────────────────────────────
KNOWN_GENERA = (
"Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
"Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
"Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
"Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
"Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
"Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
)
LATIN_PATTERN = re.compile(
rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
)
# ── HTML helpers ────────────────────────────────────────────────────────────
class TextExtractor(HTMLParser):
"""Extract all visible text from HTML."""
def __init__(self):
super().__init__()
self.parts = []
self._skip = 0
def handle_starttag(self, tag, attrs):
if tag in ("script", "style", "noscript"):
self._skip += 1
def handle_endtag(self, tag):
if tag in ("script", "style", "noscript") and self._skip > 0:
self._skip -= 1
def handle_data(self, data):
if self._skip == 0:
t = data.strip()
if t:
self.parts.append(t)
def extract_links(html: str, base_url: str) -> list[str]:
"""Extract all <a href> links from HTML, resolving relative URLs."""
links = []
seen = set()
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
href = m.group(1)
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
full = urllib.parse.urljoin(base_url, href)
if full not in seen:
seen.add(full)
links.append(full)
return links
def extract_jsonld_product(html: str) -> Optional[dict]:
"""Extract the JSON-LD Product object from HTML, if present."""
for m in re.finditer(
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
html, re.DOTALL | re.IGNORECASE
):
try:
data = json.loads(m.group(1))
if isinstance(data, dict) and data.get("@type") == "Product":
return data
except (json.JSONDecodeError, ValueError):
continue
return None
# ── HTTP helpers ────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()
def fetch_url(url: str, retries: int = 2) -> str:
"""Fetch a URL with retries."""
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
})
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
charset = resp.headers.get_content_charset() or "utf-8"
return resp.read().decode(charset)
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
if attempt < retries:
time.sleep(2)
continue
raise
return ""
def api_get(path: str):
"""GET from HerbAPI."""
req = urllib.request.Request(
f"{API_BASE}{path}",
headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
def api_post(path: str, data: dict):
"""POST to HerbAPI."""
body = json.dumps(data).encode("utf-8")
req = urllib.request.Request(
f"{API_BASE}{path}",
data=body,
headers={
"Authorization": f"Bearer {AUTH_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
except urllib.error.HTTPError as e:
error_body = e.read().decode("utf-8", errors="replace")
print(f" API ERROR {e.code}: {error_body[:500]}")
raise
# ── Species matching ────────────────────────────────────────────────────────
def load_species() -> dict:
"""Load species from API. Returns dict: lowercase scientific name -> species dict."""
result = {}
page = 1
while True:
data = api_get(f"/species?per_page=100&page={page}")
species_list = data.get("data", data) if isinstance(data, dict) else data
for s in species_list:
key = s["name_scientific"].lower().strip()
result[key] = s
if isinstance(data, dict) and "pagination" in data:
if page >= data["pagination"].get("total_pages", 1):
break
else:
break
page += 1
return result
def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
"""Match a Latin name to an existing species. Returns species dict or None."""
if not latin_name:
return None
# Clean the name: remove author citations, subspecies
clean = latin_name.strip()
clean = re.sub(r'\s+L\.\s*$', '', clean)
clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
key = clean.lower().strip()
if key in species_map:
return species_map[key]
# Try genus + species (first two words)
parts = key.split()
if len(parts) >= 2:
two = f"{parts[0]} {parts[1]}"
if two in species_map:
return species_map[two]
# Try genus-only match (less reliable, but useful for Borago, etc.)
if parts:
for skey, sval in species_map.items():
if skey.startswith(parts[0] + " "):
return sval
return None
# ── Product data extraction ─────────────────────────────────────────────────
@dataclass
class ProductData:
name: str = ""
latin_name: str = ""
description: str = ""
sku: str = ""
url: str = ""
is_organic: bool = True
sowing_depth_cm: Optional[float] = None
row_spacing_cm: Optional[float] = None
plant_spacing_cm: Optional[float] = None
germination_temp_c: Optional[float] = None
perennial: bool = False
def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
"""Parse a product page. Returns ProductData or None if not a product page."""
jsonld = extract_jsonld_product(html)
if not jsonld:
return None # Not a product page
product = ProductData(url=url)
# ── From JSON-LD ──
product.name = jsonld.get("name", "").strip()
product.description = jsonld.get("description", "").strip()
product.sku = jsonld.get("model", "").strip()
# ── Extract full text for pattern matching ──
extractor = TextExtractor()
extractor.feed(html)
full_text = " ".join(extractor.parts)
# ── Latin name ──
m = LATIN_PATTERN.search(full_text)
if m:
product.latin_name = m.group(1).strip()
# Also check <i>/<em> tags in HTML
if not product.latin_name:
for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
im = LATIN_PATTERN.search(clean)
if im:
product.latin_name = im.group(1).strip()
break
if not product.latin_name and default_species:
product.latin_name = default_species
# ── Sowing depth ──
depth_pats = [
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
r'(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
]
for pat in depth_pats:
dm = re.search(pat, full_text, re.IGNORECASE)
if dm:
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
product.sowing_depth_cm = sum(vals) / len(vals)
break
# Fallback: look in raw HTML for common depth patterns like "0,51 cm" near depth keywords
if product.sowing_depth_cm is None:
dm = re.search(
r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
html, re.IGNORECASE
)
if dm:
d1 = float(dm.group(1).replace(",", "."))
d2 = float(dm.group(2).replace(",", "."))
product.sowing_depth_cm = (d1 + d2) / 2
# ── Spacing ──
# Look for "ROW x PLANT cm" patterns
spacing_pats = [
# "3040 x 24 cm" (range x range)
r'(\d+)\s*[-]\s*(\d+)\s*[x×]\s*(\d+)\s*[-]\s*(\d+)\s*cm',
# "100 x 50 cm" (simple)
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in spacing_pats:
matches = re.findall(pat, full_text, re.IGNORECASE)
if matches:
# Prefer the last match (often the more relevant outdoor spacing)
m = matches[-1]
if len(m) == 4:
product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
elif len(m) == 2:
v1 = float(m[0].replace(",", "."))
v2 = float(m[1].replace(",", "."))
product.row_spacing_cm = v1
product.plant_spacing_cm = v2
break
# ── Germination temperature ──
temp_pats = [
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-]\s*(\d+)\s*°?\s*C',
r'(\d+)\s*[-und ]*\s*(\d+)\s*°\s*C',
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
]
for pat in temp_pats:
tm = re.search(pat, full_text, re.IGNORECASE)
if tm:
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
# Sanity check: germination temps are typically 5-35°C
avg = sum(vals) / len(vals)
if 5 <= avg <= 40:
product.germination_temp_c = avg
break
# ── Perennial ──
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
for pat in perennial_pats:
if re.search(pat, full_text, re.IGNORECASE):
product.perennial = True
break
return product
# ── Recursive product discovery ─────────────────────────────────────────────
def discover_products(
category_url: str,
default_species: Optional[str],
max_depth: int = 3,
_depth: int = 0,
_visited: set = None,
) -> list[ProductData]:
"""Recursively discover and parse product pages under a category URL."""
if _visited is None:
_visited = set()
if category_url in _visited or _depth > max_depth:
return []
_visited.add(category_url)
indent = " " * (_depth + 1)
print(f"{indent}Fetching: {category_url}")
try:
html = fetch_url(category_url)
time.sleep(DELAY)
except Exception as e:
print(f"{indent} ERROR: {e}")
return []
# Check if this IS a product page
product = parse_product(html, category_url, default_species)
if product:
return [product]
# It's a category/subcategory page: extract child links
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
child_links = []
for link in extract_links(html, category_url):
parsed = urllib.parse.urlparse(link)
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
continue
child_path = parsed.path.rstrip("/")
# Must be a direct child of the category path
if not child_path.startswith(cat_path + "/"):
continue
relative = child_path[len(cat_path) + 1:]
# Must be exactly one level deeper (no further slashes)
if "/" in relative:
continue
# Skip empty or same-path
if not relative:
continue
# Build clean URL
clean_url = f"https://www.reinsaat.at{child_path}/"
if clean_url not in _visited:
child_links.append(clean_url)
# Deduplicate
child_links = list(dict.fromkeys(child_links))
print(f"{indent} Found {len(child_links)} child links")
products = []
for child_url in child_links:
results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
products.extend(results)
return products
# ── Main ────────────────────────────────────────────────────────────────────
def main():
print("=" * 70)
print("Reinsaat Scraper -> HerbAPI")
print("=" * 70)
# Load species
print("\n[1] Loading species from API...")
species_map = load_species()
sci_names = [k for k in species_map if " " in k]
print(f" {len(sci_names)} species loaded:")
for k in sorted(sci_names):
s = species_map[k]
print(f" {s['name_scientific']:40s} {s['id'][:12]}...")
# Load existing cultivars
print("\n[2] Loading existing cultivars...")
existing_cultivars = {} # (species_id, name_lower) -> cultivar_id
page = 1
while True:
data = api_get(f"/cultivars?per_page=100&page={page}")
clist = data.get("data", data) if isinstance(data, dict) else data
if not clist:
break
for c in clist:
existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
# Check pagination - API uses {data, total, page, per_page} format
if isinstance(data, dict):
total = data.get("total", len(clist))
per_page = data.get("per_page", 100)
if page * per_page >= total:
break
else:
break
page += 1
print(f" {len(existing_cultivars)} existing cultivars")
# Discover products from all categories
print("\n[3] Discovering products from Reinsaat categories...")
all_products: list[ProductData] = []
visited: set[str] = set()
for cat_url, species_hint in CATEGORIES:
print(f"\n Category: {cat_url}")
products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
all_products.extend(products)
print(f" -> {len(products)} products from this category")
print(f"\n Total products discovered: {len(all_products)}")
# Deduplicate by URL
seen_urls = set()
unique_products = []
for p in all_products:
if p.url not in seen_urls:
seen_urls.add(p.url)
unique_products.append(p)
all_products = unique_products
print(f" Unique products: {len(all_products)}")
# Process products
print("\n[4] Creating cultivars in API...")
stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
for i, product in enumerate(all_products):
pct = (i + 1) / len(all_products) * 100
print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
# Match species
species = match_species(product.latin_name, species_map)
if not species:
print(f" Skip: no species match for '{product.latin_name}'")
stats["skipped_no_species"] += 1
continue
species_id = species["id"]
print(f" Species: {species['name_scientific']}")
print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
# Check duplicates
key = (species_id, product.name.lower())
if key in existing_cultivars:
# Still try to link supplier if cultivar exists
cultivar_id = existing_cultivars[key]
print(f" Exists: {cultivar_id[:12]}... - checking supplier link")
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", {
"supplier_id": REINSAAT_SUPPLIER_ID,
"product_url": product.url,
"article_number": product.sku,
})
print(f" Linked to Reinsaat (SKU: {product.sku})")
stats["linked"] += 1
except Exception:
pass # Already linked or other error
stats["skipped_exists"] += 1
continue
# Build payload
payload = {
"species_id": species_id,
"name": product.name,
"name_de": product.name,
"name_en": "",
"description": product.description,
"is_organic": product.is_organic,
"perennial": product.perennial,
}
if product.sowing_depth_cm is not None:
payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
if product.row_spacing_cm is not None:
payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
if product.plant_spacing_cm is not None:
payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
if product.germination_temp_c is not None:
payload["germination_temp_c"] = round(product.germination_temp_c, 1)
# Create cultivar
try:
result = api_post("/cultivars", payload)
cultivar_id = result["id"]
print(f" Created: {cultivar_id}")
stats["created"] += 1
existing_cultivars[key] = cultivar_id
except Exception as e:
print(f" FAILED to create: {e}")
stats["errors"] += 1
continue
# Link to supplier
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", {
"supplier_id": REINSAAT_SUPPLIER_ID,
"product_url": product.url,
"article_number": product.sku,
})
print(f" Linked to Reinsaat (SKU: {product.sku})")
stats["linked"] += 1
except Exception as e:
print(f" FAILED to link supplier: {e}")
# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f" Created: {stats['created']}")
print(f" Linked to supplier: {stats['linked']}")
print(f" Skipped (no species): {stats['skipped_no_species']}")
print(f" Skipped (exists): {stats['skipped_exists']}")
print(f" Errors: {stats['errors']}")
print("=" * 70)
if __name__ == "__main__":
main()