Files
herbapi/tools/scrapers/scrape_arche_noah.py

515 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
product listings and details, then creates cultivars in HerbAPI matched
to existing species.
"""
import json
import re
import time
import urllib.request
import urllib.error
import urllib.parse
import sys
from datetime import datetime, timezone
# --- Configuration -----------------------------------------------------------
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
REQUEST_DELAY = 0.5 # seconds between requests
# Only import products from these Arche Noah article lines (their own seeds)
ARCHE_NOAH_LINES = {
"Bio-Saatgut von ARCHE NOAH",
"Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
}
# Search terms to discover all seed products across the shop
SEARCH_TERMS = [
"Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
"Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
"Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
"Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
"Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
"Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
"Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
"Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
"Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
"Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
"Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
"Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
"Rote Bete", "Rote Rübe", "Mangold", "Melde",
"Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
"Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
"Zuckermais", "Popcorn",
]
# --- Helpers -----------------------------------------------------------------
def herbapi_request(method, path, data=None):
"""Make a request to HerbAPI."""
url = f"{HERBAPI_BASE}/{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method, headers={
"Authorization": f"Bearer {HERBAPI_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
})
try:
resp = urllib.request.urlopen(req, timeout=30)
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw.strip() else None
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")
print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
raise
def shop_create_session():
"""Create an anonymous session on the Arche Noah shop."""
req = urllib.request.Request(
SHOP_BASE + "webshop/createanonymoususer",
data=json.dumps({}).encode(),
headers={
"User-Agent": SHOP_UA,
"Content-Type": "application/json",
"Origin": "https://shop.arche-noah.at",
"Referer": "https://shop.arche-noah.at/",
},
)
resp = urllib.request.urlopen(req, timeout=15)
cookie = resp.headers.get("Set-Cookie", "")
session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
if not session:
raise RuntimeError("Failed to get shop session")
return session
def shop_request(session, endpoint, payload):
"""Make a POST request to the shop API."""
req = urllib.request.Request(
SHOP_BASE + endpoint,
data=json.dumps(payload).encode(),
headers={
"User-Agent": SHOP_UA,
"Content-Type": "application/json",
"Accept": "application/json",
"Cookie": f"JSESSIONID={session}",
"Origin": "https://shop.arche-noah.at",
"Referer": "https://shop.arche-noah.at/",
},
)
resp = urllib.request.urlopen(req, timeout=30)
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw.strip() else None
def extract_latin_name(detail_headline3):
"""Extract the Latin/botanical name from the product detail headline3 field."""
if not detail_headline3:
return None
# Remove HTML tags
text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
# Remove "Hier geht es zu unseren..." trailing text
text = text.split("Hier geht")[0].strip()
# Should be something like "Solanum lycopersicum" or "Capsicum annuum"
if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
return text
return None
def match_species(latin_name, species_by_scientific):
"""
Match a Latin name to a species, handling subspecies/variety suffixes.
E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
Also handles "subsp.", "convar.", "f." qualifiers.
"""
if not latin_name:
return None
normalized = latin_name.strip().lower()
# Direct match
species = species_by_scientific.get(normalized)
if species:
return species
# Strip subspecies/variety/convar/forma qualifiers and try genus + species only
# Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
if m:
base = m.group(1).strip()
species = species_by_scientific.get(base)
if species:
return species
return None
def extract_cultivar_name(product_name):
"""
Extract the cultivar/variety name from the product name.
Format examples:
"Salatparadeiser 'Naama' HG026" -> "Naama"
"Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
"Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
"""
# Try to extract name in quotes (various quote styles)
m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
if m:
return m.group(1).strip()
# Fallback: remove the article number suffix and type prefix
# Remove trailing article number like HG026, TO019, etc.
name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
# Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
# Just return the full cleaned name
return name
def parse_pack_info(unit_desc):
"""
Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
Returns (pack_size, pack_unit) or (None, None).
"""
if not unit_desc:
return None, None
# "20-30 Korn" -> take the lower bound
m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
if m:
return float(m.group(1)), m.group(2)
return None, None
# --- Main scraping logic -----------------------------------------------------
def fetch_all_arche_noah_products(session):
"""Search the shop API to find all Arche Noah seed products."""
all_products = {}
seen_terms = set()
for term in SEARCH_TERMS:
if term.lower() in seen_terms:
continue
seen_terms.add(term.lower())
offset = 0
while True:
payload = {
"searchCriteria": term,
"startIndex": offset,
"numDataSets": 200,
"allowAllProducts": False,
}
try:
data = shop_request(session, "webshop/getproducts", payload)
except Exception as e:
print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
break
if not data:
break
new_count = 0
for p in data:
if p["sid"] not in all_products:
all_products[p["sid"]] = p
new_count += 1
if len(data) < 200:
break
offset += len(data)
time.sleep(REQUEST_DELAY)
time.sleep(REQUEST_DELAY)
# Filter to Arche Noah's own seed products only
an_products = {
sid: p for sid, p in all_products.items()
if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
}
print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
return an_products
def fetch_product_details(session, products):
"""Fetch detailed info (Latin names) for each product."""
details = {}
total = len(products)
for i, (sid, product) in enumerate(products.items()):
try:
detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
if detail:
details[sid] = detail
except Exception as e:
print(f" Detail for {sid} failed: {e}", file=sys.stderr)
if (i + 1) % 20 == 0:
print(f" Fetched details: {i + 1}/{total}")
time.sleep(REQUEST_DELAY)
print(f"Fetched {len(details)} product details")
return details
def load_herbapi_species():
"""Load all species from HerbAPI and build lookup maps (handles pagination)."""
page = 1
species_list = []
while True:
result = herbapi_request("GET", f"species?per_page=100&page={page}")
if isinstance(result, dict) and "data" in result:
data = result["data"]
total = result.get("total", 0)
elif isinstance(result, list):
data = result
total = len(data)
else:
break
species_list.extend(data)
if len(species_list) >= total or not data:
break
page += 1
# Build lookup by scientific name (normalized lowercase)
by_scientific = {}
for s in species_list:
key = s["name_scientific"].strip().lower()
by_scientific[key] = s
return species_list, by_scientific
def load_herbapi_cultivars():
"""Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
page = 1
all_cultivars = []
while True:
result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
if isinstance(result, dict) and "data" in result:
data = result["data"]
total = result.get("total", 0)
elif isinstance(result, list):
data = result
total = len(data)
else:
break
all_cultivars.extend(data)
if len(all_cultivars) >= total or not data:
break
page += 1
# Build lookup by (species_id, normalized cultivar name)
by_key = {}
for c in all_cultivars:
key = (c["species_id"], c["name"].strip().lower())
by_key[key] = c
return all_cultivars, by_key
def ensure_supplier():
"""Create the Arche Noah supplier if it doesn't exist, return its ID."""
suppliers = herbapi_request("GET", "suppliers")
if isinstance(suppliers, dict) and "data" in suppliers:
suppliers = suppliers["data"]
for s in suppliers:
if "arche" in s["name"].lower() and "noah" in s["name"].lower():
print(f"Supplier 'Arche Noah' already exists: {s['id']}")
return s["id"]
print("Creating supplier 'Arche Noah'...")
result = herbapi_request("POST", "suppliers", {
"name": "Arche Noah",
"url": "https://www.arche-noah.at",
"country": "AT",
"is_organic": True,
"is_demeter": False,
"notes": "Austrian society for heritage seed preservation and biodiversity",
})
print(f"Created supplier: {result['id']}")
return result["id"]
def load_existing_supplier_links(cultivar_id):
"""Load existing supplier links for a cultivar."""
try:
result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
if isinstance(result, list):
return result
if isinstance(result, dict) and "data" in result:
return result["data"]
return []
except Exception:
return []
def main():
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"=== Arche Noah Seed Catalog Scraper ===")
print(f"Started at {now_str}\n")
# Step 1: Create Arche Noah supplier in HerbAPI
print("[1/6] Ensuring Arche Noah supplier exists...")
supplier_id = ensure_supplier()
print()
# Step 2: Load HerbAPI species for matching
print("[2/6] Loading HerbAPI species...")
species_list, species_by_scientific = load_herbapi_species()
print(f"Loaded {len(species_list)} species")
print()
# Step 3: Load existing cultivars for idempotency
print("[3/6] Loading existing cultivars...")
existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
print(f"Loaded {len(existing_cultivars)} existing cultivars")
print()
# Step 4: Scrape Arche Noah shop
print("[4/6] Scraping Arche Noah shop catalog...")
session = shop_create_session()
print(f"Got shop session")
products = fetch_all_arche_noah_products(session)
print()
# Step 5: Fetch product details (to get Latin names)
print("[5/6] Fetching product details for Latin name matching...")
details = fetch_product_details(session, products)
print()
# Step 6: Create cultivars in HerbAPI
print("[6/6] Creating cultivars in HerbAPI...")
stats = {
"created": 0,
"skipped_existing": 0,
"skipped_no_species": 0,
"supplier_linked": 0,
"supplier_link_existed": 0,
"errors": 0,
}
for sid, product in sorted(products.items()):
detail = details.get(sid, {})
# Extract Latin name from detail
latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
if not latin_name:
# Fallback: try from category mapping
latin_name = None
# Match to HerbAPI species (handles subspecies/variety suffixes)
species = match_species(latin_name, species_by_scientific)
if not species:
print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
stats["skipped_no_species"] += 1
continue
# Extract cultivar name
cultivar_name = extract_cultivar_name(product["name"])
if not cultivar_name:
print(f" SKIP (no cultivar name): {product['name']}")
stats["skipped_no_species"] += 1
continue
# Check if cultivar already exists (idempotency)
lookup_key = (species["id"], cultivar_name.strip().lower())
existing = cultivars_by_key.get(lookup_key)
if existing:
cultivar_id = existing["id"]
stats["skipped_existing"] += 1
else:
# Determine if this is organic
is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
# Build product URL
alias = product.get("alias") or detail.get("alias", "")
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
# Create cultivar
cultivar_data = {
"species_id": species["id"],
"name": cultivar_name,
"name_de": cultivar_name,
"is_organic": is_organic,
"source_urls": [product_url] if product_url else None,
}
try:
result = herbapi_request("POST", "cultivars", cultivar_data)
cultivar_id = result["id"]
stats["created"] += 1
# Add to lookup for idempotency within this run
cultivars_by_key[lookup_key] = result
print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
except Exception as e:
print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
stats["errors"] += 1
continue
# Link cultivar to supplier
existing_links = load_existing_supplier_links(cultivar_id)
already_linked = any(
link["supplier_id"] == supplier_id for link in existing_links
)
if already_linked:
stats["supplier_link_existed"] += 1
else:
# Parse pack info
unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
pack_size, pack_unit = parse_pack_info(unit_desc)
# Get price
price = None
price_list = product.get("priceListPos") or detail.get("priceListPos", [])
if price_list:
price = price_list[0].get("singleUnitPrice")
# Build product URL
alias = product.get("alias") or detail.get("alias", "")
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
link_data = {
"supplier_id": supplier_id,
"article_number": str(product.get("articleNr", "")),
"product_url": product_url,
"price_eur": price,
"pack_size": pack_size,
"pack_unit": pack_unit,
}
try:
herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
stats["supplier_linked"] += 1
except Exception as e:
print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
stats["errors"] += 1
time.sleep(0.1) # small delay between HerbAPI calls
# Summary
print(f"\n{'='*60}")
print(f"Scraping complete!")
print(f" Cultivars created: {stats['created']}")
print(f" Cultivars already existed: {stats['skipped_existing']}")
print(f" Skipped (no species match): {stats['skipped_no_species']}")
print(f" Supplier links created: {stats['supplier_linked']}")
print(f" Supplier links existed: {stats['supplier_link_existed']}")
print(f" Errors: {stats['errors']}")
if __name__ == "__main__":
main()