515 lines
18 KiB
Python
515 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
|
|
|
|
Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
|
|
product listings and details, then creates cultivars in HerbAPI matched
|
|
to existing species.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
import urllib.parse
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
|
|
# --- Configuration -----------------------------------------------------------
|
|
|
|
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
|
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
|
|
|
SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
|
|
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
REQUEST_DELAY = 0.5 # seconds between requests
|
|
|
|
# Only import products from these Arche Noah article lines (their own seeds)
|
|
ARCHE_NOAH_LINES = {
|
|
"Bio-Saatgut von ARCHE NOAH",
|
|
"Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
|
|
}
|
|
|
|
# Search terms to discover all seed products across the shop
|
|
SEARCH_TERMS = [
|
|
"Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
|
|
"Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
|
|
"Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
|
|
"Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
|
|
"Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
|
|
"Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
|
|
"Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
|
|
"Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
|
|
"Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
|
|
"Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
|
|
"Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
|
|
"Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
|
|
"Rote Bete", "Rote Rübe", "Mangold", "Melde",
|
|
"Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
|
|
"Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
|
|
"Zuckermais", "Popcorn",
|
|
]
|
|
|
|
# --- Helpers -----------------------------------------------------------------
|
|
|
|
def herbapi_request(method, path, data=None):
|
|
"""Make a request to HerbAPI."""
|
|
url = f"{HERBAPI_BASE}/{path}"
|
|
body = json.dumps(data).encode() if data else None
|
|
req = urllib.request.Request(url, data=body, method=method, headers={
|
|
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json",
|
|
})
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
raw = resp.read().decode("utf-8")
|
|
return json.loads(raw) if raw.strip() else None
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")
|
|
print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
|
|
raise
|
|
|
|
|
|
def shop_create_session():
|
|
"""Create an anonymous session on the Arche Noah shop."""
|
|
req = urllib.request.Request(
|
|
SHOP_BASE + "webshop/createanonymoususer",
|
|
data=json.dumps({}).encode(),
|
|
headers={
|
|
"User-Agent": SHOP_UA,
|
|
"Content-Type": "application/json",
|
|
"Origin": "https://shop.arche-noah.at",
|
|
"Referer": "https://shop.arche-noah.at/",
|
|
},
|
|
)
|
|
resp = urllib.request.urlopen(req, timeout=15)
|
|
cookie = resp.headers.get("Set-Cookie", "")
|
|
session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
|
|
if not session:
|
|
raise RuntimeError("Failed to get shop session")
|
|
return session
|
|
|
|
|
|
def shop_request(session, endpoint, payload):
|
|
"""Make a POST request to the shop API."""
|
|
req = urllib.request.Request(
|
|
SHOP_BASE + endpoint,
|
|
data=json.dumps(payload).encode(),
|
|
headers={
|
|
"User-Agent": SHOP_UA,
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json",
|
|
"Cookie": f"JSESSIONID={session}",
|
|
"Origin": "https://shop.arche-noah.at",
|
|
"Referer": "https://shop.arche-noah.at/",
|
|
},
|
|
)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
raw = resp.read().decode("utf-8")
|
|
return json.loads(raw) if raw.strip() else None
|
|
|
|
|
|
def extract_latin_name(detail_headline3):
|
|
"""Extract the Latin/botanical name from the product detail headline3 field."""
|
|
if not detail_headline3:
|
|
return None
|
|
# Remove HTML tags
|
|
text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
|
|
# Remove "Hier geht es zu unseren..." trailing text
|
|
text = text.split("Hier geht")[0].strip()
|
|
# Should be something like "Solanum lycopersicum" or "Capsicum annuum"
|
|
if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
|
|
return text
|
|
return None
|
|
|
|
|
|
def match_species(latin_name, species_by_scientific):
|
|
"""
|
|
Match a Latin name to a species, handling subspecies/variety suffixes.
|
|
E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
|
|
Also handles "subsp.", "convar.", "f." qualifiers.
|
|
"""
|
|
if not latin_name:
|
|
return None
|
|
|
|
normalized = latin_name.strip().lower()
|
|
|
|
# Direct match
|
|
species = species_by_scientific.get(normalized)
|
|
if species:
|
|
return species
|
|
|
|
# Strip subspecies/variety/convar/forma qualifiers and try genus + species only
|
|
# Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
|
|
m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
|
|
if m:
|
|
base = m.group(1).strip()
|
|
species = species_by_scientific.get(base)
|
|
if species:
|
|
return species
|
|
|
|
return None
|
|
|
|
|
|
def extract_cultivar_name(product_name):
|
|
"""
|
|
Extract the cultivar/variety name from the product name.
|
|
Format examples:
|
|
"Salatparadeiser 'Naama' HG026" -> "Naama"
|
|
"Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
|
|
"Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
|
|
"""
|
|
# Try to extract name in quotes (various quote styles)
|
|
m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
|
|
if m:
|
|
return m.group(1).strip()
|
|
# Fallback: remove the article number suffix and type prefix
|
|
# Remove trailing article number like HG026, TO019, etc.
|
|
name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
|
|
# Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
|
|
# Just return the full cleaned name
|
|
return name
|
|
|
|
|
|
def parse_pack_info(unit_desc):
|
|
"""
|
|
Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
|
|
Returns (pack_size, pack_unit) or (None, None).
|
|
"""
|
|
if not unit_desc:
|
|
return None, None
|
|
# "20-30 Korn" -> take the lower bound
|
|
m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
|
|
if m:
|
|
return float(m.group(1)), m.group(2)
|
|
return None, None
|
|
|
|
|
|
# --- Main scraping logic -----------------------------------------------------
|
|
|
|
def fetch_all_arche_noah_products(session):
|
|
"""Search the shop API to find all Arche Noah seed products."""
|
|
all_products = {}
|
|
seen_terms = set()
|
|
|
|
for term in SEARCH_TERMS:
|
|
if term.lower() in seen_terms:
|
|
continue
|
|
seen_terms.add(term.lower())
|
|
|
|
offset = 0
|
|
while True:
|
|
payload = {
|
|
"searchCriteria": term,
|
|
"startIndex": offset,
|
|
"numDataSets": 200,
|
|
"allowAllProducts": False,
|
|
}
|
|
try:
|
|
data = shop_request(session, "webshop/getproducts", payload)
|
|
except Exception as e:
|
|
print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
|
|
break
|
|
|
|
if not data:
|
|
break
|
|
|
|
new_count = 0
|
|
for p in data:
|
|
if p["sid"] not in all_products:
|
|
all_products[p["sid"]] = p
|
|
new_count += 1
|
|
|
|
if len(data) < 200:
|
|
break
|
|
offset += len(data)
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Filter to Arche Noah's own seed products only
|
|
an_products = {
|
|
sid: p for sid, p in all_products.items()
|
|
if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
|
|
}
|
|
|
|
print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
|
|
return an_products
|
|
|
|
|
|
def fetch_product_details(session, products):
|
|
"""Fetch detailed info (Latin names) for each product."""
|
|
details = {}
|
|
total = len(products)
|
|
for i, (sid, product) in enumerate(products.items()):
|
|
try:
|
|
detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
|
|
if detail:
|
|
details[sid] = detail
|
|
except Exception as e:
|
|
print(f" Detail for {sid} failed: {e}", file=sys.stderr)
|
|
|
|
if (i + 1) % 20 == 0:
|
|
print(f" Fetched details: {i + 1}/{total}")
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
print(f"Fetched {len(details)} product details")
|
|
return details
|
|
|
|
|
|
def load_herbapi_species():
|
|
"""Load all species from HerbAPI and build lookup maps (handles pagination)."""
|
|
page = 1
|
|
species_list = []
|
|
while True:
|
|
result = herbapi_request("GET", f"species?per_page=100&page={page}")
|
|
if isinstance(result, dict) and "data" in result:
|
|
data = result["data"]
|
|
total = result.get("total", 0)
|
|
elif isinstance(result, list):
|
|
data = result
|
|
total = len(data)
|
|
else:
|
|
break
|
|
species_list.extend(data)
|
|
if len(species_list) >= total or not data:
|
|
break
|
|
page += 1
|
|
|
|
# Build lookup by scientific name (normalized lowercase)
|
|
by_scientific = {}
|
|
for s in species_list:
|
|
key = s["name_scientific"].strip().lower()
|
|
by_scientific[key] = s
|
|
return species_list, by_scientific
|
|
|
|
|
|
def load_herbapi_cultivars():
|
|
"""Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
|
|
page = 1
|
|
all_cultivars = []
|
|
while True:
|
|
result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
|
|
if isinstance(result, dict) and "data" in result:
|
|
data = result["data"]
|
|
total = result.get("total", 0)
|
|
elif isinstance(result, list):
|
|
data = result
|
|
total = len(data)
|
|
else:
|
|
break
|
|
|
|
all_cultivars.extend(data)
|
|
if len(all_cultivars) >= total or not data:
|
|
break
|
|
page += 1
|
|
|
|
# Build lookup by (species_id, normalized cultivar name)
|
|
by_key = {}
|
|
for c in all_cultivars:
|
|
key = (c["species_id"], c["name"].strip().lower())
|
|
by_key[key] = c
|
|
|
|
return all_cultivars, by_key
|
|
|
|
|
|
def ensure_supplier():
|
|
"""Create the Arche Noah supplier if it doesn't exist, return its ID."""
|
|
suppliers = herbapi_request("GET", "suppliers")
|
|
if isinstance(suppliers, dict) and "data" in suppliers:
|
|
suppliers = suppliers["data"]
|
|
|
|
for s in suppliers:
|
|
if "arche" in s["name"].lower() and "noah" in s["name"].lower():
|
|
print(f"Supplier 'Arche Noah' already exists: {s['id']}")
|
|
return s["id"]
|
|
|
|
print("Creating supplier 'Arche Noah'...")
|
|
result = herbapi_request("POST", "suppliers", {
|
|
"name": "Arche Noah",
|
|
"url": "https://www.arche-noah.at",
|
|
"country": "AT",
|
|
"is_organic": True,
|
|
"is_demeter": False,
|
|
"notes": "Austrian society for heritage seed preservation and biodiversity",
|
|
})
|
|
print(f"Created supplier: {result['id']}")
|
|
return result["id"]
|
|
|
|
|
|
def load_existing_supplier_links(cultivar_id):
|
|
"""Load existing supplier links for a cultivar."""
|
|
try:
|
|
result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
|
|
if isinstance(result, list):
|
|
return result
|
|
if isinstance(result, dict) and "data" in result:
|
|
return result["data"]
|
|
return []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def main():
|
|
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
print(f"=== Arche Noah Seed Catalog Scraper ===")
|
|
print(f"Started at {now_str}\n")
|
|
|
|
# Step 1: Create Arche Noah supplier in HerbAPI
|
|
print("[1/6] Ensuring Arche Noah supplier exists...")
|
|
supplier_id = ensure_supplier()
|
|
print()
|
|
|
|
# Step 2: Load HerbAPI species for matching
|
|
print("[2/6] Loading HerbAPI species...")
|
|
species_list, species_by_scientific = load_herbapi_species()
|
|
print(f"Loaded {len(species_list)} species")
|
|
print()
|
|
|
|
# Step 3: Load existing cultivars for idempotency
|
|
print("[3/6] Loading existing cultivars...")
|
|
existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
|
|
print(f"Loaded {len(existing_cultivars)} existing cultivars")
|
|
print()
|
|
|
|
# Step 4: Scrape Arche Noah shop
|
|
print("[4/6] Scraping Arche Noah shop catalog...")
|
|
session = shop_create_session()
|
|
print(f"Got shop session")
|
|
products = fetch_all_arche_noah_products(session)
|
|
print()
|
|
|
|
# Step 5: Fetch product details (to get Latin names)
|
|
print("[5/6] Fetching product details for Latin name matching...")
|
|
details = fetch_product_details(session, products)
|
|
print()
|
|
|
|
# Step 6: Create cultivars in HerbAPI
|
|
print("[6/6] Creating cultivars in HerbAPI...")
|
|
stats = {
|
|
"created": 0,
|
|
"skipped_existing": 0,
|
|
"skipped_no_species": 0,
|
|
"supplier_linked": 0,
|
|
"supplier_link_existed": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
for sid, product in sorted(products.items()):
|
|
detail = details.get(sid, {})
|
|
|
|
# Extract Latin name from detail
|
|
latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
|
|
if not latin_name:
|
|
# Fallback: try from category mapping
|
|
latin_name = None
|
|
|
|
# Match to HerbAPI species (handles subspecies/variety suffixes)
|
|
species = match_species(latin_name, species_by_scientific)
|
|
|
|
if not species:
|
|
print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
|
|
stats["skipped_no_species"] += 1
|
|
continue
|
|
|
|
# Extract cultivar name
|
|
cultivar_name = extract_cultivar_name(product["name"])
|
|
if not cultivar_name:
|
|
print(f" SKIP (no cultivar name): {product['name']}")
|
|
stats["skipped_no_species"] += 1
|
|
continue
|
|
|
|
# Check if cultivar already exists (idempotency)
|
|
lookup_key = (species["id"], cultivar_name.strip().lower())
|
|
existing = cultivars_by_key.get(lookup_key)
|
|
|
|
if existing:
|
|
cultivar_id = existing["id"]
|
|
stats["skipped_existing"] += 1
|
|
else:
|
|
# Determine if this is organic
|
|
is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
|
|
|
|
# Build product URL
|
|
alias = product.get("alias") or detail.get("alias", "")
|
|
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
|
|
|
# Create cultivar
|
|
cultivar_data = {
|
|
"species_id": species["id"],
|
|
"name": cultivar_name,
|
|
"name_de": cultivar_name,
|
|
"is_organic": is_organic,
|
|
"source_urls": [product_url] if product_url else None,
|
|
}
|
|
|
|
try:
|
|
result = herbapi_request("POST", "cultivars", cultivar_data)
|
|
cultivar_id = result["id"]
|
|
stats["created"] += 1
|
|
# Add to lookup for idempotency within this run
|
|
cultivars_by_key[lookup_key] = result
|
|
print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
|
|
except Exception as e:
|
|
print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
# Link cultivar to supplier
|
|
existing_links = load_existing_supplier_links(cultivar_id)
|
|
already_linked = any(
|
|
link["supplier_id"] == supplier_id for link in existing_links
|
|
)
|
|
|
|
if already_linked:
|
|
stats["supplier_link_existed"] += 1
|
|
else:
|
|
# Parse pack info
|
|
unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
|
|
pack_size, pack_unit = parse_pack_info(unit_desc)
|
|
|
|
# Get price
|
|
price = None
|
|
price_list = product.get("priceListPos") or detail.get("priceListPos", [])
|
|
if price_list:
|
|
price = price_list[0].get("singleUnitPrice")
|
|
|
|
# Build product URL
|
|
alias = product.get("alias") or detail.get("alias", "")
|
|
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
|
|
|
link_data = {
|
|
"supplier_id": supplier_id,
|
|
"article_number": str(product.get("articleNr", "")),
|
|
"product_url": product_url,
|
|
"price_eur": price,
|
|
"pack_size": pack_size,
|
|
"pack_unit": pack_unit,
|
|
}
|
|
|
|
try:
|
|
herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
|
|
stats["supplier_linked"] += 1
|
|
except Exception as e:
|
|
print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
|
|
stats["errors"] += 1
|
|
|
|
time.sleep(0.1) # small delay between HerbAPI calls
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print(f"Scraping complete!")
|
|
print(f" Cultivars created: {stats['created']}")
|
|
print(f" Cultivars already existed: {stats['skipped_existing']}")
|
|
print(f" Skipped (no species match): {stats['skipped_no_species']}")
|
|
print(f" Supplier links created: {stats['supplier_linked']}")
|
|
print(f" Supplier links existed: {stats['supplier_link_existed']}")
|
|
print(f" Errors: {stats['errors']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|