761 lines
28 KiB
Python
761 lines
28 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
|
|
Extracts cultivar data and imports into HerbAPI.
|
|
|
|
Run 2 - fixes pagination (API caps at 100/page), better species matching,
|
|
caches scraped products, handles duplicates gracefully.
|
|
"""
|
|
|
|
import urllib.request
|
|
import urllib.parse
|
|
import urllib.error
|
|
import gzip
|
|
import json
|
|
import re
|
|
import time
|
|
import sys
|
|
import os
|
|
import html as html_mod
|
|
from collections import defaultdict
|
|
|
|
# --- Configuration ---
|
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
|
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
|
SITE_BASE = "https://www.dreschflegel-saatgut.de"
|
|
DELAY = 0.5
|
|
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
|
|
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
|
|
|
|
# Unbuffered output
|
|
sys.stdout.reconfigure(line_buffering=True)
|
|
sys.stderr.reconfigure(line_buffering=True)
|
|
|
|
stats = defaultdict(int)
|
|
|
|
|
|
def api_request(method, path, data=None):
|
|
"""Make an API request to HerbAPI."""
|
|
url = f"{API_BASE}{path}"
|
|
body = json.dumps(data).encode("utf-8") if data else None
|
|
req = urllib.request.Request(url, data=body, method=method)
|
|
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
|
req.add_header("Content-Type", "application/json")
|
|
req.add_header("Accept", "application/json")
|
|
try:
|
|
resp = urllib.request.urlopen(req)
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
except urllib.error.HTTPError as e:
|
|
body_text = e.read().decode("utf-8", errors="replace")
|
|
if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
|
|
return None # Duplicate, handled silently
|
|
if e.code == 500 and "database error" in body_text.lower():
|
|
# Likely a unique constraint violation = duplicate
|
|
return None
|
|
print(f" API error {e.code} {method} {path}: {body_text[:200]}")
|
|
return None
|
|
|
|
|
|
def fetch_page(url):
|
|
"""Fetch a web page with delay and user-agent."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
print(f" Fetch error {url}: {e}")
|
|
return None
|
|
|
|
|
|
def get_sitemap_urls():
|
|
"""Download sitemap and extract all URLs."""
|
|
print("Fetching sitemap index...")
|
|
html = fetch_page(f"{SITE_BASE}/sitemap.xml")
|
|
if not html:
|
|
return []
|
|
|
|
sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
|
|
all_urls = []
|
|
|
|
for smap_url in sitemap_urls:
|
|
if smap_url.endswith(".xml.gz"):
|
|
print(f" Fetching compressed sitemap...")
|
|
req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
data = gzip.decompress(resp.read()).decode("utf-8")
|
|
urls = re.findall(r"<loc>(.*?)</loc>", data)
|
|
all_urls.extend(urls)
|
|
print(f" Found {len(urls)} URLs")
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
|
|
return all_urls
|
|
|
|
|
|
def classify_urls(urls):
|
|
"""Filter URLs to likely product pages (single-segment paths)."""
|
|
skip_prefixes = [
|
|
"impressum", "agb", "datenschutz", "kontakt", "widerrufs",
|
|
"versand", "abkuerz", "zertifikat", "wichtige-hinweise",
|
|
"muster-", "gutscheine", "kalender", "flyer", "katalog",
|
|
"sommer-herbst", "unsere-hoefe", "bestellschein",
|
|
"dreschflegel-news", "termine", "rezepte", "anbautipps",
|
|
"tipps-zur", "gartentelefon", "gartenfreude", "buecher",
|
|
"navigation", "vielfalt", "sut20", "saatgut",
|
|
"neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
|
|
"saatgut-vielfalt", "saat",
|
|
]
|
|
candidates = []
|
|
for url in urls:
|
|
url = url.rstrip("/")
|
|
path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
|
|
"https://www.dreschflegel-saatgut.de/", ""
|
|
)
|
|
if not path or "/" in path:
|
|
continue
|
|
if any(path == p or path.startswith(p) for p in skip_prefixes):
|
|
continue
|
|
candidates.append(url)
|
|
return candidates
|
|
|
|
|
|
def parse_product_page(html_content):
|
|
"""Extract product data from a Dreschflegel product page."""
|
|
if not html_content or 'class="botname"' not in html_content:
|
|
return None
|
|
|
|
result = {}
|
|
|
|
m = re.search(r"<h1>(.*?)</h1>", html_content)
|
|
if m:
|
|
result["name"] = html_mod.unescape(m.group(1).strip())
|
|
|
|
m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
|
|
if m:
|
|
result["botanical_name"] = html_mod.unescape(m.group(1).strip())
|
|
|
|
m = re.search(
|
|
r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
|
|
html_content,
|
|
re.DOTALL,
|
|
)
|
|
if m:
|
|
result["article_number"] = m.group(1)
|
|
|
|
m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
|
|
if m:
|
|
try:
|
|
result["price"] = float(m.group(1))
|
|
except ValueError:
|
|
pass
|
|
|
|
m = re.search(
|
|
r"product-detail-description-text.*?<p>(.*?)</p>",
|
|
html_content,
|
|
re.DOTALL,
|
|
)
|
|
if m:
|
|
desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
|
|
desc = html_mod.unescape(desc).strip()
|
|
if desc:
|
|
result["description"] = desc
|
|
|
|
m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
|
|
if m:
|
|
result["pack_info"] = html_mod.unescape(m.group(1).strip())
|
|
|
|
return result if "name" in result and "botanical_name" in result else None
|
|
|
|
|
|
def scrape_all_products(candidate_urls):
|
|
"""Scrape product pages, using cache for already-scraped URLs."""
|
|
# Load cache
|
|
cache = {}
|
|
if os.path.exists(CACHE_FILE):
|
|
with open(CACHE_FILE, "r") as f:
|
|
cache = json.load(f)
|
|
print(f" Loaded {len(cache)} cached products")
|
|
|
|
products = []
|
|
to_fetch = [u for u in candidate_urls if u not in cache]
|
|
already_cached = [u for u in candidate_urls if u in cache]
|
|
|
|
# Add cached products
|
|
for u in already_cached:
|
|
if cache[u]: # None means "not a product page"
|
|
products.append(cache[u])
|
|
|
|
cached_products = len(products)
|
|
cached_non_products = len(already_cached) - cached_products
|
|
print(f" {cached_products} products from cache, "
|
|
f"{cached_non_products} non-products cached, "
|
|
f"{len(to_fetch)} to fetch")
|
|
|
|
for i, url in enumerate(to_fetch):
|
|
if (i + 1) % 50 == 0 or i == 0:
|
|
print(f" Fetching {i + 1}/{len(to_fetch)}...")
|
|
|
|
time.sleep(DELAY)
|
|
html_content = fetch_page(url)
|
|
if not html_content:
|
|
stats["fetch_errors"] += 1
|
|
cache[url] = None
|
|
continue
|
|
|
|
product = parse_product_page(html_content)
|
|
if product:
|
|
product["url"] = url
|
|
products.append(product)
|
|
cache[url] = product
|
|
stats["products_scraped"] += 1
|
|
else:
|
|
cache[url] = None
|
|
stats["not_product_pages"] += 1
|
|
|
|
# Save cache periodically
|
|
if (i + 1) % 100 == 0:
|
|
with open(CACHE_FILE, "w") as f:
|
|
json.dump(cache, f)
|
|
|
|
# Final cache save
|
|
with open(CACHE_FILE, "w") as f:
|
|
json.dump(cache, f)
|
|
|
|
print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
|
|
return products
|
|
|
|
|
|
def paginated_get(path):
|
|
"""Fetch all pages from a paginated API endpoint."""
|
|
all_items = []
|
|
page = 1
|
|
while True:
|
|
resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
|
|
if not resp or "data" not in resp or not resp["data"]:
|
|
break
|
|
all_items.extend(resp["data"])
|
|
if len(resp["data"]) < 100:
|
|
break
|
|
page += 1
|
|
return all_items
|
|
|
|
|
|
def load_api_data():
|
|
"""Load all species, families, cultivars from HerbAPI."""
|
|
print("Loading HerbAPI data...")
|
|
|
|
families = {}
|
|
for f in paginated_get("/families"):
|
|
families[f["name_scientific"].lower()] = f
|
|
print(f" {len(families)} families")
|
|
|
|
species = {}
|
|
for s in paginated_get("/species"):
|
|
species[s["name_scientific"].lower().strip()] = s
|
|
print(f" {len(species)} species")
|
|
|
|
cultivars = {}
|
|
for c in paginated_get("/cultivars"):
|
|
key = (c["species_id"], c["name"].lower().strip())
|
|
cultivars[key] = c
|
|
print(f" {len(cultivars)} cultivars")
|
|
|
|
return families, species, cultivars
|
|
|
|
|
|
def ensure_supplier():
|
|
"""Create or find the Dreschflegel supplier."""
|
|
resp = api_request("GET", "/suppliers")
|
|
if resp:
|
|
for s in resp:
|
|
if "dreschflegel" in s["name"].lower():
|
|
print(f" Supplier exists: {s['name']} ({s['id']})")
|
|
return s
|
|
data = {
|
|
"name": "Dreschflegel",
|
|
"url": "https://www.dreschflegel-saatgut.de",
|
|
"country": "DE",
|
|
"is_organic": True,
|
|
"is_demeter": False,
|
|
"notes": "German organic seed cooperative, open-pollinated heritage varieties",
|
|
}
|
|
resp = api_request("POST", "/suppliers", data)
|
|
if resp:
|
|
print(f" Created supplier: {resp['name']} ({resp['id']})")
|
|
return resp
|
|
|
|
|
|
# Genus → family mapping for species creation
|
|
GENUS_TO_FAMILY = {
|
|
# Asteraceae
|
|
"Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
|
|
"Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
|
|
"Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
|
|
"Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
|
|
"Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
|
|
"Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
|
|
"Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
|
|
"Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
|
|
"Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
|
|
"Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
|
|
"Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
|
|
"Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
|
|
"Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
|
|
"Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
|
|
# Solanaceae
|
|
"Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
|
|
"Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
|
|
# Cucurbitaceae
|
|
"Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
|
|
"Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
|
|
# Fabaceae
|
|
"Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
|
|
"Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
|
|
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
|
|
"Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
|
|
"Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
|
|
# Brassicaceae
|
|
"Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
|
|
"Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
|
|
"Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
|
|
"Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
|
|
"Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
|
# Apiaceae
|
|
"Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
|
|
"Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
|
|
"Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
|
|
"Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
|
|
"Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
|
|
# Lamiaceae
|
|
"Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
|
|
"Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
|
|
"Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
|
|
"Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
|
|
"Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
|
|
"Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
|
|
# Amaryllidaceae / Alliaceae
|
|
"Allium": "Amaryllidaceae",
|
|
# Poaceae
|
|
"Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
|
|
"Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
|
|
"Zea": "Poaceae", "Setaria": "Poaceae",
|
|
# Chenopodiaceae
|
|
"Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
|
|
"Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
|
# Rosaceae
|
|
"Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
|
|
"Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
|
|
"Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
|
|
"Waldsteinia": "Rosaceae",
|
|
# Boraginaceae
|
|
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
|
|
"Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
|
|
# Malvaceae
|
|
"Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
|
|
"Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
|
|
# Polygonaceae
|
|
"Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
|
|
# Caryophyllaceae
|
|
"Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
|
|
"Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
|
|
"Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
|
|
# Tropaeolaceae
|
|
"Tropaeolum": "Tropaeolaceae",
|
|
# Papaveraceae
|
|
"Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
|
|
"Meconopsis": "Papaveraceae",
|
|
# Caprifoliaceae
|
|
"Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
|
|
"Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
|
|
# Plantaginaceae
|
|
"Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
|
|
"Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
|
|
# Violaceae
|
|
"Viola": "Violaceae",
|
|
# Ranunculaceae
|
|
"Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
|
|
"Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
|
|
# Linaceae
|
|
"Linum": "Linaceae",
|
|
# Convolvulaceae
|
|
"Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
|
|
# Portulacaceae / Montiaceae
|
|
"Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
|
|
# Amaranthaceae
|
|
"Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
|
|
"Gomphrena": "Amaranthaceae",
|
|
# Asparagaceae
|
|
"Asparagus": "Asparagaceae",
|
|
# Resedaceae
|
|
"Reseda": "Resedaceae",
|
|
# Balsaminaceae
|
|
"Impatiens": "Balsaminaceae",
|
|
# Hydrangeaceae
|
|
"Hydrangea": "Hydrangeaceae",
|
|
# Campanulaceae
|
|
"Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
|
|
# Scrophulariaceae
|
|
"Verbascum": "Scrophulariaceae",
|
|
# Verbenaceae
|
|
"Verbena": "Verbenaceae",
|
|
# Onagraceae
|
|
"Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
|
|
# Cucurbitaceae extras
|
|
"Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
|
|
# Hypericaceae
|
|
"Hypericum": "Hypericaceae",
|
|
# Adoxaceae
|
|
"Sambucus": "Adoxaceae",
|
|
# Others
|
|
"Nigella": "Ranunculaceae",
|
|
"Dipsacus": "Caprifoliaceae",
|
|
"Knautia": "Caprifoliaceae",
|
|
"Scabiosa": "Caprifoliaceae",
|
|
"Succisa": "Caprifoliaceae",
|
|
"Asclepias": "Apocynaceae",
|
|
"Cynoglossum": "Boraginaceae",
|
|
"Echium": "Boraginaceae",
|
|
"Anchusa": "Boraginaceae",
|
|
"Lithospermum": "Boraginaceae",
|
|
"Tanacetum": "Asteraceae",
|
|
"Onobrychis": "Fabaceae",
|
|
"Ornithopus": "Fabaceae",
|
|
"Lotus": "Fabaceae",
|
|
"Anthyllis": "Fabaceae",
|
|
"Melilotus": "Fabaceae",
|
|
"Galega": "Fabaceae",
|
|
"Lespedeza": "Fabaceae",
|
|
"Arachis": "Fabaceae",
|
|
"Senna": "Fabaceae",
|
|
# Additional genera found in Dreschflegel catalog
|
|
"Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
|
|
"Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
|
|
"Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
|
|
"Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
|
|
"Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
|
|
"Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
|
|
"Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
|
|
"Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
|
|
"Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
|
|
"Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
|
|
"Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
|
|
"Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
|
|
"Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
|
|
"Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
|
|
"Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
|
|
"Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
|
|
"Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
|
|
"Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
|
|
"Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
|
|
"Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
|
|
"Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
|
|
"Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
|
|
"Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
|
|
}
|
|
|
|
|
|
def normalize_species_name(botanical_name):
|
|
"""Normalize botanical name to 'Genus species' for matching.
|
|
Handles var., subsp., ssp., hybrids etc.
|
|
"""
|
|
name = botanical_name.strip()
|
|
parts = name.split()
|
|
if len(parts) < 2:
|
|
return None, None
|
|
|
|
genus = parts[0]
|
|
# Handle 'Genus x species' (hybrid notation)
|
|
if parts[1] == "x" and len(parts) >= 3:
|
|
species = f"x {parts[2]}"
|
|
elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
|
|
# Only genus level - can't match to species
|
|
return genus, None
|
|
else:
|
|
species = parts[1]
|
|
|
|
return genus, species
|
|
|
|
|
|
def find_species(botanical_name, species_cache):
|
|
"""Find existing species matching a botanical name.
|
|
Tries exact match, then genus+species without var/subsp.
|
|
"""
|
|
genus, sp = normalize_species_name(botanical_name)
|
|
if not genus:
|
|
return None
|
|
|
|
if sp:
|
|
# Try exact genus+species
|
|
search_key = f"{genus} {sp}".lower()
|
|
if search_key in species_cache:
|
|
return species_cache[search_key]
|
|
|
|
# Try all species with same genus
|
|
genus_lower = genus.lower()
|
|
matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
|
|
if len(matches) == 1:
|
|
# Only one species in this genus - use it
|
|
return list(matches.values())[0]
|
|
|
|
return None
|
|
|
|
|
|
def find_or_create_species(botanical_name, families, species_cache):
|
|
"""Find or create a species from a botanical name."""
|
|
# Try to find existing
|
|
sp = find_species(botanical_name, species_cache)
|
|
if sp:
|
|
return sp
|
|
|
|
genus, species_epithet = normalize_species_name(botanical_name)
|
|
if not genus or not species_epithet:
|
|
stats["species_no_epithet"] += 1
|
|
return None
|
|
|
|
sci_name = f"{genus} {species_epithet}"
|
|
|
|
# Check cache again with normalized name
|
|
if sci_name.lower() in species_cache:
|
|
return species_cache[sci_name.lower()]
|
|
|
|
# Need to create - find the family
|
|
family_name = GENUS_TO_FAMILY.get(genus)
|
|
if not family_name:
|
|
stats["species_no_family"] += 1
|
|
print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
|
|
return None
|
|
|
|
# Find or create the family
|
|
family = families.get(family_name.lower())
|
|
if not family:
|
|
print(f" Creating family: {family_name}")
|
|
resp = api_request("POST", "/families", {"name_scientific": family_name})
|
|
if resp:
|
|
families[family_name.lower()] = resp
|
|
family = resp
|
|
stats["families_created"] += 1
|
|
else:
|
|
# May already exist (duplicate from previous run) - reload
|
|
for f in paginated_get("/families"):
|
|
if f["name_scientific"].lower() == family_name.lower():
|
|
families[family_name.lower()] = f
|
|
family = f
|
|
break
|
|
if not family:
|
|
print(f" [SKIP] Cannot create family: {family_name}")
|
|
return None
|
|
|
|
# Create species
|
|
print(f" Creating species: {sci_name} (family: {family_name})")
|
|
resp = api_request("POST", "/species", {
|
|
"name_scientific": sci_name,
|
|
"family_id": family["id"],
|
|
})
|
|
if resp:
|
|
species_cache[sci_name.lower()] = resp
|
|
stats["species_created"] += 1
|
|
return resp
|
|
else:
|
|
# May already exist - try to find it
|
|
time.sleep(0.1)
|
|
for s in paginated_get("/species"):
|
|
if s["name_scientific"].lower() == sci_name.lower():
|
|
species_cache[sci_name.lower()] = s
|
|
return s
|
|
return None
|
|
|
|
|
|
def extract_cultivar_name(product_name):
|
|
"""Extract the cultivar/variety name from the full product name."""
|
|
name = product_name.strip()
|
|
|
|
# Common German crop type prefixes to strip (longest first)
|
|
prefixes = [
|
|
# Tomatoes
|
|
"Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
|
|
"Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
|
|
"Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
|
|
# Lettuce
|
|
"Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
|
|
"Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
|
|
"Spargelsalat", "Romanasalat",
|
|
# Beans
|
|
"Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
|
|
"Prunkbohne",
|
|
# Peas
|
|
"Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
|
|
"Knackerbse", "Kapuzinererbse",
|
|
# Cucumbers
|
|
"Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
|
|
"Freilandgurke",
|
|
# Squash
|
|
"Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
|
|
"Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
|
|
# Melon
|
|
"Wassermelone", "Zuckermelone",
|
|
# Peppers
|
|
"Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
|
|
"Snackpaprika", "Peperoni", "Chili",
|
|
# Brassicas
|
|
"Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
|
|
"Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
|
|
"Chinakohl", "Pak Choi", "Markstammkohl",
|
|
# Root veg
|
|
"Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
|
|
"Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
|
|
"Steckrübe", "Knollensellerie", "Petersilienwurzel",
|
|
"Rettich", "Radieschen",
|
|
# Onions
|
|
"Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
|
|
"Schalotte", "Wintersteckzwiebel", "Zwiebel",
|
|
# Herbs
|
|
"Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
|
|
"Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
|
|
"Basilikum", "Schnittknoblauch",
|
|
# Grains
|
|
"Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
|
|
"Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
|
|
# Misc
|
|
"Zuckermais", "Popcornmais",
|
|
"Salattomate", "Zucchini",
|
|
]
|
|
|
|
for prefix in sorted(prefixes, key=len, reverse=True):
|
|
if name.startswith(prefix + " "):
|
|
return name[len(prefix):].strip()
|
|
|
|
return name
|
|
|
|
|
|
def get_existing_supplier_links(cultivar_id, supplier_id):
|
|
"""Check if a cultivar-supplier link already exists."""
|
|
resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
|
|
if resp:
|
|
for link in resp:
|
|
if link["supplier_id"] == supplier_id:
|
|
return True
|
|
return False
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Supplier
|
|
print("\n[1] Setting up supplier...")
|
|
supplier = ensure_supplier()
|
|
if not supplier:
|
|
print("FATAL: Could not create/find supplier")
|
|
sys.exit(1)
|
|
supplier_id = supplier["id"]
|
|
|
|
# Step 2: Load API data
|
|
print("\n[2] Loading existing HerbAPI data...")
|
|
families, species_cache, cultivar_cache = load_api_data()
|
|
|
|
# Step 3: Get product URLs
|
|
print("\n[3] Fetching sitemap...")
|
|
all_urls = get_sitemap_urls()
|
|
if not all_urls:
|
|
print("FATAL: Could not fetch sitemap")
|
|
sys.exit(1)
|
|
candidate_urls = classify_urls(all_urls)
|
|
print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
|
|
|
|
# Step 4: Scrape
|
|
print(f"\n[4] Scraping product pages...")
|
|
products = scrape_all_products(candidate_urls)
|
|
|
|
# Step 5: Import
|
|
print(f"\n[5] Importing {len(products)} products into HerbAPI...")
|
|
|
|
for i, product in enumerate(products):
|
|
if (i + 1) % 50 == 0:
|
|
print(f" Processing {i + 1}/{len(products)}...")
|
|
|
|
botanical = product.get("botanical_name", "")
|
|
if not botanical:
|
|
stats["no_botanical"] += 1
|
|
continue
|
|
|
|
# Find or create species
|
|
sp = find_or_create_species(botanical, families, species_cache)
|
|
if not sp:
|
|
stats["species_not_matched"] += 1
|
|
continue
|
|
|
|
species_id = sp["id"]
|
|
cultivar_name = extract_cultivar_name(product["name"])
|
|
|
|
# Check if cultivar already exists
|
|
cv_key = (species_id, cultivar_name.lower().strip())
|
|
if cv_key in cultivar_cache:
|
|
cv = cultivar_cache[cv_key]
|
|
stats["cultivars_existing"] += 1
|
|
else:
|
|
cv_data = {
|
|
"species_id": species_id,
|
|
"name": cultivar_name,
|
|
"is_organic": True,
|
|
}
|
|
if product.get("description"):
|
|
cv_data["description"] = product["description"]
|
|
|
|
cv = api_request("POST", "/cultivars", cv_data)
|
|
if cv:
|
|
cultivar_cache[cv_key] = cv
|
|
stats["cultivars_created"] += 1
|
|
else:
|
|
# Might already exist from previous run - try to find it
|
|
found = False
|
|
for c in paginated_get(f"/cultivars?species_id={species_id}"):
|
|
if c["name"].lower().strip() == cultivar_name.lower().strip():
|
|
cultivar_cache[cv_key] = c
|
|
cv = c
|
|
stats["cultivars_existing"] += 1
|
|
found = True
|
|
break
|
|
if not found:
|
|
stats["cultivar_create_errors"] += 1
|
|
continue
|
|
|
|
# Link to supplier (check first for idempotency)
|
|
if get_existing_supplier_links(cv["id"], supplier_id):
|
|
stats["supplier_links_existing"] += 1
|
|
continue
|
|
|
|
link_data = {
|
|
"supplier_id": supplier_id,
|
|
"article_number": product.get("article_number", ""),
|
|
"product_url": product.get("url", ""),
|
|
"price_eur": product.get("price"),
|
|
}
|
|
pack_info = product.get("pack_info", "")
|
|
if pack_info:
|
|
m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
|
|
if m:
|
|
link_data["pack_size"] = float(m.group(1))
|
|
unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
|
|
link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
|
|
|
|
resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
|
|
if resp:
|
|
stats["supplier_links_created"] += 1
|
|
else:
|
|
stats["supplier_link_errors"] += 1
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("RESULTS")
|
|
print("=" * 60)
|
|
for key, val in sorted(stats.items()):
|
|
print(f" {key}: {val}")
|
|
print(f"\n Total species in DB: {len(species_cache)}")
|
|
print(f" Total cultivars tracked: {len(cultivar_cache)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|