Add scraper and enrichment scripts to tools/ directory

This commit is contained in:
2026-03-16 11:10:18 +01:00
parent 83ab8c4cf9
commit 0ef902cc91
13 changed files with 6031 additions and 0 deletions
+156
View File
@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
import json
import time
import urllib.parse
import urllib.request
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
HEADERS_WD = {
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
"Accept": "application/json",
}
def herbapi_request(path, method="GET", data=None):
url = f"{HERBAPI_BASE}{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method, headers={
"Authorization": f"Bearer {HERBAPI_TOKEN}",
"Content-Type": "application/json",
})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def query_wikidata_batch(names):
"""Query Wikidata for a batch of scientific names."""
values = " ".join(f'"{n}"' for n in names)
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
VALUES ?name {{ {values} }}
?item wdt:P225 ?name .
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
}}"""
encoded = urllib.parse.quote(sparql)
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
req = urllib.request.Request(url, headers=HEADERS_WD)
with urllib.request.urlopen(req, timeout=60) as resp:
data = json.loads(resp.read())
results = {}
for binding in data.get("results", {}).get("bindings", []):
name = binding["name"]["value"]
qid_url = binding["item"]["value"]
qid = qid_url.rsplit("/", 1)[-1]
gbif = binding.get("gbifId", {}).get("value")
eppo = binding.get("eppoCode", {}).get("value")
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
return results
def main():
# 1. Fetch all species
resp = herbapi_request("/species?per_page=200")
species_list = resp["data"]
print(f"Fetched {len(species_list)} species from HerbAPI\n")
# 2. Collect species needing enrichment
to_enrich = [sp for sp in species_list
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
if not to_enrich:
print("All species already enriched.")
return
print(f"{len(to_enrich)} species need enrichment\n")
# 3. Batch query Wikidata
BATCH_SIZE = 20
wikidata_results = {}
names = [sp["name_scientific"] for sp in to_enrich]
for i in range(0, len(names), BATCH_SIZE):
batch = names[i:i + BATCH_SIZE]
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
try:
results = query_wikidata_batch(batch)
wikidata_results.update(results)
print(f" Got {len(results)} matches")
except Exception as e:
print(f" ERROR: {e}")
if i + BATCH_SIZE < len(names):
time.sleep(2)
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
updated = 0
skipped = 0
not_found = 0
errors = 0
for sp in to_enrich:
name = sp["name_scientific"]
wd = wikidata_results.get(name)
if not wd:
print(f" SKIP (no Wikidata match): {name}")
not_found += 1
continue
# Check what needs updating
needs_qid = not sp["wikidata_qid"] and wd["qid"]
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
if not (needs_qid or needs_gbif or needs_eppo):
print(f" SKIP (nothing new): {name}")
skipped += 1
continue
try:
# GET full species by slug for the complete object
full_sp = herbapi_request(f"/species/{sp['slug']}")
# Remove read-only fields
species_id = full_sp.pop("id")
full_sp.pop("slug", None)
full_sp.pop("created_at", None)
full_sp.pop("updated_at", None)
# Merge new data (only null fields)
if needs_qid:
full_sp["wikidata_qid"] = wd["qid"]
if needs_gbif:
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
if needs_eppo:
full_sp["eppo_code"] = wd["eppo_code"]
# PUT by UUID
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
fields = []
if needs_qid: fields.append(f"qid={wd['qid']}")
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
print(f" UPDATED: {name} -> {', '.join(fields)}")
updated += 1
except Exception as e:
print(f" ERROR updating {name}: {e}")
errors += 1
print(f"\n{'=' * 60}")
print(f"RESULTS:")
print(f" Updated: {updated}")
print(f" Skipped (no new data): {skipped}")
print(f" Not found on Wikidata: {not_found}")
print(f" Errors: {errors}")
print(f" Total species: {len(species_list)}")
if __name__ == "__main__":
main()
+305
View File
@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""Expand HerbAPI species database with common permaculture/garden species."""
import json
import time
import urllib.request
import urllib.parse
import urllib.error
import ssl
BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
DELAY = 0.15
# SSL context for GBIF (https)
ssl_ctx = ssl.create_default_context()
def api_get(path):
req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def api_post(path, data):
body = json.dumps(data).encode()
req = urllib.request.Request(
f"{BASE_URL}{path}",
data=body,
headers={"Authorization": AUTH, "Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read()), resp.status
except urllib.error.HTTPError as e:
err_body = e.read().decode()
print(f" ERROR {e.code}: {err_body}")
return None, e.code
def gbif_get_german_name(scientific_name):
"""Query GBIF for the German vernacular name."""
try:
url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
req = urllib.request.Request(url)
with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
match = json.loads(resp.read())
usage_key = match.get("usageKey")
if not usage_key:
return None
url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
req2 = urllib.request.Request(url2)
with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
vn = json.loads(resp.read())
for r in vn.get("results", []):
if r.get("language") == "deu":
return r["vernacularName"]
return None
except Exception as e:
print(f" GBIF lookup failed for {scientific_name}: {e}")
return None
# ── Families to ensure exist ─────────────────────────────────────────
FAMILIES_NEEDED = {
"Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
"Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
"Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
"Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
"Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
"Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
"Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
"Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
"Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
"Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
"Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
"Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
"Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
"Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
"Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
# New families not yet in the DB:
"Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
"Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
"Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
}
# ── Species to add ───────────────────────────────────────────────────
# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
SPECIES = [
# Vegetables
("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
{"food_uses": "Fruit"}),
("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
{"food_uses": "Fruit"}),
("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
{"food_uses": "Fruit, seeds, flowers"}),
("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
{"food_uses": "Fruit, seeds"}),
("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
{"food_uses": "Leaves"}),
("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
{"food_uses": "Leaves"}),
("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
{"food_uses": "Leaves, flower buds, stems"}),
("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
{"food_uses": "Root, leaves"}),
("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
{"food_uses": "Root, leaves, seed pods"}),
("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
{"food_uses": "Bulb, leaves"}),
("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
{"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
{"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
{"food_uses": "Leaves, root"}),
("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
{"food_uses": "Stalks, root, leaves"}),
("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
{"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
{"food_uses": "Root"}),
("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
{"food_uses": "Kernels, cobs"}),
("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
{"food_uses": "Fruit"}),
# Herbs
("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True}),
("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
{"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True}),
("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
{"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
{"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
"attracts_beneficial_insects": True, "attracts_pollinators": True}),
("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
{"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
{"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
{"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
"other_uses": "Earthworm attractant (biodynamic)"}),
# Flowers & cover crops
("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
{"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
{"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
{"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
{"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
{"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
{"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
"ground_cover_quality": "excellent", "attracts_pollinators": True}),
("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Sprouts",
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
"other_uses": "Green manure, deep-rooting soil improver"}),
# Fruit / Trees
("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
{"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
{"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
{"food_uses": "Fruit", "attracts_pollinators": True}),
("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
{"food_uses": "Berries"}),
("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
{"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
"wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
{"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
{"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
"medicinal_uses": "High vitamin C, skin care",
"other_uses": "Erosion control, windbreak"}),
("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
{"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
]
def main():
# 1. Load existing families
print("=== Loading existing families ===")
fam_resp = api_get("/families?per_page=100")
family_map = {} # name_scientific -> id
for f in fam_resp["data"]:
family_map[f["name_scientific"]] = f["id"]
print(f" Found {len(family_map)} existing families")
# 2. Create missing families
print("\n=== Creating missing families ===")
families_created = 0
for fam_name, fam_info in FAMILIES_NEEDED.items():
if fam_name in family_map:
print(f" SKIP (exists): {fam_name}")
continue
payload = {
"name_scientific": fam_name,
"name_en": fam_info["name_en"],
"name_de": fam_info["name_de"],
}
print(f" CREATE: {fam_name} ...", end=" ")
result, status = api_post("/families", payload)
if result and "id" in result:
family_map[fam_name] = result["id"]
print(f"OK ({result['id']})")
families_created += 1
else:
print(f"FAILED (status={status})")
time.sleep(DELAY)
print(f"\n Families created: {families_created}")
# 3. Load existing species
print("\n=== Loading existing species ===")
sp_resp = api_get("/species?per_page=200")
existing_species = set()
for s in sp_resp["data"]:
existing_species.add(s["name_scientific"])
print(f" Found {len(existing_species)} existing species")
# 4. Add new species
print("\n=== Adding new species ===")
created = 0
skipped = 0
failed = 0
for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
if sci_name in existing_species:
print(f" SKIP (exists): {sci_name}")
skipped += 1
continue
# Look up family ID
fam_id = family_map.get(family)
if not fam_id:
print(f" SKIP (no family '{family}'): {sci_name}")
failed += 1
continue
# Try GBIF for German name
gbif_de = gbif_get_german_name(sci_name)
if gbif_de:
print(f" GBIF name for {sci_name}: {gbif_de}")
# Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
# Keep our curated name_de but log the GBIF one
payload = {
"name_scientific": sci_name,
"family_id": fam_id,
"name_en": name_en,
"name_de": name_de,
"plant_layer": plant_layer,
}
# Add extra fields
for k, v in extras.items():
payload[k] = v
print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
result, status = api_post("/species", payload)
if result and "id" in result:
print(f"OK ({result['id']})")
created += 1
else:
print(f"FAILED (status={status})")
failed += 1
time.sleep(DELAY)
print(f"\n{'='*50}")
print(f"SUMMARY")
print(f" Families created: {families_created}")
print(f" Species created: {created}")
print(f" Species skipped: {skipped}")
print(f" Species failed: {failed}")
print(f" Total species now: {len(existing_species) + created}")
if __name__ == "__main__":
main()
+362
View File
@@ -0,0 +1,362 @@
#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
# --- Configuration ---
S3_ENDPOINT = "http://garage.sub-net.at:3900"
S3_BUCKET = "herbapi"
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
S3_REGION = "garage"
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
THUMB_WIDTH = 800
REQUEST_DELAY = 0.3
ALLOWED_LICENSES = {
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
"pd-us", "pd-usgov", "pd-author",
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
}
def slugify(name: str) -> str:
"""Convert scientific name to a URL-safe slug."""
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def psql(query: str) -> str:
"""Run a psql query and return output."""
env = os.environ.copy()
env["PGPASSWORD"] = DB_PASS
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
capture_output=True, text=True, env=env
)
if result.returncode != 0:
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
return result.stdout.strip()
def fetch_json(url: str) -> dict | None:
"""Fetch JSON from a URL with proper User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
except Exception as e:
print(f" HTTP error fetching {url}: {e}")
return None
def get_wikidata_image(qid: str) -> str | None:
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
"query": sparql, "format": "json"
})
data = fetch_json(url)
if not data:
return None
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
return filename
def get_commons_info(filename: str) -> dict | None:
"""Get image info from Wikimedia Commons API."""
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
"action": "query",
"titles": f"File:{filename}",
"prop": "imageinfo",
"iiprop": "url|extmetadata",
"iiurlwidth": str(THUMB_WIDTH),
"format": "json",
})
data = fetch_json(url)
if not data:
return None
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [])
if not imageinfo:
return None
info = imageinfo[0]
meta = info.get("extmetadata", {})
thumb_url = info.get("thumburl") or info.get("url")
desc_url = info.get("descriptionurl", "")
license_short = meta.get("LicenseShortName", {}).get("value", "")
artist_html = meta.get("Artist", {}).get("value", "")
# Strip HTML tags from artist
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
# Clean up whitespace
artist = re.sub(r'\s+', ' ', artist)
return {
"thumb_url": thumb_url,
"description_url": desc_url,
"license": license_short,
"artist": artist,
"filename": filename,
}
return None
def is_license_allowed(license_str: str) -> bool:
"""Check if a license is in our allowed list."""
normalized = license_str.lower().strip()
# Direct match
if normalized in ALLOWED_LICENSES:
return True
# Check for NC or ND
if "nc" in normalized or "nd" in normalized:
return False
# Check patterns
if normalized.startswith("public domain") or normalized.startswith("pd"):
return True
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
return True
return False
def normalize_license(license_str: str) -> str:
"""Normalize license string for storage."""
low = license_str.lower().strip()
if "public domain" in low or low.startswith("pd"):
return "Public domain"
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
return "CC0 1.0"
# CC BY-SA X.0
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY-SA {m.group(1)}"
# CC BY X.0
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY {m.group(1)}"
return license_str
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
"""Upload to S3 Garage using AWS CLI."""
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
with open(tmp_path, "wb") as f:
f.write(data)
env = os.environ.copy()
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
env["AWS_DEFAULT_REGION"] = S3_REGION
result = subprocess.run(
[
"aws", "s3", "cp", tmp_path,
f"s3://{S3_BUCKET}/{s3_key}",
"--endpoint-url", S3_ENDPOINT,
"--content-type", content_type,
],
capture_output=True, text=True, env=env
)
os.unlink(tmp_path)
if result.returncode != 0:
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
def download_image(url: str) -> bytes | None:
"""Download image data from URL."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.read()
except Exception as e:
print(f" Download error: {e}")
return None
def main():
# 1. Get species
rows = psql(
"SELECT id, name_scientific, wikidata_qid FROM species "
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
"ORDER BY name_scientific"
)
if not rows:
print("No species with wikidata_qid found.")
return
species_list = []
for line in rows.split("\n"):
parts = line.split("|")
if len(parts) == 3:
species_list.append({
"id": parts[0],
"name": parts[1],
"qid": parts[2],
})
print(f"Found {len(species_list)} species with Wikidata QIDs.")
# 2. Get existing images
existing = set()
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
if existing_rows:
for line in existing_rows.split("\n"):
line = line.strip()
if line:
existing.add(line)
print(f"Found {len(existing)} species that already have images.")
imported = 0
skipped_existing = 0
skipped_no_image = 0
skipped_license = 0
skipped_download = 0
errors = 0
for i, sp in enumerate(species_list):
name = sp["name"]
qid = sp["qid"]
sp_id = sp["id"]
slug = slugify(name)
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
if sp_id in existing:
print(" Already has image, skipping.")
skipped_existing += 1
continue
# Query Wikidata for image
time.sleep(REQUEST_DELAY)
filename = get_wikidata_image(qid)
if not filename:
print(" No image on Wikidata.")
skipped_no_image += 1
continue
# Get Commons info
time.sleep(REQUEST_DELAY)
info = get_commons_info(filename)
if not info:
print(f" Could not get Commons info for {filename}")
skipped_no_image += 1
continue
# Check license
raw_license = info["license"]
if not is_license_allowed(raw_license):
print(f" License not allowed: {raw_license}")
skipped_license += 1
continue
norm_license = normalize_license(raw_license)
artist = info["artist"]
thumb_url = info["thumb_url"]
desc_url = info["description_url"]
print(f" License: {raw_license} -> {norm_license}")
print(f" Artist: {artist[:80]}")
print(f" Thumbnail: {thumb_url[:100]}...")
# Download image
time.sleep(REQUEST_DELAY)
image_data = download_image(thumb_url)
if not image_data:
print(" Failed to download image.")
skipped_download += 1
continue
print(f" Downloaded {len(image_data)} bytes")
# Determine file extension from URL
ext = "jpg"
if ".png" in thumb_url.lower():
ext = "png"
elif ".svg" in thumb_url.lower():
ext = "svg"
elif ".gif" in thumb_url.lower():
ext = "gif"
s3_key = f"species/{slug}.{ext}"
content_type = {
"jpg": "image/jpeg",
"png": "image/png",
"svg": "image/svg+xml",
"gif": "image/gif",
}.get(ext, "image/jpeg")
# Upload to S3
try:
s3_upload(s3_key, image_data, content_type)
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
except RuntimeError as e:
print(f" S3 upload failed: {e}")
errors += 1
continue
# Insert into database
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
# Escape single quotes for SQL
caption_esc = caption.replace("'", "''")
desc_url_esc = desc_url.replace("'", "''")
norm_license_esc = norm_license.replace("'", "''")
s3_key_esc = s3_key.replace("'", "''")
insert_sql = (
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
)
result = psql(insert_sql)
# psql returns empty on success for INSERT
print(f" Inserted into images table.")
imported += 1
print(f"\n{'='*60}")
print(f"DONE!")
print(f" Imported: {imported}")
print(f" Skipped (existing):{skipped_existing}")
print(f" Skipped (no image):{skipped_no_image}")
print(f" Skipped (license): {skipped_license}")
print(f" Skipped (download):{skipped_download}")
print(f" Errors: {errors}")
print(f" Total processed: {len(species_list)}")
if __name__ == "__main__":
main()
+290
View File
@@ -0,0 +1,290 @@
#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Config
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
S3_BUCKET = "herbapi"
S3_ENDPOINT = "http://10.31.3.170:3900"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
REQUEST_DELAY = 0.3
# AWS env for subprocess calls
AWS_ENV = {
**os.environ,
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
"AWS_DEFAULT_REGION": "garage",
}
# Stats
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
def fetch_url(url):
"""Fetch URL with custom User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
def fetch_json(url):
"""Fetch URL and parse JSON."""
return json.loads(fetch_url(url))
def psql(sql):
"""Run psql command and return output."""
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
capture_output=True, text=True,
env={**os.environ, "PGPASSWORD": DB_PASS},
)
return result.stdout.strip()
def is_license_allowed(license_str):
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
"""
if not license_str:
return False
ls = license_str.lower().strip()
# Reject NC and ND explicitly first
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
return False
# Public domain / CC0
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
return True
if "public domain" in ls or ls.startswith("pd"):
return True
# CC BY-SA (any version, any jurisdiction)
if re.match(r"cc\s+by-sa\b", ls):
return True
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
if re.match(r"cc\s+by\b", ls):
return True
return False
def get_wikidata_image(qid):
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
data = fetch_json(url)
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# Extract filename from commons URL
filename = urllib.parse.unquote(image_url.split("/")[-1])
return filename
def get_commons_info(filename):
"""Get image info from Commons API: license, artist, thumbnail URL."""
title = f"File:{filename}"
url = (
f"https://commons.wikimedia.org/w/api.php?action=query"
f"&titles={urllib.parse.quote(title)}"
f"&prop=imageinfo&iiprop=url|extmetadata"
f"&iiurlwidth=800&format=json"
)
data = fetch_json(url)
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [{}])[0]
meta = imageinfo.get("extmetadata", {})
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
artist_html = meta.get("Artist", {}).get("value", "")
# Clean up artist: strip HTML tags
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
# Collapse whitespace
artist = re.sub(r"\s+", " ", artist)
if len(artist) > 120:
artist = artist[:117] + "..."
# Use the API-provided thumbnail URL (iiurlwidth=800)
thumb_url = imageinfo.get("thumburl", "")
# Also get the description URL
desc_url = imageinfo.get("descriptionurl", "")
return {
"license": license_short,
"artist": artist,
"thumb_url": thumb_url,
"desc_url": desc_url,
"filename": filename,
}
return None
def process_species(species_id, slug, name_sci, qid):
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
stats["total"] += 1
# Step 1: Get image filename from Wikidata
try:
filename = get_wikidata_image(qid)
except Exception as e:
print(f" ERROR querying Wikidata for {qid}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not filename:
print(f" No P18 image for {qid}")
stats["no_p18"] += 1
return False
# Step 2: Get Commons info (license, artist, thumb URL)
try:
info = get_commons_info(filename)
except Exception as e:
print(f" ERROR querying Commons for {filename}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not info:
print(f" No Commons info for {filename}")
stats["errors"] += 1
return False
# Step 3: Check license
if not is_license_allowed(info["license"]):
print(f" Bad license: {info['license']} for {filename}")
stats["bad_license"] += 1
return False
# Step 4: Download thumbnail using API-provided URL
thumb_url = info["thumb_url"]
if not thumb_url:
print(f" No thumbnail URL available for {filename}")
stats["download_fail"] += 1
return False
# Determine file extension from thumbnail URL
ext = "jpg"
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "png"
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "gif"
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
try:
img_data = fetch_url(thumb_url)
with open(tmp_path, "wb") as f:
f.write(img_data)
except Exception as e:
print(f" ERROR downloading {thumb_url}: {e}")
stats["download_fail"] += 1
return False
time.sleep(REQUEST_DELAY)
# Step 5: Upload to S3
s3_key = f"species/{slug}.{ext}"
try:
result = subprocess.run(
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
capture_output=True, text=True, env=AWS_ENV, timeout=60,
)
if result.returncode != 0:
print(f" S3 upload failed: {result.stderr}")
stats["upload_fail"] += 1
return False
except Exception as e:
print(f" ERROR uploading to S3: {e}")
stats["upload_fail"] += 1
return False
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
# Step 6: Insert into DB
caption = f"Photo: {info['artist']}" if info["artist"] else ""
caption_sql = caption.replace("'", "''")
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
source_url_sql = source_url.replace("'", "''")
license_sql = info["license"].replace("'", "''")
sql = (
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
)
try:
psql(sql)
except Exception as e:
print(f" ERROR inserting to DB: {e}")
stats["errors"] += 1
return False
stats["imported"] += 1
return True
def main():
# Get species without images
rows = psql(
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
"FROM species s "
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
"ORDER BY s.name_scientific;"
)
if not rows:
print("No species need images.")
return
species_list = []
for line in rows.split("\n"):
parts = line.strip().split("|")
if len(parts) == 4:
species_list.append(parts)
print(f"Processing {len(species_list)} species...\n")
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
ok = process_species(sid, slug, name_sci, qid)
if ok:
print(f" OK - imported")
print(f"\n{'='*50}")
print(f"RESULTS:")
print(f" Total species processed: {stats['total']}")
print(f" Successfully imported: {stats['imported']}")
print(f" No P18 image: {stats['no_p18']}")
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
print(f" Download failures: {stats['download_fail']}")
print(f" Upload failures: {stats['upload_fail']}")
print(f" Other errors: {stats['errors']}")
if __name__ == "__main__":
main()
+126
View File
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
import json, urllib.request, urllib.parse, time, sys
API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
GBIF = "https://api.gbif.org/v1"
def api_post(path, data):
req = urllib.request.Request(f"{API}{path}",
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
try:
resp = urllib.request.urlopen(req)
return json.loads(resp.read())
except urllib.error.HTTPError as e:
print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
return None
def gbif_de_name(name):
"""Get German common name from GBIF."""
url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
try:
match = json.loads(urllib.request.urlopen(url).read())
if not match.get("usageKey"): return None
url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
data = json.loads(urllib.request.urlopen(url2).read())
for r in data.get("results", []):
if r.get("language") == "deu":
return r["vernacularName"]
except: pass
return None
FAMILIES = [
("Fabaceae", "Hülsenfrüchtler", "Legumes"),
("Rosaceae", "Rosengewächse", "Rose family"),
("Brassicaceae", "Kreuzblütler", "Cabbage family"),
("Apiaceae", "Doldenblütler", "Carrot family"),
("Lamiaceae", "Lippenblütler", "Mint family"),
("Asteraceae", "Korbblütler", "Daisy family"),
("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
("Poaceae", "Süßgräser", "Grass family"),
("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
("Boraginaceae", "Raublattgewächse", "Borage family"),
("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
("Betulaceae", "Birkengewächse", "Birch family"),
("Fagaceae", "Buchengewächse", "Beech family"),
("Juglandaceae", "Walnussgewächse", "Walnut family"),
("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
("Ericaceae", "Heidekrautgewächse", "Heath family"),
("Moraceae", "Maulbeergewächse", "Mulberry family"),
("Urticaceae", "Brennnesselgewächse", "Nettle family"),
("Malvaceae", "Malvengewächse", "Mallow family"),
("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
("Asparagaceae", "Spargelgewächse", "Asparagus family"),
("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
]
SPECIES = [
("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
]
# Create families
print("=== Creating families ===")
family_map = {}
for sci, de, en in FAMILIES:
r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
if r:
family_map[sci] = r["id"]
print(f"{sci}")
time.sleep(0.05)
print(f"Created {len(family_map)} families\n")
# Create species
print("=== Creating species (with GBIF German names) ===")
created = 0
for sci_name, family_sci, extra in SPECIES:
fam_id = family_map.get(family_sci)
if not fam_id:
print(f"{sci_name} — family {family_sci} missing")
continue
de_name = gbif_de_name(sci_name)
data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
r = api_post("/species", data)
if r:
created += 1
print(f"{sci_name}{de_name or '(no DE name)'}")
time.sleep(0.15)
print(f"Created {created} species\n")
# Create suppliers
print("=== Creating suppliers ===")
for name, url, country, organic, demeter, notes in [
("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
]:
r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
if r: print(f"{name}")
print("\nDone!")
+514
View File
@@ -0,0 +1,514 @@
#!/usr/bin/env python3
"""
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
product listings and details, then creates cultivars in HerbAPI matched
to existing species.
"""
import json
import re
import time
import urllib.request
import urllib.error
import urllib.parse
import sys
from datetime import datetime, timezone
# --- Configuration -----------------------------------------------------------
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
REQUEST_DELAY = 0.5 # seconds between requests
# Only import products from these Arche Noah article lines (their own seeds)
ARCHE_NOAH_LINES = {
"Bio-Saatgut von ARCHE NOAH",
"Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
}
# Search terms to discover all seed products across the shop
SEARCH_TERMS = [
"Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
"Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
"Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
"Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
"Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
"Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
"Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
"Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
"Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
"Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
"Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
"Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
"Rote Bete", "Rote Rübe", "Mangold", "Melde",
"Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
"Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
"Zuckermais", "Popcorn",
]
# --- Helpers -----------------------------------------------------------------
def herbapi_request(method, path, data=None):
"""Make a request to HerbAPI."""
url = f"{HERBAPI_BASE}/{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method, headers={
"Authorization": f"Bearer {HERBAPI_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
})
try:
resp = urllib.request.urlopen(req, timeout=30)
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw.strip() else None
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")
print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
raise
def shop_create_session():
"""Create an anonymous session on the Arche Noah shop."""
req = urllib.request.Request(
SHOP_BASE + "webshop/createanonymoususer",
data=json.dumps({}).encode(),
headers={
"User-Agent": SHOP_UA,
"Content-Type": "application/json",
"Origin": "https://shop.arche-noah.at",
"Referer": "https://shop.arche-noah.at/",
},
)
resp = urllib.request.urlopen(req, timeout=15)
cookie = resp.headers.get("Set-Cookie", "")
session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
if not session:
raise RuntimeError("Failed to get shop session")
return session
def shop_request(session, endpoint, payload):
"""Make a POST request to the shop API."""
req = urllib.request.Request(
SHOP_BASE + endpoint,
data=json.dumps(payload).encode(),
headers={
"User-Agent": SHOP_UA,
"Content-Type": "application/json",
"Accept": "application/json",
"Cookie": f"JSESSIONID={session}",
"Origin": "https://shop.arche-noah.at",
"Referer": "https://shop.arche-noah.at/",
},
)
resp = urllib.request.urlopen(req, timeout=30)
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw.strip() else None
def extract_latin_name(detail_headline3):
"""Extract the Latin/botanical name from the product detail headline3 field."""
if not detail_headline3:
return None
# Remove HTML tags
text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
# Remove "Hier geht es zu unseren..." trailing text
text = text.split("Hier geht")[0].strip()
# Should be something like "Solanum lycopersicum" or "Capsicum annuum"
if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
return text
return None
def match_species(latin_name, species_by_scientific):
"""
Match a Latin name to a species, handling subspecies/variety suffixes.
E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
Also handles "subsp.", "convar.", "f." qualifiers.
"""
if not latin_name:
return None
normalized = latin_name.strip().lower()
# Direct match
species = species_by_scientific.get(normalized)
if species:
return species
# Strip subspecies/variety/convar/forma qualifiers and try genus + species only
# Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
if m:
base = m.group(1).strip()
species = species_by_scientific.get(base)
if species:
return species
return None
def extract_cultivar_name(product_name):
"""
Extract the cultivar/variety name from the product name.
Format examples:
"Salatparadeiser 'Naama' HG026" -> "Naama"
"Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
"Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
"""
# Try to extract name in quotes (various quote styles)
m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
if m:
return m.group(1).strip()
# Fallback: remove the article number suffix and type prefix
# Remove trailing article number like HG026, TO019, etc.
name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
# Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
# Just return the full cleaned name
return name
def parse_pack_info(unit_desc):
"""
Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
Returns (pack_size, pack_unit) or (None, None).
"""
if not unit_desc:
return None, None
# "20-30 Korn" -> take the lower bound
m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
if m:
return float(m.group(1)), m.group(2)
return None, None
# --- Main scraping logic -----------------------------------------------------
def fetch_all_arche_noah_products(session):
"""Search the shop API to find all Arche Noah seed products."""
all_products = {}
seen_terms = set()
for term in SEARCH_TERMS:
if term.lower() in seen_terms:
continue
seen_terms.add(term.lower())
offset = 0
while True:
payload = {
"searchCriteria": term,
"startIndex": offset,
"numDataSets": 200,
"allowAllProducts": False,
}
try:
data = shop_request(session, "webshop/getproducts", payload)
except Exception as e:
print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
break
if not data:
break
new_count = 0
for p in data:
if p["sid"] not in all_products:
all_products[p["sid"]] = p
new_count += 1
if len(data) < 200:
break
offset += len(data)
time.sleep(REQUEST_DELAY)
time.sleep(REQUEST_DELAY)
# Filter to Arche Noah's own seed products only
an_products = {
sid: p for sid, p in all_products.items()
if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
}
print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
return an_products
def fetch_product_details(session, products):
"""Fetch detailed info (Latin names) for each product."""
details = {}
total = len(products)
for i, (sid, product) in enumerate(products.items()):
try:
detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
if detail:
details[sid] = detail
except Exception as e:
print(f" Detail for {sid} failed: {e}", file=sys.stderr)
if (i + 1) % 20 == 0:
print(f" Fetched details: {i + 1}/{total}")
time.sleep(REQUEST_DELAY)
print(f"Fetched {len(details)} product details")
return details
def load_herbapi_species():
"""Load all species from HerbAPI and build lookup maps (handles pagination)."""
page = 1
species_list = []
while True:
result = herbapi_request("GET", f"species?per_page=100&page={page}")
if isinstance(result, dict) and "data" in result:
data = result["data"]
total = result.get("total", 0)
elif isinstance(result, list):
data = result
total = len(data)
else:
break
species_list.extend(data)
if len(species_list) >= total or not data:
break
page += 1
# Build lookup by scientific name (normalized lowercase)
by_scientific = {}
for s in species_list:
key = s["name_scientific"].strip().lower()
by_scientific[key] = s
return species_list, by_scientific
def load_herbapi_cultivars():
"""Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
page = 1
all_cultivars = []
while True:
result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
if isinstance(result, dict) and "data" in result:
data = result["data"]
total = result.get("total", 0)
elif isinstance(result, list):
data = result
total = len(data)
else:
break
all_cultivars.extend(data)
if len(all_cultivars) >= total or not data:
break
page += 1
# Build lookup by (species_id, normalized cultivar name)
by_key = {}
for c in all_cultivars:
key = (c["species_id"], c["name"].strip().lower())
by_key[key] = c
return all_cultivars, by_key
def ensure_supplier():
"""Create the Arche Noah supplier if it doesn't exist, return its ID."""
suppliers = herbapi_request("GET", "suppliers")
if isinstance(suppliers, dict) and "data" in suppliers:
suppliers = suppliers["data"]
for s in suppliers:
if "arche" in s["name"].lower() and "noah" in s["name"].lower():
print(f"Supplier 'Arche Noah' already exists: {s['id']}")
return s["id"]
print("Creating supplier 'Arche Noah'...")
result = herbapi_request("POST", "suppliers", {
"name": "Arche Noah",
"url": "https://www.arche-noah.at",
"country": "AT",
"is_organic": True,
"is_demeter": False,
"notes": "Austrian society for heritage seed preservation and biodiversity",
})
print(f"Created supplier: {result['id']}")
return result["id"]
def load_existing_supplier_links(cultivar_id):
"""Load existing supplier links for a cultivar."""
try:
result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
if isinstance(result, list):
return result
if isinstance(result, dict) and "data" in result:
return result["data"]
return []
except Exception:
return []
def main():
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"=== Arche Noah Seed Catalog Scraper ===")
print(f"Started at {now_str}\n")
# Step 1: Create Arche Noah supplier in HerbAPI
print("[1/6] Ensuring Arche Noah supplier exists...")
supplier_id = ensure_supplier()
print()
# Step 2: Load HerbAPI species for matching
print("[2/6] Loading HerbAPI species...")
species_list, species_by_scientific = load_herbapi_species()
print(f"Loaded {len(species_list)} species")
print()
# Step 3: Load existing cultivars for idempotency
print("[3/6] Loading existing cultivars...")
existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
print(f"Loaded {len(existing_cultivars)} existing cultivars")
print()
# Step 4: Scrape Arche Noah shop
print("[4/6] Scraping Arche Noah shop catalog...")
session = shop_create_session()
print(f"Got shop session")
products = fetch_all_arche_noah_products(session)
print()
# Step 5: Fetch product details (to get Latin names)
print("[5/6] Fetching product details for Latin name matching...")
details = fetch_product_details(session, products)
print()
# Step 6: Create cultivars in HerbAPI
print("[6/6] Creating cultivars in HerbAPI...")
stats = {
"created": 0,
"skipped_existing": 0,
"skipped_no_species": 0,
"supplier_linked": 0,
"supplier_link_existed": 0,
"errors": 0,
}
for sid, product in sorted(products.items()):
detail = details.get(sid, {})
# Extract Latin name from detail
latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
if not latin_name:
# Fallback: try from category mapping
latin_name = None
# Match to HerbAPI species (handles subspecies/variety suffixes)
species = match_species(latin_name, species_by_scientific)
if not species:
print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
stats["skipped_no_species"] += 1
continue
# Extract cultivar name
cultivar_name = extract_cultivar_name(product["name"])
if not cultivar_name:
print(f" SKIP (no cultivar name): {product['name']}")
stats["skipped_no_species"] += 1
continue
# Check if cultivar already exists (idempotency)
lookup_key = (species["id"], cultivar_name.strip().lower())
existing = cultivars_by_key.get(lookup_key)
if existing:
cultivar_id = existing["id"]
stats["skipped_existing"] += 1
else:
# Determine if this is organic
is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
# Build product URL
alias = product.get("alias") or detail.get("alias", "")
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
# Create cultivar
cultivar_data = {
"species_id": species["id"],
"name": cultivar_name,
"name_de": cultivar_name,
"is_organic": is_organic,
"source_urls": [product_url] if product_url else None,
}
try:
result = herbapi_request("POST", "cultivars", cultivar_data)
cultivar_id = result["id"]
stats["created"] += 1
# Add to lookup for idempotency within this run
cultivars_by_key[lookup_key] = result
print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
except Exception as e:
print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
stats["errors"] += 1
continue
# Link cultivar to supplier
existing_links = load_existing_supplier_links(cultivar_id)
already_linked = any(
link["supplier_id"] == supplier_id for link in existing_links
)
if already_linked:
stats["supplier_link_existed"] += 1
else:
# Parse pack info
unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
pack_size, pack_unit = parse_pack_info(unit_desc)
# Get price
price = None
price_list = product.get("priceListPos") or detail.get("priceListPos", [])
if price_list:
price = price_list[0].get("singleUnitPrice")
# Build product URL
alias = product.get("alias") or detail.get("alias", "")
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
link_data = {
"supplier_id": supplier_id,
"article_number": str(product.get("articleNr", "")),
"product_url": product_url,
"price_eur": price,
"pack_size": pack_size,
"pack_unit": pack_unit,
}
try:
herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
stats["supplier_linked"] += 1
except Exception as e:
print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
stats["errors"] += 1
time.sleep(0.1) # small delay between HerbAPI calls
# Summary
print(f"\n{'='*60}")
print(f"Scraping complete!")
print(f" Cultivars created: {stats['created']}")
print(f" Cultivars already existed: {stats['skipped_existing']}")
print(f" Skipped (no species match): {stats['skipped_no_species']}")
print(f" Supplier links created: {stats['supplier_linked']}")
print(f" Supplier links existed: {stats['supplier_link_existed']}")
print(f" Errors: {stats['errors']}")
if __name__ == "__main__":
main()
+843
View File
@@ -0,0 +1,843 @@
#!/usr/bin/env python3
"""
Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
Extracts cultivar data and imports into HerbAPI.
Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
"""
import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser
from typing import Optional
# ── Configuration ─────────────────────────────────────────────────────────
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SITE_BASE = "https://www.bingenheimersaatgut.de"
DELAY = 0.5
USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
# ── Category URLs to scrape ───────────────────────────────────────────────
# (url_path, default_species_scientific_name)
VEGETABLE_CATEGORIES = [
("gemuese/tomaten", "Solanum lycopersicum"),
("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
("gemuese/gurken/salatgurken", "Cucumis sativus"),
("gemuese/aubergine", "Solanum melongena"),
("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
("gemuese/bohnen/dicke-bohne", "Vicia faba"),
("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
("gemuese/erbsen/markerbse", "Pisum sativum"),
("gemuese/erbsen/schalerbse", "Pisum sativum"),
("gemuese/erbsen/zuckererbse", "Pisum sativum"),
("gemuese/feldsalat", "Valerianella locusta"),
("gemuese/knollenfenchel", "Foeniculum vulgare"),
("gemuese/kohl/blumenkohl", "Brassica oleracea"),
("gemuese/kohl/brokkoli", "Brassica oleracea"),
("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
("gemuese/kohl/gruenkohl", "Brassica oleracea"),
("gemuese/kohl/kohlrabi", "Brassica oleracea"),
("gemuese/kohl/rotkohl", "Brassica oleracea"),
("gemuese/kohl/weisskohl", "Brassica oleracea"),
("gemuese/kohl/wirsing", "Brassica oleracea"),
("gemuese/kohl/rosenkohl", "Brassica oleracea"),
("gemuese/kresse", "Lepidium sativum"),
("gemuese/kuerbis", "Cucurbita maxima"),
("gemuese/zuckermais", "Zea mays"),
("gemuese/mangold", "Beta vulgaris"),
("gemuese/melone", "Cucumis melo"),
("gemuese/moehren", "Daucus carota"),
("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
("gemuese/paprika/chili", "Capsicum annuum"),
("gemuese/pastinaken", "Pastinaca sativa"),
("gemuese/petersilienwurzel", "Petroselinum crispum"),
("gemuese/physalis", "Physalis peruviana"),
("gemuese/porreelauch", "Allium porrum"),
("gemuese/radies", "Raphanus sativus"),
("gemuese/rettich", "Raphanus sativus"),
("gemuese/rote-bete", "Beta vulgaris"),
("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
("gemuese/rueben/kohlruebe", "Brassica napus"),
("gemuese/rucola", "Eruca vesicaria"),
("gemuese/salat/bataviasalat", "Lactuca sativa"),
("gemuese/salat/eichblattsalat", "Lactuca sativa"),
("gemuese/salat/eissalat", "Lactuca sativa"),
("gemuese/salat/endivien", "Cichorium endivia"),
("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
("gemuese/salat/kopfsalat", "Lactuca sativa"),
("gemuese/salat/lollosalat", "Lactuca sativa"),
("gemuese/salat/romanasalat", "Lactuca sativa"),
("gemuese/salat/baby-leaf", "Lactuca sativa"),
("gemuese/sellerie/knollensellerie", "Apium graveolens"),
("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
("gemuese/blattstielgemuese", "Beta vulgaris"),
("gemuese/zwiebeln", "Allium cepa"),
("gemuese/lauchzwiebeln", "Allium fistulosum"),
("gemuese/artischocke", "Cynara cardunculus"),
("gemuese/asia-salate", "Brassica juncea"),
("gemuese/chicoree", "Cichorium intybus"),
("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
("gemuese/winterpostelein", "Claytonia perfoliata"),
("gemuese/zucchini", "Cucurbita pepo"),
("gemuese/catalogna", "Cichorium intybus"),
("gemuese/zichoriensalate", "Cichorium intybus"),
]
HERB_CATEGORIES = [
("kraeuter/basilikum", "Ocimum basilicum"),
("kraeuter/bohnenkraut", "Satureja hortensis"),
("kraeuter/borretsch", "Borago officinalis"),
("kraeuter/dill", "Anethum graveolens"),
("kraeuter/kuemmel", "Carum carvi"),
("kraeuter/kerbel", "Anthriscus cerefolium"),
("kraeuter/koriander", "Coriandrum sativum"),
("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
("kraeuter/kultursauerampfer", "Rumex acetosa"),
("kraeuter/lavendel", "Lavandula angustifolia"),
("kraeuter/liebstock", "Levisticum officinale"),
("kraeuter/majoran", "Origanum majorana"),
("kraeuter/oregano", "Origanum vulgare"),
("kraeuter/pimpinelle", "Sanguisorba minor"),
("kraeuter/estragon", "Artemisia dracunculus"),
("kraeuter/salbei", "Salvia officinalis"),
("kraeuter/schnittlauch", "Allium schoenoprasum"),
("kraeuter/schnittknoblauch", "Allium tuberosum"),
("kraeuter/schwarzkuemmel", "Nigella sativa"),
("kraeuter/speisechrysantheme", "Glebionis coronaria"),
("kraeuter/thymian", "Thymus vulgaris"),
("kraeuter/ysop", "Hyssopus officinalis"),
("kraeuter/winterkresse", "Barbarea vulgaris"),
("kraeuter/brunnenkresse", "Nasturtium officinale"),
("kraeuter/melisse", "Melissa officinalis"),
("kraeuter/petersilie", "Petroselinum crispum"),
("kraeuter/schnittsellerie", "Apium graveolens"),
("kraeuter/beifuss", "Artemisia vulgaris"),
]
GREEN_MANURE_CATEGORIES = [
("gruenduengung", None),
]
ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
# ── Stats ─────────────────────────────────────────────────────────────────
stats = {
"categories_scraped": 0,
"products_found": 0,
"detail_pages_fetched": 0,
"cultivars_created": 0,
"cultivars_existed": 0,
"supplier_links_created": 0,
"supplier_links_existed": 0,
"species_created": 0,
"families_created": 0,
"species_not_matched": [],
"errors": [],
}
# ── HTTP helpers ──────────────────────────────────────────────────────────
def fetch_page(url: str) -> str:
"""Fetch a web page with User-Agent header."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404:
return ""
raise
def api_get(path: str, params: dict = None) -> dict:
"""GET from HerbAPI."""
url = f"{API_BASE}{path}"
if params:
url += "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url, headers={
"Authorization": f"Bearer {API_TOKEN}",
"Accept": "application/json",
})
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
def api_post(path: str, data: dict) -> tuple:
"""POST to HerbAPI. Returns (response_dict, status_code)."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode("utf-8")
req = urllib.request.Request(url, data=body, method="POST", headers={
"Authorization": f"Bearer {API_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read()), resp.status
except urllib.error.HTTPError as e:
err_body = e.read().decode("utf-8", errors="replace")
return {"error": err_body, "_status": e.code}, e.code
# ── HTML parsing helpers ──────────────────────────────────────────────────
def parse_product_links(html: str) -> list:
"""Parse product links from listing page using regex."""
links = []
# Magento product-item-link pattern
pattern = re.compile(
r'<a[^>]+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*</a>',
re.DOTALL | re.IGNORECASE
)
for match in pattern.finditer(html):
url = match.group(1)
name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
if name:
if not url.startswith("http"):
url = SITE_BASE + url
links.append((url, name))
if not links:
# Broader pattern for product detail links
pattern2 = re.compile(
r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
re.IGNORECASE
)
seen = set()
for match in pattern2.finditer(html):
url = match.group(1).strip()
name = match.group(2).strip()
if name and url not in seen and not url.endswith(".html"):
seen.add(url)
if not url.startswith("http"):
url = SITE_BASE + url
links.append((url, name))
# Deduplicate by URL
seen_urls = set()
unique = []
for url, name in links:
if url not in seen_urls:
seen_urls.add(url)
unique.append((url, name))
return unique
def extract_latin_from_detail(html: str) -> Optional[str]:
"""Extract Latin/botanical name from product detail page."""
patterns = [
r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*</(?:em|i)>',
r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
]
for pat in patterns:
m = re.search(pat, html, re.IGNORECASE)
if m:
name = m.group(1).strip()
parts = name.split()
if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
return name
return None
def extract_description_from_detail(html: str) -> str:
"""Extract product description from detail page."""
desc_patterns = [
r'<div[^>]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)</div>',
r'<div[^>]*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)</div>',
r'data-content-type="description"[^>]*>(.*?)</div>',
]
for pat in desc_patterns:
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
if m:
raw = m.group(1)
text = re.sub(r'<[^>]+>', ' ', raw)
text = re.sub(r'\s+', ' ', text).strip()
if len(text) > 20:
return text[:2000]
return ""
def extract_article_number(product_name: str, url: str) -> Optional[str]:
"""Extract article number from product name or URL."""
m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
if m:
return m.group(1).replace(" ", "")
slug = url.rstrip("/").split("/")[-1]
m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
if m:
return m.group(1).upper()
return None
def extract_variety_name(product_name: str) -> str:
"""Extract the variety/cultivar name from the full product name."""
name = product_name.strip()
# Remove article number suffix like (G802)
name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
# Common German vegetable/herb type prefixes to strip
prefixes = [
# Tomatoes
r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
# Beans
r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
r'Edamame(?:-Sojabohne)?\s+',
# Peas
r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
# Cucurbits
r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
r'Zucchini\s+',
r'Kürbis\s+',
r'(?:Wasser)?[Mm]elone\s+',
# Brassicas
r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
r'Kohlrabi\s+',
r'Wirsing\s+',
r'Brokkoli\s+',
r'Chinakohl\s+',
r'Pak\s+Choi\s+',
r'Kohlrübe\s+',
r'Mai-/Herbstrüben?(?:/Navets)?\s+',
# Root vegetables
r'Möhre\s+',
r'Karotten?(?:\s*-?\s*Mix)?\s+',
r'Pastinake\s+',
r'Radies(?:chen)?\s+',
r'Rettich\s+',
r'Schwarzwurzel\s+',
r'Haferwurzel\s+',
r'Petersilienwurzel\s+',
# Beets
r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
r'Mangold\s+',
# Lettuce & leafy
r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
r'Feldsalat\s+',
r'Endivie\s+',
r'Asia[\s-]*Salat\s+',
r'Spinat\s+',
# Alliums
r'Zwiebel\s+',
r'Lauchzwiebel\s+',
r'Porree(?:/Lauch)?\s+',
r'Schnittlauch\s+',
r'Schnittknoblauch\s+',
# Peppers
r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
r'Chili\s+',
# Celery
r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
# Herbs
r'Basilikum\s+',
r'Koriander\s+',
r'Dill\s+',
r'Petersilie\s+',
r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
r'Salbei\s+',
r'Thymian\s+',
r'Oregano\s+',
r'Lavendel\s+',
r'Melisse\s+',
r'Majoran\s+',
r'Estragon\s+',
r'Kresse\s+',
r'Bohnenkraut\s+',
r'Borretsch\s+',
r'Kümmel\s+',
r'Kerbel\s+',
r'Liebstock\s+',
r'Ysop\s+',
r'Pimpinelle\s+',
r'Beifuß\s+',
r'Schwarzkümmel\s+',
# Other
r'Zuckermais\s+',
r'Artischocke\s+',
r'Physalis\s+',
r'Aubergine\s+',
r'Catalogna\s+',
]
for prefix in prefixes:
name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
name = name.strip().strip("'\"")
return name
# ── API data caches ───────────────────────────────────────────────────────
species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
family_cache = {} # name_scientific_lower -> {id, name_scientific}
cultivar_cache = {} # slug -> {id, name, species_id, ...}
supplier_id = None
def load_api_data():
"""Load all existing data from HerbAPI for matching."""
global supplier_id
print("Loading existing HerbAPI data...")
# Load families
page = 1
while True:
resp = api_get("/families", {"per_page": 100, "page": page})
for f in resp["data"]:
family_cache[f["name_scientific"].lower()] = f
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(family_cache)} families")
# Load species
page = 1
while True:
resp = api_get("/species", {"per_page": 100, "page": page})
for s in resp["data"]:
species_cache[s["name_scientific"].lower()] = s
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(species_cache)} species")
# Load ALL cultivars (slug + id + name + species_id)
page = 1
while True:
resp = api_get("/cultivars", {"per_page": 100, "page": page})
for c in resp["data"]:
cultivar_cache[c["slug"]] = {
"id": c["id"],
"name": c["name"],
"species_id": c["species_id"],
}
if len(resp["data"]) < 100:
break
page += 1
print(f" Loaded {len(cultivar_cache)} cultivars")
# Create or find Bingenheimer supplier
resp = api_get("/suppliers")
for s in resp:
if "bingenheimer" in s["name"].lower():
supplier_id = s["id"]
print(f" Found existing supplier: {s['name']} ({s['id']})")
break
if not supplier_id:
print(" Creating Bingenheimer Saatgut supplier...")
s, code = api_post("/suppliers", {
"name": "Bingenheimer Saatgut",
"url": "https://www.bingenheimersaatgut.de",
"country": "DE",
"is_organic": True,
"is_demeter": True,
"notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
})
if "id" in s:
supplier_id = s["id"]
print(f" Created supplier: {s['id']}")
else:
print(f" ERROR creating supplier: {s}")
sys.exit(1)
def find_or_create_species(latin_name: str) -> Optional[str]:
"""Find species by Latin name or create it. Returns species ID."""
if not latin_name:
return None
key = latin_name.lower().strip()
# Direct match
if key in species_cache:
return species_cache[key]["id"]
# Try without subspecies/variety
base = " ".join(key.split()[:2])
if base in species_cache:
return species_cache[base]["id"]
# Handle synonyms
synonyms = {
"lycopersicon esculentum": "solanum lycopersicum",
"capsicum annuum var. annuum": "capsicum annuum",
"brassica oleracea var. botrytis": "brassica oleracea",
"brassica oleracea var. italica": "brassica oleracea",
"brassica oleracea var. gemmifera": "brassica oleracea",
"brassica oleracea var. gongylodes": "brassica oleracea",
"brassica oleracea var. capitata": "brassica oleracea",
"brassica oleracea var. sabauda": "brassica oleracea",
"brassica oleracea var. sabellica": "brassica oleracea",
"brassica rapa var. rapa": "brassica rapa",
"brassica rapa subsp. pekinensis": "brassica rapa",
"brassica rapa subsp. chinensis": "brassica rapa",
"beta vulgaris var. conditiva": "beta vulgaris",
"beta vulgaris subsp. vulgaris": "beta vulgaris",
"beta vulgaris var. vulgaris": "beta vulgaris",
"allium porrum": "allium cepa",
"allium ampeloprasum": "allium cepa",
"origanum majorana": "origanum vulgare",
"cichorium intybus var. foliosum": "cichorium intybus",
"petroselinum crispum var. tuberosum": "petroselinum crispum",
"apium graveolens var. rapaceum": "apium graveolens",
"apium graveolens var. dulce": "apium graveolens",
"lactuca sativa var. capitata": "lactuca sativa",
"lactuca sativa var. crispa": "lactuca sativa",
"lactuca sativa var. longifolia": "lactuca sativa",
}
if key in synonyms:
syn_key = synonyms[key]
if syn_key in species_cache:
return species_cache[syn_key]["id"]
# Try to create the species
genus = latin_name.split()[0]
family_map = {
"Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
"Nicandra": "Solanaceae",
"Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
"Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
"Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
"Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
"Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
"Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
"Allium": "Amaryllidaceae",
"Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
"Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
"Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
"Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
"Sanguisorba": "Rosaceae",
"Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
"Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
"Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
"Artemisia": "Asteraceae",
"Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
"Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
"Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
"Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
"Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
"Mentha": "Lamiaceae",
"Zea": "Poaceae",
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
"Valerianella": "Caprifoliaceae",
"Tropaeolum": "Tropaeolaceae",
"Rumex": "Polygonaceae",
"Nigella": "Ranunculaceae",
"Claytonia": "Montiaceae",
"Tetragonia": "Aizoaceae",
"Basella": "Basellaceae",
"Plantago": "Plantaginaceae",
}
family_name = family_map.get(genus)
if not family_name:
print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
stats["species_not_matched"].append(latin_name)
return None
family_id = find_or_create_family(family_name)
if not family_id:
return None
print(f" Creating species: {latin_name}")
resp, code = api_post("/species", {
"name_scientific": latin_name,
"family_id": family_id,
})
if "id" in resp:
species_cache[latin_name.lower()] = resp
stats["species_created"] += 1
return resp["id"]
else:
# Might already exist, reload
print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
page = 1
while True:
r = api_get("/species", {"per_page": 100, "page": page})
for s in r["data"]:
species_cache[s["name_scientific"].lower()] = s
if len(r["data"]) < 100:
break
page += 1
if latin_name.lower() in species_cache:
return species_cache[latin_name.lower()]["id"]
stats["errors"].append(f"Species creation failed: {latin_name}")
return None
def find_or_create_family(family_name: str) -> Optional[str]:
"""Find or create a plant family. Returns family ID."""
key = family_name.lower()
if key in family_cache:
return family_cache[key]["id"]
print(f" Creating family: {family_name}")
resp, code = api_post("/families", {"name_scientific": family_name})
if "id" in resp:
family_cache[key] = resp
stats["families_created"] += 1
return resp["id"]
else:
# Reload
r = api_get("/families", {"per_page": 200})
for ff in r["data"]:
family_cache[ff["name_scientific"].lower()] = ff
if key in family_cache:
return family_cache[key]["id"]
stats["errors"].append(f"Family creation failed: {family_name}")
return None
def slugify(text: str) -> str:
"""Generate a URL-safe slug."""
text = text.lower()
replacements = {
"ä": "a", "ö": "o", "ü": "u", "ß": "ss",
"é": "e", "è": "e", "ê": "e", "ë": "e",
"à": "a", "â": "a", "á": "a",
"ô": "o", "ù": "u", "û": "u", "ú": "u",
"ï": "i", "î": "i", "í": "i",
"ç": "c", "ñ": "n", "ó": "o",
"œ": "oe", "æ": "ae",
}
for old, new in replacements.items():
text = text.replace(old, new)
text = re.sub(r'[^a-z0-9\s-]', '', text)
text = re.sub(r'[\s]+', '-', text.strip())
text = re.sub(r'-+', '-', text)
return text.strip('-')
def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
"""Check if cultivar already exists. Returns cultivar ID or None."""
expected_slug = slugify(f"{species_name} {variety_name}")
# Direct slug match
if expected_slug in cultivar_cache:
return cultivar_cache[expected_slug]["id"]
# Check for name match in same species
variety_lower = variety_name.lower()
for slug, data in cultivar_cache.items():
if data["species_id"] == species_id and data["name"].lower() == variety_lower:
return data["id"]
return None
def scrape_category(cat_path: str, default_species: Optional[str]):
"""Scrape a single category page and all its products."""
url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
print(f"\n{'='*60}")
print(f"Category: {cat_path}")
html = fetch_page(url)
if not html:
print(" SKIP: Page not found (404)")
return
time.sleep(DELAY)
products = parse_product_links(html)
print(f" Found {len(products)} products")
stats["products_found"] += len(products)
stats["categories_scraped"] += 1
for prod_url, prod_name in products:
process_product(prod_url, prod_name, default_species)
def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
"""Process a single product: fetch detail, extract data, create cultivar."""
article_number = extract_article_number(prod_name, prod_url)
variety_name = extract_variety_name(prod_name)
if not variety_name:
print(f" SKIP (no variety): {prod_name}")
return
# Skip mixes, sets, bundles
skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
"saat-set", " mix ", "trio ", "quartett", "gutschein",
"buch ", "düngung", "erde ", "-garten"]
name_lower = prod_name.lower()
# Exception: if the variety name itself is the whole thing, keep it
if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
# Only skip if it really seems like a mix
if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
print(f" SKIP (mix/set): {prod_name}")
return
print(f"\n Product: {prod_name}")
print(f" Variety: {variety_name}, SKU: {article_number}")
# Fetch detail page
latin_name = None
description = ""
time.sleep(DELAY)
try:
detail_html = fetch_page(prod_url)
stats["detail_pages_fetched"] += 1
if detail_html:
latin_name = extract_latin_from_detail(detail_html)
description = extract_description_from_detail(detail_html)
except Exception as e:
print(f" WARNING: Detail page error: {e}")
species_name = latin_name or default_species
if not species_name:
print(f" SKIP: No species for '{prod_name}'")
stats["species_not_matched"].append(prod_name)
return
print(f" Species: {species_name}")
species_id = find_or_create_species(species_name)
if not species_id:
print(f" SKIP: Could not resolve species '{species_name}'")
return
# Check if cultivar already exists
existing_id = find_existing_cultivar(species_name, variety_name, species_id)
cultivar_id = None
if existing_id:
cultivar_id = existing_id
print(f" EXISTS: cultivar already in DB")
stats["cultivars_existed"] += 1
else:
# Create cultivar
data = {
"species_id": species_id,
"name": variety_name,
"name_de": variety_name,
"is_organic": True,
}
if description:
data["description"] = description
resp, code = api_post("/cultivars", data)
if "id" in resp:
cultivar_id = resp["id"]
cultivar_cache[resp["slug"]] = {
"id": resp["id"],
"name": variety_name,
"species_id": species_id,
}
stats["cultivars_created"] += 1
print(f" CREATED: {resp['slug']}")
elif code == 500 and "Database error" in str(resp.get("error", "")):
# Likely slug conflict - try to find existing
print(f" DB conflict - searching for existing cultivar...")
# Reload cultivars for this species
page = 1
while True:
r = api_get("/cultivars", {"per_page": 100, "page": page})
for c in r["data"]:
cultivar_cache[c["slug"]] = {
"id": c["id"],
"name": c["name"],
"species_id": c["species_id"],
}
if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
cultivar_id = c["id"]
if cultivar_id or len(r["data"]) < 100:
break
page += 1
if cultivar_id:
print(f" Found existing after conflict: {cultivar_id}")
stats["cultivars_existed"] += 1
else:
print(f" ERROR: DB error and could not find existing cultivar")
stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
return
else:
print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
return
# Link to supplier
if cultivar_id and supplier_id:
link_data = {
"supplier_id": supplier_id,
"product_url": prod_url,
}
if article_number:
link_data["article_number"] = article_number
resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
if "id" in resp:
stats["supplier_links_created"] += 1
print(f" LINKED (SKU: {article_number})")
elif code == 500 or "already" in str(resp.get("error", "")).lower():
stats["supplier_links_existed"] += 1
print(f" LINK EXISTS")
else:
print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
def main():
print("=" * 60)
print("Bingenheimer Saatgut Scraper for HerbAPI")
print("=" * 60)
load_api_data()
print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
for cat_path, default_species in ALL_CATEGORIES:
try:
scrape_category(cat_path, default_species)
except Exception as e:
print(f" ERROR in category {cat_path}: {e}")
stats["errors"].append(f"Category error: {cat_path}: {e}")
# Summary
print("\n" + "=" * 60)
print("SCRAPING COMPLETE - SUMMARY")
print("=" * 60)
print(f"Categories scraped: {stats['categories_scraped']}")
print(f"Products found: {stats['products_found']}")
print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
print(f"Cultivars created: {stats['cultivars_created']}")
print(f"Cultivars existed: {stats['cultivars_existed']}")
print(f"Supplier links created: {stats['supplier_links_created']}")
print(f"Supplier links existed: {stats['supplier_links_existed']}")
print(f"Species created: {stats['species_created']}")
print(f"Families created: {stats['families_created']}")
print(f"Errors: {len(stats['errors'])}")
if stats["species_not_matched"]:
print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
for s in stats["species_not_matched"][:30]:
print(f" - {s}")
if stats["errors"]:
print(f"\nErrors ({len(stats['errors'])}):")
for e in stats["errors"][:30]:
print(f" - {e}")
return 0 if not stats["errors"] else 1
if __name__ == "__main__":
sys.exit(main())
+760
View File
@@ -0,0 +1,760 @@
#!/usr/bin/env python3
"""
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
Extracts cultivar data and imports into HerbAPI.
Run 2 - fixes pagination (API caps at 100/page), better species matching,
caches scraped products, handles duplicates gracefully.
"""
import urllib.request
import urllib.parse
import urllib.error
import gzip
import json
import re
import time
import sys
import os
import html as html_mod
from collections import defaultdict
# --- Configuration ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
SITE_BASE = "https://www.dreschflegel-saatgut.de"
DELAY = 0.5
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
stats = defaultdict(int)
def api_request(method, path, data=None):
"""Make an API request to HerbAPI."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode("utf-8") if data else None
req = urllib.request.Request(url, data=body, method=method)
req.add_header("Authorization", f"Bearer {API_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
try:
resp = urllib.request.urlopen(req)
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
body_text = e.read().decode("utf-8", errors="replace")
if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
return None # Duplicate, handled silently
if e.code == 500 and "database error" in body_text.lower():
# Likely a unique constraint violation = duplicate
return None
print(f" API error {e.code} {method} {path}: {body_text[:200]}")
return None
def fetch_page(url):
"""Fetch a web page with delay and user-agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
resp = urllib.request.urlopen(req, timeout=30)
return resp.read().decode("utf-8", errors="replace")
except Exception as e:
print(f" Fetch error {url}: {e}")
return None
def get_sitemap_urls():
"""Download sitemap and extract all URLs."""
print("Fetching sitemap index...")
html = fetch_page(f"{SITE_BASE}/sitemap.xml")
if not html:
return []
sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
all_urls = []
for smap_url in sitemap_urls:
if smap_url.endswith(".xml.gz"):
print(f" Fetching compressed sitemap...")
req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
try:
resp = urllib.request.urlopen(req, timeout=30)
data = gzip.decompress(resp.read()).decode("utf-8")
urls = re.findall(r"<loc>(.*?)</loc>", data)
all_urls.extend(urls)
print(f" Found {len(urls)} URLs")
except Exception as e:
print(f" Error: {e}")
return all_urls
def classify_urls(urls):
"""Filter URLs to likely product pages (single-segment paths)."""
skip_prefixes = [
"impressum", "agb", "datenschutz", "kontakt", "widerrufs",
"versand", "abkuerz", "zertifikat", "wichtige-hinweise",
"muster-", "gutscheine", "kalender", "flyer", "katalog",
"sommer-herbst", "unsere-hoefe", "bestellschein",
"dreschflegel-news", "termine", "rezepte", "anbautipps",
"tipps-zur", "gartentelefon", "gartenfreude", "buecher",
"navigation", "vielfalt", "sut20", "saatgut",
"neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
"saatgut-vielfalt", "saat",
]
candidates = []
for url in urls:
url = url.rstrip("/")
path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
"https://www.dreschflegel-saatgut.de/", ""
)
if not path or "/" in path:
continue
if any(path == p or path.startswith(p) for p in skip_prefixes):
continue
candidates.append(url)
return candidates
def parse_product_page(html_content):
"""Extract product data from a Dreschflegel product page."""
if not html_content or 'class="botname"' not in html_content:
return None
result = {}
m = re.search(r"<h1>(.*?)</h1>", html_content)
if m:
result["name"] = html_mod.unescape(m.group(1).strip())
m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
if m:
result["botanical_name"] = html_mod.unescape(m.group(1).strip())
m = re.search(
r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
html_content,
re.DOTALL,
)
if m:
result["article_number"] = m.group(1)
m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
if m:
try:
result["price"] = float(m.group(1))
except ValueError:
pass
m = re.search(
r"product-detail-description-text.*?<p>(.*?)</p>",
html_content,
re.DOTALL,
)
if m:
desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
desc = html_mod.unescape(desc).strip()
if desc:
result["description"] = desc
m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
if m:
result["pack_info"] = html_mod.unescape(m.group(1).strip())
return result if "name" in result and "botanical_name" in result else None
def scrape_all_products(candidate_urls):
"""Scrape product pages, using cache for already-scraped URLs."""
# Load cache
cache = {}
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r") as f:
cache = json.load(f)
print(f" Loaded {len(cache)} cached products")
products = []
to_fetch = [u for u in candidate_urls if u not in cache]
already_cached = [u for u in candidate_urls if u in cache]
# Add cached products
for u in already_cached:
if cache[u]: # None means "not a product page"
products.append(cache[u])
cached_products = len(products)
cached_non_products = len(already_cached) - cached_products
print(f" {cached_products} products from cache, "
f"{cached_non_products} non-products cached, "
f"{len(to_fetch)} to fetch")
for i, url in enumerate(to_fetch):
if (i + 1) % 50 == 0 or i == 0:
print(f" Fetching {i + 1}/{len(to_fetch)}...")
time.sleep(DELAY)
html_content = fetch_page(url)
if not html_content:
stats["fetch_errors"] += 1
cache[url] = None
continue
product = parse_product_page(html_content)
if product:
product["url"] = url
products.append(product)
cache[url] = product
stats["products_scraped"] += 1
else:
cache[url] = None
stats["not_product_pages"] += 1
# Save cache periodically
if (i + 1) % 100 == 0:
with open(CACHE_FILE, "w") as f:
json.dump(cache, f)
# Final cache save
with open(CACHE_FILE, "w") as f:
json.dump(cache, f)
print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
return products
def paginated_get(path):
"""Fetch all pages from a paginated API endpoint."""
all_items = []
page = 1
while True:
resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
if not resp or "data" not in resp or not resp["data"]:
break
all_items.extend(resp["data"])
if len(resp["data"]) < 100:
break
page += 1
return all_items
def load_api_data():
"""Load all species, families, cultivars from HerbAPI."""
print("Loading HerbAPI data...")
families = {}
for f in paginated_get("/families"):
families[f["name_scientific"].lower()] = f
print(f" {len(families)} families")
species = {}
for s in paginated_get("/species"):
species[s["name_scientific"].lower().strip()] = s
print(f" {len(species)} species")
cultivars = {}
for c in paginated_get("/cultivars"):
key = (c["species_id"], c["name"].lower().strip())
cultivars[key] = c
print(f" {len(cultivars)} cultivars")
return families, species, cultivars
def ensure_supplier():
"""Create or find the Dreschflegel supplier."""
resp = api_request("GET", "/suppliers")
if resp:
for s in resp:
if "dreschflegel" in s["name"].lower():
print(f" Supplier exists: {s['name']} ({s['id']})")
return s
data = {
"name": "Dreschflegel",
"url": "https://www.dreschflegel-saatgut.de",
"country": "DE",
"is_organic": True,
"is_demeter": False,
"notes": "German organic seed cooperative, open-pollinated heritage varieties",
}
resp = api_request("POST", "/suppliers", data)
if resp:
print(f" Created supplier: {resp['name']} ({resp['id']})")
return resp
# Genus → family mapping for species creation
GENUS_TO_FAMILY = {
# Asteraceae
"Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
"Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
"Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
"Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
"Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
"Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
"Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
"Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
"Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
"Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
"Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
"Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
"Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
"Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
# Solanaceae
"Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
"Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
# Cucurbitaceae
"Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
"Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
# Fabaceae
"Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
"Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
"Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
"Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
# Brassicaceae
"Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
"Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
"Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
"Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
"Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
# Apiaceae
"Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
"Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
"Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
"Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
"Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
# Lamiaceae
"Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
"Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
"Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
"Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
"Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
"Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
# Amaryllidaceae / Alliaceae
"Allium": "Amaryllidaceae",
# Poaceae
"Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
"Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
"Zea": "Poaceae", "Setaria": "Poaceae",
# Chenopodiaceae
"Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
"Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
# Rosaceae
"Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
"Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
"Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
"Waldsteinia": "Rosaceae",
# Boraginaceae
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
"Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
# Malvaceae
"Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
"Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
# Polygonaceae
"Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
# Caryophyllaceae
"Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
"Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
"Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
# Tropaeolaceae
"Tropaeolum": "Tropaeolaceae",
# Papaveraceae
"Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
"Meconopsis": "Papaveraceae",
# Caprifoliaceae
"Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
"Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
# Plantaginaceae
"Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
"Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
# Violaceae
"Viola": "Violaceae",
# Ranunculaceae
"Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
"Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
# Linaceae
"Linum": "Linaceae",
# Convolvulaceae
"Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
# Portulacaceae / Montiaceae
"Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
# Amaranthaceae
"Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
"Gomphrena": "Amaranthaceae",
# Asparagaceae
"Asparagus": "Asparagaceae",
# Resedaceae
"Reseda": "Resedaceae",
# Balsaminaceae
"Impatiens": "Balsaminaceae",
# Hydrangeaceae
"Hydrangea": "Hydrangeaceae",
# Campanulaceae
"Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
# Scrophulariaceae
"Verbascum": "Scrophulariaceae",
# Verbenaceae
"Verbena": "Verbenaceae",
# Onagraceae
"Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
# Cucurbitaceae extras
"Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
# Hypericaceae
"Hypericum": "Hypericaceae",
# Adoxaceae
"Sambucus": "Adoxaceae",
# Others
"Nigella": "Ranunculaceae",
"Dipsacus": "Caprifoliaceae",
"Knautia": "Caprifoliaceae",
"Scabiosa": "Caprifoliaceae",
"Succisa": "Caprifoliaceae",
"Asclepias": "Apocynaceae",
"Cynoglossum": "Boraginaceae",
"Echium": "Boraginaceae",
"Anchusa": "Boraginaceae",
"Lithospermum": "Boraginaceae",
"Tanacetum": "Asteraceae",
"Onobrychis": "Fabaceae",
"Ornithopus": "Fabaceae",
"Lotus": "Fabaceae",
"Anthyllis": "Fabaceae",
"Melilotus": "Fabaceae",
"Galega": "Fabaceae",
"Lespedeza": "Fabaceae",
"Arachis": "Fabaceae",
"Senna": "Fabaceae",
# Additional genera found in Dreschflegel catalog
"Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
"Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
"Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
"Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
"Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
"Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
"Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
"Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
"Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
"Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
"Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
"Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
"Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
"Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
"Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
"Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
"Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
"Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
"Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
"Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
"Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
"Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
"Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
}
def normalize_species_name(botanical_name):
"""Normalize botanical name to 'Genus species' for matching.
Handles var., subsp., ssp., hybrids etc.
"""
name = botanical_name.strip()
parts = name.split()
if len(parts) < 2:
return None, None
genus = parts[0]
# Handle 'Genus x species' (hybrid notation)
if parts[1] == "x" and len(parts) >= 3:
species = f"x {parts[2]}"
elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
# Only genus level - can't match to species
return genus, None
else:
species = parts[1]
return genus, species
def find_species(botanical_name, species_cache):
"""Find existing species matching a botanical name.
Tries exact match, then genus+species without var/subsp.
"""
genus, sp = normalize_species_name(botanical_name)
if not genus:
return None
if sp:
# Try exact genus+species
search_key = f"{genus} {sp}".lower()
if search_key in species_cache:
return species_cache[search_key]
# Try all species with same genus
genus_lower = genus.lower()
matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
if len(matches) == 1:
# Only one species in this genus - use it
return list(matches.values())[0]
return None
def find_or_create_species(botanical_name, families, species_cache):
"""Find or create a species from a botanical name."""
# Try to find existing
sp = find_species(botanical_name, species_cache)
if sp:
return sp
genus, species_epithet = normalize_species_name(botanical_name)
if not genus or not species_epithet:
stats["species_no_epithet"] += 1
return None
sci_name = f"{genus} {species_epithet}"
# Check cache again with normalized name
if sci_name.lower() in species_cache:
return species_cache[sci_name.lower()]
# Need to create - find the family
family_name = GENUS_TO_FAMILY.get(genus)
if not family_name:
stats["species_no_family"] += 1
print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
return None
# Find or create the family
family = families.get(family_name.lower())
if not family:
print(f" Creating family: {family_name}")
resp = api_request("POST", "/families", {"name_scientific": family_name})
if resp:
families[family_name.lower()] = resp
family = resp
stats["families_created"] += 1
else:
# May already exist (duplicate from previous run) - reload
for f in paginated_get("/families"):
if f["name_scientific"].lower() == family_name.lower():
families[family_name.lower()] = f
family = f
break
if not family:
print(f" [SKIP] Cannot create family: {family_name}")
return None
# Create species
print(f" Creating species: {sci_name} (family: {family_name})")
resp = api_request("POST", "/species", {
"name_scientific": sci_name,
"family_id": family["id"],
})
if resp:
species_cache[sci_name.lower()] = resp
stats["species_created"] += 1
return resp
else:
# May already exist - try to find it
time.sleep(0.1)
for s in paginated_get("/species"):
if s["name_scientific"].lower() == sci_name.lower():
species_cache[sci_name.lower()] = s
return s
return None
def extract_cultivar_name(product_name):
"""Extract the cultivar/variety name from the full product name."""
name = product_name.strip()
# Common German crop type prefixes to strip (longest first)
prefixes = [
# Tomatoes
"Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
"Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
"Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
# Lettuce
"Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
"Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
"Spargelsalat", "Romanasalat",
# Beans
"Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
"Prunkbohne",
# Peas
"Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
"Knackerbse", "Kapuzinererbse",
# Cucumbers
"Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
"Freilandgurke",
# Squash
"Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
"Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
# Melon
"Wassermelone", "Zuckermelone",
# Peppers
"Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
"Snackpaprika", "Peperoni", "Chili",
# Brassicas
"Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
"Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
"Chinakohl", "Pak Choi", "Markstammkohl",
# Root veg
"Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
"Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
"Steckrübe", "Knollensellerie", "Petersilienwurzel",
"Rettich", "Radieschen",
# Onions
"Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
"Schalotte", "Wintersteckzwiebel", "Zwiebel",
# Herbs
"Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
"Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
"Basilikum", "Schnittknoblauch",
# Grains
"Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
"Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
# Misc
"Zuckermais", "Popcornmais",
"Salattomate", "Zucchini",
]
for prefix in sorted(prefixes, key=len, reverse=True):
if name.startswith(prefix + " "):
return name[len(prefix):].strip()
return name
def get_existing_supplier_links(cultivar_id, supplier_id):
"""Check if a cultivar-supplier link already exists."""
resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
if resp:
for link in resp:
if link["supplier_id"] == supplier_id:
return True
return False
def main():
print("=" * 60)
print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
print("=" * 60)
# Step 1: Supplier
print("\n[1] Setting up supplier...")
supplier = ensure_supplier()
if not supplier:
print("FATAL: Could not create/find supplier")
sys.exit(1)
supplier_id = supplier["id"]
# Step 2: Load API data
print("\n[2] Loading existing HerbAPI data...")
families, species_cache, cultivar_cache = load_api_data()
# Step 3: Get product URLs
print("\n[3] Fetching sitemap...")
all_urls = get_sitemap_urls()
if not all_urls:
print("FATAL: Could not fetch sitemap")
sys.exit(1)
candidate_urls = classify_urls(all_urls)
print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
# Step 4: Scrape
print(f"\n[4] Scraping product pages...")
products = scrape_all_products(candidate_urls)
# Step 5: Import
print(f"\n[5] Importing {len(products)} products into HerbAPI...")
for i, product in enumerate(products):
if (i + 1) % 50 == 0:
print(f" Processing {i + 1}/{len(products)}...")
botanical = product.get("botanical_name", "")
if not botanical:
stats["no_botanical"] += 1
continue
# Find or create species
sp = find_or_create_species(botanical, families, species_cache)
if not sp:
stats["species_not_matched"] += 1
continue
species_id = sp["id"]
cultivar_name = extract_cultivar_name(product["name"])
# Check if cultivar already exists
cv_key = (species_id, cultivar_name.lower().strip())
if cv_key in cultivar_cache:
cv = cultivar_cache[cv_key]
stats["cultivars_existing"] += 1
else:
cv_data = {
"species_id": species_id,
"name": cultivar_name,
"is_organic": True,
}
if product.get("description"):
cv_data["description"] = product["description"]
cv = api_request("POST", "/cultivars", cv_data)
if cv:
cultivar_cache[cv_key] = cv
stats["cultivars_created"] += 1
else:
# Might already exist from previous run - try to find it
found = False
for c in paginated_get(f"/cultivars?species_id={species_id}"):
if c["name"].lower().strip() == cultivar_name.lower().strip():
cultivar_cache[cv_key] = c
cv = c
stats["cultivars_existing"] += 1
found = True
break
if not found:
stats["cultivar_create_errors"] += 1
continue
# Link to supplier (check first for idempotency)
if get_existing_supplier_links(cv["id"], supplier_id):
stats["supplier_links_existing"] += 1
continue
link_data = {
"supplier_id": supplier_id,
"article_number": product.get("article_number", ""),
"product_url": product.get("url", ""),
"price_eur": product.get("price"),
}
pack_info = product.get("pack_info", "")
if pack_info:
m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
if m:
link_data["pack_size"] = float(m.group(1))
unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
if resp:
stats["supplier_links_created"] += 1
else:
stats["supplier_link_errors"] += 1
# Summary
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
for key, val in sorted(stats.items()):
print(f" {key}: {val}")
print(f"\n Total species in DB: {len(species_cache)}")
print(f" Total cultivars tracked: {len(cultivar_cache)}")
if __name__ == "__main__":
main()
+380
View File
@@ -0,0 +1,380 @@
#!/usr/bin/env python3
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
import subprocess
import re
import time
import os
import sys
DB_CMD = [
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
'-t', '-A', '-F|'
]
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
MONTH_MAP = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
def run_sql(sql):
result = subprocess.run(
DB_CMD + ['-c', sql],
capture_output=True, text=True, env=DB_ENV
)
return result.stdout.strip()
def fetch_page(url):
result = subprocess.run(
['curl', '-sL', '--max-time', '15', url],
capture_output=True, text=True
)
return result.stdout
def parse_months(text):
if not text:
return None
text_lower = text.lower().strip()
months = []
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
if month_name in text_lower:
if month_num not in months:
months.append(month_num)
text_lower = text_lower.replace(month_name, '')
return sorted(months) if months else None
def parse_depth(text):
if not text:
return None
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
v1 = float(match.group(1).replace(',', '.'))
v2 = float(match.group(2).replace(',', '.'))
return round((v1 + v2) / 2, 1)
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
return float(match.group(1).replace(',', '.'))
return None
def parse_spacing(text):
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
if not text:
return None, None
text = text.lower().strip()
# "X x Y cm"
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return float(match.group(2)), float(match.group(1))
# "X - Y cm" range -> average as plant spacing
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
# Single value
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, float(match.group(1))
return None, None
def parse_germination_days(text):
if not text:
return None
text = text.lower()
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
match = re.search(r'(\d+)\s*weeks?', text)
if match:
return int(match.group(1)) * 7
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
match = re.search(r'(\d+)\s*days?', text)
if match:
return int(match.group(1))
return None
def parse_germ_temp(text):
if not text:
return None
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
if match:
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
match = re.search(r'(\d+)\s*°', text)
if match:
return float(match.group(1))
return None
def parse_lifecycle(text):
if not text:
return None
text = text.lower().strip()
if 'perennial' in text:
return True
if 'annual' in text or 'biennial' in text:
return False
return None
def parse_light(text):
if not text:
return None
text = text.lower().strip()
if 'full sun' in text and 'partial' in text:
return 'full sun to partial shade'
if 'full sun' in text:
return 'full sun'
if 'partial' in text or 'semi' in text or 'half' in text:
return 'partial shade'
if 'shade' in text:
return 'shade'
if 'sun' in text:
return 'full sun'
return text
def extract_data(html):
data = {}
# Extract table cell pairs
cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
clean_cells = []
for c in cells:
clean = re.sub(r'<[^>]+>', ' ', c).strip()
clean = re.sub(r'\s+', ' ', clean)
clean_cells.append(clean)
specs = {}
i = 0
while i < len(clean_cells) - 1:
key = clean_cells[i].rstrip(':').strip()
val = clean_cells[i + 1].strip()
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
specs[key.lower()] = val
i += 2
# Extract description from itemprop="description"
desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
if desc_match:
content = desc_match.group(1)
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
content = re.sub(r'<[^>]+>', ' ', content)
content = re.sub(r'\s+', ' ', content).strip()
for marker in ['Other names', 'Additional contact mail', 'Question about']:
idx = content.find(marker)
if idx > 0:
content = content[:idx].strip()
if len(content) > 20:
data['description'] = content
if 'description' not in data:
meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
if meta_match and len(meta_match.group(1)) > 20:
data['description'] = meta_match.group(1)
# Parse specs
if 'planting distance' in specs:
row_sp, plant_sp = parse_spacing(specs['planting distance'])
if plant_sp:
data['plant_spacing_cm'] = plant_sp
if row_sp:
data['row_spacing_cm'] = row_sp
if 'row spacing' in specs:
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
if match:
data['row_spacing_cm'] = float(match.group(1))
if 'sowing depth' in specs:
depth = parse_depth(specs['sowing depth'])
if depth is not None:
data['planting_depth_cm'] = depth
# Harvesting months - prefer explicit harvest time over flowering
if 'harvest time' in specs:
months = parse_months(specs['harvest time'])
if months:
data['harvesting_months'] = months
elif 'harvesting months' in specs:
months = parse_months(specs['harvesting months'])
if months:
data['harvesting_months'] = months
elif 'flowering months' in specs:
months = parse_months(specs['flowering months'])
if months:
data['harvesting_months'] = months
if 'when to sow outdoors' in specs:
months = parse_months(specs['when to sow outdoors'])
if months:
data['direct_sowing_months'] = months
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
if indoor_key in specs:
months = parse_months(specs[indoor_key])
if months:
data['indoor_sowing_months'] = months
break
if 'lifecycle' in specs:
perennial = parse_lifecycle(specs['lifecycle'])
if perennial is not None:
data['perennial'] = perennial
if 'sunlight' in specs:
light = parse_light(specs['sunlight'])
if light:
data['light_requirement'] = light
if 'germination time' in specs:
days = parse_germination_days(specs['germination time'])
if days:
data['days_to_germination'] = days
if 'germination temperature' in specs:
temp = parse_germ_temp(specs['germination temperature'])
if temp:
data['germination_temp_c'] = temp
return data
def get_current_values(cultivar_id):
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
perennial, harvesting_months, direct_sowing_months, light_requirement,
days_to_germination, germination_temp_c, indoor_sowing_months
FROM cultivars WHERE id = '{cultivar_id}'"""
row = run_sql(sql)
if not row:
return {}
parts = row.split('|')
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
current = {}
for i, f in enumerate(fields):
if i < len(parts):
val = parts[i].strip()
if val and val != '':
current[f] = val
return current
def build_update_sql(cultivar_id, data, current):
sets = []
updated_fields = []
for field, value in data.items():
if field in current and current[field]:
continue
if isinstance(value, str):
escaped = value.replace("'", "''")
sets.append(f"{field} = '{escaped}'")
elif isinstance(value, bool):
sets.append(f"{field} = {'true' if value else 'false'}")
elif isinstance(value, list):
arr_str = '{' + ','.join(str(x) for x in value) + '}'
sets.append(f"{field} = '{arr_str}'")
elif isinstance(value, (int, float)):
sets.append(f"{field} = {value}")
updated_fields.append(field)
if not sets:
return None, []
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
def main():
sql = """
SELECT c.id, c.name, cs.product_url
FROM cultivars c
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
JOIN suppliers s ON cs.supplier_id = s.id
WHERE s.name = 'Magic Garden Seeds'
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
ORDER BY c.name;
"""
rows = run_sql(sql)
if not rows:
print("No cultivars to process")
return
cultivars = []
for line in rows.strip().split('\n'):
parts = line.split('|')
if len(parts) >= 3:
cultivars.append({
'id': parts[0],
'name': parts[1],
'url': parts[2]
})
print(f"Processing {len(cultivars)} MGS cultivars...")
sys.stdout.flush()
updated = 0
skipped = 0
failed = 0
fields_updated = {}
for i, cv in enumerate(cultivars):
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
try:
html = fetch_page(cv['url'])
if not html or len(html) < 1000:
print("FAILED (empty page)")
failed += 1
time.sleep(0.5)
continue
data = extract_data(html)
if not data:
print("NO DATA")
skipped += 1
time.sleep(0.5)
continue
current = get_current_values(cv['id'])
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
if not sql_stmt:
print(f"SKIP (all fields populated)")
skipped += 1
else:
run_sql(sql_stmt)
for f in upd_fields:
fields_updated[f] = fields_updated.get(f, 0) + 1
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
updated += 1
except Exception as e:
print(f"ERROR: {e}")
failed += 1
time.sleep(0.5)
print(f"\n=== MGS Summary ===")
print(f"Total processed: {len(cultivars)}")
print(f"Updated: {updated}")
print(f"Skipped (all fields already populated): {skipped}")
print(f"Failed: {failed}")
print(f"\nFields updated:")
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
print(f" {field}: {count}")
if __name__ == '__main__':
main()
+330
View File
@@ -0,0 +1,330 @@
#!/usr/bin/env python3
"""
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
"""
import json
import re
import time
import urllib.request
import urllib.error
import sys
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
DELAY = 0.5
def api_get(path):
"""GET from HerbAPI."""
url = f"{HERBAPI_BASE}{path}"
req = urllib.request.Request(url)
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def api_put(path, data):
"""PUT to HerbAPI."""
url = f"{HERBAPI_BASE}{path}"
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, method="PUT")
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def fetch_naturadb(latin_name):
"""Fetch a NaturaDB plant page. Returns HTML string or None."""
slug = latin_name.lower().replace(" ", "-")
url = f"{NATURADB_BASE}/{slug}/"
req = urllib.request.Request(url)
req.add_header("User-Agent", USER_AGENT)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404:
return None
print(f" HTTP {e.code} for {url}")
return None
except Exception as e:
print(f" Error fetching {url}: {e}")
return None
def extract_td_value(html, label):
"""Extract value from <td>label:</td><td>value</td> pattern."""
pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
m = re.search(pattern, html, re.DOTALL)
if m:
# Strip HTML tags from value
val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
return val
return None
def extract_native_status(html):
"""Extract native status from chip badges."""
# Look for the primary native status chips (large, colored)
statuses = []
for m in re.finditer(
r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
):
tag = m.group(1).strip()
if tag in (
"heimische Wildform",
"Archäophyt",
"Neophyt",
"nicht heimisch (Neophyt)",
):
statuses.append(tag)
return statuses
def extract_badge_tags(html):
"""Extract ecological badge chips (large, plain text)."""
tags = []
for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
tag = m.group(1).strip()
if tag and tag not in ("", "winterhart"):
tags.append(tag)
return tags
def parse_count(text):
"""Extract leading integer from text like '82 (Nektar und/oder ...)' """
if not text:
return None
m = re.match(r"(\d+)", text.strip())
return int(m.group(1)) if m else None
def parse_specialist_count(text):
"""Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
if not text:
return None
m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
return int(m.group(1)) if m else None
def parse_nectar_pollen(text):
"""Extract numeric value from '2/4 - mäßig' -> 2."""
if not text:
return None
m = re.match(r"(\d+)/4", text.strip())
return int(m.group(1)) if m else None
def build_wildlife_value(data):
"""Build a structured wildlife_value string from scraped data."""
parts = []
# Nectar and pollen
np_parts = []
if data.get("nectar") is not None:
np_parts.append(f"Nectar: {data['nectar']}/4")
if data.get("pollen") is not None:
np_parts.append(f"Pollen: {data['pollen']}/4")
if np_parts:
parts.append(", ".join(np_parts) + ".")
# Wild bees
if data.get("wildbienen_count") is not None:
s = f"Supports {data['wildbienen_count']} wild bee species"
if data.get("wildbienen_specialists") is not None:
s += f" ({data['wildbienen_specialists']} specialists)"
parts.append(s + ".")
# Butterflies / moths
if data.get("schmetterlinge_count") is not None:
s = f"{data['schmetterlinge_count']} butterfly/moth species"
if data.get("raupen_count") is not None:
spec = ""
if data.get("raupen_specialists") is not None:
spec = f" ({data['raupen_specialists']} specialized)"
s += f", {data['raupen_count']} as caterpillar host{spec}"
parts.append(s + ".")
# Hoverflies
if data.get("schwebfliegen_count") is not None:
parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
# Beetles
if data.get("kaefer_count") is not None:
parts.append(f"{data['kaefer_count']} beetle species.")
# Birds
if data.get("vogelarten_count") is not None:
parts.append(f"{data['vogelarten_count']} bird species.")
# Mammals
if data.get("saeugetier_count") is not None:
parts.append(f"{data['saeugetier_count']} mammal species.")
# Native status
if data.get("native_status"):
parts.append(" ".join(data["native_status"]) + ".")
# Notable badges
notable = [
t
for t in data.get("badges", [])
if any(
kw in t.lower()
for kw in [
"insektenpflanze",
"raupenfutter",
"vogelschutz",
"vogelnähr",
"bienenweide",
]
)
]
if notable:
parts.append("Tags: " + ", ".join(notable) + ".")
return " ".join(parts) if parts else None
def scrape_species(html):
"""Parse NaturaDB HTML and return structured wildlife data dict."""
data = {}
# Nectar and pollen values
nectar_raw = extract_td_value(html, "Nektarwert")
pollen_raw = extract_td_value(html, "Pollenwert")
data["nectar"] = parse_nectar_pollen(nectar_raw)
data["pollen"] = parse_nectar_pollen(pollen_raw)
# Wild bees
bees_raw = extract_td_value(html, "Wildbienen")
data["wildbienen_count"] = parse_count(bees_raw)
data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
# Butterflies/moths
schmett_raw = extract_td_value(html, "Schmetterlinge")
data["schmetterlinge_count"] = parse_count(schmett_raw)
# Caterpillar hosts
raupen_raw = extract_td_value(html, "Raupen")
data["raupen_count"] = parse_count(raupen_raw)
data["raupen_specialists"] = parse_specialist_count(raupen_raw)
# Hoverflies
schweb_raw = extract_td_value(html, "Schwebfliegen")
data["schwebfliegen_count"] = parse_count(schweb_raw)
# Beetles
kaefer_raw = extract_td_value(html, "Käfer")
data["kaefer_count"] = parse_count(kaefer_raw)
# Birds
vogel_raw = extract_td_value(html, "fressende Vogelarten")
data["vogelarten_count"] = parse_count(vogel_raw)
# Mammals
saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
data["saeugetier_count"] = parse_count(saeuget_raw)
# Native status
data["native_status"] = extract_native_status(html)
# Badge tags
data["badges"] = extract_badge_tags(html)
return data
def has_any_data(data):
"""Check if we scraped anything meaningful."""
for k, v in data.items():
if k in ("native_status", "badges"):
if v:
return True
elif v is not None:
return True
return False
def main():
print("Fetching species list from HerbAPI...")
species_list = api_get("/species?per_page=200")["data"]
print(f"Found {len(species_list)} species.\n")
enriched = 0
skipped_has_data = 0
skipped_not_found = 0
skipped_no_data = 0
errors = 0
for i, sp in enumerate(species_list):
slug = sp["slug"]
name = sp["name_scientific"]
existing_wv = sp.get("wildlife_value")
# Only enrich if wildlife_value is empty/null
if existing_wv:
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
skipped_has_data += 1
continue
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
# Fetch NaturaDB page
html = fetch_naturadb(name)
time.sleep(DELAY)
if html is None:
print("NOT FOUND on NaturaDB")
skipped_not_found += 1
continue
# Parse wildlife data
data = scrape_species(html)
if not has_any_data(data):
print("no wildlife data on page")
skipped_no_data += 1
continue
# Build wildlife_value string
wildlife_value = build_wildlife_value(data)
if not wildlife_value:
print("no wildlife data extracted")
skipped_no_data += 1
continue
# GET full species, merge, PUT back
try:
full = api_get(f"/species/{slug}")
full["wildlife_value"] = wildlife_value
# Remove read-only / computed fields that the PUT endpoint might reject
for key in ("created_at", "updated_at", "family"):
full.pop(key, None)
api_put(f"/species/{full['id']}", full)
print(f"ENRICHED -> {wildlife_value[:80]}...")
enriched += 1
except Exception as e:
print(f"API ERROR: {e}")
errors += 1
print("\n" + "=" * 70)
print(f"DONE. Results:")
print(f" Enriched: {enriched}")
print(f" Already had data: {skipped_has_data}")
print(f" Not on NaturaDB: {skipped_not_found}")
print(f" No wildlife data: {skipped_no_data}")
print(f" Errors: {errors}")
print(f" Total: {len(species_list)}")
if __name__ == "__main__":
main()
+560
View File
@@ -0,0 +1,560 @@
#!/usr/bin/env python3
"""
Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
Strategy:
1. Fetch category pages, recursively discover product pages via JSON-LD detection
2. Extract structured data from JSON-LD Product schema + HTML text for growing data
3. Match Latin names to existing species in the API
4. Create cultivar records and link them to Reinsaat supplier
"""
import json
import re
import ssl
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser
from dataclasses import dataclass
from typing import Optional
# ── Config ──────────────────────────────────────────────────────────────────
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.5 # seconds between requests
USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
# ── Categories to scrape ────────────────────────────────────────────────────
# (category_url, default_species_hint for leaf pages in this category)
CATEGORIES = [
("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
("https://www.reinsaat.at/shop/DE/kuerbis/", None),
("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
("https://www.reinsaat.at/shop/DE/bohnen/", None),
("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
]
# ── Known Latin name genera we can match ────────────────────────────────────
KNOWN_GENERA = (
"Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
"Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
"Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
"Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
"Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
"Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
)
LATIN_PATTERN = re.compile(
rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
)
# ── HTML helpers ────────────────────────────────────────────────────────────
class TextExtractor(HTMLParser):
"""Extract all visible text from HTML."""
def __init__(self):
super().__init__()
self.parts = []
self._skip = 0
def handle_starttag(self, tag, attrs):
if tag in ("script", "style", "noscript"):
self._skip += 1
def handle_endtag(self, tag):
if tag in ("script", "style", "noscript") and self._skip > 0:
self._skip -= 1
def handle_data(self, data):
if self._skip == 0:
t = data.strip()
if t:
self.parts.append(t)
def extract_links(html: str, base_url: str) -> list[str]:
"""Extract all <a href> links from HTML, resolving relative URLs."""
links = []
seen = set()
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
href = m.group(1)
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
full = urllib.parse.urljoin(base_url, href)
if full not in seen:
seen.add(full)
links.append(full)
return links
def extract_jsonld_product(html: str) -> Optional[dict]:
"""Extract the JSON-LD Product object from HTML, if present."""
for m in re.finditer(
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
html, re.DOTALL | re.IGNORECASE
):
try:
data = json.loads(m.group(1))
if isinstance(data, dict) and data.get("@type") == "Product":
return data
except (json.JSONDecodeError, ValueError):
continue
return None
# ── HTTP helpers ────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()
def fetch_url(url: str, retries: int = 2) -> str:
"""Fetch a URL with retries."""
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
})
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
charset = resp.headers.get_content_charset() or "utf-8"
return resp.read().decode(charset)
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
if attempt < retries:
time.sleep(2)
continue
raise
return ""
def api_get(path: str):
"""GET from HerbAPI."""
req = urllib.request.Request(
f"{API_BASE}{path}",
headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
def api_post(path: str, data: dict):
"""POST to HerbAPI."""
body = json.dumps(data).encode("utf-8")
req = urllib.request.Request(
f"{API_BASE}{path}",
data=body,
headers={
"Authorization": f"Bearer {AUTH_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
except urllib.error.HTTPError as e:
error_body = e.read().decode("utf-8", errors="replace")
print(f" API ERROR {e.code}: {error_body[:500]}")
raise
# ── Species matching ────────────────────────────────────────────────────────
def load_species() -> dict:
"""Load species from API. Returns dict: lowercase scientific name -> species dict."""
result = {}
page = 1
while True:
data = api_get(f"/species?per_page=100&page={page}")
species_list = data.get("data", data) if isinstance(data, dict) else data
for s in species_list:
key = s["name_scientific"].lower().strip()
result[key] = s
if isinstance(data, dict) and "pagination" in data:
if page >= data["pagination"].get("total_pages", 1):
break
else:
break
page += 1
return result
def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
"""Match a Latin name to an existing species. Returns species dict or None."""
if not latin_name:
return None
# Clean the name: remove author citations, subspecies
clean = latin_name.strip()
clean = re.sub(r'\s+L\.\s*$', '', clean)
clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
key = clean.lower().strip()
if key in species_map:
return species_map[key]
# Try genus + species (first two words)
parts = key.split()
if len(parts) >= 2:
two = f"{parts[0]} {parts[1]}"
if two in species_map:
return species_map[two]
# Try genus-only match (less reliable, but useful for Borago, etc.)
if parts:
for skey, sval in species_map.items():
if skey.startswith(parts[0] + " "):
return sval
return None
# ── Product data extraction ─────────────────────────────────────────────────
@dataclass
class ProductData:
name: str = ""
latin_name: str = ""
description: str = ""
sku: str = ""
url: str = ""
is_organic: bool = True
sowing_depth_cm: Optional[float] = None
row_spacing_cm: Optional[float] = None
plant_spacing_cm: Optional[float] = None
germination_temp_c: Optional[float] = None
perennial: bool = False
def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
"""Parse a product page. Returns ProductData or None if not a product page."""
jsonld = extract_jsonld_product(html)
if not jsonld:
return None # Not a product page
product = ProductData(url=url)
# ── From JSON-LD ──
product.name = jsonld.get("name", "").strip()
product.description = jsonld.get("description", "").strip()
product.sku = jsonld.get("model", "").strip()
# ── Extract full text for pattern matching ──
extractor = TextExtractor()
extractor.feed(html)
full_text = " ".join(extractor.parts)
# ── Latin name ──
m = LATIN_PATTERN.search(full_text)
if m:
product.latin_name = m.group(1).strip()
# Also check <i>/<em> tags in HTML
if not product.latin_name:
for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
im = LATIN_PATTERN.search(clean)
if im:
product.latin_name = im.group(1).strip()
break
if not product.latin_name and default_species:
product.latin_name = default_species
# ── Sowing depth ──
depth_pats = [
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
r'(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
]
for pat in depth_pats:
dm = re.search(pat, full_text, re.IGNORECASE)
if dm:
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
product.sowing_depth_cm = sum(vals) / len(vals)
break
# Fallback: look in raw HTML for common depth patterns like "0,51 cm" near depth keywords
if product.sowing_depth_cm is None:
dm = re.search(
r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
html, re.IGNORECASE
)
if dm:
d1 = float(dm.group(1).replace(",", "."))
d2 = float(dm.group(2).replace(",", "."))
product.sowing_depth_cm = (d1 + d2) / 2
# ── Spacing ──
# Look for "ROW x PLANT cm" patterns
spacing_pats = [
# "3040 x 24 cm" (range x range)
r'(\d+)\s*[-]\s*(\d+)\s*[x×]\s*(\d+)\s*[-]\s*(\d+)\s*cm',
# "100 x 50 cm" (simple)
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in spacing_pats:
matches = re.findall(pat, full_text, re.IGNORECASE)
if matches:
# Prefer the last match (often the more relevant outdoor spacing)
m = matches[-1]
if len(m) == 4:
product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
elif len(m) == 2:
v1 = float(m[0].replace(",", "."))
v2 = float(m[1].replace(",", "."))
product.row_spacing_cm = v1
product.plant_spacing_cm = v2
break
# ── Germination temperature ──
temp_pats = [
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-]\s*(\d+)\s*°?\s*C',
r'(\d+)\s*[-und ]*\s*(\d+)\s*°\s*C',
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
]
for pat in temp_pats:
tm = re.search(pat, full_text, re.IGNORECASE)
if tm:
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
# Sanity check: germination temps are typically 5-35°C
avg = sum(vals) / len(vals)
if 5 <= avg <= 40:
product.germination_temp_c = avg
break
# ── Perennial ──
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
for pat in perennial_pats:
if re.search(pat, full_text, re.IGNORECASE):
product.perennial = True
break
return product
# ── Recursive product discovery ─────────────────────────────────────────────
def discover_products(
category_url: str,
default_species: Optional[str],
max_depth: int = 3,
_depth: int = 0,
_visited: set = None,
) -> list[ProductData]:
"""Recursively discover and parse product pages under a category URL."""
if _visited is None:
_visited = set()
if category_url in _visited or _depth > max_depth:
return []
_visited.add(category_url)
indent = " " * (_depth + 1)
print(f"{indent}Fetching: {category_url}")
try:
html = fetch_url(category_url)
time.sleep(DELAY)
except Exception as e:
print(f"{indent} ERROR: {e}")
return []
# Check if this IS a product page
product = parse_product(html, category_url, default_species)
if product:
return [product]
# It's a category/subcategory page: extract child links
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
child_links = []
for link in extract_links(html, category_url):
parsed = urllib.parse.urlparse(link)
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
continue
child_path = parsed.path.rstrip("/")
# Must be a direct child of the category path
if not child_path.startswith(cat_path + "/"):
continue
relative = child_path[len(cat_path) + 1:]
# Must be exactly one level deeper (no further slashes)
if "/" in relative:
continue
# Skip empty or same-path
if not relative:
continue
# Build clean URL
clean_url = f"https://www.reinsaat.at{child_path}/"
if clean_url not in _visited:
child_links.append(clean_url)
# Deduplicate
child_links = list(dict.fromkeys(child_links))
print(f"{indent} Found {len(child_links)} child links")
products = []
for child_url in child_links:
results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
products.extend(results)
return products
# ── Main ────────────────────────────────────────────────────────────────────
def main():
print("=" * 70)
print("Reinsaat Scraper -> HerbAPI")
print("=" * 70)
# Load species
print("\n[1] Loading species from API...")
species_map = load_species()
sci_names = [k for k in species_map if " " in k]
print(f" {len(sci_names)} species loaded:")
for k in sorted(sci_names):
s = species_map[k]
print(f" {s['name_scientific']:40s} {s['id'][:12]}...")
# Load existing cultivars
print("\n[2] Loading existing cultivars...")
existing_cultivars = {} # (species_id, name_lower) -> cultivar_id
page = 1
while True:
data = api_get(f"/cultivars?per_page=100&page={page}")
clist = data.get("data", data) if isinstance(data, dict) else data
if not clist:
break
for c in clist:
existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
# Check pagination - API uses {data, total, page, per_page} format
if isinstance(data, dict):
total = data.get("total", len(clist))
per_page = data.get("per_page", 100)
if page * per_page >= total:
break
else:
break
page += 1
print(f" {len(existing_cultivars)} existing cultivars")
# Discover products from all categories
print("\n[3] Discovering products from Reinsaat categories...")
all_products: list[ProductData] = []
visited: set[str] = set()
for cat_url, species_hint in CATEGORIES:
print(f"\n Category: {cat_url}")
products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
all_products.extend(products)
print(f" -> {len(products)} products from this category")
print(f"\n Total products discovered: {len(all_products)}")
# Deduplicate by URL
seen_urls = set()
unique_products = []
for p in all_products:
if p.url not in seen_urls:
seen_urls.add(p.url)
unique_products.append(p)
all_products = unique_products
print(f" Unique products: {len(all_products)}")
# Process products
print("\n[4] Creating cultivars in API...")
stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
for i, product in enumerate(all_products):
pct = (i + 1) / len(all_products) * 100
print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
# Match species
species = match_species(product.latin_name, species_map)
if not species:
print(f" Skip: no species match for '{product.latin_name}'")
stats["skipped_no_species"] += 1
continue
species_id = species["id"]
print(f" Species: {species['name_scientific']}")
print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
# Check duplicates
key = (species_id, product.name.lower())
if key in existing_cultivars:
# Still try to link supplier if cultivar exists
cultivar_id = existing_cultivars[key]
print(f" Exists: {cultivar_id[:12]}... - checking supplier link")
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", {
"supplier_id": REINSAAT_SUPPLIER_ID,
"product_url": product.url,
"article_number": product.sku,
})
print(f" Linked to Reinsaat (SKU: {product.sku})")
stats["linked"] += 1
except Exception:
pass # Already linked or other error
stats["skipped_exists"] += 1
continue
# Build payload
payload = {
"species_id": species_id,
"name": product.name,
"name_de": product.name,
"name_en": "",
"description": product.description,
"is_organic": product.is_organic,
"perennial": product.perennial,
}
if product.sowing_depth_cm is not None:
payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
if product.row_spacing_cm is not None:
payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
if product.plant_spacing_cm is not None:
payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
if product.germination_temp_c is not None:
payload["germination_temp_c"] = round(product.germination_temp_c, 1)
# Create cultivar
try:
result = api_post("/cultivars", payload)
cultivar_id = result["id"]
print(f" Created: {cultivar_id}")
stats["created"] += 1
existing_cultivars[key] = cultivar_id
except Exception as e:
print(f" FAILED to create: {e}")
stats["errors"] += 1
continue
# Link to supplier
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", {
"supplier_id": REINSAAT_SUPPLIER_ID,
"product_url": product.url,
"article_number": product.sku,
})
print(f" Linked to Reinsaat (SKU: {product.sku})")
stats["linked"] += 1
except Exception as e:
print(f" FAILED to link supplier: {e}")
# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f" Created: {stats['created']}")
print(f" Linked to supplier: {stats['linked']}")
print(f" Skipped (no species): {stats['skipped_no_species']}")
print(f" Skipped (exists): {stats['skipped_exists']}")
print(f" Errors: {stats['errors']}")
print("=" * 70)
if __name__ == "__main__":
main()
+770
View File
@@ -0,0 +1,770 @@
#!/usr/bin/env python3
"""
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
genus+species from extended botanical names, create/enrich cultivars, link supplier.
Uses direct PostgreSQL access (psycopg2) for speed and reliability.
"""
import json
import re
import ssl
import sys
import time
import uuid
import html as html_mod
import urllib.request
import urllib.error
import urllib.parse
from dataclasses import dataclass, field
from typing import Optional
# Unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
import psycopg2
import psycopg2.extras
# ── Config ──────────────────────────────────────────────────────────────────
DB_HOST = "10.31.3.90"
DB_NAME = "herbapi"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
DELAY = 0.3
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
# ── All Reinsaat categories ────────────────────────────────────────────────
CATEGORIES = [
"https://www.reinsaat.at/shop/DE/bohnen/",
"https://www.reinsaat.at/shop/DE/erbsen/",
"https://www.reinsaat.at/shop/DE/gurken/",
"https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
"https://www.reinsaat.at/shop/DE/knollenfenchel/",
"https://www.reinsaat.at/shop/DE/kohlgewaechse/",
"https://www.reinsaat.at/shop/DE/kuerbis/",
"https://www.reinsaat.at/shop/DE/mais/",
"https://www.reinsaat.at/shop/DE/mangold/",
"https://www.reinsaat.at/shop/DE/melanzani_1/",
"https://www.reinsaat.at/shop/DE/melone/",
"https://www.reinsaat.at/shop/DE/paprika/",
"https://www.reinsaat.at/shop/DE/pastinaken_1/",
"https://www.reinsaat.at/shop/DE/petersilie/",
"https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
"https://www.reinsaat.at/shop/DE/porree/",
"https://www.reinsaat.at/shop/DE/radies_rettich/",
"https://www.reinsaat.at/shop/DE/rote_ruebe/",
"https://www.reinsaat.at/shop/DE/salate/",
"https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
"https://www.reinsaat.at/shop/DE/sellerie/",
"https://www.reinsaat.at/shop/DE/spinat/",
"https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
"https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
"https://www.reinsaat.at/shop/DE/zucchini/",
"https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
"https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
"https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
"https://www.reinsaat.at/shop/DE/gruenduengung/",
]
# ── HTTP ────────────────────────────────────────────────────────────────────
_ssl_ctx = ssl.create_default_context()
def fetch_url(url: str, retries: int = 2) -> str:
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
})
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
charset = resp.headers.get_content_charset() or "utf-8"
return resp.read().decode(charset)
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
if attempt < retries:
time.sleep(2)
continue
raise
return ""
# ── HTML parsing helpers ────────────────────────────────────────────────────
def extract_links(html_text: str, base_url: str) -> list[str]:
links = []
seen = set()
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
href = m.group(1)
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
full = urllib.parse.urljoin(base_url, href)
if full not in seen:
seen.add(full)
links.append(full)
return links
def extract_jsonld_product(html_text: str) -> Optional[dict]:
for m in re.finditer(
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
html_text, re.DOTALL | re.IGNORECASE
):
try:
data = json.loads(m.group(1))
if isinstance(data, dict) and data.get("@type") == "Product":
return data
except (json.JSONDecodeError, ValueError):
continue
return None
def html_to_text(html_text: str) -> str:
"""Strip HTML tags and decode entities."""
text = re.sub(r'<[^>]+>', ' ', html_text)
text = html_mod.unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_botanical_name(html_text: str) -> str:
"""
Extract the botanical/Latin name from the page.
Primary source: <div class="fce_shop_kurztext"> content.
Fallback: <em> tags in growing infos.
Returns the raw text (may include authority names, infraspecific ranks, etc.)
"""
# Primary: kurztext div
m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if m:
text = html_to_text(m.group(1)).strip()
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
return text
# Fallback: first <em> in growingInfos that looks like a Latin name
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if gi:
for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
text = html_to_text(em.group(1)).strip()
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
return text
# Last resort: any <em>/<i> tag with a Latin-looking name
for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
text = html_to_text(tag.group(1)).strip()
if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
return text
return ""
def normalize_latin_name(raw: str) -> str:
"""
Extract genus + species from an extended botanical name.
Examples:
"Pisum sativum L. convar. sat." -> "Pisum sativum"
"Capsicum annuum L." -> "Capsicum annuum"
"Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
"Solanum lycopersicum L." -> "Solanum lycopersicum"
"Cucumis sativus" -> "Cucumis sativus"
"Mentha x piperita" -> "Mentha x piperita"
"""
if not raw:
return ""
# Clean up
name = raw.strip()
# Remove leading/trailing punctuation
name = name.strip(".,;:")
words = name.split()
if len(words) < 2:
return name
genus = words[0]
# Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
if len(words) >= 3 and words[1] in ("x", "×"):
return f"{genus} x {words[2]}"
species = words[1]
# Validate: genus should start uppercase, species lowercase
if not genus[0].isupper() or not species[0].islower():
return name # Can't parse, return as-is
return f"{genus} {species}"
# ── Calendar parsing ────────────────────────────────────────────────────────
CALENDAR_ROW_TYPES = {
"voranzucht": "indoor_sowing_months",
"vorzucht": "indoor_sowing_months",
"vorkultur": "indoor_sowing_months",
"aussaat/ pflanzung freiland": "direct_sowing_months",
"aussaat/pflanzung freiland": "direct_sowing_months",
"aussaat freiland": "direct_sowing_months",
"direktsaat": "direct_sowing_months",
"pflanzung freiland": "transplanting_months",
"pflanzung": "transplanting_months",
"aussaat/ pflanzung gewächshaus": "glasshouse_months",
"aussaat/pflanzung gewächshaus": "glasshouse_months",
"gewächshaus": "glasshouse_months",
"ernte": "harvesting_months",
}
def parse_calendar(html_text: str) -> dict:
"""
Parse the Reinsaat growing calendar table.
Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
Each value is a sorted list of month integers (1-12).
"""
result = {}
cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
if not cal_match:
return result
cal = cal_match.group(1)
rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
for row in rows:
# Get label
label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
if not label_m:
continue
label = html_to_text(label_m.group(1)).strip().lower()
# Map label to our field
field_name = None
for pattern, fname in CALENDAR_ROW_TYPES.items():
if pattern in label:
field_name = fname
break
if not field_name:
continue
# Extract background colors for each cell (24 cells = 12 months x 2 halves)
colors = re.findall(r'background-color:\s*([^;"]+)', row)
# Convert to months: cell i maps to month (i // 2) + 1
active_months = set()
for i, color in enumerate(colors):
color = color.strip().lower()
if color != "none" and color != "transparent" and color != "":
month = (i // 2) + 1
if 1 <= month <= 12:
active_months.add(month)
if active_months:
# Merge if same field already found (e.g. two sowing rows)
if field_name in result:
result[field_name] = sorted(set(result[field_name]) | active_months)
else:
result[field_name] = sorted(active_months)
return result
# ── Growing data extraction ─────────────────────────────────────────────────
def extract_growing_data(html_text: str) -> dict:
"""Extract spacing, depth, germination temp from the growing text."""
data = {}
# Get the growingInfos text
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
if not gi:
return data
full_text = html_to_text(gi.group(1))
# Also get the raw HTML for better entity handling
raw_html = gi.group(1)
# Convert HTML entities for pattern matching
raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
raw_text = re.sub(r'\s+', ' ', raw_text)
# ── Sowing depth ──
depth_pats = [
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-]\s*(\d+(?:[.,]\d+)?)\s*cm',
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in depth_pats:
dm = re.search(pat, raw_text, re.IGNORECASE)
if dm:
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
break
# ── Spacing: "ROW x PLANT cm" ──
spacing_pats = [
# "3045 x 35 cm" (range x range)
r'(\d+)\s*[-]\s*(\d+)\s*[x×]\s*(\d+)\s*[-]\s*(\d+)\s*cm',
# "100 x 50 cm" (simple)
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
]
for pat in spacing_pats:
matches = re.findall(pat, raw_text, re.IGNORECASE)
if matches:
m = matches[-1] # prefer last match
if len(m) == 4:
data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
elif len(m) == 2:
v1 = float(m[0].replace(",", "."))
v2 = float(m[1].replace(",", "."))
data["row_spacing_cm"] = round(v1, 1)
data["plant_spacing_cm"] = round(v2, 1)
break
# ── Germination temperature ──
temp_pats = [
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-]\s*(\d+)\s*[°]?\s*C',
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
]
for pat in temp_pats:
tm = re.search(pat, raw_text, re.IGNORECASE)
if tm:
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
avg = sum(vals) / len(vals)
if 5 <= avg <= 40:
data["germination_temp_c"] = round(avg, 1)
break
# ── Perennial ──
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
for pat in perennial_pats:
if re.search(pat, raw_text, re.IGNORECASE):
data["perennial"] = True
break
return data
# ── Product data ────────────────────────────────────────────────────────────
@dataclass
class ProductData:
name: str = ""
raw_latin_name: str = ""
normalized_latin: str = ""
description: str = ""
sku: str = ""
url: str = ""
is_organic: bool = True
growing_data: dict = field(default_factory=dict)
calendar: dict = field(default_factory=dict)
def parse_product(html_text: str, url: str) -> Optional[ProductData]:
"""Parse a product page. Returns ProductData or None if not a product page."""
jsonld = extract_jsonld_product(html_text)
if not jsonld:
return None
product = ProductData(url=url)
product.name = jsonld.get("name", "").strip()
product.description = jsonld.get("description", "").strip()
product.sku = jsonld.get("model", "").strip()
# Extract and normalize botanical name
product.raw_latin_name = extract_botanical_name(html_text)
product.normalized_latin = normalize_latin_name(product.raw_latin_name)
# Extract growing data
product.growing_data = extract_growing_data(html_text)
# Parse calendar
product.calendar = parse_calendar(html_text)
# Check organic status (Reinsaat is all organic, but check for "demeter" too)
product.is_organic = True
return product
# ── Recursive discovery ─────────────────────────────────────────────────────
def discover_products(
category_url: str,
max_depth: int = 4,
_depth: int = 0,
_visited: set = None,
) -> list[ProductData]:
if _visited is None:
_visited = set()
if category_url in _visited or _depth > max_depth:
return []
_visited.add(category_url)
indent = " " * (_depth + 1)
try:
html_text = fetch_url(category_url)
time.sleep(DELAY)
except Exception as e:
print(f"{indent}ERROR fetching {category_url}: {e}")
return []
# Check if this is a product page
product = parse_product(html_text, category_url)
if product:
return [product]
# Category page: find child links
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
child_links = []
for link in extract_links(html_text, category_url):
parsed = urllib.parse.urlparse(link)
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
continue
child_path = parsed.path.rstrip("/")
if not child_path.startswith(cat_path + "/"):
continue
relative = child_path[len(cat_path) + 1:]
if "/" in relative or not relative:
continue
clean_url = f"https://www.reinsaat.at{child_path}/"
if clean_url not in _visited:
child_links.append(clean_url)
child_links = list(dict.fromkeys(child_links))
print(f"{indent}Category {category_url} -> {len(child_links)} children")
products = []
for child_url in child_links:
results = discover_products(child_url, max_depth, _depth + 1, _visited)
products.extend(results)
return products
# ── Slug generation ─────────────────────────────────────────────────────────
def make_slug(species_name: str, cultivar_name: str) -> str:
"""Generate a URL-friendly slug."""
raw = f"{species_name}-{cultivar_name}".lower()
# Replace umlauts and special chars
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
'á': 'a', 'à': 'a', 'â': 'a',
'í': 'i', 'ì': 'i', 'î': 'i',
'ó': 'o', 'ò': 'o', 'ô': 'o',
'ú': 'u', 'ù': 'u', 'û': 'u',
'ñ': 'n', 'ç': 'c',
}
for old, new in replacements.items():
raw = raw.replace(old, new)
# Keep only alphanumeric and hyphens
slug = re.sub(r'[^a-z0-9]+', '-', raw)
slug = slug.strip('-')
# Collapse multiple hyphens
slug = re.sub(r'-+', '-', slug)
return slug
# ── Main ────────────────────────────────────────────────────────────────────
def db_connect():
"""Create a fresh DB connection."""
conn = psycopg2.connect(
host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
)
conn.autocommit = False
return conn
def main():
print("=" * 70)
print("Reinsaat Scraper v2")
print("=" * 70)
# ── Phase 1: Discover all products (no DB needed) ──
print("\n[1] Discovering products from Reinsaat categories...")
all_products: list[ProductData] = []
visited: set[str] = set()
for cat_url in CATEGORIES:
print(f"\n Category: {cat_url}")
products = discover_products(cat_url, max_depth=4, _visited=visited)
all_products.extend(products)
print(f" -> {len(products)} products")
# Deduplicate by URL
seen_urls = set()
unique_products = []
for p in all_products:
if p.url not in seen_urls:
seen_urls.add(p.url)
unique_products.append(p)
all_products = unique_products
print(f"\n Total unique products: {len(all_products)}")
# ── Phase 2: Connect to DB and load existing data ──
print("\n[2] Connecting to DB and loading existing data...")
conn = db_connect()
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# Load species
cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
species_rows = cur.fetchall()
species_map = {}
for row in species_rows:
key = row["name_scientific"].lower().strip()
species_map[key] = row
print(f" {len(species_map)} species loaded")
# Load existing cultivars
cur.execute("""
SELECT id, species_id, name, slug, description,
row_spacing_cm, plant_spacing_cm, planting_depth_cm,
germination_temp_c, perennial,
indoor_sowing_months, direct_sowing_months,
transplanting_months, glasshouse_months, harvesting_months
FROM cultivars
""")
cultivar_rows = cur.fetchall()
existing_cultivars = {}
existing_slugs = set()
for row in cultivar_rows:
sid = str(row["species_id"])
name_lower = row["name"].lower()
existing_cultivars[(sid, name_lower)] = dict(row)
existing_slugs.add(row["slug"])
print(f" {len(existing_cultivars)} cultivars loaded")
# Load existing Reinsaat supplier links
cur.execute("""
SELECT cultivar_id, product_url, article_number
FROM cultivar_suppliers
WHERE supplier_id = %s
""", (REINSAAT_SUPPLIER_ID,))
existing_links = {}
for row in cur.fetchall():
cid = str(row["cultivar_id"])
url = row["product_url"] or ""
sku = row["article_number"] or ""
existing_links.setdefault(cid, []).append((url, sku))
print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
# ── Phase 3: Process products ──
print("\n[3] Processing products...")
stats = {
"created": 0,
"linked": 0,
"enriched": 0,
"skipped_no_species": 0,
"skipped_no_name": 0,
"link_exists": 0,
"errors": 0,
}
unmatched = []
for i, product in enumerate(all_products):
pct = (i + 1) / len(all_products) * 100
prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
if not product.name:
stats["skipped_no_name"] += 1
continue
# Match species
normalized = product.normalized_latin.lower().strip()
species = species_map.get(normalized)
if not species:
# Try exact match on raw name (first two words)
raw_words = product.raw_latin_name.split()
if len(raw_words) >= 2:
attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
species = species_map.get(attempt)
if not species:
stats["skipped_no_species"] += 1
unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
continue
species_id = str(species["id"])
species_name = species["name_scientific"]
# Check if cultivar exists
ckey = (species_id, product.name.lower())
existing = existing_cultivars.get(ckey)
if existing:
cultivar_id = str(existing["id"])
# ── Enrich existing cultivar with missing data ──
updates = {}
# Growing data from page
gd = product.growing_data
if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
updates["planting_depth_cm"] = gd["planting_depth_cm"]
if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
updates["row_spacing_cm"] = gd["row_spacing_cm"]
if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
updates["germination_temp_c"] = gd["germination_temp_c"]
if gd.get("perennial") and not existing.get("perennial"):
updates["perennial"] = True
# Calendar data
cal = product.calendar
if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
updates["direct_sowing_months"] = cal["direct_sowing_months"]
if cal.get("transplanting_months") and not existing.get("transplanting_months"):
updates["transplanting_months"] = cal["transplanting_months"]
if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
updates["glasshouse_months"] = cal["glasshouse_months"]
if cal.get("harvesting_months") and not existing.get("harvesting_months"):
updates["harvesting_months"] = cal["harvesting_months"]
# Description
if product.description and not existing.get("description"):
updates["description"] = product.description
if updates:
set_clauses = []
values = []
for col, val in updates.items():
set_clauses.append(f"{col} = %s")
values.append(val)
set_clauses.append("updated_at = NOW()")
values.append(cultivar_id)
cur.execute(
f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
values
)
stats["enriched"] += 1
print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
# ── Add supplier link if missing ──
link_exists = False
if cultivar_id in existing_links:
for lurl, lsku in existing_links[cultivar_id]:
if lurl == product.url or (lsku and lsku == product.sku):
link_exists = True
break
if link_exists:
stats["link_exists"] += 1
else:
try:
cur.execute("SAVEPOINT link_sp")
cur.execute("""
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
""", (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
cur.execute("RELEASE SAVEPOINT link_sp")
stats["linked"] += 1
existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
print(f"{prefix} {product.name} -> LINKED ({product.sku})")
except Exception as e:
print(f"{prefix} {product.name} -> LINK ERROR: {e}")
cur.execute("ROLLBACK TO SAVEPOINT link_sp")
stats["errors"] += 1
else:
# ── Create new cultivar ──
slug = make_slug(species_name, product.name)
# Ensure unique slug
base_slug = slug
counter = 2
while slug in existing_slugs:
slug = f"{base_slug}-{counter}"
counter += 1
gd = product.growing_data
cal = product.calendar
try:
cur.execute("SAVEPOINT create_sp")
cur.execute("""
INSERT INTO cultivars (
species_id, name, name_de, slug, description,
is_organic, perennial,
planting_depth_cm, row_spacing_cm, plant_spacing_cm,
germination_temp_c,
indoor_sowing_months, direct_sowing_months,
transplanting_months, glasshouse_months, harvesting_months
) VALUES (
%s::uuid, %s, %s, %s, %s,
%s, %s,
%s, %s, %s,
%s,
%s, %s,
%s, %s, %s
)
RETURNING id
""", (
species_id,
product.name,
product.name,
slug,
product.description,
product.is_organic,
gd.get("perennial", False),
gd.get("planting_depth_cm"),
gd.get("row_spacing_cm"),
gd.get("plant_spacing_cm"),
gd.get("germination_temp_c"),
cal.get("indoor_sowing_months"),
cal.get("direct_sowing_months"),
cal.get("transplanting_months"),
cal.get("glasshouse_months"),
cal.get("harvesting_months"),
))
new_id = str(cur.fetchone()["id"])
existing_slugs.add(slug)
existing_cultivars[ckey] = {"id": new_id}
stats["created"] += 1
# Link to supplier
cur.execute("""
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
""", (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
stats["linked"] += 1
existing_links.setdefault(new_id, []).append((product.url, product.sku))
print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
cur.execute("RELEASE SAVEPOINT create_sp")
except Exception as e:
print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
cur.execute("ROLLBACK TO SAVEPOINT create_sp")
stats["errors"] += 1
# ── Commit ──
conn.commit()
# ── Summary ──
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f" Total products discovered: {len(all_products)}")
print(f" New cultivars created: {stats['created']}")
print(f" New supplier links added: {stats['linked']}")
print(f" Cultivars enriched: {stats['enriched']}")
print(f" Links already existed: {stats['link_exists']}")
print(f" Skipped (no species): {stats['skipped_no_species']}")
print(f" Skipped (no name): {stats['skipped_no_name']}")
print(f" Errors: {stats['errors']}")
print("=" * 70)
if unmatched:
print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
cur.close()
conn.close()
if __name__ == "__main__":
main()
+635
View File
@@ -0,0 +1,635 @@
#!/usr/bin/env python3
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
import json
import re
import sys
import time
import urllib.request
import urllib.error
import urllib.parse
from html import unescape
# --- Config ---
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
REINSAAT_BASE = "https://www.reinsaat.at"
DELAY = 0.3
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
CATEGORIES = [
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
"wild_flowers_seeds", "green_manure",
]
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
STRIP_SUFFIXES = {
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
"sat.", "sat", "axillare", "medikus",
}
def api_get(path, params=None):
"""GET from HerbAPI."""
url = f"{API_BASE}{path}"
if params:
url += "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url)
req.add_header("Authorization", f"Bearer {API_TOKEN}")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def api_post(path, data):
"""POST to HerbAPI."""
url = f"{API_BASE}{path}"
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Authorization", f"Bearer {API_TOKEN}")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def fetch_page(url):
"""Fetch a web page, return HTML string."""
req = urllib.request.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
BOTANICAL_TYPOS = {
"capscicum": "capsicum",
"capsicum frutenscens": "capsicum frutescens",
"tropaelum": "tropaeolum",
"lact.": "lactuca",
}
ABBREVIATED_NAMES = {
"origanum vulg.": "origanum vulgare",
"helichrysum bract.": "helichrysum bracteatum",
"campanula lat.": "campanula latifolia",
"cosmos bip.": "cosmos bipinnatus",
"papaver somnif.": "papaver somniferum",
}
def normalise_botanical(raw):
"""Strip botanical name to genus + species only.
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
"""
if not raw:
return None
# Clean HTML entities
raw = unescape(raw).replace("\xa0", " ").strip()
# Remove trailing commas/periods
raw = raw.rstrip(",. ")
# Remove content in parentheses
raw = re.sub(r"\([^)]*\)", "", raw)
# Check abbreviated names first (before splitting)
raw_lower = raw.lower().strip()
for abbrev, full in ABBREVIATED_NAMES.items():
if raw_lower.startswith(abbrev):
return full
parts = raw.split()
if len(parts) < 2:
return None
# Genus (capitalised) + species (lowercase)
genus = parts[0].lower().rstrip(",")
species = parts[1].lower().rstrip(",")
# Fix known typos
if genus in BOTANICAL_TYPOS:
genus = BOTANICAL_TYPOS[genus]
full_name = f"{genus} {species}"
if full_name in BOTANICAL_TYPOS:
full_name = BOTANICAL_TYPOS[full_name]
genus, species = full_name.split()
# Validate: genus should start with letter, species should be all lowercase
if not genus[0].isalpha() or not species[0].isalpha():
return None
# Skip if species looks like an authority (starts with uppercase in original)
if parts[1][0].isupper():
return None
return f"{genus} {species}"
def extract_product_data(html, url):
"""Extract product info from a Reinsaat product page."""
result = {}
# H1 = variety name
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
if m:
name = unescape(m.group(1)).strip()
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
paren = re.search(r"\(([^)]+)\)", name)
if paren and re.match(r"RS-", name):
name = paren.group(1).strip()
result["name"] = name
# Botanical name from fce_shop_kurztext
m = re.search(
r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
html,
)
if m:
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
# Article number from JSON-LD
for jm in re.finditer(
r'<script type="application/ld\+json">(.*?)</script>', html, re.S
):
try:
jd = json.loads(jm.group(1))
except json.JSONDecodeError:
continue
if jd.get("@type") == "Product":
if "model" in jd:
result["article_number"] = str(jd["model"])
# Get smallest pack price (usually the Portion)
offers = jd.get("offers", {})
if isinstance(offers, dict):
offer_list = offers.get("offers", [])
elif isinstance(offers, list):
offer_list = offers
else:
offer_list = []
if offer_list:
prices = [
o["price"]
for o in offer_list
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
]
if prices:
result["price_eur"] = min(prices)
break
# Price table - get pack sizes
tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
for tbl in tables:
if "" not in tbl:
continue
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
if len(rows) >= 2:
size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
# Find the "Port." entry
for i, st in enumerate(size_texts):
if "Port" in st:
if i < len(price_texts):
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
if pm:
result["port_price"] = float(pm.group())
break
# Get portion content info
result["pack_sizes"] = size_texts
break
# Sowing depth
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
if m:
d1 = float(m.group(1).replace(",", "."))
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
# Try outdoor spacing first
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if not m:
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
if m:
result["row_spacing_cm"] = float(m.group(1))
result["plant_spacing_cm"] = float(m.group(2))
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
if "row_spacing_cm" not in result:
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
if m:
r1 = int(m.group(1))
r2 = int(m.group(2)) if m.group(2) else r1
result["row_spacing_cm"] = float((r1 + r2) // 2)
# Germination temperature
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
if m:
t1 = int(m.group(1))
t2 = int(m.group(2)) if m.group(2) else t1
result["germination_temp_c"] = float((t1 + t2) // 2)
# Pack unit from portion info - "20 seeds" or "25 g" etc
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if not portion_m:
# Try "Port. (20 seeds)" format
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
if portion_m:
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
unit = portion_m.group(2).lower()
if unit in ("seed", "seeds", "korn"):
result["pack_unit"] = "Korn"
else:
result["pack_unit"] = unit
result["url"] = url
return result
def get_all_species():
"""Fetch all species from API, build lookup by normalised name."""
species_map = {}
page = 1
while True:
data = api_get("/species", {"per_page": 100, "page": page})
batch = data.get("data", [])
for sp in batch:
norm = normalise_botanical(sp["name_scientific"])
if norm:
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
if len(batch) < 100:
break
page += 1
return species_map
def get_all_cultivars():
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
cultivar_map = {} # (species_id, lower_name) -> cultivar
page = 1
while True:
data = api_get("/cultivars", {"per_page": 100, "page": page})
batch = data.get("data", [])
for cv in batch:
key = (cv["species_id"], cv["name"].lower().strip())
cultivar_map[key] = cv
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
if len(batch) < 100:
break
page += 1
return cultivar_map
def get_reinsaat_supplier():
"""Get Reinsaat supplier record."""
suppliers = api_get("/suppliers")
for s in suppliers:
if s["slug"] == "reinsaat":
return s
raise RuntimeError("Reinsaat supplier not found in API")
def get_cultivar_suppliers(cultivar_id):
"""Get existing supplier links for a cultivar."""
return api_get(f"/cultivars/{cultivar_id}/suppliers")
def get_product_urls_from_category(cat_slug):
"""Fetch product URLs from a category page. Handles one level of subcategories."""
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
try:
html = fetch_page(cat_url)
except Exception as e:
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
return []
time.sleep(DELAY)
# Get all internal links under this category
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
raw_links = re.findall(rf'href="({pattern})"', html)
# raw_links is list of (full_path, slug_part) but re gives us captured groups
# Let me redo this
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
unique_links = sorted(set(raw_links))
product_urls = []
subcategory_urls = []
for link in unique_links:
full_url = REINSAAT_BASE + link
# Determine depth relative to category
parts = link.rstrip("/").split("/")
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
if len(parts) == 4:
# Could be product or subcategory - we'll check later
product_urls.append(full_url)
elif len(parts) >= 5:
product_urls.append(full_url)
return product_urls
def is_product_page(html):
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
return bool(
re.search(r'fce_shop_kurztext', html)
or re.search(r'"@type":\s*"Product"', html)
)
def main():
print("=" * 60)
print("Reinsaat v3 Scraper")
print("=" * 60)
# Step 1: Load all species
print("\n[1/4] Loading species from API...")
species_map = get_all_species()
print(f" Loaded {len(species_map)} species")
# Step 2: Load all cultivars
print("\n[2/4] Loading cultivars from API...")
cultivar_map = get_all_cultivars()
print(f" Loaded {len(cultivar_map)} cultivars")
# Step 3: Get Reinsaat supplier
print("\n[3/4] Getting Reinsaat supplier...")
supplier = get_reinsaat_supplier()
supplier_id = supplier["id"]
print(f" Reinsaat ID: {supplier_id}")
# Step 4: Scrape categories
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
stats = {
"products_found": 0,
"botanical_extracted": 0,
"species_matched": 0,
"species_not_matched": 0,
"cultivar_existed": 0,
"cultivar_created": 0,
"link_existed": 0,
"link_created": 0,
"errors": 0,
}
unmatched_species = {} # botanical_norm -> count
new_cultivars = []
new_links = []
for cat_i, cat in enumerate(CATEGORIES):
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
urls = get_product_urls_from_category(cat)
print(f" Found {len(urls)} URLs")
for url in urls:
time.sleep(DELAY)
try:
html = fetch_page(url)
except Exception as e:
print(f" ERROR fetching {url}: {e}")
stats["errors"] += 1
continue
# Check if this is actually a product page
if not is_product_page(html):
# Might be a subcategory - get links from it
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
sub_links = [
REINSAAT_BASE + l
for l in sorted(set(sub_links))
if l.startswith(f"/shop/EN/{cat}/")
and l.count("/") > url.rstrip("/").count("/")
]
if sub_links:
# It's a subcategory, process its product links
for sub_url in sub_links:
if sub_url in urls:
continue # already in list
time.sleep(DELAY)
try:
sub_html = fetch_page(sub_url)
except Exception as e:
print(f" ERROR fetching {sub_url}: {e}")
stats["errors"] += 1
continue
if not is_product_page(sub_html):
continue
process_product(
sub_html, sub_url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
continue
process_product(
html, url, species_map, cultivar_map,
supplier_id, stats, unmatched_species,
new_cultivars, new_links,
)
# Report
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Products found: {stats['products_found']}")
print(f"Botanical extracted: {stats['botanical_extracted']}")
print(f"Species matched: {stats['species_matched']}")
print(f"Species NOT matched: {stats['species_not_matched']}")
print(f"Cultivars existed: {stats['cultivar_existed']}")
print(f"Cultivars created: {stats['cultivar_created']}")
print(f"Links existed: {stats['link_existed']}")
print(f"Links created: {stats['link_created']}")
print(f"Errors: {stats['errors']}")
if new_cultivars:
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
for cv in new_cultivars:
print(f" + {cv['name']} ({cv.get('species', '?')})")
if new_links:
print(f"\n--- New supplier links ({len(new_links)}) ---")
for lk in new_links:
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
if unmatched_species:
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
print(f" ? {name} (x{count})")
print("\nDone.")
def process_product(html, url, species_map, cultivar_map, supplier_id,
stats, unmatched_species, new_cultivars, new_links):
"""Process a single product page."""
stats["products_found"] += 1
prod = extract_product_data(html, url)
if not prod.get("name"):
return
bot_norm = prod.get("botanical_norm")
if not bot_norm:
# No botanical name found on page
stats["species_not_matched"] += 1
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
return
stats["botanical_extracted"] += 1
# Match species
species = species_map.get(bot_norm)
if not species:
stats["species_not_matched"] += 1
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
return
stats["species_matched"] += 1
species_id = species["id"]
cultivar_name = prod["name"]
# Check if cultivar exists
cv_key = (species_id, cultivar_name.lower().strip())
existing_cv = cultivar_map.get(cv_key)
if existing_cv:
stats["cultivar_existed"] += 1
cultivar_id = existing_cv["id"]
else:
# Create cultivar
create_data = {
"species_id": species_id,
"name": cultivar_name,
"is_organic": True,
"source_urls": [url],
}
# Add growing data if we extracted any
if "planting_depth_cm" in prod:
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
if "row_spacing_cm" in prod:
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
if "plant_spacing_cm" in prod:
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
if "germination_temp_c" in prod:
create_data["germination_temp_c"] = prod["germination_temp_c"]
try:
new_cv = api_post("/cultivars", create_data)
cultivar_id = new_cv["id"]
stats["cultivar_created"] += 1
new_cultivars.append({
"name": cultivar_name,
"species": species["name"],
"id": cultivar_id,
})
# Add to local cache
cultivar_map[cv_key] = new_cv
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
if e.code == 500 and "Database error" in body:
# Likely slug collision - search for existing cultivar
try:
# Try multiple search strategies
found = None
cn_lower = cultivar_name.lower().strip()
# Strategy 1: search by full name
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
for cv in search_data.get("data", []):
if cv["name"].lower().strip() == cn_lower:
found = cv
break
# Strategy 2: match by species_id + partial name
if not found:
for cv in search_data.get("data", []):
if cv["species_id"] == species_id:
# Match if names are similar (ignoring punctuation)
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
found = cv
break
# Strategy 3: search by last significant word
if not found:
words = [w for w in cultivar_name.split() if len(w) > 2]
if words:
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
for cv in search2.get("data", []):
if cv["species_id"] == species_id:
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
if cv_clean == cn_clean:
found = cv
break
if found:
cultivar_id = found["id"]
cultivar_map[cv_key] = found
stats["cultivar_existed"] += 1
else:
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
stats["errors"] += 1
return
except Exception as e2:
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
stats["errors"] += 1
return
else:
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
return
# Check if Reinsaat supplier link exists
try:
existing_links = get_cultivar_suppliers(cultivar_id)
except Exception:
existing_links = []
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
if has_reinsaat:
stats["link_existed"] += 1
else:
# Create supplier link
link_data = {
"supplier_id": supplier_id,
"product_url": url,
}
if "article_number" in prod:
link_data["article_number"] = prod["article_number"]
if "port_price" in prod:
link_data["price_eur"] = prod["port_price"]
elif "price_eur" in prod:
link_data["price_eur"] = prod["price_eur"]
if "pack_size" in prod:
link_data["pack_size"] = prod["pack_size"]
if "pack_unit" in prod:
link_data["pack_unit"] = prod["pack_unit"]
try:
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
stats["link_created"] += 1
new_links.append({
"cultivar": cultivar_name,
"article": prod.get("article_number", "?"),
"url": url,
})
except urllib.error.HTTPError as e:
body = e.read().decode() if hasattr(e, 'read') else str(e)
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
stats["errors"] += 1
if __name__ == "__main__":
main()