Add scraper and enrichment scripts to tools/ directory
This commit is contained in:
@@ -0,0 +1,156 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
||||||
|
|
||||||
|
HEADERS_WD = {
|
||||||
|
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
|
||||||
|
"Accept": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def herbapi_request(path, method="GET", data=None):
|
||||||
|
url = f"{HERBAPI_BASE}{path}"
|
||||||
|
body = json.dumps(data).encode() if data else None
|
||||||
|
req = urllib.request.Request(url, data=body, method=method, headers={
|
||||||
|
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
})
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def query_wikidata_batch(names):
|
||||||
|
"""Query Wikidata for a batch of scientific names."""
|
||||||
|
values = " ".join(f'"{n}"' for n in names)
|
||||||
|
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
|
||||||
|
VALUES ?name {{ {values} }}
|
||||||
|
?item wdt:P225 ?name .
|
||||||
|
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
|
||||||
|
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
|
||||||
|
}}"""
|
||||||
|
encoded = urllib.parse.quote(sparql)
|
||||||
|
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
|
||||||
|
req = urllib.request.Request(url, headers=HEADERS_WD)
|
||||||
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for binding in data.get("results", {}).get("bindings", []):
|
||||||
|
name = binding["name"]["value"]
|
||||||
|
qid_url = binding["item"]["value"]
|
||||||
|
qid = qid_url.rsplit("/", 1)[-1]
|
||||||
|
gbif = binding.get("gbifId", {}).get("value")
|
||||||
|
eppo = binding.get("eppoCode", {}).get("value")
|
||||||
|
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 1. Fetch all species
|
||||||
|
resp = herbapi_request("/species?per_page=200")
|
||||||
|
species_list = resp["data"]
|
||||||
|
print(f"Fetched {len(species_list)} species from HerbAPI\n")
|
||||||
|
|
||||||
|
# 2. Collect species needing enrichment
|
||||||
|
to_enrich = [sp for sp in species_list
|
||||||
|
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
|
||||||
|
|
||||||
|
if not to_enrich:
|
||||||
|
print("All species already enriched.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"{len(to_enrich)} species need enrichment\n")
|
||||||
|
|
||||||
|
# 3. Batch query Wikidata
|
||||||
|
BATCH_SIZE = 20
|
||||||
|
wikidata_results = {}
|
||||||
|
names = [sp["name_scientific"] for sp in to_enrich]
|
||||||
|
|
||||||
|
for i in range(0, len(names), BATCH_SIZE):
|
||||||
|
batch = names[i:i + BATCH_SIZE]
|
||||||
|
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
|
||||||
|
try:
|
||||||
|
results = query_wikidata_batch(batch)
|
||||||
|
wikidata_results.update(results)
|
||||||
|
print(f" Got {len(results)} matches")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
if i + BATCH_SIZE < len(names):
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
|
||||||
|
|
||||||
|
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
not_found = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for sp in to_enrich:
|
||||||
|
name = sp["name_scientific"]
|
||||||
|
wd = wikidata_results.get(name)
|
||||||
|
if not wd:
|
||||||
|
print(f" SKIP (no Wikidata match): {name}")
|
||||||
|
not_found += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check what needs updating
|
||||||
|
needs_qid = not sp["wikidata_qid"] and wd["qid"]
|
||||||
|
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
|
||||||
|
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
|
||||||
|
|
||||||
|
if not (needs_qid or needs_gbif or needs_eppo):
|
||||||
|
print(f" SKIP (nothing new): {name}")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# GET full species by slug for the complete object
|
||||||
|
full_sp = herbapi_request(f"/species/{sp['slug']}")
|
||||||
|
|
||||||
|
# Remove read-only fields
|
||||||
|
species_id = full_sp.pop("id")
|
||||||
|
full_sp.pop("slug", None)
|
||||||
|
full_sp.pop("created_at", None)
|
||||||
|
full_sp.pop("updated_at", None)
|
||||||
|
|
||||||
|
# Merge new data (only null fields)
|
||||||
|
if needs_qid:
|
||||||
|
full_sp["wikidata_qid"] = wd["qid"]
|
||||||
|
if needs_gbif:
|
||||||
|
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
|
||||||
|
if needs_eppo:
|
||||||
|
full_sp["eppo_code"] = wd["eppo_code"]
|
||||||
|
|
||||||
|
# PUT by UUID
|
||||||
|
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
|
||||||
|
|
||||||
|
fields = []
|
||||||
|
if needs_qid: fields.append(f"qid={wd['qid']}")
|
||||||
|
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
|
||||||
|
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
|
||||||
|
print(f" UPDATED: {name} -> {', '.join(fields)}")
|
||||||
|
updated += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR updating {name}: {e}")
|
||||||
|
errors += 1
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f" Updated: {updated}")
|
||||||
|
print(f" Skipped (no new data): {skipped}")
|
||||||
|
print(f" Not found on Wikidata: {not_found}")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
print(f" Total species: {len(species_list)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,305 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Expand HerbAPI species database with common permaculture/garden species."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.error
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
DELAY = 0.15
|
||||||
|
|
||||||
|
# SSL context for GBIF (https)
|
||||||
|
ssl_ctx = ssl.create_default_context()
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(path):
|
||||||
|
req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def api_post(path, data):
|
||||||
|
body = json.dumps(data).encode()
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{BASE_URL}{path}",
|
||||||
|
data=body,
|
||||||
|
headers={"Authorization": AUTH, "Content-Type": "application/json"},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read()), resp.status
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
err_body = e.read().decode()
|
||||||
|
print(f" ERROR {e.code}: {err_body}")
|
||||||
|
return None, e.code
|
||||||
|
|
||||||
|
|
||||||
|
def gbif_get_german_name(scientific_name):
|
||||||
|
"""Query GBIF for the German vernacular name."""
|
||||||
|
try:
|
||||||
|
url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
|
||||||
|
match = json.loads(resp.read())
|
||||||
|
|
||||||
|
usage_key = match.get("usageKey")
|
||||||
|
if not usage_key:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
|
||||||
|
req2 = urllib.request.Request(url2)
|
||||||
|
with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
|
||||||
|
vn = json.loads(resp.read())
|
||||||
|
|
||||||
|
for r in vn.get("results", []):
|
||||||
|
if r.get("language") == "deu":
|
||||||
|
return r["vernacularName"]
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" GBIF lookup failed for {scientific_name}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Families to ensure exist ─────────────────────────────────────────
|
||||||
|
FAMILIES_NEEDED = {
|
||||||
|
"Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
|
||||||
|
"Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
|
||||||
|
"Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
|
||||||
|
"Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
|
||||||
|
"Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
|
||||||
|
"Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
|
||||||
|
"Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
|
||||||
|
"Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
|
||||||
|
"Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
|
||||||
|
"Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
|
||||||
|
"Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
|
||||||
|
"Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
|
||||||
|
"Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
|
||||||
|
"Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
|
||||||
|
"Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
|
||||||
|
# New families not yet in the DB:
|
||||||
|
"Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
|
||||||
|
"Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
|
||||||
|
"Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Species to add ───────────────────────────────────────────────────
|
||||||
|
# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
|
||||||
|
SPECIES = [
|
||||||
|
# Vegetables
|
||||||
|
("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
|
||||||
|
("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
|
||||||
|
("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
|
||||||
|
("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
|
||||||
|
{"food_uses": "Fruit"}),
|
||||||
|
("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
|
||||||
|
{"food_uses": "Fruit"}),
|
||||||
|
("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
|
||||||
|
{"food_uses": "Fruit, seeds, flowers"}),
|
||||||
|
("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
|
||||||
|
{"food_uses": "Fruit, seeds"}),
|
||||||
|
("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
|
||||||
|
{"food_uses": "Leaves"}),
|
||||||
|
("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
|
||||||
|
{"food_uses": "Leaves"}),
|
||||||
|
("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
|
||||||
|
{"food_uses": "Leaves, flower buds, stems"}),
|
||||||
|
("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
|
||||||
|
{"food_uses": "Root, leaves"}),
|
||||||
|
("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
|
||||||
|
{"food_uses": "Root, leaves, seed pods"}),
|
||||||
|
("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
|
||||||
|
{"food_uses": "Bulb, leaves"}),
|
||||||
|
("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
|
||||||
|
{"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
|
||||||
|
("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
|
||||||
|
{"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
|
||||||
|
("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
|
||||||
|
{"food_uses": "Leaves, root"}),
|
||||||
|
("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
|
||||||
|
{"food_uses": "Stalks, root, leaves"}),
|
||||||
|
("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
|
||||||
|
{"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
|
||||||
|
("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
|
||||||
|
{"food_uses": "Root"}),
|
||||||
|
("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
|
||||||
|
{"food_uses": "Kernels, cobs"}),
|
||||||
|
("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
|
||||||
|
{"food_uses": "Fruit"}),
|
||||||
|
|
||||||
|
# Herbs
|
||||||
|
("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
|
||||||
|
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||||
|
("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
|
||||||
|
{"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||||
|
("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
|
||||||
|
{"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
|
||||||
|
("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
|
||||||
|
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||||
|
("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
|
||||||
|
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||||
|
("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
|
||||||
|
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||||
|
("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
|
||||||
|
{"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
|
||||||
|
("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
|
||||||
|
{"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
|
||||||
|
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
|
||||||
|
"attracts_beneficial_insects": True, "attracts_pollinators": True}),
|
||||||
|
("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
|
||||||
|
{"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
|
||||||
|
("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
|
||||||
|
{"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||||
|
("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
|
||||||
|
{"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
|
||||||
|
"other_uses": "Earthworm attractant (biodynamic)"}),
|
||||||
|
|
||||||
|
# Flowers & cover crops
|
||||||
|
("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
|
||||||
|
{"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
|
||||||
|
("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
|
||||||
|
{"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||||
|
("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
|
||||||
|
{"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
|
||||||
|
("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
|
||||||
|
{"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||||
|
("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
|
||||||
|
{"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
|
||||||
|
("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
|
||||||
|
"ground_cover_quality": "excellent", "attracts_pollinators": True}),
|
||||||
|
("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Sprouts",
|
||||||
|
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
|
||||||
|
"other_uses": "Green manure, deep-rooting soil improver"}),
|
||||||
|
|
||||||
|
# Fruit / Trees
|
||||||
|
("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
|
||||||
|
{"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
|
||||||
|
("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
|
||||||
|
{"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
|
||||||
|
("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
|
||||||
|
{"food_uses": "Fruit", "attracts_pollinators": True}),
|
||||||
|
("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
|
||||||
|
{"food_uses": "Berries"}),
|
||||||
|
("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
|
||||||
|
{"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
|
||||||
|
"wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
|
||||||
|
("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
|
||||||
|
{"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
|
||||||
|
("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
|
||||||
|
{"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
|
||||||
|
"medicinal_uses": "High vitamin C, skin care",
|
||||||
|
"other_uses": "Erosion control, windbreak"}),
|
||||||
|
("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
|
||||||
|
{"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 1. Load existing families
|
||||||
|
print("=== Loading existing families ===")
|
||||||
|
fam_resp = api_get("/families?per_page=100")
|
||||||
|
family_map = {} # name_scientific -> id
|
||||||
|
for f in fam_resp["data"]:
|
||||||
|
family_map[f["name_scientific"]] = f["id"]
|
||||||
|
print(f" Found {len(family_map)} existing families")
|
||||||
|
|
||||||
|
# 2. Create missing families
|
||||||
|
print("\n=== Creating missing families ===")
|
||||||
|
families_created = 0
|
||||||
|
for fam_name, fam_info in FAMILIES_NEEDED.items():
|
||||||
|
if fam_name in family_map:
|
||||||
|
print(f" SKIP (exists): {fam_name}")
|
||||||
|
continue
|
||||||
|
payload = {
|
||||||
|
"name_scientific": fam_name,
|
||||||
|
"name_en": fam_info["name_en"],
|
||||||
|
"name_de": fam_info["name_de"],
|
||||||
|
}
|
||||||
|
print(f" CREATE: {fam_name} ...", end=" ")
|
||||||
|
result, status = api_post("/families", payload)
|
||||||
|
if result and "id" in result:
|
||||||
|
family_map[fam_name] = result["id"]
|
||||||
|
print(f"OK ({result['id']})")
|
||||||
|
families_created += 1
|
||||||
|
else:
|
||||||
|
print(f"FAILED (status={status})")
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
print(f"\n Families created: {families_created}")
|
||||||
|
|
||||||
|
# 3. Load existing species
|
||||||
|
print("\n=== Loading existing species ===")
|
||||||
|
sp_resp = api_get("/species?per_page=200")
|
||||||
|
existing_species = set()
|
||||||
|
for s in sp_resp["data"]:
|
||||||
|
existing_species.add(s["name_scientific"])
|
||||||
|
print(f" Found {len(existing_species)} existing species")
|
||||||
|
|
||||||
|
# 4. Add new species
|
||||||
|
print("\n=== Adding new species ===")
|
||||||
|
created = 0
|
||||||
|
skipped = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
|
||||||
|
if sci_name in existing_species:
|
||||||
|
print(f" SKIP (exists): {sci_name}")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Look up family ID
|
||||||
|
fam_id = family_map.get(family)
|
||||||
|
if not fam_id:
|
||||||
|
print(f" SKIP (no family '{family}'): {sci_name}")
|
||||||
|
failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try GBIF for German name
|
||||||
|
gbif_de = gbif_get_german_name(sci_name)
|
||||||
|
if gbif_de:
|
||||||
|
print(f" GBIF name for {sci_name}: {gbif_de}")
|
||||||
|
# Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
|
||||||
|
# Keep our curated name_de but log the GBIF one
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"name_scientific": sci_name,
|
||||||
|
"family_id": fam_id,
|
||||||
|
"name_en": name_en,
|
||||||
|
"name_de": name_de,
|
||||||
|
"plant_layer": plant_layer,
|
||||||
|
}
|
||||||
|
# Add extra fields
|
||||||
|
for k, v in extras.items():
|
||||||
|
payload[k] = v
|
||||||
|
|
||||||
|
print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
|
||||||
|
result, status = api_post("/species", payload)
|
||||||
|
if result and "id" in result:
|
||||||
|
print(f"OK ({result['id']})")
|
||||||
|
created += 1
|
||||||
|
else:
|
||||||
|
print(f"FAILED (status={status})")
|
||||||
|
failed += 1
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"SUMMARY")
|
||||||
|
print(f" Families created: {families_created}")
|
||||||
|
print(f" Species created: {created}")
|
||||||
|
print(f" Species skipped: {skipped}")
|
||||||
|
print(f" Species failed: {failed}")
|
||||||
|
print(f" Total species now: {len(existing_species) + created}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,362 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
# Force unbuffered output
|
||||||
|
sys.stdout.reconfigure(line_buffering=True)
|
||||||
|
sys.stderr.reconfigure(line_buffering=True)
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
S3_ENDPOINT = "http://garage.sub-net.at:3900"
|
||||||
|
S3_BUCKET = "herbapi"
|
||||||
|
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
|
||||||
|
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
|
||||||
|
S3_REGION = "garage"
|
||||||
|
|
||||||
|
DB_HOST = "10.31.3.90"
|
||||||
|
DB_USER = "herbapi"
|
||||||
|
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||||
|
DB_NAME = "herbapi"
|
||||||
|
|
||||||
|
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||||
|
THUMB_WIDTH = 800
|
||||||
|
REQUEST_DELAY = 0.3
|
||||||
|
|
||||||
|
ALLOWED_LICENSES = {
|
||||||
|
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
|
||||||
|
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
|
||||||
|
"pd-us", "pd-usgov", "pd-author",
|
||||||
|
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
|
||||||
|
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
|
||||||
|
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
|
||||||
|
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(name: str) -> str:
|
||||||
|
"""Convert scientific name to a URL-safe slug."""
|
||||||
|
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||||
|
|
||||||
|
|
||||||
|
def psql(query: str) -> str:
|
||||||
|
"""Run a psql query and return output."""
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PGPASSWORD"] = DB_PASS
|
||||||
|
result = subprocess.run(
|
||||||
|
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
|
||||||
|
capture_output=True, text=True, env=env
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_json(url: str) -> dict | None:
|
||||||
|
"""Fetch JSON from a URL with proper User-Agent."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
except Exception as e:
|
||||||
|
print(f" HTTP error fetching {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_wikidata_image(qid: str) -> str | None:
|
||||||
|
"""Query Wikidata SPARQL for P18 image filename."""
|
||||||
|
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||||
|
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
|
||||||
|
"query": sparql, "format": "json"
|
||||||
|
})
|
||||||
|
data = fetch_json(url)
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
|
bindings = data.get("results", {}).get("bindings", [])
|
||||||
|
if not bindings:
|
||||||
|
return None
|
||||||
|
image_url = bindings[0]["image"]["value"]
|
||||||
|
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
|
||||||
|
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def get_commons_info(filename: str) -> dict | None:
|
||||||
|
"""Get image info from Wikimedia Commons API."""
|
||||||
|
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
|
||||||
|
"action": "query",
|
||||||
|
"titles": f"File:{filename}",
|
||||||
|
"prop": "imageinfo",
|
||||||
|
"iiprop": "url|extmetadata",
|
||||||
|
"iiurlwidth": str(THUMB_WIDTH),
|
||||||
|
"format": "json",
|
||||||
|
})
|
||||||
|
data = fetch_json(url)
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
|
pages = data.get("query", {}).get("pages", {})
|
||||||
|
for page_id, page in pages.items():
|
||||||
|
if page_id == "-1":
|
||||||
|
return None
|
||||||
|
imageinfo = page.get("imageinfo", [])
|
||||||
|
if not imageinfo:
|
||||||
|
return None
|
||||||
|
info = imageinfo[0]
|
||||||
|
meta = info.get("extmetadata", {})
|
||||||
|
|
||||||
|
thumb_url = info.get("thumburl") or info.get("url")
|
||||||
|
desc_url = info.get("descriptionurl", "")
|
||||||
|
|
||||||
|
license_short = meta.get("LicenseShortName", {}).get("value", "")
|
||||||
|
artist_html = meta.get("Artist", {}).get("value", "")
|
||||||
|
# Strip HTML tags from artist
|
||||||
|
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
|
||||||
|
# Clean up whitespace
|
||||||
|
artist = re.sub(r'\s+', ' ', artist)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"thumb_url": thumb_url,
|
||||||
|
"description_url": desc_url,
|
||||||
|
"license": license_short,
|
||||||
|
"artist": artist,
|
||||||
|
"filename": filename,
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_license_allowed(license_str: str) -> bool:
|
||||||
|
"""Check if a license is in our allowed list."""
|
||||||
|
normalized = license_str.lower().strip()
|
||||||
|
# Direct match
|
||||||
|
if normalized in ALLOWED_LICENSES:
|
||||||
|
return True
|
||||||
|
# Check for NC or ND
|
||||||
|
if "nc" in normalized or "nd" in normalized:
|
||||||
|
return False
|
||||||
|
# Check patterns
|
||||||
|
if normalized.startswith("public domain") or normalized.startswith("pd"):
|
||||||
|
return True
|
||||||
|
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
|
||||||
|
return True
|
||||||
|
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
|
||||||
|
return True
|
||||||
|
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_license(license_str: str) -> str:
|
||||||
|
"""Normalize license string for storage."""
|
||||||
|
low = license_str.lower().strip()
|
||||||
|
if "public domain" in low or low.startswith("pd"):
|
||||||
|
return "Public domain"
|
||||||
|
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
|
||||||
|
return "CC0 1.0"
|
||||||
|
# CC BY-SA X.0
|
||||||
|
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
|
||||||
|
if m:
|
||||||
|
return f"CC BY-SA {m.group(1)}"
|
||||||
|
# CC BY X.0
|
||||||
|
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
|
||||||
|
if m:
|
||||||
|
return f"CC BY {m.group(1)}"
|
||||||
|
return license_str
|
||||||
|
|
||||||
|
|
||||||
|
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
|
||||||
|
"""Upload to S3 Garage using AWS CLI."""
|
||||||
|
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
|
||||||
|
with open(tmp_path, "wb") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
|
||||||
|
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
|
||||||
|
env["AWS_DEFAULT_REGION"] = S3_REGION
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"aws", "s3", "cp", tmp_path,
|
||||||
|
f"s3://{S3_BUCKET}/{s3_key}",
|
||||||
|
"--endpoint-url", S3_ENDPOINT,
|
||||||
|
"--content-type", content_type,
|
||||||
|
],
|
||||||
|
capture_output=True, text=True, env=env
|
||||||
|
)
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
|
||||||
|
|
||||||
|
|
||||||
|
def download_image(url: str) -> bytes | None:
|
||||||
|
"""Download image data from URL."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||||
|
return resp.read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Download error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 1. Get species
|
||||||
|
rows = psql(
|
||||||
|
"SELECT id, name_scientific, wikidata_qid FROM species "
|
||||||
|
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
|
||||||
|
"ORDER BY name_scientific"
|
||||||
|
)
|
||||||
|
if not rows:
|
||||||
|
print("No species with wikidata_qid found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
species_list = []
|
||||||
|
for line in rows.split("\n"):
|
||||||
|
parts = line.split("|")
|
||||||
|
if len(parts) == 3:
|
||||||
|
species_list.append({
|
||||||
|
"id": parts[0],
|
||||||
|
"name": parts[1],
|
||||||
|
"qid": parts[2],
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"Found {len(species_list)} species with Wikidata QIDs.")
|
||||||
|
|
||||||
|
# 2. Get existing images
|
||||||
|
existing = set()
|
||||||
|
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
|
||||||
|
if existing_rows:
|
||||||
|
for line in existing_rows.split("\n"):
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
existing.add(line)
|
||||||
|
|
||||||
|
print(f"Found {len(existing)} species that already have images.")
|
||||||
|
|
||||||
|
imported = 0
|
||||||
|
skipped_existing = 0
|
||||||
|
skipped_no_image = 0
|
||||||
|
skipped_license = 0
|
||||||
|
skipped_download = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for i, sp in enumerate(species_list):
|
||||||
|
name = sp["name"]
|
||||||
|
qid = sp["qid"]
|
||||||
|
sp_id = sp["id"]
|
||||||
|
slug = slugify(name)
|
||||||
|
|
||||||
|
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
|
||||||
|
|
||||||
|
if sp_id in existing:
|
||||||
|
print(" Already has image, skipping.")
|
||||||
|
skipped_existing += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Query Wikidata for image
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
filename = get_wikidata_image(qid)
|
||||||
|
if not filename:
|
||||||
|
print(" No image on Wikidata.")
|
||||||
|
skipped_no_image += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get Commons info
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
info = get_commons_info(filename)
|
||||||
|
if not info:
|
||||||
|
print(f" Could not get Commons info for {filename}")
|
||||||
|
skipped_no_image += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check license
|
||||||
|
raw_license = info["license"]
|
||||||
|
if not is_license_allowed(raw_license):
|
||||||
|
print(f" License not allowed: {raw_license}")
|
||||||
|
skipped_license += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
norm_license = normalize_license(raw_license)
|
||||||
|
artist = info["artist"]
|
||||||
|
thumb_url = info["thumb_url"]
|
||||||
|
desc_url = info["description_url"]
|
||||||
|
|
||||||
|
print(f" License: {raw_license} -> {norm_license}")
|
||||||
|
print(f" Artist: {artist[:80]}")
|
||||||
|
print(f" Thumbnail: {thumb_url[:100]}...")
|
||||||
|
|
||||||
|
# Download image
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
image_data = download_image(thumb_url)
|
||||||
|
if not image_data:
|
||||||
|
print(" Failed to download image.")
|
||||||
|
skipped_download += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" Downloaded {len(image_data)} bytes")
|
||||||
|
|
||||||
|
# Determine file extension from URL
|
||||||
|
ext = "jpg"
|
||||||
|
if ".png" in thumb_url.lower():
|
||||||
|
ext = "png"
|
||||||
|
elif ".svg" in thumb_url.lower():
|
||||||
|
ext = "svg"
|
||||||
|
elif ".gif" in thumb_url.lower():
|
||||||
|
ext = "gif"
|
||||||
|
|
||||||
|
s3_key = f"species/{slug}.{ext}"
|
||||||
|
content_type = {
|
||||||
|
"jpg": "image/jpeg",
|
||||||
|
"png": "image/png",
|
||||||
|
"svg": "image/svg+xml",
|
||||||
|
"gif": "image/gif",
|
||||||
|
}.get(ext, "image/jpeg")
|
||||||
|
|
||||||
|
# Upload to S3
|
||||||
|
try:
|
||||||
|
s3_upload(s3_key, image_data, content_type)
|
||||||
|
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f" S3 upload failed: {e}")
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Insert into database
|
||||||
|
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
|
||||||
|
# Escape single quotes for SQL
|
||||||
|
caption_esc = caption.replace("'", "''")
|
||||||
|
desc_url_esc = desc_url.replace("'", "''")
|
||||||
|
norm_license_esc = norm_license.replace("'", "''")
|
||||||
|
s3_key_esc = s3_key.replace("'", "''")
|
||||||
|
|
||||||
|
insert_sql = (
|
||||||
|
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||||
|
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
|
||||||
|
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = psql(insert_sql)
|
||||||
|
# psql returns empty on success for INSERT
|
||||||
|
print(f" Inserted into images table.")
|
||||||
|
imported += 1
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"DONE!")
|
||||||
|
print(f" Imported: {imported}")
|
||||||
|
print(f" Skipped (existing):{skipped_existing}")
|
||||||
|
print(f" Skipped (no image):{skipped_no_image}")
|
||||||
|
print(f" Skipped (license): {skipped_license}")
|
||||||
|
print(f" Skipped (download):{skipped_download}")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
print(f" Total processed: {len(species_list)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,290 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
# Config
|
||||||
|
DB_HOST = "10.31.3.90"
|
||||||
|
DB_USER = "herbapi"
|
||||||
|
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||||
|
DB_NAME = "herbapi"
|
||||||
|
S3_BUCKET = "herbapi"
|
||||||
|
S3_ENDPOINT = "http://10.31.3.170:3900"
|
||||||
|
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||||
|
REQUEST_DELAY = 0.3
|
||||||
|
|
||||||
|
# AWS env for subprocess calls
|
||||||
|
AWS_ENV = {
|
||||||
|
**os.environ,
|
||||||
|
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
|
||||||
|
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
|
||||||
|
"AWS_DEFAULT_REGION": "garage",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url):
|
||||||
|
"""Fetch URL with custom User-Agent."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
return resp.read()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_json(url):
|
||||||
|
"""Fetch URL and parse JSON."""
|
||||||
|
return json.loads(fetch_url(url))
|
||||||
|
|
||||||
|
|
||||||
|
def psql(sql):
|
||||||
|
"""Run psql command and return output."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
env={**os.environ, "PGPASSWORD": DB_PASS},
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def is_license_allowed(license_str):
|
||||||
|
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
|
||||||
|
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
|
||||||
|
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
|
||||||
|
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
|
||||||
|
"""
|
||||||
|
if not license_str:
|
||||||
|
return False
|
||||||
|
ls = license_str.lower().strip()
|
||||||
|
|
||||||
|
# Reject NC and ND explicitly first
|
||||||
|
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Public domain / CC0
|
||||||
|
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
|
||||||
|
return True
|
||||||
|
if "public domain" in ls or ls.startswith("pd"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# CC BY-SA (any version, any jurisdiction)
|
||||||
|
if re.match(r"cc\s+by-sa\b", ls):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
|
||||||
|
if re.match(r"cc\s+by\b", ls):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_wikidata_image(qid):
|
||||||
|
"""Query Wikidata SPARQL for P18 image filename."""
|
||||||
|
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||||
|
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
|
||||||
|
data = fetch_json(url)
|
||||||
|
bindings = data.get("results", {}).get("bindings", [])
|
||||||
|
if not bindings:
|
||||||
|
return None
|
||||||
|
image_url = bindings[0]["image"]["value"]
|
||||||
|
# Extract filename from commons URL
|
||||||
|
filename = urllib.parse.unquote(image_url.split("/")[-1])
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def get_commons_info(filename):
|
||||||
|
"""Get image info from Commons API: license, artist, thumbnail URL."""
|
||||||
|
title = f"File:{filename}"
|
||||||
|
url = (
|
||||||
|
f"https://commons.wikimedia.org/w/api.php?action=query"
|
||||||
|
f"&titles={urllib.parse.quote(title)}"
|
||||||
|
f"&prop=imageinfo&iiprop=url|extmetadata"
|
||||||
|
f"&iiurlwidth=800&format=json"
|
||||||
|
)
|
||||||
|
data = fetch_json(url)
|
||||||
|
pages = data.get("query", {}).get("pages", {})
|
||||||
|
for page_id, page in pages.items():
|
||||||
|
if page_id == "-1":
|
||||||
|
return None
|
||||||
|
imageinfo = page.get("imageinfo", [{}])[0]
|
||||||
|
meta = imageinfo.get("extmetadata", {})
|
||||||
|
|
||||||
|
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
|
||||||
|
artist_html = meta.get("Artist", {}).get("value", "")
|
||||||
|
|
||||||
|
# Clean up artist: strip HTML tags
|
||||||
|
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
|
||||||
|
# Collapse whitespace
|
||||||
|
artist = re.sub(r"\s+", " ", artist)
|
||||||
|
if len(artist) > 120:
|
||||||
|
artist = artist[:117] + "..."
|
||||||
|
|
||||||
|
# Use the API-provided thumbnail URL (iiurlwidth=800)
|
||||||
|
thumb_url = imageinfo.get("thumburl", "")
|
||||||
|
# Also get the description URL
|
||||||
|
desc_url = imageinfo.get("descriptionurl", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"license": license_short,
|
||||||
|
"artist": artist,
|
||||||
|
"thumb_url": thumb_url,
|
||||||
|
"desc_url": desc_url,
|
||||||
|
"filename": filename,
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def process_species(species_id, slug, name_sci, qid):
|
||||||
|
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
|
||||||
|
stats["total"] += 1
|
||||||
|
|
||||||
|
# Step 1: Get image filename from Wikidata
|
||||||
|
try:
|
||||||
|
filename = get_wikidata_image(qid)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR querying Wikidata for {qid}: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return False
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
if not filename:
|
||||||
|
print(f" No P18 image for {qid}")
|
||||||
|
stats["no_p18"] += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 2: Get Commons info (license, artist, thumb URL)
|
||||||
|
try:
|
||||||
|
info = get_commons_info(filename)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR querying Commons for {filename}: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return False
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
if not info:
|
||||||
|
print(f" No Commons info for {filename}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 3: Check license
|
||||||
|
if not is_license_allowed(info["license"]):
|
||||||
|
print(f" Bad license: {info['license']} for {filename}")
|
||||||
|
stats["bad_license"] += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 4: Download thumbnail using API-provided URL
|
||||||
|
thumb_url = info["thumb_url"]
|
||||||
|
if not thumb_url:
|
||||||
|
print(f" No thumbnail URL available for {filename}")
|
||||||
|
stats["download_fail"] += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Determine file extension from thumbnail URL
|
||||||
|
ext = "jpg"
|
||||||
|
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||||
|
ext = "png"
|
||||||
|
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||||
|
ext = "gif"
|
||||||
|
|
||||||
|
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
|
||||||
|
try:
|
||||||
|
img_data = fetch_url(thumb_url)
|
||||||
|
with open(tmp_path, "wb") as f:
|
||||||
|
f.write(img_data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR downloading {thumb_url}: {e}")
|
||||||
|
stats["download_fail"] += 1
|
||||||
|
return False
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# Step 5: Upload to S3
|
||||||
|
s3_key = f"species/{slug}.{ext}"
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
|
||||||
|
capture_output=True, text=True, env=AWS_ENV, timeout=60,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" S3 upload failed: {result.stderr}")
|
||||||
|
stats["upload_fail"] += 1
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR uploading to S3: {e}")
|
||||||
|
stats["upload_fail"] += 1
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Step 6: Insert into DB
|
||||||
|
caption = f"Photo: {info['artist']}" if info["artist"] else ""
|
||||||
|
caption_sql = caption.replace("'", "''")
|
||||||
|
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
|
||||||
|
source_url_sql = source_url.replace("'", "''")
|
||||||
|
license_sql = info["license"].replace("'", "''")
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||||
|
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
psql(sql)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR inserting to DB: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
stats["imported"] += 1
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Get species without images
|
||||||
|
rows = psql(
|
||||||
|
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
|
||||||
|
"FROM species s "
|
||||||
|
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
|
||||||
|
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
|
||||||
|
"ORDER BY s.name_scientific;"
|
||||||
|
)
|
||||||
|
if not rows:
|
||||||
|
print("No species need images.")
|
||||||
|
return
|
||||||
|
|
||||||
|
species_list = []
|
||||||
|
for line in rows.split("\n"):
|
||||||
|
parts = line.strip().split("|")
|
||||||
|
if len(parts) == 4:
|
||||||
|
species_list.append(parts)
|
||||||
|
|
||||||
|
print(f"Processing {len(species_list)} species...\n")
|
||||||
|
|
||||||
|
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
|
||||||
|
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
|
||||||
|
ok = process_species(sid, slug, name_sci, qid)
|
||||||
|
if ok:
|
||||||
|
print(f" OK - imported")
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f" Total species processed: {stats['total']}")
|
||||||
|
print(f" Successfully imported: {stats['imported']}")
|
||||||
|
print(f" No P18 image: {stats['no_p18']}")
|
||||||
|
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
|
||||||
|
print(f" Download failures: {stats['download_fail']}")
|
||||||
|
print(f" Upload failures: {stats['upload_fail']}")
|
||||||
|
print(f" Other errors: {stats['errors']}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
|
||||||
|
import json, urllib.request, urllib.parse, time, sys
|
||||||
|
|
||||||
|
API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
GBIF = "https://api.gbif.org/v1"
|
||||||
|
|
||||||
|
def api_post(path, data):
|
||||||
|
req = urllib.request.Request(f"{API}{path}",
|
||||||
|
data=json.dumps(data).encode(),
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req)
|
||||||
|
return json.loads(resp.read())
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def gbif_de_name(name):
|
||||||
|
"""Get German common name from GBIF."""
|
||||||
|
url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
|
||||||
|
try:
|
||||||
|
match = json.loads(urllib.request.urlopen(url).read())
|
||||||
|
if not match.get("usageKey"): return None
|
||||||
|
url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
|
||||||
|
data = json.loads(urllib.request.urlopen(url2).read())
|
||||||
|
for r in data.get("results", []):
|
||||||
|
if r.get("language") == "deu":
|
||||||
|
return r["vernacularName"]
|
||||||
|
except: pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
FAMILIES = [
|
||||||
|
("Fabaceae", "Hülsenfrüchtler", "Legumes"),
|
||||||
|
("Rosaceae", "Rosengewächse", "Rose family"),
|
||||||
|
("Brassicaceae", "Kreuzblütler", "Cabbage family"),
|
||||||
|
("Apiaceae", "Doldenblütler", "Carrot family"),
|
||||||
|
("Lamiaceae", "Lippenblütler", "Mint family"),
|
||||||
|
("Asteraceae", "Korbblütler", "Daisy family"),
|
||||||
|
("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
|
||||||
|
("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
|
||||||
|
("Poaceae", "Süßgräser", "Grass family"),
|
||||||
|
("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
|
||||||
|
("Boraginaceae", "Raublattgewächse", "Borage family"),
|
||||||
|
("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
|
||||||
|
("Betulaceae", "Birkengewächse", "Birch family"),
|
||||||
|
("Fagaceae", "Buchengewächse", "Beech family"),
|
||||||
|
("Juglandaceae", "Walnussgewächse", "Walnut family"),
|
||||||
|
("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
|
||||||
|
("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
|
||||||
|
("Ericaceae", "Heidekrautgewächse", "Heath family"),
|
||||||
|
("Moraceae", "Maulbeergewächse", "Mulberry family"),
|
||||||
|
("Urticaceae", "Brennnesselgewächse", "Nettle family"),
|
||||||
|
("Malvaceae", "Malvengewächse", "Mallow family"),
|
||||||
|
("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
|
||||||
|
("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
|
||||||
|
("Asparagaceae", "Spargelgewächse", "Asparagus family"),
|
||||||
|
("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
|
||||||
|
]
|
||||||
|
|
||||||
|
SPECIES = [
|
||||||
|
("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
|
||||||
|
("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
|
||||||
|
("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
|
||||||
|
("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
|
||||||
|
("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
|
||||||
|
("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
|
||||||
|
("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
|
||||||
|
("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
|
||||||
|
("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
|
||||||
|
("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||||
|
("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||||
|
("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
|
||||||
|
("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
|
||||||
|
("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
|
||||||
|
("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
|
||||||
|
("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
|
||||||
|
("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
|
||||||
|
("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
|
||||||
|
("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
|
||||||
|
("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
|
||||||
|
("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
|
||||||
|
("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
|
||||||
|
("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
|
||||||
|
("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
|
||||||
|
("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create families
|
||||||
|
print("=== Creating families ===")
|
||||||
|
family_map = {}
|
||||||
|
for sci, de, en in FAMILIES:
|
||||||
|
r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
|
||||||
|
if r:
|
||||||
|
family_map[sci] = r["id"]
|
||||||
|
print(f" ✓ {sci}")
|
||||||
|
time.sleep(0.05)
|
||||||
|
print(f"Created {len(family_map)} families\n")
|
||||||
|
|
||||||
|
# Create species
|
||||||
|
print("=== Creating species (with GBIF German names) ===")
|
||||||
|
created = 0
|
||||||
|
for sci_name, family_sci, extra in SPECIES:
|
||||||
|
fam_id = family_map.get(family_sci)
|
||||||
|
if not fam_id:
|
||||||
|
print(f" ✗ {sci_name} — family {family_sci} missing")
|
||||||
|
continue
|
||||||
|
de_name = gbif_de_name(sci_name)
|
||||||
|
data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
|
||||||
|
r = api_post("/species", data)
|
||||||
|
if r:
|
||||||
|
created += 1
|
||||||
|
print(f" ✓ {sci_name} → {de_name or '(no DE name)'}")
|
||||||
|
time.sleep(0.15)
|
||||||
|
print(f"Created {created} species\n")
|
||||||
|
|
||||||
|
# Create suppliers
|
||||||
|
print("=== Creating suppliers ===")
|
||||||
|
for name, url, country, organic, demeter, notes in [
|
||||||
|
("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
|
||||||
|
("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
|
||||||
|
]:
|
||||||
|
r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
|
||||||
|
if r: print(f" ✓ {name}")
|
||||||
|
print("\nDone!")
|
||||||
@@ -0,0 +1,514 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
|
||||||
|
|
||||||
|
Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
|
||||||
|
product listings and details, then creates cultivars in HerbAPI matched
|
||||||
|
to existing species.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# --- Configuration -----------------------------------------------------------
|
||||||
|
|
||||||
|
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
|
||||||
|
SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
|
||||||
|
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
|
REQUEST_DELAY = 0.5 # seconds between requests
|
||||||
|
|
||||||
|
# Only import products from these Arche Noah article lines (their own seeds)
|
||||||
|
ARCHE_NOAH_LINES = {
|
||||||
|
"Bio-Saatgut von ARCHE NOAH",
|
||||||
|
"Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Search terms to discover all seed products across the shop
|
||||||
|
SEARCH_TERMS = [
|
||||||
|
"Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
|
||||||
|
"Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
|
||||||
|
"Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
|
||||||
|
"Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
|
||||||
|
"Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
|
||||||
|
"Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
|
||||||
|
"Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
|
||||||
|
"Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
|
||||||
|
"Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
|
||||||
|
"Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
|
||||||
|
"Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
|
||||||
|
"Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
|
||||||
|
"Rote Bete", "Rote Rübe", "Mangold", "Melde",
|
||||||
|
"Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
|
||||||
|
"Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
|
||||||
|
"Zuckermais", "Popcorn",
|
||||||
|
]
|
||||||
|
|
||||||
|
# --- Helpers -----------------------------------------------------------------
|
||||||
|
|
||||||
|
def herbapi_request(method, path, data=None):
|
||||||
|
"""Make a request to HerbAPI."""
|
||||||
|
url = f"{HERBAPI_BASE}/{path}"
|
||||||
|
body = json.dumps(data).encode() if data else None
|
||||||
|
req = urllib.request.Request(url, data=body, method=method, headers={
|
||||||
|
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
raw = resp.read().decode("utf-8")
|
||||||
|
return json.loads(raw) if raw.strip() else None
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body = e.read().decode("utf-8", errors="replace")
|
||||||
|
print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def shop_create_session():
|
||||||
|
"""Create an anonymous session on the Arche Noah shop."""
|
||||||
|
req = urllib.request.Request(
|
||||||
|
SHOP_BASE + "webshop/createanonymoususer",
|
||||||
|
data=json.dumps({}).encode(),
|
||||||
|
headers={
|
||||||
|
"User-Agent": SHOP_UA,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Origin": "https://shop.arche-noah.at",
|
||||||
|
"Referer": "https://shop.arche-noah.at/",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=15)
|
||||||
|
cookie = resp.headers.get("Set-Cookie", "")
|
||||||
|
session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
|
||||||
|
if not session:
|
||||||
|
raise RuntimeError("Failed to get shop session")
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
def shop_request(session, endpoint, payload):
|
||||||
|
"""Make a POST request to the shop API."""
|
||||||
|
req = urllib.request.Request(
|
||||||
|
SHOP_BASE + endpoint,
|
||||||
|
data=json.dumps(payload).encode(),
|
||||||
|
headers={
|
||||||
|
"User-Agent": SHOP_UA,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Cookie": f"JSESSIONID={session}",
|
||||||
|
"Origin": "https://shop.arche-noah.at",
|
||||||
|
"Referer": "https://shop.arche-noah.at/",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
raw = resp.read().decode("utf-8")
|
||||||
|
return json.loads(raw) if raw.strip() else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_latin_name(detail_headline3):
|
||||||
|
"""Extract the Latin/botanical name from the product detail headline3 field."""
|
||||||
|
if not detail_headline3:
|
||||||
|
return None
|
||||||
|
# Remove HTML tags
|
||||||
|
text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
|
||||||
|
# Remove "Hier geht es zu unseren..." trailing text
|
||||||
|
text = text.split("Hier geht")[0].strip()
|
||||||
|
# Should be something like "Solanum lycopersicum" or "Capsicum annuum"
|
||||||
|
if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
|
||||||
|
return text
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def match_species(latin_name, species_by_scientific):
|
||||||
|
"""
|
||||||
|
Match a Latin name to a species, handling subspecies/variety suffixes.
|
||||||
|
E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
|
||||||
|
Also handles "subsp.", "convar.", "f." qualifiers.
|
||||||
|
"""
|
||||||
|
if not latin_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
normalized = latin_name.strip().lower()
|
||||||
|
|
||||||
|
# Direct match
|
||||||
|
species = species_by_scientific.get(normalized)
|
||||||
|
if species:
|
||||||
|
return species
|
||||||
|
|
||||||
|
# Strip subspecies/variety/convar/forma qualifiers and try genus + species only
|
||||||
|
# Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
|
||||||
|
m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
|
||||||
|
if m:
|
||||||
|
base = m.group(1).strip()
|
||||||
|
species = species_by_scientific.get(base)
|
||||||
|
if species:
|
||||||
|
return species
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_cultivar_name(product_name):
|
||||||
|
"""
|
||||||
|
Extract the cultivar/variety name from the product name.
|
||||||
|
Format examples:
|
||||||
|
"Salatparadeiser 'Naama' HG026" -> "Naama"
|
||||||
|
"Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
|
||||||
|
"Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
|
||||||
|
"""
|
||||||
|
# Try to extract name in quotes (various quote styles)
|
||||||
|
m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
# Fallback: remove the article number suffix and type prefix
|
||||||
|
# Remove trailing article number like HG026, TO019, etc.
|
||||||
|
name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
|
||||||
|
# Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
|
||||||
|
# Just return the full cleaned name
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pack_info(unit_desc):
|
||||||
|
"""
|
||||||
|
Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
|
||||||
|
Returns (pack_size, pack_unit) or (None, None).
|
||||||
|
"""
|
||||||
|
if not unit_desc:
|
||||||
|
return None, None
|
||||||
|
# "20-30 Korn" -> take the lower bound
|
||||||
|
m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
|
||||||
|
if m:
|
||||||
|
return float(m.group(1)), m.group(2)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# --- Main scraping logic -----------------------------------------------------
|
||||||
|
|
||||||
|
def fetch_all_arche_noah_products(session):
|
||||||
|
"""Search the shop API to find all Arche Noah seed products."""
|
||||||
|
all_products = {}
|
||||||
|
seen_terms = set()
|
||||||
|
|
||||||
|
for term in SEARCH_TERMS:
|
||||||
|
if term.lower() in seen_terms:
|
||||||
|
continue
|
||||||
|
seen_terms.add(term.lower())
|
||||||
|
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
payload = {
|
||||||
|
"searchCriteria": term,
|
||||||
|
"startIndex": offset,
|
||||||
|
"numDataSets": 200,
|
||||||
|
"allowAllProducts": False,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
data = shop_request(session, "webshop/getproducts", payload)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
|
||||||
|
new_count = 0
|
||||||
|
for p in data:
|
||||||
|
if p["sid"] not in all_products:
|
||||||
|
all_products[p["sid"]] = p
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
if len(data) < 200:
|
||||||
|
break
|
||||||
|
offset += len(data)
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# Filter to Arche Noah's own seed products only
|
||||||
|
an_products = {
|
||||||
|
sid: p for sid, p in all_products.items()
|
||||||
|
if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
|
||||||
|
return an_products
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_product_details(session, products):
|
||||||
|
"""Fetch detailed info (Latin names) for each product."""
|
||||||
|
details = {}
|
||||||
|
total = len(products)
|
||||||
|
for i, (sid, product) in enumerate(products.items()):
|
||||||
|
try:
|
||||||
|
detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
|
||||||
|
if detail:
|
||||||
|
details[sid] = detail
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Detail for {sid} failed: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
if (i + 1) % 20 == 0:
|
||||||
|
print(f" Fetched details: {i + 1}/{total}")
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
print(f"Fetched {len(details)} product details")
|
||||||
|
return details
|
||||||
|
|
||||||
|
|
||||||
|
def load_herbapi_species():
|
||||||
|
"""Load all species from HerbAPI and build lookup maps (handles pagination)."""
|
||||||
|
page = 1
|
||||||
|
species_list = []
|
||||||
|
while True:
|
||||||
|
result = herbapi_request("GET", f"species?per_page=100&page={page}")
|
||||||
|
if isinstance(result, dict) and "data" in result:
|
||||||
|
data = result["data"]
|
||||||
|
total = result.get("total", 0)
|
||||||
|
elif isinstance(result, list):
|
||||||
|
data = result
|
||||||
|
total = len(data)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
species_list.extend(data)
|
||||||
|
if len(species_list) >= total or not data:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
# Build lookup by scientific name (normalized lowercase)
|
||||||
|
by_scientific = {}
|
||||||
|
for s in species_list:
|
||||||
|
key = s["name_scientific"].strip().lower()
|
||||||
|
by_scientific[key] = s
|
||||||
|
return species_list, by_scientific
|
||||||
|
|
||||||
|
|
||||||
|
def load_herbapi_cultivars():
|
||||||
|
"""Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
|
||||||
|
page = 1
|
||||||
|
all_cultivars = []
|
||||||
|
while True:
|
||||||
|
result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
|
||||||
|
if isinstance(result, dict) and "data" in result:
|
||||||
|
data = result["data"]
|
||||||
|
total = result.get("total", 0)
|
||||||
|
elif isinstance(result, list):
|
||||||
|
data = result
|
||||||
|
total = len(data)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
all_cultivars.extend(data)
|
||||||
|
if len(all_cultivars) >= total or not data:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
# Build lookup by (species_id, normalized cultivar name)
|
||||||
|
by_key = {}
|
||||||
|
for c in all_cultivars:
|
||||||
|
key = (c["species_id"], c["name"].strip().lower())
|
||||||
|
by_key[key] = c
|
||||||
|
|
||||||
|
return all_cultivars, by_key
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_supplier():
|
||||||
|
"""Create the Arche Noah supplier if it doesn't exist, return its ID."""
|
||||||
|
suppliers = herbapi_request("GET", "suppliers")
|
||||||
|
if isinstance(suppliers, dict) and "data" in suppliers:
|
||||||
|
suppliers = suppliers["data"]
|
||||||
|
|
||||||
|
for s in suppliers:
|
||||||
|
if "arche" in s["name"].lower() and "noah" in s["name"].lower():
|
||||||
|
print(f"Supplier 'Arche Noah' already exists: {s['id']}")
|
||||||
|
return s["id"]
|
||||||
|
|
||||||
|
print("Creating supplier 'Arche Noah'...")
|
||||||
|
result = herbapi_request("POST", "suppliers", {
|
||||||
|
"name": "Arche Noah",
|
||||||
|
"url": "https://www.arche-noah.at",
|
||||||
|
"country": "AT",
|
||||||
|
"is_organic": True,
|
||||||
|
"is_demeter": False,
|
||||||
|
"notes": "Austrian society for heritage seed preservation and biodiversity",
|
||||||
|
})
|
||||||
|
print(f"Created supplier: {result['id']}")
|
||||||
|
return result["id"]
|
||||||
|
|
||||||
|
|
||||||
|
def load_existing_supplier_links(cultivar_id):
|
||||||
|
"""Load existing supplier links for a cultivar."""
|
||||||
|
try:
|
||||||
|
result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
|
||||||
|
if isinstance(result, list):
|
||||||
|
return result
|
||||||
|
if isinstance(result, dict) and "data" in result:
|
||||||
|
return result["data"]
|
||||||
|
return []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
print(f"=== Arche Noah Seed Catalog Scraper ===")
|
||||||
|
print(f"Started at {now_str}\n")
|
||||||
|
|
||||||
|
# Step 1: Create Arche Noah supplier in HerbAPI
|
||||||
|
print("[1/6] Ensuring Arche Noah supplier exists...")
|
||||||
|
supplier_id = ensure_supplier()
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Step 2: Load HerbAPI species for matching
|
||||||
|
print("[2/6] Loading HerbAPI species...")
|
||||||
|
species_list, species_by_scientific = load_herbapi_species()
|
||||||
|
print(f"Loaded {len(species_list)} species")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Step 3: Load existing cultivars for idempotency
|
||||||
|
print("[3/6] Loading existing cultivars...")
|
||||||
|
existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
|
||||||
|
print(f"Loaded {len(existing_cultivars)} existing cultivars")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Step 4: Scrape Arche Noah shop
|
||||||
|
print("[4/6] Scraping Arche Noah shop catalog...")
|
||||||
|
session = shop_create_session()
|
||||||
|
print(f"Got shop session")
|
||||||
|
products = fetch_all_arche_noah_products(session)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Step 5: Fetch product details (to get Latin names)
|
||||||
|
print("[5/6] Fetching product details for Latin name matching...")
|
||||||
|
details = fetch_product_details(session, products)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Step 6: Create cultivars in HerbAPI
|
||||||
|
print("[6/6] Creating cultivars in HerbAPI...")
|
||||||
|
stats = {
|
||||||
|
"created": 0,
|
||||||
|
"skipped_existing": 0,
|
||||||
|
"skipped_no_species": 0,
|
||||||
|
"supplier_linked": 0,
|
||||||
|
"supplier_link_existed": 0,
|
||||||
|
"errors": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
for sid, product in sorted(products.items()):
|
||||||
|
detail = details.get(sid, {})
|
||||||
|
|
||||||
|
# Extract Latin name from detail
|
||||||
|
latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
|
||||||
|
if not latin_name:
|
||||||
|
# Fallback: try from category mapping
|
||||||
|
latin_name = None
|
||||||
|
|
||||||
|
# Match to HerbAPI species (handles subspecies/variety suffixes)
|
||||||
|
species = match_species(latin_name, species_by_scientific)
|
||||||
|
|
||||||
|
if not species:
|
||||||
|
print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
|
||||||
|
stats["skipped_no_species"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract cultivar name
|
||||||
|
cultivar_name = extract_cultivar_name(product["name"])
|
||||||
|
if not cultivar_name:
|
||||||
|
print(f" SKIP (no cultivar name): {product['name']}")
|
||||||
|
stats["skipped_no_species"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if cultivar already exists (idempotency)
|
||||||
|
lookup_key = (species["id"], cultivar_name.strip().lower())
|
||||||
|
existing = cultivars_by_key.get(lookup_key)
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
cultivar_id = existing["id"]
|
||||||
|
stats["skipped_existing"] += 1
|
||||||
|
else:
|
||||||
|
# Determine if this is organic
|
||||||
|
is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
|
||||||
|
|
||||||
|
# Build product URL
|
||||||
|
alias = product.get("alias") or detail.get("alias", "")
|
||||||
|
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
||||||
|
|
||||||
|
# Create cultivar
|
||||||
|
cultivar_data = {
|
||||||
|
"species_id": species["id"],
|
||||||
|
"name": cultivar_name,
|
||||||
|
"name_de": cultivar_name,
|
||||||
|
"is_organic": is_organic,
|
||||||
|
"source_urls": [product_url] if product_url else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = herbapi_request("POST", "cultivars", cultivar_data)
|
||||||
|
cultivar_id = result["id"]
|
||||||
|
stats["created"] += 1
|
||||||
|
# Add to lookup for idempotency within this run
|
||||||
|
cultivars_by_key[lookup_key] = result
|
||||||
|
print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
|
||||||
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Link cultivar to supplier
|
||||||
|
existing_links = load_existing_supplier_links(cultivar_id)
|
||||||
|
already_linked = any(
|
||||||
|
link["supplier_id"] == supplier_id for link in existing_links
|
||||||
|
)
|
||||||
|
|
||||||
|
if already_linked:
|
||||||
|
stats["supplier_link_existed"] += 1
|
||||||
|
else:
|
||||||
|
# Parse pack info
|
||||||
|
unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
|
||||||
|
pack_size, pack_unit = parse_pack_info(unit_desc)
|
||||||
|
|
||||||
|
# Get price
|
||||||
|
price = None
|
||||||
|
price_list = product.get("priceListPos") or detail.get("priceListPos", [])
|
||||||
|
if price_list:
|
||||||
|
price = price_list[0].get("singleUnitPrice")
|
||||||
|
|
||||||
|
# Build product URL
|
||||||
|
alias = product.get("alias") or detail.get("alias", "")
|
||||||
|
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
||||||
|
|
||||||
|
link_data = {
|
||||||
|
"supplier_id": supplier_id,
|
||||||
|
"article_number": str(product.get("articleNr", "")),
|
||||||
|
"product_url": product_url,
|
||||||
|
"price_eur": price,
|
||||||
|
"pack_size": pack_size,
|
||||||
|
"pack_unit": pack_unit,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
|
||||||
|
stats["supplier_linked"] += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
|
||||||
|
stats["errors"] += 1
|
||||||
|
|
||||||
|
time.sleep(0.1) # small delay between HerbAPI calls
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Scraping complete!")
|
||||||
|
print(f" Cultivars created: {stats['created']}")
|
||||||
|
print(f" Cultivars already existed: {stats['skipped_existing']}")
|
||||||
|
print(f" Skipped (no species match): {stats['skipped_no_species']}")
|
||||||
|
print(f" Supplier links created: {stats['supplier_linked']}")
|
||||||
|
print(f" Supplier links existed: {stats['supplier_link_existed']}")
|
||||||
|
print(f" Errors: {stats['errors']}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,843 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
|
||||||
|
Extracts cultivar data and imports into HerbAPI.
|
||||||
|
|
||||||
|
Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# ── Configuration ─────────────────────────────────────────────────────────
|
||||||
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
SITE_BASE = "https://www.bingenheimersaatgut.de"
|
||||||
|
DELAY = 0.5
|
||||||
|
USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
|
||||||
|
|
||||||
|
# ── Category URLs to scrape ───────────────────────────────────────────────
|
||||||
|
# (url_path, default_species_scientific_name)
|
||||||
|
|
||||||
|
VEGETABLE_CATEGORIES = [
|
||||||
|
("gemuese/tomaten", "Solanum lycopersicum"),
|
||||||
|
("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
|
||||||
|
("gemuese/gurken/salatgurken", "Cucumis sativus"),
|
||||||
|
("gemuese/aubergine", "Solanum melongena"),
|
||||||
|
("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
|
||||||
|
("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
|
||||||
|
("gemuese/bohnen/dicke-bohne", "Vicia faba"),
|
||||||
|
("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
|
||||||
|
("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
|
||||||
|
("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
|
||||||
|
("gemuese/erbsen/markerbse", "Pisum sativum"),
|
||||||
|
("gemuese/erbsen/schalerbse", "Pisum sativum"),
|
||||||
|
("gemuese/erbsen/zuckererbse", "Pisum sativum"),
|
||||||
|
("gemuese/feldsalat", "Valerianella locusta"),
|
||||||
|
("gemuese/knollenfenchel", "Foeniculum vulgare"),
|
||||||
|
("gemuese/kohl/blumenkohl", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/brokkoli", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
|
||||||
|
("gemuese/kohl/gruenkohl", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/kohlrabi", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/rotkohl", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/weisskohl", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/wirsing", "Brassica oleracea"),
|
||||||
|
("gemuese/kohl/rosenkohl", "Brassica oleracea"),
|
||||||
|
("gemuese/kresse", "Lepidium sativum"),
|
||||||
|
("gemuese/kuerbis", "Cucurbita maxima"),
|
||||||
|
("gemuese/zuckermais", "Zea mays"),
|
||||||
|
("gemuese/mangold", "Beta vulgaris"),
|
||||||
|
("gemuese/melone", "Cucumis melo"),
|
||||||
|
("gemuese/moehren", "Daucus carota"),
|
||||||
|
("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
|
||||||
|
("gemuese/paprika/chili", "Capsicum annuum"),
|
||||||
|
("gemuese/pastinaken", "Pastinaca sativa"),
|
||||||
|
("gemuese/petersilienwurzel", "Petroselinum crispum"),
|
||||||
|
("gemuese/physalis", "Physalis peruviana"),
|
||||||
|
("gemuese/porreelauch", "Allium porrum"),
|
||||||
|
("gemuese/radies", "Raphanus sativus"),
|
||||||
|
("gemuese/rettich", "Raphanus sativus"),
|
||||||
|
("gemuese/rote-bete", "Beta vulgaris"),
|
||||||
|
("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
|
||||||
|
("gemuese/rueben/kohlruebe", "Brassica napus"),
|
||||||
|
("gemuese/rucola", "Eruca vesicaria"),
|
||||||
|
("gemuese/salat/bataviasalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/eichblattsalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/eissalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/endivien", "Cichorium endivia"),
|
||||||
|
("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
|
||||||
|
("gemuese/salat/kopfsalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/lollosalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/romanasalat", "Lactuca sativa"),
|
||||||
|
("gemuese/salat/baby-leaf", "Lactuca sativa"),
|
||||||
|
("gemuese/sellerie/knollensellerie", "Apium graveolens"),
|
||||||
|
("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
|
||||||
|
("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
|
||||||
|
("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
|
||||||
|
("gemuese/blattstielgemuese", "Beta vulgaris"),
|
||||||
|
("gemuese/zwiebeln", "Allium cepa"),
|
||||||
|
("gemuese/lauchzwiebeln", "Allium fistulosum"),
|
||||||
|
("gemuese/artischocke", "Cynara cardunculus"),
|
||||||
|
("gemuese/asia-salate", "Brassica juncea"),
|
||||||
|
("gemuese/chicoree", "Cichorium intybus"),
|
||||||
|
("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
|
||||||
|
("gemuese/winterpostelein", "Claytonia perfoliata"),
|
||||||
|
("gemuese/zucchini", "Cucurbita pepo"),
|
||||||
|
("gemuese/catalogna", "Cichorium intybus"),
|
||||||
|
("gemuese/zichoriensalate", "Cichorium intybus"),
|
||||||
|
]
|
||||||
|
|
||||||
|
HERB_CATEGORIES = [
|
||||||
|
("kraeuter/basilikum", "Ocimum basilicum"),
|
||||||
|
("kraeuter/bohnenkraut", "Satureja hortensis"),
|
||||||
|
("kraeuter/borretsch", "Borago officinalis"),
|
||||||
|
("kraeuter/dill", "Anethum graveolens"),
|
||||||
|
("kraeuter/kuemmel", "Carum carvi"),
|
||||||
|
("kraeuter/kerbel", "Anthriscus cerefolium"),
|
||||||
|
("kraeuter/koriander", "Coriandrum sativum"),
|
||||||
|
("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
|
||||||
|
("kraeuter/kultursauerampfer", "Rumex acetosa"),
|
||||||
|
("kraeuter/lavendel", "Lavandula angustifolia"),
|
||||||
|
("kraeuter/liebstock", "Levisticum officinale"),
|
||||||
|
("kraeuter/majoran", "Origanum majorana"),
|
||||||
|
("kraeuter/oregano", "Origanum vulgare"),
|
||||||
|
("kraeuter/pimpinelle", "Sanguisorba minor"),
|
||||||
|
("kraeuter/estragon", "Artemisia dracunculus"),
|
||||||
|
("kraeuter/salbei", "Salvia officinalis"),
|
||||||
|
("kraeuter/schnittlauch", "Allium schoenoprasum"),
|
||||||
|
("kraeuter/schnittknoblauch", "Allium tuberosum"),
|
||||||
|
("kraeuter/schwarzkuemmel", "Nigella sativa"),
|
||||||
|
("kraeuter/speisechrysantheme", "Glebionis coronaria"),
|
||||||
|
("kraeuter/thymian", "Thymus vulgaris"),
|
||||||
|
("kraeuter/ysop", "Hyssopus officinalis"),
|
||||||
|
("kraeuter/winterkresse", "Barbarea vulgaris"),
|
||||||
|
("kraeuter/brunnenkresse", "Nasturtium officinale"),
|
||||||
|
("kraeuter/melisse", "Melissa officinalis"),
|
||||||
|
("kraeuter/petersilie", "Petroselinum crispum"),
|
||||||
|
("kraeuter/schnittsellerie", "Apium graveolens"),
|
||||||
|
("kraeuter/beifuss", "Artemisia vulgaris"),
|
||||||
|
]
|
||||||
|
|
||||||
|
GREEN_MANURE_CATEGORIES = [
|
||||||
|
("gruenduengung", None),
|
||||||
|
]
|
||||||
|
|
||||||
|
ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
|
||||||
|
|
||||||
|
# ── Stats ─────────────────────────────────────────────────────────────────
|
||||||
|
stats = {
|
||||||
|
"categories_scraped": 0,
|
||||||
|
"products_found": 0,
|
||||||
|
"detail_pages_fetched": 0,
|
||||||
|
"cultivars_created": 0,
|
||||||
|
"cultivars_existed": 0,
|
||||||
|
"supplier_links_created": 0,
|
||||||
|
"supplier_links_existed": 0,
|
||||||
|
"species_created": 0,
|
||||||
|
"families_created": 0,
|
||||||
|
"species_not_matched": [],
|
||||||
|
"errors": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTTP helpers ──────────────────────────────────────────────────────────
|
||||||
|
def fetch_page(url: str) -> str:
|
||||||
|
"""Fetch a web page with User-Agent header."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
return resp.read().decode("utf-8", errors="replace")
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 404:
|
||||||
|
return ""
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(path: str, params: dict = None) -> dict:
|
||||||
|
"""GET from HerbAPI."""
|
||||||
|
url = f"{API_BASE}{path}"
|
||||||
|
if params:
|
||||||
|
url += "?" + urllib.parse.urlencode(params)
|
||||||
|
req = urllib.request.Request(url, headers={
|
||||||
|
"Authorization": f"Bearer {API_TOKEN}",
|
||||||
|
"Accept": "application/json",
|
||||||
|
})
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def api_post(path: str, data: dict) -> tuple:
|
||||||
|
"""POST to HerbAPI. Returns (response_dict, status_code)."""
|
||||||
|
url = f"{API_BASE}{path}"
|
||||||
|
body = json.dumps(data).encode("utf-8")
|
||||||
|
req = urllib.request.Request(url, data=body, method="POST", headers={
|
||||||
|
"Authorization": f"Bearer {API_TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
return json.loads(resp.read()), resp.status
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
err_body = e.read().decode("utf-8", errors="replace")
|
||||||
|
return {"error": err_body, "_status": e.code}, e.code
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTML parsing helpers ──────────────────────────────────────────────────
|
||||||
|
def parse_product_links(html: str) -> list:
|
||||||
|
"""Parse product links from listing page using regex."""
|
||||||
|
links = []
|
||||||
|
# Magento product-item-link pattern
|
||||||
|
pattern = re.compile(
|
||||||
|
r'<a[^>]+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*</a>',
|
||||||
|
re.DOTALL | re.IGNORECASE
|
||||||
|
)
|
||||||
|
for match in pattern.finditer(html):
|
||||||
|
url = match.group(1)
|
||||||
|
name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
|
||||||
|
if name:
|
||||||
|
if not url.startswith("http"):
|
||||||
|
url = SITE_BASE + url
|
||||||
|
links.append((url, name))
|
||||||
|
|
||||||
|
if not links:
|
||||||
|
# Broader pattern for product detail links
|
||||||
|
pattern2 = re.compile(
|
||||||
|
r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
seen = set()
|
||||||
|
for match in pattern2.finditer(html):
|
||||||
|
url = match.group(1).strip()
|
||||||
|
name = match.group(2).strip()
|
||||||
|
if name and url not in seen and not url.endswith(".html"):
|
||||||
|
seen.add(url)
|
||||||
|
if not url.startswith("http"):
|
||||||
|
url = SITE_BASE + url
|
||||||
|
links.append((url, name))
|
||||||
|
|
||||||
|
# Deduplicate by URL
|
||||||
|
seen_urls = set()
|
||||||
|
unique = []
|
||||||
|
for url, name in links:
|
||||||
|
if url not in seen_urls:
|
||||||
|
seen_urls.add(url)
|
||||||
|
unique.append((url, name))
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def extract_latin_from_detail(html: str) -> Optional[str]:
|
||||||
|
"""Extract Latin/botanical name from product detail page."""
|
||||||
|
patterns = [
|
||||||
|
r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*</(?:em|i)>',
|
||||||
|
r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
||||||
|
r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
m = re.search(pat, html, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
name = m.group(1).strip()
|
||||||
|
parts = name.split()
|
||||||
|
if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
|
||||||
|
return name
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_description_from_detail(html: str) -> str:
|
||||||
|
"""Extract product description from detail page."""
|
||||||
|
desc_patterns = [
|
||||||
|
r'<div[^>]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)</div>',
|
||||||
|
r'<div[^>]*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)</div>',
|
||||||
|
r'data-content-type="description"[^>]*>(.*?)</div>',
|
||||||
|
]
|
||||||
|
for pat in desc_patterns:
|
||||||
|
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
raw = m.group(1)
|
||||||
|
text = re.sub(r'<[^>]+>', ' ', raw)
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
if len(text) > 20:
|
||||||
|
return text[:2000]
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_article_number(product_name: str, url: str) -> Optional[str]:
|
||||||
|
"""Extract article number from product name or URL."""
|
||||||
|
m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
|
||||||
|
if m:
|
||||||
|
return m.group(1).replace(" ", "")
|
||||||
|
slug = url.rstrip("/").split("/")[-1]
|
||||||
|
m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return m.group(1).upper()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_variety_name(product_name: str) -> str:
|
||||||
|
"""Extract the variety/cultivar name from the full product name."""
|
||||||
|
name = product_name.strip()
|
||||||
|
|
||||||
|
# Remove article number suffix like (G802)
|
||||||
|
name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
|
||||||
|
|
||||||
|
# Common German vegetable/herb type prefixes to strip
|
||||||
|
prefixes = [
|
||||||
|
# Tomatoes
|
||||||
|
r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
|
||||||
|
r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
|
||||||
|
# Beans
|
||||||
|
r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
|
||||||
|
r'Edamame(?:-Sojabohne)?\s+',
|
||||||
|
# Peas
|
||||||
|
r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
|
||||||
|
# Cucurbits
|
||||||
|
r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
|
||||||
|
r'Zucchini\s+',
|
||||||
|
r'Kürbis\s+',
|
||||||
|
r'(?:Wasser)?[Mm]elone\s+',
|
||||||
|
# Brassicas
|
||||||
|
r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
|
||||||
|
r'Kohlrabi\s+',
|
||||||
|
r'Wirsing\s+',
|
||||||
|
r'Brokkoli\s+',
|
||||||
|
r'Chinakohl\s+',
|
||||||
|
r'Pak\s+Choi\s+',
|
||||||
|
r'Kohlrübe\s+',
|
||||||
|
r'Mai-/Herbstrüben?(?:/Navets)?\s+',
|
||||||
|
# Root vegetables
|
||||||
|
r'Möhre\s+',
|
||||||
|
r'Karotten?(?:\s*-?\s*Mix)?\s+',
|
||||||
|
r'Pastinake\s+',
|
||||||
|
r'Radies(?:chen)?\s+',
|
||||||
|
r'Rettich\s+',
|
||||||
|
r'Schwarzwurzel\s+',
|
||||||
|
r'Haferwurzel\s+',
|
||||||
|
r'Petersilienwurzel\s+',
|
||||||
|
# Beets
|
||||||
|
r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
|
||||||
|
r'Mangold\s+',
|
||||||
|
# Lettuce & leafy
|
||||||
|
r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
|
||||||
|
r'Feldsalat\s+',
|
||||||
|
r'Endivie\s+',
|
||||||
|
r'Asia[\s-]*Salat\s+',
|
||||||
|
r'Spinat\s+',
|
||||||
|
# Alliums
|
||||||
|
r'Zwiebel\s+',
|
||||||
|
r'Lauchzwiebel\s+',
|
||||||
|
r'Porree(?:/Lauch)?\s+',
|
||||||
|
r'Schnittlauch\s+',
|
||||||
|
r'Schnittknoblauch\s+',
|
||||||
|
# Peppers
|
||||||
|
r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
|
||||||
|
r'Chili\s+',
|
||||||
|
# Celery
|
||||||
|
r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
|
||||||
|
# Herbs
|
||||||
|
r'Basilikum\s+',
|
||||||
|
r'Koriander\s+',
|
||||||
|
r'Dill\s+',
|
||||||
|
r'Petersilie\s+',
|
||||||
|
r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
|
||||||
|
r'Salbei\s+',
|
||||||
|
r'Thymian\s+',
|
||||||
|
r'Oregano\s+',
|
||||||
|
r'Lavendel\s+',
|
||||||
|
r'Melisse\s+',
|
||||||
|
r'Majoran\s+',
|
||||||
|
r'Estragon\s+',
|
||||||
|
r'Kresse\s+',
|
||||||
|
r'Bohnenkraut\s+',
|
||||||
|
r'Borretsch\s+',
|
||||||
|
r'Kümmel\s+',
|
||||||
|
r'Kerbel\s+',
|
||||||
|
r'Liebstock\s+',
|
||||||
|
r'Ysop\s+',
|
||||||
|
r'Pimpinelle\s+',
|
||||||
|
r'Beifuß\s+',
|
||||||
|
r'Schwarzkümmel\s+',
|
||||||
|
# Other
|
||||||
|
r'Zuckermais\s+',
|
||||||
|
r'Artischocke\s+',
|
||||||
|
r'Physalis\s+',
|
||||||
|
r'Aubergine\s+',
|
||||||
|
r'Catalogna\s+',
|
||||||
|
]
|
||||||
|
for prefix in prefixes:
|
||||||
|
name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
name = name.strip().strip("'\"")
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
# ── API data caches ───────────────────────────────────────────────────────
|
||||||
|
species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
|
||||||
|
family_cache = {} # name_scientific_lower -> {id, name_scientific}
|
||||||
|
cultivar_cache = {} # slug -> {id, name, species_id, ...}
|
||||||
|
supplier_id = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_api_data():
|
||||||
|
"""Load all existing data from HerbAPI for matching."""
|
||||||
|
global supplier_id
|
||||||
|
|
||||||
|
print("Loading existing HerbAPI data...")
|
||||||
|
|
||||||
|
# Load families
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
resp = api_get("/families", {"per_page": 100, "page": page})
|
||||||
|
for f in resp["data"]:
|
||||||
|
family_cache[f["name_scientific"].lower()] = f
|
||||||
|
if len(resp["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
print(f" Loaded {len(family_cache)} families")
|
||||||
|
|
||||||
|
# Load species
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
resp = api_get("/species", {"per_page": 100, "page": page})
|
||||||
|
for s in resp["data"]:
|
||||||
|
species_cache[s["name_scientific"].lower()] = s
|
||||||
|
if len(resp["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
print(f" Loaded {len(species_cache)} species")
|
||||||
|
|
||||||
|
# Load ALL cultivars (slug + id + name + species_id)
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
resp = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||||
|
for c in resp["data"]:
|
||||||
|
cultivar_cache[c["slug"]] = {
|
||||||
|
"id": c["id"],
|
||||||
|
"name": c["name"],
|
||||||
|
"species_id": c["species_id"],
|
||||||
|
}
|
||||||
|
if len(resp["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
print(f" Loaded {len(cultivar_cache)} cultivars")
|
||||||
|
|
||||||
|
# Create or find Bingenheimer supplier
|
||||||
|
resp = api_get("/suppliers")
|
||||||
|
for s in resp:
|
||||||
|
if "bingenheimer" in s["name"].lower():
|
||||||
|
supplier_id = s["id"]
|
||||||
|
print(f" Found existing supplier: {s['name']} ({s['id']})")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not supplier_id:
|
||||||
|
print(" Creating Bingenheimer Saatgut supplier...")
|
||||||
|
s, code = api_post("/suppliers", {
|
||||||
|
"name": "Bingenheimer Saatgut",
|
||||||
|
"url": "https://www.bingenheimersaatgut.de",
|
||||||
|
"country": "DE",
|
||||||
|
"is_organic": True,
|
||||||
|
"is_demeter": True,
|
||||||
|
"notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
|
||||||
|
})
|
||||||
|
if "id" in s:
|
||||||
|
supplier_id = s["id"]
|
||||||
|
print(f" Created supplier: {s['id']}")
|
||||||
|
else:
|
||||||
|
print(f" ERROR creating supplier: {s}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def find_or_create_species(latin_name: str) -> Optional[str]:
|
||||||
|
"""Find species by Latin name or create it. Returns species ID."""
|
||||||
|
if not latin_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
key = latin_name.lower().strip()
|
||||||
|
|
||||||
|
# Direct match
|
||||||
|
if key in species_cache:
|
||||||
|
return species_cache[key]["id"]
|
||||||
|
|
||||||
|
# Try without subspecies/variety
|
||||||
|
base = " ".join(key.split()[:2])
|
||||||
|
if base in species_cache:
|
||||||
|
return species_cache[base]["id"]
|
||||||
|
|
||||||
|
# Handle synonyms
|
||||||
|
synonyms = {
|
||||||
|
"lycopersicon esculentum": "solanum lycopersicum",
|
||||||
|
"capsicum annuum var. annuum": "capsicum annuum",
|
||||||
|
"brassica oleracea var. botrytis": "brassica oleracea",
|
||||||
|
"brassica oleracea var. italica": "brassica oleracea",
|
||||||
|
"brassica oleracea var. gemmifera": "brassica oleracea",
|
||||||
|
"brassica oleracea var. gongylodes": "brassica oleracea",
|
||||||
|
"brassica oleracea var. capitata": "brassica oleracea",
|
||||||
|
"brassica oleracea var. sabauda": "brassica oleracea",
|
||||||
|
"brassica oleracea var. sabellica": "brassica oleracea",
|
||||||
|
"brassica rapa var. rapa": "brassica rapa",
|
||||||
|
"brassica rapa subsp. pekinensis": "brassica rapa",
|
||||||
|
"brassica rapa subsp. chinensis": "brassica rapa",
|
||||||
|
"beta vulgaris var. conditiva": "beta vulgaris",
|
||||||
|
"beta vulgaris subsp. vulgaris": "beta vulgaris",
|
||||||
|
"beta vulgaris var. vulgaris": "beta vulgaris",
|
||||||
|
"allium porrum": "allium cepa",
|
||||||
|
"allium ampeloprasum": "allium cepa",
|
||||||
|
"origanum majorana": "origanum vulgare",
|
||||||
|
"cichorium intybus var. foliosum": "cichorium intybus",
|
||||||
|
"petroselinum crispum var. tuberosum": "petroselinum crispum",
|
||||||
|
"apium graveolens var. rapaceum": "apium graveolens",
|
||||||
|
"apium graveolens var. dulce": "apium graveolens",
|
||||||
|
"lactuca sativa var. capitata": "lactuca sativa",
|
||||||
|
"lactuca sativa var. crispa": "lactuca sativa",
|
||||||
|
"lactuca sativa var. longifolia": "lactuca sativa",
|
||||||
|
}
|
||||||
|
if key in synonyms:
|
||||||
|
syn_key = synonyms[key]
|
||||||
|
if syn_key in species_cache:
|
||||||
|
return species_cache[syn_key]["id"]
|
||||||
|
|
||||||
|
# Try to create the species
|
||||||
|
genus = latin_name.split()[0]
|
||||||
|
family_map = {
|
||||||
|
"Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
|
||||||
|
"Nicandra": "Solanaceae",
|
||||||
|
"Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
|
||||||
|
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
|
||||||
|
"Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
|
||||||
|
"Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
|
||||||
|
"Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
|
||||||
|
"Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
|
||||||
|
"Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
||||||
|
"Allium": "Amaryllidaceae",
|
||||||
|
"Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
|
||||||
|
"Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
|
||||||
|
"Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
|
||||||
|
"Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
|
||||||
|
"Sanguisorba": "Rosaceae",
|
||||||
|
"Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
|
||||||
|
"Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
|
||||||
|
"Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
|
||||||
|
"Artemisia": "Asteraceae",
|
||||||
|
"Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
||||||
|
"Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
|
||||||
|
"Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
|
||||||
|
"Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
|
||||||
|
"Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
|
||||||
|
"Mentha": "Lamiaceae",
|
||||||
|
"Zea": "Poaceae",
|
||||||
|
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
|
||||||
|
"Valerianella": "Caprifoliaceae",
|
||||||
|
"Tropaeolum": "Tropaeolaceae",
|
||||||
|
"Rumex": "Polygonaceae",
|
||||||
|
"Nigella": "Ranunculaceae",
|
||||||
|
"Claytonia": "Montiaceae",
|
||||||
|
"Tetragonia": "Aizoaceae",
|
||||||
|
"Basella": "Basellaceae",
|
||||||
|
"Plantago": "Plantaginaceae",
|
||||||
|
}
|
||||||
|
|
||||||
|
family_name = family_map.get(genus)
|
||||||
|
if not family_name:
|
||||||
|
print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
|
||||||
|
stats["species_not_matched"].append(latin_name)
|
||||||
|
return None
|
||||||
|
|
||||||
|
family_id = find_or_create_family(family_name)
|
||||||
|
if not family_id:
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" Creating species: {latin_name}")
|
||||||
|
resp, code = api_post("/species", {
|
||||||
|
"name_scientific": latin_name,
|
||||||
|
"family_id": family_id,
|
||||||
|
})
|
||||||
|
if "id" in resp:
|
||||||
|
species_cache[latin_name.lower()] = resp
|
||||||
|
stats["species_created"] += 1
|
||||||
|
return resp["id"]
|
||||||
|
else:
|
||||||
|
# Might already exist, reload
|
||||||
|
print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
r = api_get("/species", {"per_page": 100, "page": page})
|
||||||
|
for s in r["data"]:
|
||||||
|
species_cache[s["name_scientific"].lower()] = s
|
||||||
|
if len(r["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
if latin_name.lower() in species_cache:
|
||||||
|
return species_cache[latin_name.lower()]["id"]
|
||||||
|
stats["errors"].append(f"Species creation failed: {latin_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_or_create_family(family_name: str) -> Optional[str]:
|
||||||
|
"""Find or create a plant family. Returns family ID."""
|
||||||
|
key = family_name.lower()
|
||||||
|
if key in family_cache:
|
||||||
|
return family_cache[key]["id"]
|
||||||
|
|
||||||
|
print(f" Creating family: {family_name}")
|
||||||
|
resp, code = api_post("/families", {"name_scientific": family_name})
|
||||||
|
if "id" in resp:
|
||||||
|
family_cache[key] = resp
|
||||||
|
stats["families_created"] += 1
|
||||||
|
return resp["id"]
|
||||||
|
else:
|
||||||
|
# Reload
|
||||||
|
r = api_get("/families", {"per_page": 200})
|
||||||
|
for ff in r["data"]:
|
||||||
|
family_cache[ff["name_scientific"].lower()] = ff
|
||||||
|
if key in family_cache:
|
||||||
|
return family_cache[key]["id"]
|
||||||
|
stats["errors"].append(f"Family creation failed: {family_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(text: str) -> str:
|
||||||
|
"""Generate a URL-safe slug."""
|
||||||
|
text = text.lower()
|
||||||
|
replacements = {
|
||||||
|
"ä": "a", "ö": "o", "ü": "u", "ß": "ss",
|
||||||
|
"é": "e", "è": "e", "ê": "e", "ë": "e",
|
||||||
|
"à": "a", "â": "a", "á": "a",
|
||||||
|
"ô": "o", "ù": "u", "û": "u", "ú": "u",
|
||||||
|
"ï": "i", "î": "i", "í": "i",
|
||||||
|
"ç": "c", "ñ": "n", "ó": "o",
|
||||||
|
"œ": "oe", "æ": "ae",
|
||||||
|
}
|
||||||
|
for old, new in replacements.items():
|
||||||
|
text = text.replace(old, new)
|
||||||
|
text = re.sub(r'[^a-z0-9\s-]', '', text)
|
||||||
|
text = re.sub(r'[\s]+', '-', text.strip())
|
||||||
|
text = re.sub(r'-+', '-', text)
|
||||||
|
return text.strip('-')
|
||||||
|
|
||||||
|
|
||||||
|
def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
|
||||||
|
"""Check if cultivar already exists. Returns cultivar ID or None."""
|
||||||
|
expected_slug = slugify(f"{species_name} {variety_name}")
|
||||||
|
|
||||||
|
# Direct slug match
|
||||||
|
if expected_slug in cultivar_cache:
|
||||||
|
return cultivar_cache[expected_slug]["id"]
|
||||||
|
|
||||||
|
# Check for name match in same species
|
||||||
|
variety_lower = variety_name.lower()
|
||||||
|
for slug, data in cultivar_cache.items():
|
||||||
|
if data["species_id"] == species_id and data["name"].lower() == variety_lower:
|
||||||
|
return data["id"]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_category(cat_path: str, default_species: Optional[str]):
|
||||||
|
"""Scrape a single category page and all its products."""
|
||||||
|
url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Category: {cat_path}")
|
||||||
|
|
||||||
|
html = fetch_page(url)
|
||||||
|
if not html:
|
||||||
|
print(" SKIP: Page not found (404)")
|
||||||
|
return
|
||||||
|
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
products = parse_product_links(html)
|
||||||
|
print(f" Found {len(products)} products")
|
||||||
|
stats["products_found"] += len(products)
|
||||||
|
stats["categories_scraped"] += 1
|
||||||
|
|
||||||
|
for prod_url, prod_name in products:
|
||||||
|
process_product(prod_url, prod_name, default_species)
|
||||||
|
|
||||||
|
|
||||||
|
def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
|
||||||
|
"""Process a single product: fetch detail, extract data, create cultivar."""
|
||||||
|
article_number = extract_article_number(prod_name, prod_url)
|
||||||
|
variety_name = extract_variety_name(prod_name)
|
||||||
|
|
||||||
|
if not variety_name:
|
||||||
|
print(f" SKIP (no variety): {prod_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip mixes, sets, bundles
|
||||||
|
skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
|
||||||
|
"saat-set", " mix ", "trio ", "quartett", "gutschein",
|
||||||
|
"buch ", "düngung", "erde ", "-garten"]
|
||||||
|
name_lower = prod_name.lower()
|
||||||
|
# Exception: if the variety name itself is the whole thing, keep it
|
||||||
|
if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
|
||||||
|
# Only skip if it really seems like a mix
|
||||||
|
if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
|
||||||
|
print(f" SKIP (mix/set): {prod_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"\n Product: {prod_name}")
|
||||||
|
print(f" Variety: {variety_name}, SKU: {article_number}")
|
||||||
|
|
||||||
|
# Fetch detail page
|
||||||
|
latin_name = None
|
||||||
|
description = ""
|
||||||
|
time.sleep(DELAY)
|
||||||
|
try:
|
||||||
|
detail_html = fetch_page(prod_url)
|
||||||
|
stats["detail_pages_fetched"] += 1
|
||||||
|
if detail_html:
|
||||||
|
latin_name = extract_latin_from_detail(detail_html)
|
||||||
|
description = extract_description_from_detail(detail_html)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" WARNING: Detail page error: {e}")
|
||||||
|
|
||||||
|
species_name = latin_name or default_species
|
||||||
|
if not species_name:
|
||||||
|
print(f" SKIP: No species for '{prod_name}'")
|
||||||
|
stats["species_not_matched"].append(prod_name)
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f" Species: {species_name}")
|
||||||
|
|
||||||
|
species_id = find_or_create_species(species_name)
|
||||||
|
if not species_id:
|
||||||
|
print(f" SKIP: Could not resolve species '{species_name}'")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if cultivar already exists
|
||||||
|
existing_id = find_existing_cultivar(species_name, variety_name, species_id)
|
||||||
|
|
||||||
|
cultivar_id = None
|
||||||
|
|
||||||
|
if existing_id:
|
||||||
|
cultivar_id = existing_id
|
||||||
|
print(f" EXISTS: cultivar already in DB")
|
||||||
|
stats["cultivars_existed"] += 1
|
||||||
|
else:
|
||||||
|
# Create cultivar
|
||||||
|
data = {
|
||||||
|
"species_id": species_id,
|
||||||
|
"name": variety_name,
|
||||||
|
"name_de": variety_name,
|
||||||
|
"is_organic": True,
|
||||||
|
}
|
||||||
|
if description:
|
||||||
|
data["description"] = description
|
||||||
|
|
||||||
|
resp, code = api_post("/cultivars", data)
|
||||||
|
|
||||||
|
if "id" in resp:
|
||||||
|
cultivar_id = resp["id"]
|
||||||
|
cultivar_cache[resp["slug"]] = {
|
||||||
|
"id": resp["id"],
|
||||||
|
"name": variety_name,
|
||||||
|
"species_id": species_id,
|
||||||
|
}
|
||||||
|
stats["cultivars_created"] += 1
|
||||||
|
print(f" CREATED: {resp['slug']}")
|
||||||
|
elif code == 500 and "Database error" in str(resp.get("error", "")):
|
||||||
|
# Likely slug conflict - try to find existing
|
||||||
|
print(f" DB conflict - searching for existing cultivar...")
|
||||||
|
# Reload cultivars for this species
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
r = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||||
|
for c in r["data"]:
|
||||||
|
cultivar_cache[c["slug"]] = {
|
||||||
|
"id": c["id"],
|
||||||
|
"name": c["name"],
|
||||||
|
"species_id": c["species_id"],
|
||||||
|
}
|
||||||
|
if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
|
||||||
|
cultivar_id = c["id"]
|
||||||
|
if cultivar_id or len(r["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
if cultivar_id:
|
||||||
|
print(f" Found existing after conflict: {cultivar_id}")
|
||||||
|
stats["cultivars_existed"] += 1
|
||||||
|
else:
|
||||||
|
print(f" ERROR: DB error and could not find existing cultivar")
|
||||||
|
stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
|
||||||
|
stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Link to supplier
|
||||||
|
if cultivar_id and supplier_id:
|
||||||
|
link_data = {
|
||||||
|
"supplier_id": supplier_id,
|
||||||
|
"product_url": prod_url,
|
||||||
|
}
|
||||||
|
if article_number:
|
||||||
|
link_data["article_number"] = article_number
|
||||||
|
|
||||||
|
resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
||||||
|
|
||||||
|
if "id" in resp:
|
||||||
|
stats["supplier_links_created"] += 1
|
||||||
|
print(f" LINKED (SKU: {article_number})")
|
||||||
|
elif code == 500 or "already" in str(resp.get("error", "")).lower():
|
||||||
|
stats["supplier_links_existed"] += 1
|
||||||
|
print(f" LINK EXISTS")
|
||||||
|
else:
|
||||||
|
print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
|
||||||
|
stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Bingenheimer Saatgut Scraper for HerbAPI")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
load_api_data()
|
||||||
|
|
||||||
|
print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
|
||||||
|
|
||||||
|
for cat_path, default_species in ALL_CATEGORIES:
|
||||||
|
try:
|
||||||
|
scrape_category(cat_path, default_species)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR in category {cat_path}: {e}")
|
||||||
|
stats["errors"].append(f"Category error: {cat_path}: {e}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("SCRAPING COMPLETE - SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Categories scraped: {stats['categories_scraped']}")
|
||||||
|
print(f"Products found: {stats['products_found']}")
|
||||||
|
print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
|
||||||
|
print(f"Cultivars created: {stats['cultivars_created']}")
|
||||||
|
print(f"Cultivars existed: {stats['cultivars_existed']}")
|
||||||
|
print(f"Supplier links created: {stats['supplier_links_created']}")
|
||||||
|
print(f"Supplier links existed: {stats['supplier_links_existed']}")
|
||||||
|
print(f"Species created: {stats['species_created']}")
|
||||||
|
print(f"Families created: {stats['families_created']}")
|
||||||
|
print(f"Errors: {len(stats['errors'])}")
|
||||||
|
|
||||||
|
if stats["species_not_matched"]:
|
||||||
|
print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
|
||||||
|
for s in stats["species_not_matched"][:30]:
|
||||||
|
print(f" - {s}")
|
||||||
|
|
||||||
|
if stats["errors"]:
|
||||||
|
print(f"\nErrors ({len(stats['errors'])}):")
|
||||||
|
for e in stats["errors"][:30]:
|
||||||
|
print(f" - {e}")
|
||||||
|
|
||||||
|
return 0 if not stats["errors"] else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -0,0 +1,760 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
|
||||||
|
Extracts cultivar data and imports into HerbAPI.
|
||||||
|
|
||||||
|
Run 2 - fixes pagination (API caps at 100/page), better species matching,
|
||||||
|
caches scraped products, handles duplicates gracefully.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.error
|
||||||
|
import gzip
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import html as html_mod
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
SITE_BASE = "https://www.dreschflegel-saatgut.de"
|
||||||
|
DELAY = 0.5
|
||||||
|
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
|
||||||
|
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
|
||||||
|
|
||||||
|
# Unbuffered output
|
||||||
|
sys.stdout.reconfigure(line_buffering=True)
|
||||||
|
sys.stderr.reconfigure(line_buffering=True)
|
||||||
|
|
||||||
|
stats = defaultdict(int)
|
||||||
|
|
||||||
|
|
||||||
|
def api_request(method, path, data=None):
|
||||||
|
"""Make an API request to HerbAPI."""
|
||||||
|
url = f"{API_BASE}{path}"
|
||||||
|
body = json.dumps(data).encode("utf-8") if data else None
|
||||||
|
req = urllib.request.Request(url, data=body, method=method)
|
||||||
|
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||||
|
req.add_header("Content-Type", "application/json")
|
||||||
|
req.add_header("Accept", "application/json")
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req)
|
||||||
|
return json.loads(resp.read().decode("utf-8"))
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body_text = e.read().decode("utf-8", errors="replace")
|
||||||
|
if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
|
||||||
|
return None # Duplicate, handled silently
|
||||||
|
if e.code == 500 and "database error" in body_text.lower():
|
||||||
|
# Likely a unique constraint violation = duplicate
|
||||||
|
return None
|
||||||
|
print(f" API error {e.code} {method} {path}: {body_text[:200]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url):
|
||||||
|
"""Fetch a web page with delay and user-agent."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
return resp.read().decode("utf-8", errors="replace")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Fetch error {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_sitemap_urls():
|
||||||
|
"""Download sitemap and extract all URLs."""
|
||||||
|
print("Fetching sitemap index...")
|
||||||
|
html = fetch_page(f"{SITE_BASE}/sitemap.xml")
|
||||||
|
if not html:
|
||||||
|
return []
|
||||||
|
|
||||||
|
sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
|
||||||
|
all_urls = []
|
||||||
|
|
||||||
|
for smap_url in sitemap_urls:
|
||||||
|
if smap_url.endswith(".xml.gz"):
|
||||||
|
print(f" Fetching compressed sitemap...")
|
||||||
|
req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
data = gzip.decompress(resp.read()).decode("utf-8")
|
||||||
|
urls = re.findall(r"<loc>(.*?)</loc>", data)
|
||||||
|
all_urls.extend(urls)
|
||||||
|
print(f" Found {len(urls)} URLs")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
|
||||||
|
return all_urls
|
||||||
|
|
||||||
|
|
||||||
|
def classify_urls(urls):
|
||||||
|
"""Filter URLs to likely product pages (single-segment paths)."""
|
||||||
|
skip_prefixes = [
|
||||||
|
"impressum", "agb", "datenschutz", "kontakt", "widerrufs",
|
||||||
|
"versand", "abkuerz", "zertifikat", "wichtige-hinweise",
|
||||||
|
"muster-", "gutscheine", "kalender", "flyer", "katalog",
|
||||||
|
"sommer-herbst", "unsere-hoefe", "bestellschein",
|
||||||
|
"dreschflegel-news", "termine", "rezepte", "anbautipps",
|
||||||
|
"tipps-zur", "gartentelefon", "gartenfreude", "buecher",
|
||||||
|
"navigation", "vielfalt", "sut20", "saatgut",
|
||||||
|
"neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
|
||||||
|
"saatgut-vielfalt", "saat",
|
||||||
|
]
|
||||||
|
candidates = []
|
||||||
|
for url in urls:
|
||||||
|
url = url.rstrip("/")
|
||||||
|
path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
|
||||||
|
"https://www.dreschflegel-saatgut.de/", ""
|
||||||
|
)
|
||||||
|
if not path or "/" in path:
|
||||||
|
continue
|
||||||
|
if any(path == p or path.startswith(p) for p in skip_prefixes):
|
||||||
|
continue
|
||||||
|
candidates.append(url)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def parse_product_page(html_content):
|
||||||
|
"""Extract product data from a Dreschflegel product page."""
|
||||||
|
if not html_content or 'class="botname"' not in html_content:
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
m = re.search(r"<h1>(.*?)</h1>", html_content)
|
||||||
|
if m:
|
||||||
|
result["name"] = html_mod.unescape(m.group(1).strip())
|
||||||
|
|
||||||
|
m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
result["botanical_name"] = html_mod.unescape(m.group(1).strip())
|
||||||
|
|
||||||
|
m = re.search(
|
||||||
|
r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
|
||||||
|
html_content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["article_number"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
result["price"] = float(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
m = re.search(
|
||||||
|
r"product-detail-description-text.*?<p>(.*?)</p>",
|
||||||
|
html_content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
|
||||||
|
desc = html_mod.unescape(desc).strip()
|
||||||
|
if desc:
|
||||||
|
result["description"] = desc
|
||||||
|
|
||||||
|
m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
|
||||||
|
if m:
|
||||||
|
result["pack_info"] = html_mod.unescape(m.group(1).strip())
|
||||||
|
|
||||||
|
return result if "name" in result and "botanical_name" in result else None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_all_products(candidate_urls):
|
||||||
|
"""Scrape product pages, using cache for already-scraped URLs."""
|
||||||
|
# Load cache
|
||||||
|
cache = {}
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r") as f:
|
||||||
|
cache = json.load(f)
|
||||||
|
print(f" Loaded {len(cache)} cached products")
|
||||||
|
|
||||||
|
products = []
|
||||||
|
to_fetch = [u for u in candidate_urls if u not in cache]
|
||||||
|
already_cached = [u for u in candidate_urls if u in cache]
|
||||||
|
|
||||||
|
# Add cached products
|
||||||
|
for u in already_cached:
|
||||||
|
if cache[u]: # None means "not a product page"
|
||||||
|
products.append(cache[u])
|
||||||
|
|
||||||
|
cached_products = len(products)
|
||||||
|
cached_non_products = len(already_cached) - cached_products
|
||||||
|
print(f" {cached_products} products from cache, "
|
||||||
|
f"{cached_non_products} non-products cached, "
|
||||||
|
f"{len(to_fetch)} to fetch")
|
||||||
|
|
||||||
|
for i, url in enumerate(to_fetch):
|
||||||
|
if (i + 1) % 50 == 0 or i == 0:
|
||||||
|
print(f" Fetching {i + 1}/{len(to_fetch)}...")
|
||||||
|
|
||||||
|
time.sleep(DELAY)
|
||||||
|
html_content = fetch_page(url)
|
||||||
|
if not html_content:
|
||||||
|
stats["fetch_errors"] += 1
|
||||||
|
cache[url] = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
product = parse_product_page(html_content)
|
||||||
|
if product:
|
||||||
|
product["url"] = url
|
||||||
|
products.append(product)
|
||||||
|
cache[url] = product
|
||||||
|
stats["products_scraped"] += 1
|
||||||
|
else:
|
||||||
|
cache[url] = None
|
||||||
|
stats["not_product_pages"] += 1
|
||||||
|
|
||||||
|
# Save cache periodically
|
||||||
|
if (i + 1) % 100 == 0:
|
||||||
|
with open(CACHE_FILE, "w") as f:
|
||||||
|
json.dump(cache, f)
|
||||||
|
|
||||||
|
# Final cache save
|
||||||
|
with open(CACHE_FILE, "w") as f:
|
||||||
|
json.dump(cache, f)
|
||||||
|
|
||||||
|
print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
|
||||||
|
return products
|
||||||
|
|
||||||
|
|
||||||
|
def paginated_get(path):
|
||||||
|
"""Fetch all pages from a paginated API endpoint."""
|
||||||
|
all_items = []
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
|
||||||
|
if not resp or "data" not in resp or not resp["data"]:
|
||||||
|
break
|
||||||
|
all_items.extend(resp["data"])
|
||||||
|
if len(resp["data"]) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
|
||||||
|
def load_api_data():
|
||||||
|
"""Load all species, families, cultivars from HerbAPI."""
|
||||||
|
print("Loading HerbAPI data...")
|
||||||
|
|
||||||
|
families = {}
|
||||||
|
for f in paginated_get("/families"):
|
||||||
|
families[f["name_scientific"].lower()] = f
|
||||||
|
print(f" {len(families)} families")
|
||||||
|
|
||||||
|
species = {}
|
||||||
|
for s in paginated_get("/species"):
|
||||||
|
species[s["name_scientific"].lower().strip()] = s
|
||||||
|
print(f" {len(species)} species")
|
||||||
|
|
||||||
|
cultivars = {}
|
||||||
|
for c in paginated_get("/cultivars"):
|
||||||
|
key = (c["species_id"], c["name"].lower().strip())
|
||||||
|
cultivars[key] = c
|
||||||
|
print(f" {len(cultivars)} cultivars")
|
||||||
|
|
||||||
|
return families, species, cultivars
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_supplier():
|
||||||
|
"""Create or find the Dreschflegel supplier."""
|
||||||
|
resp = api_request("GET", "/suppliers")
|
||||||
|
if resp:
|
||||||
|
for s in resp:
|
||||||
|
if "dreschflegel" in s["name"].lower():
|
||||||
|
print(f" Supplier exists: {s['name']} ({s['id']})")
|
||||||
|
return s
|
||||||
|
data = {
|
||||||
|
"name": "Dreschflegel",
|
||||||
|
"url": "https://www.dreschflegel-saatgut.de",
|
||||||
|
"country": "DE",
|
||||||
|
"is_organic": True,
|
||||||
|
"is_demeter": False,
|
||||||
|
"notes": "German organic seed cooperative, open-pollinated heritage varieties",
|
||||||
|
}
|
||||||
|
resp = api_request("POST", "/suppliers", data)
|
||||||
|
if resp:
|
||||||
|
print(f" Created supplier: {resp['name']} ({resp['id']})")
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
# Genus → family mapping for species creation
|
||||||
|
GENUS_TO_FAMILY = {
|
||||||
|
# Asteraceae
|
||||||
|
"Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
|
||||||
|
"Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
|
||||||
|
"Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
|
||||||
|
"Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
|
||||||
|
"Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
|
||||||
|
"Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
|
||||||
|
"Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
|
||||||
|
"Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
|
||||||
|
"Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
|
||||||
|
"Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
|
||||||
|
"Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
|
||||||
|
"Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
|
||||||
|
"Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
|
||||||
|
"Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
|
||||||
|
# Solanaceae
|
||||||
|
"Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
|
||||||
|
"Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
|
||||||
|
# Cucurbitaceae
|
||||||
|
"Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
|
||||||
|
"Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
|
||||||
|
# Fabaceae
|
||||||
|
"Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
|
||||||
|
"Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
|
||||||
|
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
|
||||||
|
"Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
|
||||||
|
"Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
|
||||||
|
# Brassicaceae
|
||||||
|
"Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
|
||||||
|
"Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
|
||||||
|
"Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
|
||||||
|
"Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
|
||||||
|
"Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
||||||
|
# Apiaceae
|
||||||
|
"Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
|
||||||
|
"Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
|
||||||
|
"Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
|
||||||
|
"Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
|
||||||
|
"Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
|
||||||
|
# Lamiaceae
|
||||||
|
"Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
|
||||||
|
"Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
|
||||||
|
"Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
|
||||||
|
"Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
|
||||||
|
"Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
|
||||||
|
"Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
|
||||||
|
# Amaryllidaceae / Alliaceae
|
||||||
|
"Allium": "Amaryllidaceae",
|
||||||
|
# Poaceae
|
||||||
|
"Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
|
||||||
|
"Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
|
||||||
|
"Zea": "Poaceae", "Setaria": "Poaceae",
|
||||||
|
# Chenopodiaceae
|
||||||
|
"Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
|
||||||
|
"Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
||||||
|
# Rosaceae
|
||||||
|
"Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
|
||||||
|
"Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
|
||||||
|
"Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
|
||||||
|
"Waldsteinia": "Rosaceae",
|
||||||
|
# Boraginaceae
|
||||||
|
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
|
||||||
|
"Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
|
||||||
|
# Malvaceae
|
||||||
|
"Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
|
||||||
|
"Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
|
||||||
|
# Polygonaceae
|
||||||
|
"Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
|
||||||
|
# Caryophyllaceae
|
||||||
|
"Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
|
||||||
|
"Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
|
||||||
|
"Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
|
||||||
|
# Tropaeolaceae
|
||||||
|
"Tropaeolum": "Tropaeolaceae",
|
||||||
|
# Papaveraceae
|
||||||
|
"Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
|
||||||
|
"Meconopsis": "Papaveraceae",
|
||||||
|
# Caprifoliaceae
|
||||||
|
"Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
|
||||||
|
"Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
|
||||||
|
# Plantaginaceae
|
||||||
|
"Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
|
||||||
|
"Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
|
||||||
|
# Violaceae
|
||||||
|
"Viola": "Violaceae",
|
||||||
|
# Ranunculaceae
|
||||||
|
"Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
|
||||||
|
"Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
|
||||||
|
# Linaceae
|
||||||
|
"Linum": "Linaceae",
|
||||||
|
# Convolvulaceae
|
||||||
|
"Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
|
||||||
|
# Portulacaceae / Montiaceae
|
||||||
|
"Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
|
||||||
|
# Amaranthaceae
|
||||||
|
"Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
|
||||||
|
"Gomphrena": "Amaranthaceae",
|
||||||
|
# Asparagaceae
|
||||||
|
"Asparagus": "Asparagaceae",
|
||||||
|
# Resedaceae
|
||||||
|
"Reseda": "Resedaceae",
|
||||||
|
# Balsaminaceae
|
||||||
|
"Impatiens": "Balsaminaceae",
|
||||||
|
# Hydrangeaceae
|
||||||
|
"Hydrangea": "Hydrangeaceae",
|
||||||
|
# Campanulaceae
|
||||||
|
"Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
|
||||||
|
# Scrophulariaceae
|
||||||
|
"Verbascum": "Scrophulariaceae",
|
||||||
|
# Verbenaceae
|
||||||
|
"Verbena": "Verbenaceae",
|
||||||
|
# Onagraceae
|
||||||
|
"Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
|
||||||
|
# Cucurbitaceae extras
|
||||||
|
"Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
|
||||||
|
# Hypericaceae
|
||||||
|
"Hypericum": "Hypericaceae",
|
||||||
|
# Adoxaceae
|
||||||
|
"Sambucus": "Adoxaceae",
|
||||||
|
# Others
|
||||||
|
"Nigella": "Ranunculaceae",
|
||||||
|
"Dipsacus": "Caprifoliaceae",
|
||||||
|
"Knautia": "Caprifoliaceae",
|
||||||
|
"Scabiosa": "Caprifoliaceae",
|
||||||
|
"Succisa": "Caprifoliaceae",
|
||||||
|
"Asclepias": "Apocynaceae",
|
||||||
|
"Cynoglossum": "Boraginaceae",
|
||||||
|
"Echium": "Boraginaceae",
|
||||||
|
"Anchusa": "Boraginaceae",
|
||||||
|
"Lithospermum": "Boraginaceae",
|
||||||
|
"Tanacetum": "Asteraceae",
|
||||||
|
"Onobrychis": "Fabaceae",
|
||||||
|
"Ornithopus": "Fabaceae",
|
||||||
|
"Lotus": "Fabaceae",
|
||||||
|
"Anthyllis": "Fabaceae",
|
||||||
|
"Melilotus": "Fabaceae",
|
||||||
|
"Galega": "Fabaceae",
|
||||||
|
"Lespedeza": "Fabaceae",
|
||||||
|
"Arachis": "Fabaceae",
|
||||||
|
"Senna": "Fabaceae",
|
||||||
|
# Additional genera found in Dreschflegel catalog
|
||||||
|
"Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
|
||||||
|
"Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
|
||||||
|
"Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
|
||||||
|
"Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
|
||||||
|
"Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
|
||||||
|
"Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
|
||||||
|
"Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
|
||||||
|
"Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
|
||||||
|
"Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
|
||||||
|
"Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
|
||||||
|
"Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
|
||||||
|
"Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
|
||||||
|
"Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
|
||||||
|
"Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
|
||||||
|
"Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
|
||||||
|
"Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
|
||||||
|
"Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
|
||||||
|
"Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
|
||||||
|
"Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
|
||||||
|
"Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
|
||||||
|
"Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
|
||||||
|
"Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
|
||||||
|
"Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_species_name(botanical_name):
|
||||||
|
"""Normalize botanical name to 'Genus species' for matching.
|
||||||
|
Handles var., subsp., ssp., hybrids etc.
|
||||||
|
"""
|
||||||
|
name = botanical_name.strip()
|
||||||
|
parts = name.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
genus = parts[0]
|
||||||
|
# Handle 'Genus x species' (hybrid notation)
|
||||||
|
if parts[1] == "x" and len(parts) >= 3:
|
||||||
|
species = f"x {parts[2]}"
|
||||||
|
elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
|
||||||
|
# Only genus level - can't match to species
|
||||||
|
return genus, None
|
||||||
|
else:
|
||||||
|
species = parts[1]
|
||||||
|
|
||||||
|
return genus, species
|
||||||
|
|
||||||
|
|
||||||
|
def find_species(botanical_name, species_cache):
|
||||||
|
"""Find existing species matching a botanical name.
|
||||||
|
Tries exact match, then genus+species without var/subsp.
|
||||||
|
"""
|
||||||
|
genus, sp = normalize_species_name(botanical_name)
|
||||||
|
if not genus:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if sp:
|
||||||
|
# Try exact genus+species
|
||||||
|
search_key = f"{genus} {sp}".lower()
|
||||||
|
if search_key in species_cache:
|
||||||
|
return species_cache[search_key]
|
||||||
|
|
||||||
|
# Try all species with same genus
|
||||||
|
genus_lower = genus.lower()
|
||||||
|
matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
|
||||||
|
if len(matches) == 1:
|
||||||
|
# Only one species in this genus - use it
|
||||||
|
return list(matches.values())[0]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_or_create_species(botanical_name, families, species_cache):
|
||||||
|
"""Find or create a species from a botanical name."""
|
||||||
|
# Try to find existing
|
||||||
|
sp = find_species(botanical_name, species_cache)
|
||||||
|
if sp:
|
||||||
|
return sp
|
||||||
|
|
||||||
|
genus, species_epithet = normalize_species_name(botanical_name)
|
||||||
|
if not genus or not species_epithet:
|
||||||
|
stats["species_no_epithet"] += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
sci_name = f"{genus} {species_epithet}"
|
||||||
|
|
||||||
|
# Check cache again with normalized name
|
||||||
|
if sci_name.lower() in species_cache:
|
||||||
|
return species_cache[sci_name.lower()]
|
||||||
|
|
||||||
|
# Need to create - find the family
|
||||||
|
family_name = GENUS_TO_FAMILY.get(genus)
|
||||||
|
if not family_name:
|
||||||
|
stats["species_no_family"] += 1
|
||||||
|
print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find or create the family
|
||||||
|
family = families.get(family_name.lower())
|
||||||
|
if not family:
|
||||||
|
print(f" Creating family: {family_name}")
|
||||||
|
resp = api_request("POST", "/families", {"name_scientific": family_name})
|
||||||
|
if resp:
|
||||||
|
families[family_name.lower()] = resp
|
||||||
|
family = resp
|
||||||
|
stats["families_created"] += 1
|
||||||
|
else:
|
||||||
|
# May already exist (duplicate from previous run) - reload
|
||||||
|
for f in paginated_get("/families"):
|
||||||
|
if f["name_scientific"].lower() == family_name.lower():
|
||||||
|
families[family_name.lower()] = f
|
||||||
|
family = f
|
||||||
|
break
|
||||||
|
if not family:
|
||||||
|
print(f" [SKIP] Cannot create family: {family_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create species
|
||||||
|
print(f" Creating species: {sci_name} (family: {family_name})")
|
||||||
|
resp = api_request("POST", "/species", {
|
||||||
|
"name_scientific": sci_name,
|
||||||
|
"family_id": family["id"],
|
||||||
|
})
|
||||||
|
if resp:
|
||||||
|
species_cache[sci_name.lower()] = resp
|
||||||
|
stats["species_created"] += 1
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
# May already exist - try to find it
|
||||||
|
time.sleep(0.1)
|
||||||
|
for s in paginated_get("/species"):
|
||||||
|
if s["name_scientific"].lower() == sci_name.lower():
|
||||||
|
species_cache[sci_name.lower()] = s
|
||||||
|
return s
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_cultivar_name(product_name):
|
||||||
|
"""Extract the cultivar/variety name from the full product name."""
|
||||||
|
name = product_name.strip()
|
||||||
|
|
||||||
|
# Common German crop type prefixes to strip (longest first)
|
||||||
|
prefixes = [
|
||||||
|
# Tomatoes
|
||||||
|
"Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
|
||||||
|
"Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
|
||||||
|
"Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
|
||||||
|
# Lettuce
|
||||||
|
"Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
|
||||||
|
"Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
|
||||||
|
"Spargelsalat", "Romanasalat",
|
||||||
|
# Beans
|
||||||
|
"Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
|
||||||
|
"Prunkbohne",
|
||||||
|
# Peas
|
||||||
|
"Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
|
||||||
|
"Knackerbse", "Kapuzinererbse",
|
||||||
|
# Cucumbers
|
||||||
|
"Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
|
||||||
|
"Freilandgurke",
|
||||||
|
# Squash
|
||||||
|
"Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
|
||||||
|
"Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
|
||||||
|
# Melon
|
||||||
|
"Wassermelone", "Zuckermelone",
|
||||||
|
# Peppers
|
||||||
|
"Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
|
||||||
|
"Snackpaprika", "Peperoni", "Chili",
|
||||||
|
# Brassicas
|
||||||
|
"Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
|
||||||
|
"Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
|
||||||
|
"Chinakohl", "Pak Choi", "Markstammkohl",
|
||||||
|
# Root veg
|
||||||
|
"Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
|
||||||
|
"Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
|
||||||
|
"Steckrübe", "Knollensellerie", "Petersilienwurzel",
|
||||||
|
"Rettich", "Radieschen",
|
||||||
|
# Onions
|
||||||
|
"Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
|
||||||
|
"Schalotte", "Wintersteckzwiebel", "Zwiebel",
|
||||||
|
# Herbs
|
||||||
|
"Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
|
||||||
|
"Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
|
||||||
|
"Basilikum", "Schnittknoblauch",
|
||||||
|
# Grains
|
||||||
|
"Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
|
||||||
|
"Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
|
||||||
|
# Misc
|
||||||
|
"Zuckermais", "Popcornmais",
|
||||||
|
"Salattomate", "Zucchini",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prefix in sorted(prefixes, key=len, reverse=True):
|
||||||
|
if name.startswith(prefix + " "):
|
||||||
|
return name[len(prefix):].strip()
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def get_existing_supplier_links(cultivar_id, supplier_id):
|
||||||
|
"""Check if a cultivar-supplier link already exists."""
|
||||||
|
resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
|
||||||
|
if resp:
|
||||||
|
for link in resp:
|
||||||
|
if link["supplier_id"] == supplier_id:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Step 1: Supplier
|
||||||
|
print("\n[1] Setting up supplier...")
|
||||||
|
supplier = ensure_supplier()
|
||||||
|
if not supplier:
|
||||||
|
print("FATAL: Could not create/find supplier")
|
||||||
|
sys.exit(1)
|
||||||
|
supplier_id = supplier["id"]
|
||||||
|
|
||||||
|
# Step 2: Load API data
|
||||||
|
print("\n[2] Loading existing HerbAPI data...")
|
||||||
|
families, species_cache, cultivar_cache = load_api_data()
|
||||||
|
|
||||||
|
# Step 3: Get product URLs
|
||||||
|
print("\n[3] Fetching sitemap...")
|
||||||
|
all_urls = get_sitemap_urls()
|
||||||
|
if not all_urls:
|
||||||
|
print("FATAL: Could not fetch sitemap")
|
||||||
|
sys.exit(1)
|
||||||
|
candidate_urls = classify_urls(all_urls)
|
||||||
|
print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
|
||||||
|
|
||||||
|
# Step 4: Scrape
|
||||||
|
print(f"\n[4] Scraping product pages...")
|
||||||
|
products = scrape_all_products(candidate_urls)
|
||||||
|
|
||||||
|
# Step 5: Import
|
||||||
|
print(f"\n[5] Importing {len(products)} products into HerbAPI...")
|
||||||
|
|
||||||
|
for i, product in enumerate(products):
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f" Processing {i + 1}/{len(products)}...")
|
||||||
|
|
||||||
|
botanical = product.get("botanical_name", "")
|
||||||
|
if not botanical:
|
||||||
|
stats["no_botanical"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find or create species
|
||||||
|
sp = find_or_create_species(botanical, families, species_cache)
|
||||||
|
if not sp:
|
||||||
|
stats["species_not_matched"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
species_id = sp["id"]
|
||||||
|
cultivar_name = extract_cultivar_name(product["name"])
|
||||||
|
|
||||||
|
# Check if cultivar already exists
|
||||||
|
cv_key = (species_id, cultivar_name.lower().strip())
|
||||||
|
if cv_key in cultivar_cache:
|
||||||
|
cv = cultivar_cache[cv_key]
|
||||||
|
stats["cultivars_existing"] += 1
|
||||||
|
else:
|
||||||
|
cv_data = {
|
||||||
|
"species_id": species_id,
|
||||||
|
"name": cultivar_name,
|
||||||
|
"is_organic": True,
|
||||||
|
}
|
||||||
|
if product.get("description"):
|
||||||
|
cv_data["description"] = product["description"]
|
||||||
|
|
||||||
|
cv = api_request("POST", "/cultivars", cv_data)
|
||||||
|
if cv:
|
||||||
|
cultivar_cache[cv_key] = cv
|
||||||
|
stats["cultivars_created"] += 1
|
||||||
|
else:
|
||||||
|
# Might already exist from previous run - try to find it
|
||||||
|
found = False
|
||||||
|
for c in paginated_get(f"/cultivars?species_id={species_id}"):
|
||||||
|
if c["name"].lower().strip() == cultivar_name.lower().strip():
|
||||||
|
cultivar_cache[cv_key] = c
|
||||||
|
cv = c
|
||||||
|
stats["cultivars_existing"] += 1
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
if not found:
|
||||||
|
stats["cultivar_create_errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Link to supplier (check first for idempotency)
|
||||||
|
if get_existing_supplier_links(cv["id"], supplier_id):
|
||||||
|
stats["supplier_links_existing"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
link_data = {
|
||||||
|
"supplier_id": supplier_id,
|
||||||
|
"article_number": product.get("article_number", ""),
|
||||||
|
"product_url": product.get("url", ""),
|
||||||
|
"price_eur": product.get("price"),
|
||||||
|
}
|
||||||
|
pack_info = product.get("pack_info", "")
|
||||||
|
if pack_info:
|
||||||
|
m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
|
||||||
|
if m:
|
||||||
|
link_data["pack_size"] = float(m.group(1))
|
||||||
|
unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
|
||||||
|
link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
|
||||||
|
|
||||||
|
resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
|
||||||
|
if resp:
|
||||||
|
stats["supplier_links_created"] += 1
|
||||||
|
else:
|
||||||
|
stats["supplier_link_errors"] += 1
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("RESULTS")
|
||||||
|
print("=" * 60)
|
||||||
|
for key, val in sorted(stats.items()):
|
||||||
|
print(f" {key}: {val}")
|
||||||
|
print(f"\n Total species in DB: {len(species_cache)}")
|
||||||
|
print(f" Total cultivars tracked: {len(cultivar_cache)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,380 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
DB_CMD = [
|
||||||
|
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
|
||||||
|
'-t', '-A', '-F|'
|
||||||
|
]
|
||||||
|
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
|
||||||
|
|
||||||
|
MONTH_MAP = {
|
||||||
|
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||||||
|
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
||||||
|
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_sql(sql):
|
||||||
|
result = subprocess.run(
|
||||||
|
DB_CMD + ['-c', sql],
|
||||||
|
capture_output=True, text=True, env=DB_ENV
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url):
|
||||||
|
result = subprocess.run(
|
||||||
|
['curl', '-sL', '--max-time', '15', url],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
return result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
def parse_months(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
text_lower = text.lower().strip()
|
||||||
|
months = []
|
||||||
|
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
|
||||||
|
if month_name in text_lower:
|
||||||
|
if month_num not in months:
|
||||||
|
months.append(month_num)
|
||||||
|
text_lower = text_lower.replace(month_name, '')
|
||||||
|
return sorted(months) if months else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_depth(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
|
||||||
|
if match:
|
||||||
|
v1 = float(match.group(1).replace(',', '.'))
|
||||||
|
v2 = float(match.group(2).replace(',', '.'))
|
||||||
|
return round((v1 + v2) / 2, 1)
|
||||||
|
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
|
||||||
|
if match:
|
||||||
|
return float(match.group(1).replace(',', '.'))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_spacing(text):
|
||||||
|
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
|
||||||
|
if not text:
|
||||||
|
return None, None
|
||||||
|
text = text.lower().strip()
|
||||||
|
# "X x Y cm"
|
||||||
|
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||||||
|
if match:
|
||||||
|
return float(match.group(2)), float(match.group(1))
|
||||||
|
# "X - Y cm" range -> average as plant spacing
|
||||||
|
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||||||
|
if match:
|
||||||
|
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||||||
|
# Single value
|
||||||
|
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
|
||||||
|
if match:
|
||||||
|
return None, float(match.group(1))
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_germination_days(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
text = text.lower()
|
||||||
|
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
|
||||||
|
if match:
|
||||||
|
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
|
||||||
|
match = re.search(r'(\d+)\s*weeks?', text)
|
||||||
|
if match:
|
||||||
|
return int(match.group(1)) * 7
|
||||||
|
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
|
||||||
|
if match:
|
||||||
|
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
|
||||||
|
match = re.search(r'(\d+)\s*days?', text)
|
||||||
|
if match:
|
||||||
|
return int(match.group(1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_germ_temp(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
|
||||||
|
if match:
|
||||||
|
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||||||
|
match = re.search(r'(\d+)\s*°', text)
|
||||||
|
if match:
|
||||||
|
return float(match.group(1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_lifecycle(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
text = text.lower().strip()
|
||||||
|
if 'perennial' in text:
|
||||||
|
return True
|
||||||
|
if 'annual' in text or 'biennial' in text:
|
||||||
|
return False
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_light(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
text = text.lower().strip()
|
||||||
|
if 'full sun' in text and 'partial' in text:
|
||||||
|
return 'full sun to partial shade'
|
||||||
|
if 'full sun' in text:
|
||||||
|
return 'full sun'
|
||||||
|
if 'partial' in text or 'semi' in text or 'half' in text:
|
||||||
|
return 'partial shade'
|
||||||
|
if 'shade' in text:
|
||||||
|
return 'shade'
|
||||||
|
if 'sun' in text:
|
||||||
|
return 'full sun'
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data(html):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Extract table cell pairs
|
||||||
|
cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
|
||||||
|
clean_cells = []
|
||||||
|
for c in cells:
|
||||||
|
clean = re.sub(r'<[^>]+>', ' ', c).strip()
|
||||||
|
clean = re.sub(r'\s+', ' ', clean)
|
||||||
|
clean_cells.append(clean)
|
||||||
|
|
||||||
|
specs = {}
|
||||||
|
i = 0
|
||||||
|
while i < len(clean_cells) - 1:
|
||||||
|
key = clean_cells[i].rstrip(':').strip()
|
||||||
|
val = clean_cells[i + 1].strip()
|
||||||
|
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
|
||||||
|
specs[key.lower()] = val
|
||||||
|
i += 2
|
||||||
|
|
||||||
|
# Extract description from itemprop="description"
|
||||||
|
desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
|
||||||
|
if desc_match:
|
||||||
|
content = desc_match.group(1)
|
||||||
|
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
|
||||||
|
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
|
||||||
|
content = re.sub(r'<[^>]+>', ' ', content)
|
||||||
|
content = re.sub(r'\s+', ' ', content).strip()
|
||||||
|
for marker in ['Other names', 'Additional contact mail', 'Question about']:
|
||||||
|
idx = content.find(marker)
|
||||||
|
if idx > 0:
|
||||||
|
content = content[:idx].strip()
|
||||||
|
if len(content) > 20:
|
||||||
|
data['description'] = content
|
||||||
|
|
||||||
|
if 'description' not in data:
|
||||||
|
meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
|
||||||
|
if meta_match and len(meta_match.group(1)) > 20:
|
||||||
|
data['description'] = meta_match.group(1)
|
||||||
|
|
||||||
|
# Parse specs
|
||||||
|
if 'planting distance' in specs:
|
||||||
|
row_sp, plant_sp = parse_spacing(specs['planting distance'])
|
||||||
|
if plant_sp:
|
||||||
|
data['plant_spacing_cm'] = plant_sp
|
||||||
|
if row_sp:
|
||||||
|
data['row_spacing_cm'] = row_sp
|
||||||
|
|
||||||
|
if 'row spacing' in specs:
|
||||||
|
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
|
||||||
|
if match:
|
||||||
|
data['row_spacing_cm'] = float(match.group(1))
|
||||||
|
|
||||||
|
if 'sowing depth' in specs:
|
||||||
|
depth = parse_depth(specs['sowing depth'])
|
||||||
|
if depth is not None:
|
||||||
|
data['planting_depth_cm'] = depth
|
||||||
|
|
||||||
|
# Harvesting months - prefer explicit harvest time over flowering
|
||||||
|
if 'harvest time' in specs:
|
||||||
|
months = parse_months(specs['harvest time'])
|
||||||
|
if months:
|
||||||
|
data['harvesting_months'] = months
|
||||||
|
elif 'harvesting months' in specs:
|
||||||
|
months = parse_months(specs['harvesting months'])
|
||||||
|
if months:
|
||||||
|
data['harvesting_months'] = months
|
||||||
|
elif 'flowering months' in specs:
|
||||||
|
months = parse_months(specs['flowering months'])
|
||||||
|
if months:
|
||||||
|
data['harvesting_months'] = months
|
||||||
|
|
||||||
|
if 'when to sow outdoors' in specs:
|
||||||
|
months = parse_months(specs['when to sow outdoors'])
|
||||||
|
if months:
|
||||||
|
data['direct_sowing_months'] = months
|
||||||
|
|
||||||
|
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
|
||||||
|
if indoor_key in specs:
|
||||||
|
months = parse_months(specs[indoor_key])
|
||||||
|
if months:
|
||||||
|
data['indoor_sowing_months'] = months
|
||||||
|
break
|
||||||
|
|
||||||
|
if 'lifecycle' in specs:
|
||||||
|
perennial = parse_lifecycle(specs['lifecycle'])
|
||||||
|
if perennial is not None:
|
||||||
|
data['perennial'] = perennial
|
||||||
|
|
||||||
|
if 'sunlight' in specs:
|
||||||
|
light = parse_light(specs['sunlight'])
|
||||||
|
if light:
|
||||||
|
data['light_requirement'] = light
|
||||||
|
|
||||||
|
if 'germination time' in specs:
|
||||||
|
days = parse_germination_days(specs['germination time'])
|
||||||
|
if days:
|
||||||
|
data['days_to_germination'] = days
|
||||||
|
|
||||||
|
if 'germination temperature' in specs:
|
||||||
|
temp = parse_germ_temp(specs['germination temperature'])
|
||||||
|
if temp:
|
||||||
|
data['germination_temp_c'] = temp
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_values(cultivar_id):
|
||||||
|
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||||||
|
perennial, harvesting_months, direct_sowing_months, light_requirement,
|
||||||
|
days_to_germination, germination_temp_c, indoor_sowing_months
|
||||||
|
FROM cultivars WHERE id = '{cultivar_id}'"""
|
||||||
|
row = run_sql(sql)
|
||||||
|
if not row:
|
||||||
|
return {}
|
||||||
|
parts = row.split('|')
|
||||||
|
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
|
||||||
|
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
|
||||||
|
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
|
||||||
|
current = {}
|
||||||
|
for i, f in enumerate(fields):
|
||||||
|
if i < len(parts):
|
||||||
|
val = parts[i].strip()
|
||||||
|
if val and val != '':
|
||||||
|
current[f] = val
|
||||||
|
return current
|
||||||
|
|
||||||
|
|
||||||
|
def build_update_sql(cultivar_id, data, current):
|
||||||
|
sets = []
|
||||||
|
updated_fields = []
|
||||||
|
for field, value in data.items():
|
||||||
|
if field in current and current[field]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(value, str):
|
||||||
|
escaped = value.replace("'", "''")
|
||||||
|
sets.append(f"{field} = '{escaped}'")
|
||||||
|
elif isinstance(value, bool):
|
||||||
|
sets.append(f"{field} = {'true' if value else 'false'}")
|
||||||
|
elif isinstance(value, list):
|
||||||
|
arr_str = '{' + ','.join(str(x) for x in value) + '}'
|
||||||
|
sets.append(f"{field} = '{arr_str}'")
|
||||||
|
elif isinstance(value, (int, float)):
|
||||||
|
sets.append(f"{field} = {value}")
|
||||||
|
updated_fields.append(field)
|
||||||
|
|
||||||
|
if not sets:
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sql = """
|
||||||
|
SELECT c.id, c.name, cs.product_url
|
||||||
|
FROM cultivars c
|
||||||
|
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
|
||||||
|
JOIN suppliers s ON cs.supplier_id = s.id
|
||||||
|
WHERE s.name = 'Magic Garden Seeds'
|
||||||
|
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
|
||||||
|
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
|
||||||
|
ORDER BY c.name;
|
||||||
|
"""
|
||||||
|
rows = run_sql(sql)
|
||||||
|
if not rows:
|
||||||
|
print("No cultivars to process")
|
||||||
|
return
|
||||||
|
|
||||||
|
cultivars = []
|
||||||
|
for line in rows.strip().split('\n'):
|
||||||
|
parts = line.split('|')
|
||||||
|
if len(parts) >= 3:
|
||||||
|
cultivars.append({
|
||||||
|
'id': parts[0],
|
||||||
|
'name': parts[1],
|
||||||
|
'url': parts[2]
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"Processing {len(cultivars)} MGS cultivars...")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
failed = 0
|
||||||
|
fields_updated = {}
|
||||||
|
|
||||||
|
for i, cv in enumerate(cultivars):
|
||||||
|
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = fetch_page(cv['url'])
|
||||||
|
if not html or len(html) < 1000:
|
||||||
|
print("FAILED (empty page)")
|
||||||
|
failed += 1
|
||||||
|
time.sleep(0.5)
|
||||||
|
continue
|
||||||
|
|
||||||
|
data = extract_data(html)
|
||||||
|
if not data:
|
||||||
|
print("NO DATA")
|
||||||
|
skipped += 1
|
||||||
|
time.sleep(0.5)
|
||||||
|
continue
|
||||||
|
|
||||||
|
current = get_current_values(cv['id'])
|
||||||
|
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
|
||||||
|
|
||||||
|
if not sql_stmt:
|
||||||
|
print(f"SKIP (all fields populated)")
|
||||||
|
skipped += 1
|
||||||
|
else:
|
||||||
|
run_sql(sql_stmt)
|
||||||
|
for f in upd_fields:
|
||||||
|
fields_updated[f] = fields_updated.get(f, 0) + 1
|
||||||
|
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
print(f"\n=== MGS Summary ===")
|
||||||
|
print(f"Total processed: {len(cultivars)}")
|
||||||
|
print(f"Updated: {updated}")
|
||||||
|
print(f"Skipped (all fields already populated): {skipped}")
|
||||||
|
print(f"Failed: {failed}")
|
||||||
|
print(f"\nFields updated:")
|
||||||
|
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {field}: {count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -0,0 +1,330 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import sys
|
||||||
|
|
||||||
|
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
|
||||||
|
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
|
||||||
|
DELAY = 0.5
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(path):
|
||||||
|
"""GET from HerbAPI."""
|
||||||
|
url = f"{HERBAPI_BASE}{path}"
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
||||||
|
req.add_header("Accept", "application/json")
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
|
||||||
|
def api_put(path, data):
|
||||||
|
"""PUT to HerbAPI."""
|
||||||
|
url = f"{HERBAPI_BASE}{path}"
|
||||||
|
body = json.dumps(data).encode()
|
||||||
|
req = urllib.request.Request(url, data=body, method="PUT")
|
||||||
|
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
||||||
|
req.add_header("Content-Type", "application/json")
|
||||||
|
req.add_header("Accept", "application/json")
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_naturadb(latin_name):
|
||||||
|
"""Fetch a NaturaDB plant page. Returns HTML string or None."""
|
||||||
|
slug = latin_name.lower().replace(" ", "-")
|
||||||
|
url = f"{NATURADB_BASE}/{slug}/"
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
req.add_header("User-Agent", USER_AGENT)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
return resp.read().decode("utf-8", errors="replace")
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 404:
|
||||||
|
return None
|
||||||
|
print(f" HTTP {e.code} for {url}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error fetching {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_td_value(html, label):
|
||||||
|
"""Extract value from <td>label:</td><td>value</td> pattern."""
|
||||||
|
pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
|
||||||
|
m = re.search(pattern, html, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
# Strip HTML tags from value
|
||||||
|
val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
|
||||||
|
return val
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_native_status(html):
|
||||||
|
"""Extract native status from chip badges."""
|
||||||
|
# Look for the primary native status chips (large, colored)
|
||||||
|
statuses = []
|
||||||
|
for m in re.finditer(
|
||||||
|
r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
|
||||||
|
):
|
||||||
|
tag = m.group(1).strip()
|
||||||
|
if tag in (
|
||||||
|
"heimische Wildform",
|
||||||
|
"Archäophyt",
|
||||||
|
"Neophyt",
|
||||||
|
"nicht heimisch (Neophyt)",
|
||||||
|
):
|
||||||
|
statuses.append(tag)
|
||||||
|
return statuses
|
||||||
|
|
||||||
|
|
||||||
|
def extract_badge_tags(html):
|
||||||
|
"""Extract ecological badge chips (large, plain text)."""
|
||||||
|
tags = []
|
||||||
|
for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
|
||||||
|
tag = m.group(1).strip()
|
||||||
|
if tag and tag not in ("", "winterhart"):
|
||||||
|
tags.append(tag)
|
||||||
|
return tags
|
||||||
|
|
||||||
|
|
||||||
|
def parse_count(text):
|
||||||
|
"""Extract leading integer from text like '82 (Nektar und/oder ...)' """
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
m = re.match(r"(\d+)", text.strip())
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_specialist_count(text):
|
||||||
|
"""Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_nectar_pollen(text):
|
||||||
|
"""Extract numeric value from '2/4 - mäßig' -> 2."""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
m = re.match(r"(\d+)/4", text.strip())
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def build_wildlife_value(data):
|
||||||
|
"""Build a structured wildlife_value string from scraped data."""
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
# Nectar and pollen
|
||||||
|
np_parts = []
|
||||||
|
if data.get("nectar") is not None:
|
||||||
|
np_parts.append(f"Nectar: {data['nectar']}/4")
|
||||||
|
if data.get("pollen") is not None:
|
||||||
|
np_parts.append(f"Pollen: {data['pollen']}/4")
|
||||||
|
if np_parts:
|
||||||
|
parts.append(", ".join(np_parts) + ".")
|
||||||
|
|
||||||
|
# Wild bees
|
||||||
|
if data.get("wildbienen_count") is not None:
|
||||||
|
s = f"Supports {data['wildbienen_count']} wild bee species"
|
||||||
|
if data.get("wildbienen_specialists") is not None:
|
||||||
|
s += f" ({data['wildbienen_specialists']} specialists)"
|
||||||
|
parts.append(s + ".")
|
||||||
|
|
||||||
|
# Butterflies / moths
|
||||||
|
if data.get("schmetterlinge_count") is not None:
|
||||||
|
s = f"{data['schmetterlinge_count']} butterfly/moth species"
|
||||||
|
if data.get("raupen_count") is not None:
|
||||||
|
spec = ""
|
||||||
|
if data.get("raupen_specialists") is not None:
|
||||||
|
spec = f" ({data['raupen_specialists']} specialized)"
|
||||||
|
s += f", {data['raupen_count']} as caterpillar host{spec}"
|
||||||
|
parts.append(s + ".")
|
||||||
|
|
||||||
|
# Hoverflies
|
||||||
|
if data.get("schwebfliegen_count") is not None:
|
||||||
|
parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
|
||||||
|
|
||||||
|
# Beetles
|
||||||
|
if data.get("kaefer_count") is not None:
|
||||||
|
parts.append(f"{data['kaefer_count']} beetle species.")
|
||||||
|
|
||||||
|
# Birds
|
||||||
|
if data.get("vogelarten_count") is not None:
|
||||||
|
parts.append(f"{data['vogelarten_count']} bird species.")
|
||||||
|
|
||||||
|
# Mammals
|
||||||
|
if data.get("saeugetier_count") is not None:
|
||||||
|
parts.append(f"{data['saeugetier_count']} mammal species.")
|
||||||
|
|
||||||
|
# Native status
|
||||||
|
if data.get("native_status"):
|
||||||
|
parts.append(" ".join(data["native_status"]) + ".")
|
||||||
|
|
||||||
|
# Notable badges
|
||||||
|
notable = [
|
||||||
|
t
|
||||||
|
for t in data.get("badges", [])
|
||||||
|
if any(
|
||||||
|
kw in t.lower()
|
||||||
|
for kw in [
|
||||||
|
"insektenpflanze",
|
||||||
|
"raupenfutter",
|
||||||
|
"vogelschutz",
|
||||||
|
"vogelnähr",
|
||||||
|
"bienenweide",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if notable:
|
||||||
|
parts.append("Tags: " + ", ".join(notable) + ".")
|
||||||
|
|
||||||
|
return " ".join(parts) if parts else None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_species(html):
|
||||||
|
"""Parse NaturaDB HTML and return structured wildlife data dict."""
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Nectar and pollen values
|
||||||
|
nectar_raw = extract_td_value(html, "Nektarwert")
|
||||||
|
pollen_raw = extract_td_value(html, "Pollenwert")
|
||||||
|
data["nectar"] = parse_nectar_pollen(nectar_raw)
|
||||||
|
data["pollen"] = parse_nectar_pollen(pollen_raw)
|
||||||
|
|
||||||
|
# Wild bees
|
||||||
|
bees_raw = extract_td_value(html, "Wildbienen")
|
||||||
|
data["wildbienen_count"] = parse_count(bees_raw)
|
||||||
|
data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
|
||||||
|
|
||||||
|
# Butterflies/moths
|
||||||
|
schmett_raw = extract_td_value(html, "Schmetterlinge")
|
||||||
|
data["schmetterlinge_count"] = parse_count(schmett_raw)
|
||||||
|
|
||||||
|
# Caterpillar hosts
|
||||||
|
raupen_raw = extract_td_value(html, "Raupen")
|
||||||
|
data["raupen_count"] = parse_count(raupen_raw)
|
||||||
|
data["raupen_specialists"] = parse_specialist_count(raupen_raw)
|
||||||
|
|
||||||
|
# Hoverflies
|
||||||
|
schweb_raw = extract_td_value(html, "Schwebfliegen")
|
||||||
|
data["schwebfliegen_count"] = parse_count(schweb_raw)
|
||||||
|
|
||||||
|
# Beetles
|
||||||
|
kaefer_raw = extract_td_value(html, "Käfer")
|
||||||
|
data["kaefer_count"] = parse_count(kaefer_raw)
|
||||||
|
|
||||||
|
# Birds
|
||||||
|
vogel_raw = extract_td_value(html, "fressende Vogelarten")
|
||||||
|
data["vogelarten_count"] = parse_count(vogel_raw)
|
||||||
|
|
||||||
|
# Mammals
|
||||||
|
saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
|
||||||
|
data["saeugetier_count"] = parse_count(saeuget_raw)
|
||||||
|
|
||||||
|
# Native status
|
||||||
|
data["native_status"] = extract_native_status(html)
|
||||||
|
|
||||||
|
# Badge tags
|
||||||
|
data["badges"] = extract_badge_tags(html)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def has_any_data(data):
|
||||||
|
"""Check if we scraped anything meaningful."""
|
||||||
|
for k, v in data.items():
|
||||||
|
if k in ("native_status", "badges"):
|
||||||
|
if v:
|
||||||
|
return True
|
||||||
|
elif v is not None:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Fetching species list from HerbAPI...")
|
||||||
|
species_list = api_get("/species?per_page=200")["data"]
|
||||||
|
print(f"Found {len(species_list)} species.\n")
|
||||||
|
|
||||||
|
enriched = 0
|
||||||
|
skipped_has_data = 0
|
||||||
|
skipped_not_found = 0
|
||||||
|
skipped_no_data = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for i, sp in enumerate(species_list):
|
||||||
|
slug = sp["slug"]
|
||||||
|
name = sp["name_scientific"]
|
||||||
|
existing_wv = sp.get("wildlife_value")
|
||||||
|
|
||||||
|
# Only enrich if wildlife_value is empty/null
|
||||||
|
if existing_wv:
|
||||||
|
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
|
||||||
|
skipped_has_data += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
|
||||||
|
|
||||||
|
# Fetch NaturaDB page
|
||||||
|
html = fetch_naturadb(name)
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
if html is None:
|
||||||
|
print("NOT FOUND on NaturaDB")
|
||||||
|
skipped_not_found += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse wildlife data
|
||||||
|
data = scrape_species(html)
|
||||||
|
|
||||||
|
if not has_any_data(data):
|
||||||
|
print("no wildlife data on page")
|
||||||
|
skipped_no_data += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build wildlife_value string
|
||||||
|
wildlife_value = build_wildlife_value(data)
|
||||||
|
if not wildlife_value:
|
||||||
|
print("no wildlife data extracted")
|
||||||
|
skipped_no_data += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# GET full species, merge, PUT back
|
||||||
|
try:
|
||||||
|
full = api_get(f"/species/{slug}")
|
||||||
|
full["wildlife_value"] = wildlife_value
|
||||||
|
|
||||||
|
# Remove read-only / computed fields that the PUT endpoint might reject
|
||||||
|
for key in ("created_at", "updated_at", "family"):
|
||||||
|
full.pop(key, None)
|
||||||
|
|
||||||
|
api_put(f"/species/{full['id']}", full)
|
||||||
|
print(f"ENRICHED -> {wildlife_value[:80]}...")
|
||||||
|
enriched += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"API ERROR: {e}")
|
||||||
|
errors += 1
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(f"DONE. Results:")
|
||||||
|
print(f" Enriched: {enriched}")
|
||||||
|
print(f" Already had data: {skipped_has_data}")
|
||||||
|
print(f" Not on NaturaDB: {skipped_not_found}")
|
||||||
|
print(f" No wildlife data: {skipped_no_data}")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
print(f" Total: {len(species_list)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,560 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Fetch category pages, recursively discover product pages via JSON-LD detection
|
||||||
|
2. Extract structured data from JSON-LD Product schema + HTML text for growing data
|
||||||
|
3. Match Latin names to existing species in the API
|
||||||
|
4. Create cultivar records and link them to Reinsaat supplier
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import ssl
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# ── Config ──────────────────────────────────────────────────────────────────
|
||||||
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
|
||||||
|
DELAY = 0.5 # seconds between requests
|
||||||
|
USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
|
||||||
|
|
||||||
|
# ── Categories to scrape ────────────────────────────────────────────────────
|
||||||
|
# (category_url, default_species_hint for leaf pages in this category)
|
||||||
|
CATEGORIES = [
|
||||||
|
("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
|
||||||
|
("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
|
||||||
|
("https://www.reinsaat.at/shop/DE/kuerbis/", None),
|
||||||
|
("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
|
||||||
|
("https://www.reinsaat.at/shop/DE/bohnen/", None),
|
||||||
|
("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
|
||||||
|
("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
|
||||||
|
("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Known Latin name genera we can match ────────────────────────────────────
|
||||||
|
KNOWN_GENERA = (
|
||||||
|
"Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
|
||||||
|
"Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
|
||||||
|
"Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
|
||||||
|
"Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
|
||||||
|
"Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
|
||||||
|
"Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
|
||||||
|
)
|
||||||
|
|
||||||
|
LATIN_PATTERN = re.compile(
|
||||||
|
rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTML helpers ────────────────────────────────────────────────────────────
|
||||||
|
class TextExtractor(HTMLParser):
|
||||||
|
"""Extract all visible text from HTML."""
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.parts = []
|
||||||
|
self._skip = 0
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag in ("script", "style", "noscript"):
|
||||||
|
self._skip += 1
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in ("script", "style", "noscript") and self._skip > 0:
|
||||||
|
self._skip -= 1
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self._skip == 0:
|
||||||
|
t = data.strip()
|
||||||
|
if t:
|
||||||
|
self.parts.append(t)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_links(html: str, base_url: str) -> list[str]:
|
||||||
|
"""Extract all <a href> links from HTML, resolving relative URLs."""
|
||||||
|
links = []
|
||||||
|
seen = set()
|
||||||
|
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
|
||||||
|
href = m.group(1)
|
||||||
|
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||||
|
continue
|
||||||
|
full = urllib.parse.urljoin(base_url, href)
|
||||||
|
if full not in seen:
|
||||||
|
seen.add(full)
|
||||||
|
links.append(full)
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def extract_jsonld_product(html: str) -> Optional[dict]:
|
||||||
|
"""Extract the JSON-LD Product object from HTML, if present."""
|
||||||
|
for m in re.finditer(
|
||||||
|
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
|
||||||
|
html, re.DOTALL | re.IGNORECASE
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group(1))
|
||||||
|
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||||
|
return data
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTTP helpers ────────────────────────────────────────────────────────────
|
||||||
|
_ssl_ctx = ssl.create_default_context()
|
||||||
|
|
||||||
|
def fetch_url(url: str, retries: int = 2) -> str:
|
||||||
|
"""Fetch a URL with retries."""
|
||||||
|
req = urllib.request.Request(url, headers={
|
||||||
|
"User-Agent": USER_AGENT,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
|
||||||
|
})
|
||||||
|
for attempt in range(retries + 1):
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
|
||||||
|
charset = resp.headers.get_content_charset() or "utf-8"
|
||||||
|
return resp.read().decode(charset)
|
||||||
|
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||||
|
if attempt < retries:
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(path: str):
|
||||||
|
"""GET from HerbAPI."""
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{API_BASE}{path}",
|
||||||
|
headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def api_post(path: str, data: dict):
|
||||||
|
"""POST to HerbAPI."""
|
||||||
|
body = json.dumps(data).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{API_BASE}{path}",
|
||||||
|
data=body,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {AUTH_TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
error_body = e.read().decode("utf-8", errors="replace")
|
||||||
|
print(f" API ERROR {e.code}: {error_body[:500]}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
# ── Species matching ────────────────────────────────────────────────────────
|
||||||
|
def load_species() -> dict:
|
||||||
|
"""Load species from API. Returns dict: lowercase scientific name -> species dict."""
|
||||||
|
result = {}
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
data = api_get(f"/species?per_page=100&page={page}")
|
||||||
|
species_list = data.get("data", data) if isinstance(data, dict) else data
|
||||||
|
for s in species_list:
|
||||||
|
key = s["name_scientific"].lower().strip()
|
||||||
|
result[key] = s
|
||||||
|
if isinstance(data, dict) and "pagination" in data:
|
||||||
|
if page >= data["pagination"].get("total_pages", 1):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
|
||||||
|
"""Match a Latin name to an existing species. Returns species dict or None."""
|
||||||
|
if not latin_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Clean the name: remove author citations, subspecies
|
||||||
|
clean = latin_name.strip()
|
||||||
|
clean = re.sub(r'\s+L\.\s*$', '', clean)
|
||||||
|
clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
|
||||||
|
clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
|
||||||
|
|
||||||
|
key = clean.lower().strip()
|
||||||
|
if key in species_map:
|
||||||
|
return species_map[key]
|
||||||
|
|
||||||
|
# Try genus + species (first two words)
|
||||||
|
parts = key.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
two = f"{parts[0]} {parts[1]}"
|
||||||
|
if two in species_map:
|
||||||
|
return species_map[two]
|
||||||
|
|
||||||
|
# Try genus-only match (less reliable, but useful for Borago, etc.)
|
||||||
|
if parts:
|
||||||
|
for skey, sval in species_map.items():
|
||||||
|
if skey.startswith(parts[0] + " "):
|
||||||
|
return sval
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Product data extraction ─────────────────────────────────────────────────
|
||||||
|
@dataclass
|
||||||
|
class ProductData:
|
||||||
|
name: str = ""
|
||||||
|
latin_name: str = ""
|
||||||
|
description: str = ""
|
||||||
|
sku: str = ""
|
||||||
|
url: str = ""
|
||||||
|
is_organic: bool = True
|
||||||
|
sowing_depth_cm: Optional[float] = None
|
||||||
|
row_spacing_cm: Optional[float] = None
|
||||||
|
plant_spacing_cm: Optional[float] = None
|
||||||
|
germination_temp_c: Optional[float] = None
|
||||||
|
perennial: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
|
||||||
|
"""Parse a product page. Returns ProductData or None if not a product page."""
|
||||||
|
jsonld = extract_jsonld_product(html)
|
||||||
|
if not jsonld:
|
||||||
|
return None # Not a product page
|
||||||
|
|
||||||
|
product = ProductData(url=url)
|
||||||
|
|
||||||
|
# ── From JSON-LD ──
|
||||||
|
product.name = jsonld.get("name", "").strip()
|
||||||
|
product.description = jsonld.get("description", "").strip()
|
||||||
|
product.sku = jsonld.get("model", "").strip()
|
||||||
|
|
||||||
|
# ── Extract full text for pattern matching ──
|
||||||
|
extractor = TextExtractor()
|
||||||
|
extractor.feed(html)
|
||||||
|
full_text = " ".join(extractor.parts)
|
||||||
|
|
||||||
|
# ── Latin name ──
|
||||||
|
m = LATIN_PATTERN.search(full_text)
|
||||||
|
if m:
|
||||||
|
product.latin_name = m.group(1).strip()
|
||||||
|
# Also check <i>/<em> tags in HTML
|
||||||
|
if not product.latin_name:
|
||||||
|
for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
|
||||||
|
clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
|
||||||
|
im = LATIN_PATTERN.search(clean)
|
||||||
|
if im:
|
||||||
|
product.latin_name = im.group(1).strip()
|
||||||
|
break
|
||||||
|
if not product.latin_name and default_species:
|
||||||
|
product.latin_name = default_species
|
||||||
|
|
||||||
|
# ── Sowing depth ──
|
||||||
|
depth_pats = [
|
||||||
|
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
|
||||||
|
]
|
||||||
|
for pat in depth_pats:
|
||||||
|
dm = re.search(pat, full_text, re.IGNORECASE)
|
||||||
|
if dm:
|
||||||
|
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
|
||||||
|
product.sowing_depth_cm = sum(vals) / len(vals)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords
|
||||||
|
if product.sowing_depth_cm is None:
|
||||||
|
dm = re.search(
|
||||||
|
r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
html, re.IGNORECASE
|
||||||
|
)
|
||||||
|
if dm:
|
||||||
|
d1 = float(dm.group(1).replace(",", "."))
|
||||||
|
d2 = float(dm.group(2).replace(",", "."))
|
||||||
|
product.sowing_depth_cm = (d1 + d2) / 2
|
||||||
|
|
||||||
|
# ── Spacing ──
|
||||||
|
# Look for "ROW x PLANT cm" patterns
|
||||||
|
spacing_pats = [
|
||||||
|
# "30–40 x 2–4 cm" (range x range)
|
||||||
|
r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
|
||||||
|
# "100 x 50 cm" (simple)
|
||||||
|
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
]
|
||||||
|
for pat in spacing_pats:
|
||||||
|
matches = re.findall(pat, full_text, re.IGNORECASE)
|
||||||
|
if matches:
|
||||||
|
# Prefer the last match (often the more relevant outdoor spacing)
|
||||||
|
m = matches[-1]
|
||||||
|
if len(m) == 4:
|
||||||
|
product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
|
||||||
|
product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
|
||||||
|
elif len(m) == 2:
|
||||||
|
v1 = float(m[0].replace(",", "."))
|
||||||
|
v2 = float(m[1].replace(",", "."))
|
||||||
|
product.row_spacing_cm = v1
|
||||||
|
product.plant_spacing_cm = v2
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Germination temperature ──
|
||||||
|
temp_pats = [
|
||||||
|
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C',
|
||||||
|
r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C',
|
||||||
|
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
|
||||||
|
]
|
||||||
|
for pat in temp_pats:
|
||||||
|
tm = re.search(pat, full_text, re.IGNORECASE)
|
||||||
|
if tm:
|
||||||
|
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
|
||||||
|
# Sanity check: germination temps are typically 5-35°C
|
||||||
|
avg = sum(vals) / len(vals)
|
||||||
|
if 5 <= avg <= 40:
|
||||||
|
product.germination_temp_c = avg
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Perennial ──
|
||||||
|
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
|
||||||
|
for pat in perennial_pats:
|
||||||
|
if re.search(pat, full_text, re.IGNORECASE):
|
||||||
|
product.perennial = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return product
|
||||||
|
|
||||||
|
|
||||||
|
# ── Recursive product discovery ─────────────────────────────────────────────
|
||||||
|
def discover_products(
|
||||||
|
category_url: str,
|
||||||
|
default_species: Optional[str],
|
||||||
|
max_depth: int = 3,
|
||||||
|
_depth: int = 0,
|
||||||
|
_visited: set = None,
|
||||||
|
) -> list[ProductData]:
|
||||||
|
"""Recursively discover and parse product pages under a category URL."""
|
||||||
|
if _visited is None:
|
||||||
|
_visited = set()
|
||||||
|
if category_url in _visited or _depth > max_depth:
|
||||||
|
return []
|
||||||
|
_visited.add(category_url)
|
||||||
|
|
||||||
|
indent = " " * (_depth + 1)
|
||||||
|
print(f"{indent}Fetching: {category_url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = fetch_url(category_url)
|
||||||
|
time.sleep(DELAY)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{indent} ERROR: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Check if this IS a product page
|
||||||
|
product = parse_product(html, category_url, default_species)
|
||||||
|
if product:
|
||||||
|
return [product]
|
||||||
|
|
||||||
|
# It's a category/subcategory page: extract child links
|
||||||
|
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
|
||||||
|
child_links = []
|
||||||
|
for link in extract_links(html, category_url):
|
||||||
|
parsed = urllib.parse.urlparse(link)
|
||||||
|
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
|
||||||
|
continue
|
||||||
|
child_path = parsed.path.rstrip("/")
|
||||||
|
# Must be a direct child of the category path
|
||||||
|
if not child_path.startswith(cat_path + "/"):
|
||||||
|
continue
|
||||||
|
relative = child_path[len(cat_path) + 1:]
|
||||||
|
# Must be exactly one level deeper (no further slashes)
|
||||||
|
if "/" in relative:
|
||||||
|
continue
|
||||||
|
# Skip empty or same-path
|
||||||
|
if not relative:
|
||||||
|
continue
|
||||||
|
# Build clean URL
|
||||||
|
clean_url = f"https://www.reinsaat.at{child_path}/"
|
||||||
|
if clean_url not in _visited:
|
||||||
|
child_links.append(clean_url)
|
||||||
|
|
||||||
|
# Deduplicate
|
||||||
|
child_links = list(dict.fromkeys(child_links))
|
||||||
|
print(f"{indent} Found {len(child_links)} child links")
|
||||||
|
|
||||||
|
products = []
|
||||||
|
for child_url in child_links:
|
||||||
|
results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
|
||||||
|
products.extend(results)
|
||||||
|
|
||||||
|
return products
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────────────────────────────
|
||||||
|
def main():
|
||||||
|
print("=" * 70)
|
||||||
|
print("Reinsaat Scraper -> HerbAPI")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Load species
|
||||||
|
print("\n[1] Loading species from API...")
|
||||||
|
species_map = load_species()
|
||||||
|
sci_names = [k for k in species_map if " " in k]
|
||||||
|
print(f" {len(sci_names)} species loaded:")
|
||||||
|
for k in sorted(sci_names):
|
||||||
|
s = species_map[k]
|
||||||
|
print(f" {s['name_scientific']:40s} {s['id'][:12]}...")
|
||||||
|
|
||||||
|
# Load existing cultivars
|
||||||
|
print("\n[2] Loading existing cultivars...")
|
||||||
|
existing_cultivars = {} # (species_id, name_lower) -> cultivar_id
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
data = api_get(f"/cultivars?per_page=100&page={page}")
|
||||||
|
clist = data.get("data", data) if isinstance(data, dict) else data
|
||||||
|
if not clist:
|
||||||
|
break
|
||||||
|
for c in clist:
|
||||||
|
existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
|
||||||
|
# Check pagination - API uses {data, total, page, per_page} format
|
||||||
|
if isinstance(data, dict):
|
||||||
|
total = data.get("total", len(clist))
|
||||||
|
per_page = data.get("per_page", 100)
|
||||||
|
if page * per_page >= total:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
print(f" {len(existing_cultivars)} existing cultivars")
|
||||||
|
|
||||||
|
# Discover products from all categories
|
||||||
|
print("\n[3] Discovering products from Reinsaat categories...")
|
||||||
|
all_products: list[ProductData] = []
|
||||||
|
visited: set[str] = set()
|
||||||
|
|
||||||
|
for cat_url, species_hint in CATEGORIES:
|
||||||
|
print(f"\n Category: {cat_url}")
|
||||||
|
products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
|
||||||
|
all_products.extend(products)
|
||||||
|
print(f" -> {len(products)} products from this category")
|
||||||
|
|
||||||
|
print(f"\n Total products discovered: {len(all_products)}")
|
||||||
|
|
||||||
|
# Deduplicate by URL
|
||||||
|
seen_urls = set()
|
||||||
|
unique_products = []
|
||||||
|
for p in all_products:
|
||||||
|
if p.url not in seen_urls:
|
||||||
|
seen_urls.add(p.url)
|
||||||
|
unique_products.append(p)
|
||||||
|
all_products = unique_products
|
||||||
|
print(f" Unique products: {len(all_products)}")
|
||||||
|
|
||||||
|
# Process products
|
||||||
|
print("\n[4] Creating cultivars in API...")
|
||||||
|
stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
|
||||||
|
|
||||||
|
for i, product in enumerate(all_products):
|
||||||
|
pct = (i + 1) / len(all_products) * 100
|
||||||
|
print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
|
||||||
|
|
||||||
|
# Match species
|
||||||
|
species = match_species(product.latin_name, species_map)
|
||||||
|
if not species:
|
||||||
|
print(f" Skip: no species match for '{product.latin_name}'")
|
||||||
|
stats["skipped_no_species"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
species_id = species["id"]
|
||||||
|
print(f" Species: {species['name_scientific']}")
|
||||||
|
print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
|
||||||
|
f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
|
||||||
|
f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
|
||||||
|
|
||||||
|
# Check duplicates
|
||||||
|
key = (species_id, product.name.lower())
|
||||||
|
if key in existing_cultivars:
|
||||||
|
# Still try to link supplier if cultivar exists
|
||||||
|
cultivar_id = existing_cultivars[key]
|
||||||
|
print(f" Exists: {cultivar_id[:12]}... - checking supplier link")
|
||||||
|
try:
|
||||||
|
api_post(f"/cultivars/{cultivar_id}/suppliers", {
|
||||||
|
"supplier_id": REINSAAT_SUPPLIER_ID,
|
||||||
|
"product_url": product.url,
|
||||||
|
"article_number": product.sku,
|
||||||
|
})
|
||||||
|
print(f" Linked to Reinsaat (SKU: {product.sku})")
|
||||||
|
stats["linked"] += 1
|
||||||
|
except Exception:
|
||||||
|
pass # Already linked or other error
|
||||||
|
stats["skipped_exists"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build payload
|
||||||
|
payload = {
|
||||||
|
"species_id": species_id,
|
||||||
|
"name": product.name,
|
||||||
|
"name_de": product.name,
|
||||||
|
"name_en": "",
|
||||||
|
"description": product.description,
|
||||||
|
"is_organic": product.is_organic,
|
||||||
|
"perennial": product.perennial,
|
||||||
|
}
|
||||||
|
if product.sowing_depth_cm is not None:
|
||||||
|
payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
|
||||||
|
if product.row_spacing_cm is not None:
|
||||||
|
payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
|
||||||
|
if product.plant_spacing_cm is not None:
|
||||||
|
payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
|
||||||
|
if product.germination_temp_c is not None:
|
||||||
|
payload["germination_temp_c"] = round(product.germination_temp_c, 1)
|
||||||
|
|
||||||
|
# Create cultivar
|
||||||
|
try:
|
||||||
|
result = api_post("/cultivars", payload)
|
||||||
|
cultivar_id = result["id"]
|
||||||
|
print(f" Created: {cultivar_id}")
|
||||||
|
stats["created"] += 1
|
||||||
|
existing_cultivars[key] = cultivar_id
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FAILED to create: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Link to supplier
|
||||||
|
try:
|
||||||
|
api_post(f"/cultivars/{cultivar_id}/suppliers", {
|
||||||
|
"supplier_id": REINSAAT_SUPPLIER_ID,
|
||||||
|
"product_url": product.url,
|
||||||
|
"article_number": product.sku,
|
||||||
|
})
|
||||||
|
print(f" Linked to Reinsaat (SKU: {product.sku})")
|
||||||
|
stats["linked"] += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" FAILED to link supplier: {e}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f" Created: {stats['created']}")
|
||||||
|
print(f" Linked to supplier: {stats['linked']}")
|
||||||
|
print(f" Skipped (no species): {stats['skipped_no_species']}")
|
||||||
|
print(f" Skipped (exists): {stats['skipped_exists']}")
|
||||||
|
print(f" Errors: {stats['errors']}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,770 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
|
||||||
|
genus+species from extended botanical names, create/enrich cultivars, link supplier.
|
||||||
|
|
||||||
|
Uses direct PostgreSQL access (psycopg2) for speed and reliability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import ssl
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
import html as html_mod
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# Unbuffered output
|
||||||
|
sys.stdout.reconfigure(line_buffering=True)
|
||||||
|
sys.stderr.reconfigure(line_buffering=True)
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
|
# ── Config ──────────────────────────────────────────────────────────────────
|
||||||
|
DB_HOST = "10.31.3.90"
|
||||||
|
DB_NAME = "herbapi"
|
||||||
|
DB_USER = "herbapi"
|
||||||
|
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||||
|
|
||||||
|
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
|
||||||
|
DELAY = 0.3
|
||||||
|
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
|
||||||
|
|
||||||
|
# ── All Reinsaat categories ────────────────────────────────────────────────
|
||||||
|
CATEGORIES = [
|
||||||
|
"https://www.reinsaat.at/shop/DE/bohnen/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/erbsen/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/gurken/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/knollenfenchel/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/kohlgewaechse/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/kuerbis/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/mais/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/mangold/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/melanzani_1/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/melone/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/paprika/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/pastinaken_1/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/petersilie/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/porree/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/radies_rettich/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/rote_ruebe/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/salate/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/sellerie/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/spinat/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/zucchini/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
|
||||||
|
"https://www.reinsaat.at/shop/DE/gruenduengung/",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── HTTP ────────────────────────────────────────────────────────────────────
|
||||||
|
_ssl_ctx = ssl.create_default_context()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url: str, retries: int = 2) -> str:
|
||||||
|
req = urllib.request.Request(url, headers={
|
||||||
|
"User-Agent": USER_AGENT,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
|
||||||
|
})
|
||||||
|
for attempt in range(retries + 1):
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
|
||||||
|
charset = resp.headers.get_content_charset() or "utf-8"
|
||||||
|
return resp.read().decode(charset)
|
||||||
|
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||||
|
if attempt < retries:
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTML parsing helpers ────────────────────────────────────────────────────
|
||||||
|
def extract_links(html_text: str, base_url: str) -> list[str]:
|
||||||
|
links = []
|
||||||
|
seen = set()
|
||||||
|
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
|
||||||
|
href = m.group(1)
|
||||||
|
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||||
|
continue
|
||||||
|
full = urllib.parse.urljoin(base_url, href)
|
||||||
|
if full not in seen:
|
||||||
|
seen.add(full)
|
||||||
|
links.append(full)
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def extract_jsonld_product(html_text: str) -> Optional[dict]:
|
||||||
|
for m in re.finditer(
|
||||||
|
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
|
||||||
|
html_text, re.DOTALL | re.IGNORECASE
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group(1))
|
||||||
|
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||||
|
return data
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(html_text: str) -> str:
|
||||||
|
"""Strip HTML tags and decode entities."""
|
||||||
|
text = re.sub(r'<[^>]+>', ' ', html_text)
|
||||||
|
text = html_mod.unescape(text)
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_botanical_name(html_text: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract the botanical/Latin name from the page.
|
||||||
|
Primary source: <div class="fce_shop_kurztext"> content.
|
||||||
|
Fallback: <em> tags in growing infos.
|
||||||
|
|
||||||
|
Returns the raw text (may include authority names, infraspecific ranks, etc.)
|
||||||
|
"""
|
||||||
|
# Primary: kurztext div
|
||||||
|
m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
text = html_to_text(m.group(1)).strip()
|
||||||
|
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Fallback: first <em> in growingInfos that looks like a Latin name
|
||||||
|
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||||
|
if gi:
|
||||||
|
for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
|
||||||
|
text = html_to_text(em.group(1)).strip()
|
||||||
|
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Last resort: any <em>/<i> tag with a Latin-looking name
|
||||||
|
for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
|
||||||
|
text = html_to_text(tag.group(1)).strip()
|
||||||
|
if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
|
||||||
|
return text
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_latin_name(raw: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract genus + species from an extended botanical name.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"Pisum sativum L. convar. sat." -> "Pisum sativum"
|
||||||
|
"Capsicum annuum L." -> "Capsicum annuum"
|
||||||
|
"Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
|
||||||
|
"Solanum lycopersicum L." -> "Solanum lycopersicum"
|
||||||
|
"Cucumis sativus" -> "Cucumis sativus"
|
||||||
|
"Mentha x piperita" -> "Mentha x piperita"
|
||||||
|
"""
|
||||||
|
if not raw:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
name = raw.strip()
|
||||||
|
# Remove leading/trailing punctuation
|
||||||
|
name = name.strip(".,;:")
|
||||||
|
|
||||||
|
words = name.split()
|
||||||
|
if len(words) < 2:
|
||||||
|
return name
|
||||||
|
|
||||||
|
genus = words[0]
|
||||||
|
|
||||||
|
# Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
|
||||||
|
if len(words) >= 3 and words[1] in ("x", "×"):
|
||||||
|
return f"{genus} x {words[2]}"
|
||||||
|
|
||||||
|
species = words[1]
|
||||||
|
|
||||||
|
# Validate: genus should start uppercase, species lowercase
|
||||||
|
if not genus[0].isupper() or not species[0].islower():
|
||||||
|
return name # Can't parse, return as-is
|
||||||
|
|
||||||
|
return f"{genus} {species}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Calendar parsing ────────────────────────────────────────────────────────
|
||||||
|
CALENDAR_ROW_TYPES = {
|
||||||
|
"voranzucht": "indoor_sowing_months",
|
||||||
|
"vorzucht": "indoor_sowing_months",
|
||||||
|
"vorkultur": "indoor_sowing_months",
|
||||||
|
"aussaat/ pflanzung freiland": "direct_sowing_months",
|
||||||
|
"aussaat/pflanzung freiland": "direct_sowing_months",
|
||||||
|
"aussaat freiland": "direct_sowing_months",
|
||||||
|
"direktsaat": "direct_sowing_months",
|
||||||
|
"pflanzung freiland": "transplanting_months",
|
||||||
|
"pflanzung": "transplanting_months",
|
||||||
|
"aussaat/ pflanzung gewächshaus": "glasshouse_months",
|
||||||
|
"aussaat/pflanzung gewächshaus": "glasshouse_months",
|
||||||
|
"gewächshaus": "glasshouse_months",
|
||||||
|
"ernte": "harvesting_months",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_calendar(html_text: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse the Reinsaat growing calendar table.
|
||||||
|
Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
|
||||||
|
Each value is a sorted list of month integers (1-12).
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
|
||||||
|
if not cal_match:
|
||||||
|
return result
|
||||||
|
|
||||||
|
cal = cal_match.group(1)
|
||||||
|
rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
# Get label
|
||||||
|
label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
|
||||||
|
if not label_m:
|
||||||
|
continue
|
||||||
|
label = html_to_text(label_m.group(1)).strip().lower()
|
||||||
|
|
||||||
|
# Map label to our field
|
||||||
|
field_name = None
|
||||||
|
for pattern, fname in CALENDAR_ROW_TYPES.items():
|
||||||
|
if pattern in label:
|
||||||
|
field_name = fname
|
||||||
|
break
|
||||||
|
if not field_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract background colors for each cell (24 cells = 12 months x 2 halves)
|
||||||
|
colors = re.findall(r'background-color:\s*([^;"]+)', row)
|
||||||
|
|
||||||
|
# Convert to months: cell i maps to month (i // 2) + 1
|
||||||
|
active_months = set()
|
||||||
|
for i, color in enumerate(colors):
|
||||||
|
color = color.strip().lower()
|
||||||
|
if color != "none" and color != "transparent" and color != "":
|
||||||
|
month = (i // 2) + 1
|
||||||
|
if 1 <= month <= 12:
|
||||||
|
active_months.add(month)
|
||||||
|
|
||||||
|
if active_months:
|
||||||
|
# Merge if same field already found (e.g. two sowing rows)
|
||||||
|
if field_name in result:
|
||||||
|
result[field_name] = sorted(set(result[field_name]) | active_months)
|
||||||
|
else:
|
||||||
|
result[field_name] = sorted(active_months)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ── Growing data extraction ─────────────────────────────────────────────────
|
||||||
|
def extract_growing_data(html_text: str) -> dict:
|
||||||
|
"""Extract spacing, depth, germination temp from the growing text."""
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Get the growingInfos text
|
||||||
|
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||||
|
if not gi:
|
||||||
|
return data
|
||||||
|
|
||||||
|
full_text = html_to_text(gi.group(1))
|
||||||
|
# Also get the raw HTML for better entity handling
|
||||||
|
raw_html = gi.group(1)
|
||||||
|
# Convert HTML entities for pattern matching
|
||||||
|
raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
|
||||||
|
raw_text = re.sub(r'\s+', ' ', raw_text)
|
||||||
|
|
||||||
|
# ── Sowing depth ──
|
||||||
|
depth_pats = [
|
||||||
|
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
]
|
||||||
|
for pat in depth_pats:
|
||||||
|
dm = re.search(pat, raw_text, re.IGNORECASE)
|
||||||
|
if dm:
|
||||||
|
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
|
||||||
|
data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Spacing: "ROW x PLANT cm" ──
|
||||||
|
spacing_pats = [
|
||||||
|
# "30–45 x 3–5 cm" (range x range)
|
||||||
|
r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
|
||||||
|
# "100 x 50 cm" (simple)
|
||||||
|
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||||
|
]
|
||||||
|
for pat in spacing_pats:
|
||||||
|
matches = re.findall(pat, raw_text, re.IGNORECASE)
|
||||||
|
if matches:
|
||||||
|
m = matches[-1] # prefer last match
|
||||||
|
if len(m) == 4:
|
||||||
|
data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
|
||||||
|
data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
|
||||||
|
elif len(m) == 2:
|
||||||
|
v1 = float(m[0].replace(",", "."))
|
||||||
|
v2 = float(m[1].replace(",", "."))
|
||||||
|
data["row_spacing_cm"] = round(v1, 1)
|
||||||
|
data["plant_spacing_cm"] = round(v2, 1)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Germination temperature ──
|
||||||
|
temp_pats = [
|
||||||
|
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
|
||||||
|
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
|
||||||
|
]
|
||||||
|
for pat in temp_pats:
|
||||||
|
tm = re.search(pat, raw_text, re.IGNORECASE)
|
||||||
|
if tm:
|
||||||
|
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
|
||||||
|
avg = sum(vals) / len(vals)
|
||||||
|
if 5 <= avg <= 40:
|
||||||
|
data["germination_temp_c"] = round(avg, 1)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Perennial ──
|
||||||
|
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
|
||||||
|
for pat in perennial_pats:
|
||||||
|
if re.search(pat, raw_text, re.IGNORECASE):
|
||||||
|
data["perennial"] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# ── Product data ────────────────────────────────────────────────────────────
|
||||||
|
@dataclass
|
||||||
|
class ProductData:
|
||||||
|
name: str = ""
|
||||||
|
raw_latin_name: str = ""
|
||||||
|
normalized_latin: str = ""
|
||||||
|
description: str = ""
|
||||||
|
sku: str = ""
|
||||||
|
url: str = ""
|
||||||
|
is_organic: bool = True
|
||||||
|
growing_data: dict = field(default_factory=dict)
|
||||||
|
calendar: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_product(html_text: str, url: str) -> Optional[ProductData]:
|
||||||
|
"""Parse a product page. Returns ProductData or None if not a product page."""
|
||||||
|
jsonld = extract_jsonld_product(html_text)
|
||||||
|
if not jsonld:
|
||||||
|
return None
|
||||||
|
|
||||||
|
product = ProductData(url=url)
|
||||||
|
product.name = jsonld.get("name", "").strip()
|
||||||
|
product.description = jsonld.get("description", "").strip()
|
||||||
|
product.sku = jsonld.get("model", "").strip()
|
||||||
|
|
||||||
|
# Extract and normalize botanical name
|
||||||
|
product.raw_latin_name = extract_botanical_name(html_text)
|
||||||
|
product.normalized_latin = normalize_latin_name(product.raw_latin_name)
|
||||||
|
|
||||||
|
# Extract growing data
|
||||||
|
product.growing_data = extract_growing_data(html_text)
|
||||||
|
|
||||||
|
# Parse calendar
|
||||||
|
product.calendar = parse_calendar(html_text)
|
||||||
|
|
||||||
|
# Check organic status (Reinsaat is all organic, but check for "demeter" too)
|
||||||
|
product.is_organic = True
|
||||||
|
|
||||||
|
return product
|
||||||
|
|
||||||
|
|
||||||
|
# ── Recursive discovery ─────────────────────────────────────────────────────
|
||||||
|
def discover_products(
|
||||||
|
category_url: str,
|
||||||
|
max_depth: int = 4,
|
||||||
|
_depth: int = 0,
|
||||||
|
_visited: set = None,
|
||||||
|
) -> list[ProductData]:
|
||||||
|
if _visited is None:
|
||||||
|
_visited = set()
|
||||||
|
if category_url in _visited or _depth > max_depth:
|
||||||
|
return []
|
||||||
|
_visited.add(category_url)
|
||||||
|
|
||||||
|
indent = " " * (_depth + 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
html_text = fetch_url(category_url)
|
||||||
|
time.sleep(DELAY)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{indent}ERROR fetching {category_url}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Check if this is a product page
|
||||||
|
product = parse_product(html_text, category_url)
|
||||||
|
if product:
|
||||||
|
return [product]
|
||||||
|
|
||||||
|
# Category page: find child links
|
||||||
|
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
|
||||||
|
child_links = []
|
||||||
|
for link in extract_links(html_text, category_url):
|
||||||
|
parsed = urllib.parse.urlparse(link)
|
||||||
|
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
|
||||||
|
continue
|
||||||
|
child_path = parsed.path.rstrip("/")
|
||||||
|
if not child_path.startswith(cat_path + "/"):
|
||||||
|
continue
|
||||||
|
relative = child_path[len(cat_path) + 1:]
|
||||||
|
if "/" in relative or not relative:
|
||||||
|
continue
|
||||||
|
clean_url = f"https://www.reinsaat.at{child_path}/"
|
||||||
|
if clean_url not in _visited:
|
||||||
|
child_links.append(clean_url)
|
||||||
|
|
||||||
|
child_links = list(dict.fromkeys(child_links))
|
||||||
|
print(f"{indent}Category {category_url} -> {len(child_links)} children")
|
||||||
|
|
||||||
|
products = []
|
||||||
|
for child_url in child_links:
|
||||||
|
results = discover_products(child_url, max_depth, _depth + 1, _visited)
|
||||||
|
products.extend(results)
|
||||||
|
|
||||||
|
return products
|
||||||
|
|
||||||
|
|
||||||
|
# ── Slug generation ─────────────────────────────────────────────────────────
|
||||||
|
def make_slug(species_name: str, cultivar_name: str) -> str:
|
||||||
|
"""Generate a URL-friendly slug."""
|
||||||
|
raw = f"{species_name}-{cultivar_name}".lower()
|
||||||
|
# Replace umlauts and special chars
|
||||||
|
replacements = {
|
||||||
|
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
|
||||||
|
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
||||||
|
'á': 'a', 'à': 'a', 'â': 'a',
|
||||||
|
'í': 'i', 'ì': 'i', 'î': 'i',
|
||||||
|
'ó': 'o', 'ò': 'o', 'ô': 'o',
|
||||||
|
'ú': 'u', 'ù': 'u', 'û': 'u',
|
||||||
|
'ñ': 'n', 'ç': 'c',
|
||||||
|
}
|
||||||
|
for old, new in replacements.items():
|
||||||
|
raw = raw.replace(old, new)
|
||||||
|
# Keep only alphanumeric and hyphens
|
||||||
|
slug = re.sub(r'[^a-z0-9]+', '-', raw)
|
||||||
|
slug = slug.strip('-')
|
||||||
|
# Collapse multiple hyphens
|
||||||
|
slug = re.sub(r'-+', '-', slug)
|
||||||
|
return slug
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────────────────────────────
|
||||||
|
def db_connect():
|
||||||
|
"""Create a fresh DB connection."""
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
|
||||||
|
)
|
||||||
|
conn.autocommit = False
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 70)
|
||||||
|
print("Reinsaat Scraper v2")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# ── Phase 1: Discover all products (no DB needed) ──
|
||||||
|
print("\n[1] Discovering products from Reinsaat categories...")
|
||||||
|
all_products: list[ProductData] = []
|
||||||
|
visited: set[str] = set()
|
||||||
|
|
||||||
|
for cat_url in CATEGORIES:
|
||||||
|
print(f"\n Category: {cat_url}")
|
||||||
|
products = discover_products(cat_url, max_depth=4, _visited=visited)
|
||||||
|
all_products.extend(products)
|
||||||
|
print(f" -> {len(products)} products")
|
||||||
|
|
||||||
|
# Deduplicate by URL
|
||||||
|
seen_urls = set()
|
||||||
|
unique_products = []
|
||||||
|
for p in all_products:
|
||||||
|
if p.url not in seen_urls:
|
||||||
|
seen_urls.add(p.url)
|
||||||
|
unique_products.append(p)
|
||||||
|
all_products = unique_products
|
||||||
|
print(f"\n Total unique products: {len(all_products)}")
|
||||||
|
|
||||||
|
# ── Phase 2: Connect to DB and load existing data ──
|
||||||
|
print("\n[2] Connecting to DB and loading existing data...")
|
||||||
|
conn = db_connect()
|
||||||
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||||
|
|
||||||
|
# Load species
|
||||||
|
cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
|
||||||
|
species_rows = cur.fetchall()
|
||||||
|
species_map = {}
|
||||||
|
for row in species_rows:
|
||||||
|
key = row["name_scientific"].lower().strip()
|
||||||
|
species_map[key] = row
|
||||||
|
print(f" {len(species_map)} species loaded")
|
||||||
|
|
||||||
|
# Load existing cultivars
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, species_id, name, slug, description,
|
||||||
|
row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||||||
|
germination_temp_c, perennial,
|
||||||
|
indoor_sowing_months, direct_sowing_months,
|
||||||
|
transplanting_months, glasshouse_months, harvesting_months
|
||||||
|
FROM cultivars
|
||||||
|
""")
|
||||||
|
cultivar_rows = cur.fetchall()
|
||||||
|
existing_cultivars = {}
|
||||||
|
existing_slugs = set()
|
||||||
|
for row in cultivar_rows:
|
||||||
|
sid = str(row["species_id"])
|
||||||
|
name_lower = row["name"].lower()
|
||||||
|
existing_cultivars[(sid, name_lower)] = dict(row)
|
||||||
|
existing_slugs.add(row["slug"])
|
||||||
|
print(f" {len(existing_cultivars)} cultivars loaded")
|
||||||
|
|
||||||
|
# Load existing Reinsaat supplier links
|
||||||
|
cur.execute("""
|
||||||
|
SELECT cultivar_id, product_url, article_number
|
||||||
|
FROM cultivar_suppliers
|
||||||
|
WHERE supplier_id = %s
|
||||||
|
""", (REINSAAT_SUPPLIER_ID,))
|
||||||
|
existing_links = {}
|
||||||
|
for row in cur.fetchall():
|
||||||
|
cid = str(row["cultivar_id"])
|
||||||
|
url = row["product_url"] or ""
|
||||||
|
sku = row["article_number"] or ""
|
||||||
|
existing_links.setdefault(cid, []).append((url, sku))
|
||||||
|
print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
|
||||||
|
|
||||||
|
# ── Phase 3: Process products ──
|
||||||
|
print("\n[3] Processing products...")
|
||||||
|
stats = {
|
||||||
|
"created": 0,
|
||||||
|
"linked": 0,
|
||||||
|
"enriched": 0,
|
||||||
|
"skipped_no_species": 0,
|
||||||
|
"skipped_no_name": 0,
|
||||||
|
"link_exists": 0,
|
||||||
|
"errors": 0,
|
||||||
|
}
|
||||||
|
unmatched = []
|
||||||
|
|
||||||
|
for i, product in enumerate(all_products):
|
||||||
|
pct = (i + 1) / len(all_products) * 100
|
||||||
|
prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
|
||||||
|
|
||||||
|
if not product.name:
|
||||||
|
stats["skipped_no_name"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Match species
|
||||||
|
normalized = product.normalized_latin.lower().strip()
|
||||||
|
species = species_map.get(normalized)
|
||||||
|
|
||||||
|
if not species:
|
||||||
|
# Try exact match on raw name (first two words)
|
||||||
|
raw_words = product.raw_latin_name.split()
|
||||||
|
if len(raw_words) >= 2:
|
||||||
|
attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
|
||||||
|
species = species_map.get(attempt)
|
||||||
|
|
||||||
|
if not species:
|
||||||
|
stats["skipped_no_species"] += 1
|
||||||
|
unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
|
||||||
|
continue
|
||||||
|
|
||||||
|
species_id = str(species["id"])
|
||||||
|
species_name = species["name_scientific"]
|
||||||
|
|
||||||
|
# Check if cultivar exists
|
||||||
|
ckey = (species_id, product.name.lower())
|
||||||
|
existing = existing_cultivars.get(ckey)
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
cultivar_id = str(existing["id"])
|
||||||
|
|
||||||
|
# ── Enrich existing cultivar with missing data ──
|
||||||
|
updates = {}
|
||||||
|
|
||||||
|
# Growing data from page
|
||||||
|
gd = product.growing_data
|
||||||
|
if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
|
||||||
|
updates["planting_depth_cm"] = gd["planting_depth_cm"]
|
||||||
|
if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
|
||||||
|
updates["row_spacing_cm"] = gd["row_spacing_cm"]
|
||||||
|
if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
|
||||||
|
updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
|
||||||
|
if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
|
||||||
|
updates["germination_temp_c"] = gd["germination_temp_c"]
|
||||||
|
if gd.get("perennial") and not existing.get("perennial"):
|
||||||
|
updates["perennial"] = True
|
||||||
|
|
||||||
|
# Calendar data
|
||||||
|
cal = product.calendar
|
||||||
|
if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
|
||||||
|
updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
|
||||||
|
if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
|
||||||
|
updates["direct_sowing_months"] = cal["direct_sowing_months"]
|
||||||
|
if cal.get("transplanting_months") and not existing.get("transplanting_months"):
|
||||||
|
updates["transplanting_months"] = cal["transplanting_months"]
|
||||||
|
if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
|
||||||
|
updates["glasshouse_months"] = cal["glasshouse_months"]
|
||||||
|
if cal.get("harvesting_months") and not existing.get("harvesting_months"):
|
||||||
|
updates["harvesting_months"] = cal["harvesting_months"]
|
||||||
|
|
||||||
|
# Description
|
||||||
|
if product.description and not existing.get("description"):
|
||||||
|
updates["description"] = product.description
|
||||||
|
|
||||||
|
if updates:
|
||||||
|
set_clauses = []
|
||||||
|
values = []
|
||||||
|
for col, val in updates.items():
|
||||||
|
set_clauses.append(f"{col} = %s")
|
||||||
|
values.append(val)
|
||||||
|
set_clauses.append("updated_at = NOW()")
|
||||||
|
values.append(cultivar_id)
|
||||||
|
cur.execute(
|
||||||
|
f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
|
||||||
|
values
|
||||||
|
)
|
||||||
|
stats["enriched"] += 1
|
||||||
|
print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
|
||||||
|
|
||||||
|
# ── Add supplier link if missing ──
|
||||||
|
link_exists = False
|
||||||
|
if cultivar_id in existing_links:
|
||||||
|
for lurl, lsku in existing_links[cultivar_id]:
|
||||||
|
if lurl == product.url or (lsku and lsku == product.sku):
|
||||||
|
link_exists = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if link_exists:
|
||||||
|
stats["link_exists"] += 1
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
cur.execute("SAVEPOINT link_sp")
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||||||
|
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||||||
|
ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
|
||||||
|
SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
|
||||||
|
""", (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||||||
|
cur.execute("RELEASE SAVEPOINT link_sp")
|
||||||
|
stats["linked"] += 1
|
||||||
|
existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
|
||||||
|
print(f"{prefix} {product.name} -> LINKED ({product.sku})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{prefix} {product.name} -> LINK ERROR: {e}")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT link_sp")
|
||||||
|
stats["errors"] += 1
|
||||||
|
else:
|
||||||
|
# ── Create new cultivar ──
|
||||||
|
slug = make_slug(species_name, product.name)
|
||||||
|
# Ensure unique slug
|
||||||
|
base_slug = slug
|
||||||
|
counter = 2
|
||||||
|
while slug in existing_slugs:
|
||||||
|
slug = f"{base_slug}-{counter}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
gd = product.growing_data
|
||||||
|
cal = product.calendar
|
||||||
|
|
||||||
|
try:
|
||||||
|
cur.execute("SAVEPOINT create_sp")
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO cultivars (
|
||||||
|
species_id, name, name_de, slug, description,
|
||||||
|
is_organic, perennial,
|
||||||
|
planting_depth_cm, row_spacing_cm, plant_spacing_cm,
|
||||||
|
germination_temp_c,
|
||||||
|
indoor_sowing_months, direct_sowing_months,
|
||||||
|
transplanting_months, glasshouse_months, harvesting_months
|
||||||
|
) VALUES (
|
||||||
|
%s::uuid, %s, %s, %s, %s,
|
||||||
|
%s, %s,
|
||||||
|
%s, %s, %s,
|
||||||
|
%s,
|
||||||
|
%s, %s,
|
||||||
|
%s, %s, %s
|
||||||
|
)
|
||||||
|
RETURNING id
|
||||||
|
""", (
|
||||||
|
species_id,
|
||||||
|
product.name,
|
||||||
|
product.name,
|
||||||
|
slug,
|
||||||
|
product.description,
|
||||||
|
product.is_organic,
|
||||||
|
gd.get("perennial", False),
|
||||||
|
gd.get("planting_depth_cm"),
|
||||||
|
gd.get("row_spacing_cm"),
|
||||||
|
gd.get("plant_spacing_cm"),
|
||||||
|
gd.get("germination_temp_c"),
|
||||||
|
cal.get("indoor_sowing_months"),
|
||||||
|
cal.get("direct_sowing_months"),
|
||||||
|
cal.get("transplanting_months"),
|
||||||
|
cal.get("glasshouse_months"),
|
||||||
|
cal.get("harvesting_months"),
|
||||||
|
))
|
||||||
|
new_id = str(cur.fetchone()["id"])
|
||||||
|
existing_slugs.add(slug)
|
||||||
|
existing_cultivars[ckey] = {"id": new_id}
|
||||||
|
stats["created"] += 1
|
||||||
|
|
||||||
|
# Link to supplier
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||||||
|
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||||||
|
""", (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||||||
|
stats["linked"] += 1
|
||||||
|
existing_links.setdefault(new_id, []).append((product.url, product.sku))
|
||||||
|
|
||||||
|
print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
|
||||||
|
cur.execute("RELEASE SAVEPOINT create_sp")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
|
||||||
|
cur.execute("ROLLBACK TO SAVEPOINT create_sp")
|
||||||
|
stats["errors"] += 1
|
||||||
|
|
||||||
|
# ── Commit ──
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# ── Summary ──
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f" Total products discovered: {len(all_products)}")
|
||||||
|
print(f" New cultivars created: {stats['created']}")
|
||||||
|
print(f" New supplier links added: {stats['linked']}")
|
||||||
|
print(f" Cultivars enriched: {stats['enriched']}")
|
||||||
|
print(f" Links already existed: {stats['link_exists']}")
|
||||||
|
print(f" Skipped (no species): {stats['skipped_no_species']}")
|
||||||
|
print(f" Skipped (no name): {stats['skipped_no_name']}")
|
||||||
|
print(f" Errors: {stats['errors']}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
if unmatched:
|
||||||
|
print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
|
||||||
|
for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
|
||||||
|
print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,635 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
from html import unescape
|
||||||
|
|
||||||
|
# --- Config ---
|
||||||
|
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||||
|
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||||
|
REINSAAT_BASE = "https://www.reinsaat.at"
|
||||||
|
DELAY = 0.3
|
||||||
|
|
||||||
|
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
|
||||||
|
CATEGORIES = [
|
||||||
|
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
|
||||||
|
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
|
||||||
|
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
|
||||||
|
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
|
||||||
|
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
|
||||||
|
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
|
||||||
|
"wild_flowers_seeds", "green_manure",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
|
||||||
|
STRIP_SUFFIXES = {
|
||||||
|
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
|
||||||
|
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
|
||||||
|
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
|
||||||
|
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
|
||||||
|
"sat.", "sat", "axillare", "medikus",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(path, params=None):
|
||||||
|
"""GET from HerbAPI."""
|
||||||
|
url = f"{API_BASE}{path}"
|
||||||
|
if params:
|
||||||
|
url += "?" + urllib.parse.urlencode(params)
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def api_post(path, data):
|
||||||
|
"""POST to HerbAPI."""
|
||||||
|
url = f"{API_BASE}{path}"
|
||||||
|
body = json.dumps(data).encode()
|
||||||
|
req = urllib.request.Request(url, data=body, method="POST")
|
||||||
|
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||||
|
req.add_header("Content-Type", "application/json")
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read())
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url):
|
||||||
|
"""Fetch a web page, return HTML string."""
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
return resp.read().decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
|
BOTANICAL_TYPOS = {
|
||||||
|
"capscicum": "capsicum",
|
||||||
|
"capsicum frutenscens": "capsicum frutescens",
|
||||||
|
"tropaelum": "tropaeolum",
|
||||||
|
"lact.": "lactuca",
|
||||||
|
}
|
||||||
|
|
||||||
|
ABBREVIATED_NAMES = {
|
||||||
|
"origanum vulg.": "origanum vulgare",
|
||||||
|
"helichrysum bract.": "helichrysum bracteatum",
|
||||||
|
"campanula lat.": "campanula latifolia",
|
||||||
|
"cosmos bip.": "cosmos bipinnatus",
|
||||||
|
"papaver somnif.": "papaver somniferum",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalise_botanical(raw):
|
||||||
|
"""Strip botanical name to genus + species only.
|
||||||
|
|
||||||
|
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
|
||||||
|
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
|
||||||
|
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
|
||||||
|
"""
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
# Clean HTML entities
|
||||||
|
raw = unescape(raw).replace("\xa0", " ").strip()
|
||||||
|
# Remove trailing commas/periods
|
||||||
|
raw = raw.rstrip(",. ")
|
||||||
|
# Remove content in parentheses
|
||||||
|
raw = re.sub(r"\([^)]*\)", "", raw)
|
||||||
|
# Check abbreviated names first (before splitting)
|
||||||
|
raw_lower = raw.lower().strip()
|
||||||
|
for abbrev, full in ABBREVIATED_NAMES.items():
|
||||||
|
if raw_lower.startswith(abbrev):
|
||||||
|
return full
|
||||||
|
|
||||||
|
parts = raw.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
return None
|
||||||
|
# Genus (capitalised) + species (lowercase)
|
||||||
|
genus = parts[0].lower().rstrip(",")
|
||||||
|
species = parts[1].lower().rstrip(",")
|
||||||
|
|
||||||
|
# Fix known typos
|
||||||
|
if genus in BOTANICAL_TYPOS:
|
||||||
|
genus = BOTANICAL_TYPOS[genus]
|
||||||
|
full_name = f"{genus} {species}"
|
||||||
|
if full_name in BOTANICAL_TYPOS:
|
||||||
|
full_name = BOTANICAL_TYPOS[full_name]
|
||||||
|
genus, species = full_name.split()
|
||||||
|
|
||||||
|
# Validate: genus should start with letter, species should be all lowercase
|
||||||
|
if not genus[0].isalpha() or not species[0].isalpha():
|
||||||
|
return None
|
||||||
|
# Skip if species looks like an authority (starts with uppercase in original)
|
||||||
|
if parts[1][0].isupper():
|
||||||
|
return None
|
||||||
|
return f"{genus} {species}"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_product_data(html, url):
|
||||||
|
"""Extract product info from a Reinsaat product page."""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
# H1 = variety name
|
||||||
|
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||||
|
if m:
|
||||||
|
name = unescape(m.group(1)).strip()
|
||||||
|
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
|
||||||
|
paren = re.search(r"\(([^)]+)\)", name)
|
||||||
|
if paren and re.match(r"RS-", name):
|
||||||
|
name = paren.group(1).strip()
|
||||||
|
result["name"] = name
|
||||||
|
|
||||||
|
# Botanical name from fce_shop_kurztext
|
||||||
|
m = re.search(
|
||||||
|
r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
|
||||||
|
html,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
|
||||||
|
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
|
||||||
|
|
||||||
|
# Article number from JSON-LD
|
||||||
|
for jm in re.finditer(
|
||||||
|
r'<script type="application/ld\+json">(.*?)</script>', html, re.S
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
jd = json.loads(jm.group(1))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if jd.get("@type") == "Product":
|
||||||
|
if "model" in jd:
|
||||||
|
result["article_number"] = str(jd["model"])
|
||||||
|
# Get smallest pack price (usually the Portion)
|
||||||
|
offers = jd.get("offers", {})
|
||||||
|
if isinstance(offers, dict):
|
||||||
|
offer_list = offers.get("offers", [])
|
||||||
|
elif isinstance(offers, list):
|
||||||
|
offer_list = offers
|
||||||
|
else:
|
||||||
|
offer_list = []
|
||||||
|
if offer_list:
|
||||||
|
prices = [
|
||||||
|
o["price"]
|
||||||
|
for o in offer_list
|
||||||
|
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
|
||||||
|
]
|
||||||
|
if prices:
|
||||||
|
result["price_eur"] = min(prices)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Price table - get pack sizes
|
||||||
|
tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
|
||||||
|
for tbl in tables:
|
||||||
|
if "€" not in tbl:
|
||||||
|
continue
|
||||||
|
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
|
||||||
|
if len(rows) >= 2:
|
||||||
|
size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
|
||||||
|
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
|
||||||
|
price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
|
||||||
|
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
|
||||||
|
# Find the "Port." entry
|
||||||
|
for i, st in enumerate(size_texts):
|
||||||
|
if "Port" in st:
|
||||||
|
if i < len(price_texts):
|
||||||
|
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
|
||||||
|
if pm:
|
||||||
|
result["port_price"] = float(pm.group())
|
||||||
|
break
|
||||||
|
# Get portion content info
|
||||||
|
result["pack_sizes"] = size_texts
|
||||||
|
break
|
||||||
|
|
||||||
|
# Sowing depth
|
||||||
|
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
|
||||||
|
if m:
|
||||||
|
d1 = float(m.group(1).replace(",", "."))
|
||||||
|
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
|
||||||
|
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
|
||||||
|
|
||||||
|
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
|
||||||
|
# Try outdoor spacing first
|
||||||
|
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||||
|
if not m:
|
||||||
|
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||||
|
if not m:
|
||||||
|
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||||
|
if m:
|
||||||
|
result["row_spacing_cm"] = float(m.group(1))
|
||||||
|
result["plant_spacing_cm"] = float(m.group(2))
|
||||||
|
|
||||||
|
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
|
||||||
|
if "row_spacing_cm" not in result:
|
||||||
|
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
|
||||||
|
if m:
|
||||||
|
r1 = int(m.group(1))
|
||||||
|
r2 = int(m.group(2)) if m.group(2) else r1
|
||||||
|
result["row_spacing_cm"] = float((r1 + r2) // 2)
|
||||||
|
|
||||||
|
# Germination temperature
|
||||||
|
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
|
||||||
|
if m:
|
||||||
|
t1 = int(m.group(1))
|
||||||
|
t2 = int(m.group(2)) if m.group(2) else t1
|
||||||
|
result["germination_temp_c"] = float((t1 + t2) // 2)
|
||||||
|
|
||||||
|
# Pack unit from portion info - "20 seeds" or "25 g" etc
|
||||||
|
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||||||
|
if not portion_m:
|
||||||
|
# Try "Port. (20 seeds)" format
|
||||||
|
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||||||
|
if portion_m:
|
||||||
|
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
|
||||||
|
unit = portion_m.group(2).lower()
|
||||||
|
if unit in ("seed", "seeds", "korn"):
|
||||||
|
result["pack_unit"] = "Korn"
|
||||||
|
else:
|
||||||
|
result["pack_unit"] = unit
|
||||||
|
|
||||||
|
result["url"] = url
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_species():
|
||||||
|
"""Fetch all species from API, build lookup by normalised name."""
|
||||||
|
species_map = {}
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
data = api_get("/species", {"per_page": 100, "page": page})
|
||||||
|
batch = data.get("data", [])
|
||||||
|
for sp in batch:
|
||||||
|
norm = normalise_botanical(sp["name_scientific"])
|
||||||
|
if norm:
|
||||||
|
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
|
||||||
|
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
|
||||||
|
if len(batch) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return species_map
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_cultivars():
|
||||||
|
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
|
||||||
|
cultivar_map = {} # (species_id, lower_name) -> cultivar
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
data = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||||
|
batch = data.get("data", [])
|
||||||
|
for cv in batch:
|
||||||
|
key = (cv["species_id"], cv["name"].lower().strip())
|
||||||
|
cultivar_map[key] = cv
|
||||||
|
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
|
||||||
|
if len(batch) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return cultivar_map
|
||||||
|
|
||||||
|
|
||||||
|
def get_reinsaat_supplier():
|
||||||
|
"""Get Reinsaat supplier record."""
|
||||||
|
suppliers = api_get("/suppliers")
|
||||||
|
for s in suppliers:
|
||||||
|
if s["slug"] == "reinsaat":
|
||||||
|
return s
|
||||||
|
raise RuntimeError("Reinsaat supplier not found in API")
|
||||||
|
|
||||||
|
|
||||||
|
def get_cultivar_suppliers(cultivar_id):
|
||||||
|
"""Get existing supplier links for a cultivar."""
|
||||||
|
return api_get(f"/cultivars/{cultivar_id}/suppliers")
|
||||||
|
|
||||||
|
|
||||||
|
def get_product_urls_from_category(cat_slug):
|
||||||
|
"""Fetch product URLs from a category page. Handles one level of subcategories."""
|
||||||
|
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
|
||||||
|
try:
|
||||||
|
html = fetch_page(cat_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
# Get all internal links under this category
|
||||||
|
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
|
||||||
|
raw_links = re.findall(rf'href="({pattern})"', html)
|
||||||
|
# raw_links is list of (full_path, slug_part) but re gives us captured groups
|
||||||
|
# Let me redo this
|
||||||
|
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
|
||||||
|
unique_links = sorted(set(raw_links))
|
||||||
|
|
||||||
|
product_urls = []
|
||||||
|
subcategory_urls = []
|
||||||
|
|
||||||
|
for link in unique_links:
|
||||||
|
full_url = REINSAAT_BASE + link
|
||||||
|
# Determine depth relative to category
|
||||||
|
parts = link.rstrip("/").split("/")
|
||||||
|
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
|
||||||
|
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
|
||||||
|
if len(parts) == 4:
|
||||||
|
# Could be product or subcategory - we'll check later
|
||||||
|
product_urls.append(full_url)
|
||||||
|
elif len(parts) >= 5:
|
||||||
|
product_urls.append(full_url)
|
||||||
|
|
||||||
|
return product_urls
|
||||||
|
|
||||||
|
|
||||||
|
def is_product_page(html):
|
||||||
|
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
|
||||||
|
return bool(
|
||||||
|
re.search(r'fce_shop_kurztext', html)
|
||||||
|
or re.search(r'"@type":\s*"Product"', html)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Reinsaat v3 Scraper")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Step 1: Load all species
|
||||||
|
print("\n[1/4] Loading species from API...")
|
||||||
|
species_map = get_all_species()
|
||||||
|
print(f" Loaded {len(species_map)} species")
|
||||||
|
|
||||||
|
# Step 2: Load all cultivars
|
||||||
|
print("\n[2/4] Loading cultivars from API...")
|
||||||
|
cultivar_map = get_all_cultivars()
|
||||||
|
print(f" Loaded {len(cultivar_map)} cultivars")
|
||||||
|
|
||||||
|
# Step 3: Get Reinsaat supplier
|
||||||
|
print("\n[3/4] Getting Reinsaat supplier...")
|
||||||
|
supplier = get_reinsaat_supplier()
|
||||||
|
supplier_id = supplier["id"]
|
||||||
|
print(f" Reinsaat ID: {supplier_id}")
|
||||||
|
|
||||||
|
# Step 4: Scrape categories
|
||||||
|
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"products_found": 0,
|
||||||
|
"botanical_extracted": 0,
|
||||||
|
"species_matched": 0,
|
||||||
|
"species_not_matched": 0,
|
||||||
|
"cultivar_existed": 0,
|
||||||
|
"cultivar_created": 0,
|
||||||
|
"link_existed": 0,
|
||||||
|
"link_created": 0,
|
||||||
|
"errors": 0,
|
||||||
|
}
|
||||||
|
unmatched_species = {} # botanical_norm -> count
|
||||||
|
new_cultivars = []
|
||||||
|
new_links = []
|
||||||
|
|
||||||
|
for cat_i, cat in enumerate(CATEGORIES):
|
||||||
|
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
|
||||||
|
urls = get_product_urls_from_category(cat)
|
||||||
|
print(f" Found {len(urls)} URLs")
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
time.sleep(DELAY)
|
||||||
|
try:
|
||||||
|
html = fetch_page(url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR fetching {url}: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this is actually a product page
|
||||||
|
if not is_product_page(html):
|
||||||
|
# Might be a subcategory - get links from it
|
||||||
|
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
|
||||||
|
sub_links = [
|
||||||
|
REINSAAT_BASE + l
|
||||||
|
for l in sorted(set(sub_links))
|
||||||
|
if l.startswith(f"/shop/EN/{cat}/")
|
||||||
|
and l.count("/") > url.rstrip("/").count("/")
|
||||||
|
]
|
||||||
|
if sub_links:
|
||||||
|
# It's a subcategory, process its product links
|
||||||
|
for sub_url in sub_links:
|
||||||
|
if sub_url in urls:
|
||||||
|
continue # already in list
|
||||||
|
time.sleep(DELAY)
|
||||||
|
try:
|
||||||
|
sub_html = fetch_page(sub_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR fetching {sub_url}: {e}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
if not is_product_page(sub_html):
|
||||||
|
continue
|
||||||
|
process_product(
|
||||||
|
sub_html, sub_url, species_map, cultivar_map,
|
||||||
|
supplier_id, stats, unmatched_species,
|
||||||
|
new_cultivars, new_links,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
process_product(
|
||||||
|
html, url, species_map, cultivar_map,
|
||||||
|
supplier_id, stats, unmatched_species,
|
||||||
|
new_cultivars, new_links,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Report
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("RESULTS")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Products found: {stats['products_found']}")
|
||||||
|
print(f"Botanical extracted: {stats['botanical_extracted']}")
|
||||||
|
print(f"Species matched: {stats['species_matched']}")
|
||||||
|
print(f"Species NOT matched: {stats['species_not_matched']}")
|
||||||
|
print(f"Cultivars existed: {stats['cultivar_existed']}")
|
||||||
|
print(f"Cultivars created: {stats['cultivar_created']}")
|
||||||
|
print(f"Links existed: {stats['link_existed']}")
|
||||||
|
print(f"Links created: {stats['link_created']}")
|
||||||
|
print(f"Errors: {stats['errors']}")
|
||||||
|
|
||||||
|
if new_cultivars:
|
||||||
|
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
|
||||||
|
for cv in new_cultivars:
|
||||||
|
print(f" + {cv['name']} ({cv.get('species', '?')})")
|
||||||
|
|
||||||
|
if new_links:
|
||||||
|
print(f"\n--- New supplier links ({len(new_links)}) ---")
|
||||||
|
for lk in new_links:
|
||||||
|
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
|
||||||
|
|
||||||
|
if unmatched_species:
|
||||||
|
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
|
||||||
|
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" ? {name} (x{count})")
|
||||||
|
|
||||||
|
print("\nDone.")
|
||||||
|
|
||||||
|
|
||||||
|
def process_product(html, url, species_map, cultivar_map, supplier_id,
|
||||||
|
stats, unmatched_species, new_cultivars, new_links):
|
||||||
|
"""Process a single product page."""
|
||||||
|
stats["products_found"] += 1
|
||||||
|
prod = extract_product_data(html, url)
|
||||||
|
|
||||||
|
if not prod.get("name"):
|
||||||
|
return
|
||||||
|
|
||||||
|
bot_norm = prod.get("botanical_norm")
|
||||||
|
if not bot_norm:
|
||||||
|
# No botanical name found on page
|
||||||
|
stats["species_not_matched"] += 1
|
||||||
|
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
|
||||||
|
return
|
||||||
|
|
||||||
|
stats["botanical_extracted"] += 1
|
||||||
|
|
||||||
|
# Match species
|
||||||
|
species = species_map.get(bot_norm)
|
||||||
|
if not species:
|
||||||
|
stats["species_not_matched"] += 1
|
||||||
|
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
|
||||||
|
return
|
||||||
|
|
||||||
|
stats["species_matched"] += 1
|
||||||
|
species_id = species["id"]
|
||||||
|
cultivar_name = prod["name"]
|
||||||
|
|
||||||
|
# Check if cultivar exists
|
||||||
|
cv_key = (species_id, cultivar_name.lower().strip())
|
||||||
|
existing_cv = cultivar_map.get(cv_key)
|
||||||
|
|
||||||
|
if existing_cv:
|
||||||
|
stats["cultivar_existed"] += 1
|
||||||
|
cultivar_id = existing_cv["id"]
|
||||||
|
else:
|
||||||
|
# Create cultivar
|
||||||
|
create_data = {
|
||||||
|
"species_id": species_id,
|
||||||
|
"name": cultivar_name,
|
||||||
|
"is_organic": True,
|
||||||
|
"source_urls": [url],
|
||||||
|
}
|
||||||
|
# Add growing data if we extracted any
|
||||||
|
if "planting_depth_cm" in prod:
|
||||||
|
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
|
||||||
|
if "row_spacing_cm" in prod:
|
||||||
|
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
|
||||||
|
if "plant_spacing_cm" in prod:
|
||||||
|
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
|
||||||
|
if "germination_temp_c" in prod:
|
||||||
|
create_data["germination_temp_c"] = prod["germination_temp_c"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
new_cv = api_post("/cultivars", create_data)
|
||||||
|
cultivar_id = new_cv["id"]
|
||||||
|
stats["cultivar_created"] += 1
|
||||||
|
new_cultivars.append({
|
||||||
|
"name": cultivar_name,
|
||||||
|
"species": species["name"],
|
||||||
|
"id": cultivar_id,
|
||||||
|
})
|
||||||
|
# Add to local cache
|
||||||
|
cultivar_map[cv_key] = new_cv
|
||||||
|
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||||||
|
if e.code == 500 and "Database error" in body:
|
||||||
|
# Likely slug collision - search for existing cultivar
|
||||||
|
try:
|
||||||
|
# Try multiple search strategies
|
||||||
|
found = None
|
||||||
|
cn_lower = cultivar_name.lower().strip()
|
||||||
|
|
||||||
|
# Strategy 1: search by full name
|
||||||
|
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
|
||||||
|
for cv in search_data.get("data", []):
|
||||||
|
if cv["name"].lower().strip() == cn_lower:
|
||||||
|
found = cv
|
||||||
|
break
|
||||||
|
# Strategy 2: match by species_id + partial name
|
||||||
|
if not found:
|
||||||
|
for cv in search_data.get("data", []):
|
||||||
|
if cv["species_id"] == species_id:
|
||||||
|
# Match if names are similar (ignoring punctuation)
|
||||||
|
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||||||
|
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||||||
|
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
|
||||||
|
found = cv
|
||||||
|
break
|
||||||
|
# Strategy 3: search by last significant word
|
||||||
|
if not found:
|
||||||
|
words = [w for w in cultivar_name.split() if len(w) > 2]
|
||||||
|
if words:
|
||||||
|
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
|
||||||
|
for cv in search2.get("data", []):
|
||||||
|
if cv["species_id"] == species_id:
|
||||||
|
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||||||
|
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||||||
|
if cv_clean == cn_clean:
|
||||||
|
found = cv
|
||||||
|
break
|
||||||
|
|
||||||
|
if found:
|
||||||
|
cultivar_id = found["id"]
|
||||||
|
cultivar_map[cv_key] = found
|
||||||
|
stats["cultivar_existed"] += 1
|
||||||
|
else:
|
||||||
|
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return
|
||||||
|
except Exception as e2:
|
||||||
|
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if Reinsaat supplier link exists
|
||||||
|
try:
|
||||||
|
existing_links = get_cultivar_suppliers(cultivar_id)
|
||||||
|
except Exception:
|
||||||
|
existing_links = []
|
||||||
|
|
||||||
|
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
|
||||||
|
|
||||||
|
if has_reinsaat:
|
||||||
|
stats["link_existed"] += 1
|
||||||
|
else:
|
||||||
|
# Create supplier link
|
||||||
|
link_data = {
|
||||||
|
"supplier_id": supplier_id,
|
||||||
|
"product_url": url,
|
||||||
|
}
|
||||||
|
if "article_number" in prod:
|
||||||
|
link_data["article_number"] = prod["article_number"]
|
||||||
|
if "port_price" in prod:
|
||||||
|
link_data["price_eur"] = prod["port_price"]
|
||||||
|
elif "price_eur" in prod:
|
||||||
|
link_data["price_eur"] = prod["price_eur"]
|
||||||
|
if "pack_size" in prod:
|
||||||
|
link_data["pack_size"] = prod["pack_size"]
|
||||||
|
if "pack_unit" in prod:
|
||||||
|
link_data["pack_unit"] = prod["pack_unit"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
||||||
|
stats["link_created"] += 1
|
||||||
|
new_links.append({
|
||||||
|
"cultivar": cultivar_name,
|
||||||
|
"article": prod.get("article_number", "?"),
|
||||||
|
"url": url,
|
||||||
|
})
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||||||
|
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
|
||||||
|
stats["errors"] += 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user