Add scraper and enrichment scripts to tools/ directory
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
||||
|
||||
HEADERS_WD = {
|
||||
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
|
||||
def herbapi_request(path, method="GET", data=None):
|
||||
url = f"{HERBAPI_BASE}{path}"
|
||||
body = json.dumps(data).encode() if data else None
|
||||
req = urllib.request.Request(url, data=body, method=method, headers={
|
||||
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
})
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def query_wikidata_batch(names):
|
||||
"""Query Wikidata for a batch of scientific names."""
|
||||
values = " ".join(f'"{n}"' for n in names)
|
||||
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
|
||||
VALUES ?name {{ {values} }}
|
||||
?item wdt:P225 ?name .
|
||||
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
|
||||
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
|
||||
}}"""
|
||||
encoded = urllib.parse.quote(sparql)
|
||||
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
|
||||
req = urllib.request.Request(url, headers=HEADERS_WD)
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
data = json.loads(resp.read())
|
||||
|
||||
results = {}
|
||||
for binding in data.get("results", {}).get("bindings", []):
|
||||
name = binding["name"]["value"]
|
||||
qid_url = binding["item"]["value"]
|
||||
qid = qid_url.rsplit("/", 1)[-1]
|
||||
gbif = binding.get("gbifId", {}).get("value")
|
||||
eppo = binding.get("eppoCode", {}).get("value")
|
||||
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Fetch all species
|
||||
resp = herbapi_request("/species?per_page=200")
|
||||
species_list = resp["data"]
|
||||
print(f"Fetched {len(species_list)} species from HerbAPI\n")
|
||||
|
||||
# 2. Collect species needing enrichment
|
||||
to_enrich = [sp for sp in species_list
|
||||
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
|
||||
|
||||
if not to_enrich:
|
||||
print("All species already enriched.")
|
||||
return
|
||||
|
||||
print(f"{len(to_enrich)} species need enrichment\n")
|
||||
|
||||
# 3. Batch query Wikidata
|
||||
BATCH_SIZE = 20
|
||||
wikidata_results = {}
|
||||
names = [sp["name_scientific"] for sp in to_enrich]
|
||||
|
||||
for i in range(0, len(names), BATCH_SIZE):
|
||||
batch = names[i:i + BATCH_SIZE]
|
||||
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
|
||||
try:
|
||||
results = query_wikidata_batch(batch)
|
||||
wikidata_results.update(results)
|
||||
print(f" Got {len(results)} matches")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
if i + BATCH_SIZE < len(names):
|
||||
time.sleep(2)
|
||||
|
||||
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
|
||||
|
||||
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
|
||||
updated = 0
|
||||
skipped = 0
|
||||
not_found = 0
|
||||
errors = 0
|
||||
|
||||
for sp in to_enrich:
|
||||
name = sp["name_scientific"]
|
||||
wd = wikidata_results.get(name)
|
||||
if not wd:
|
||||
print(f" SKIP (no Wikidata match): {name}")
|
||||
not_found += 1
|
||||
continue
|
||||
|
||||
# Check what needs updating
|
||||
needs_qid = not sp["wikidata_qid"] and wd["qid"]
|
||||
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
|
||||
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
|
||||
|
||||
if not (needs_qid or needs_gbif or needs_eppo):
|
||||
print(f" SKIP (nothing new): {name}")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# GET full species by slug for the complete object
|
||||
full_sp = herbapi_request(f"/species/{sp['slug']}")
|
||||
|
||||
# Remove read-only fields
|
||||
species_id = full_sp.pop("id")
|
||||
full_sp.pop("slug", None)
|
||||
full_sp.pop("created_at", None)
|
||||
full_sp.pop("updated_at", None)
|
||||
|
||||
# Merge new data (only null fields)
|
||||
if needs_qid:
|
||||
full_sp["wikidata_qid"] = wd["qid"]
|
||||
if needs_gbif:
|
||||
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
|
||||
if needs_eppo:
|
||||
full_sp["eppo_code"] = wd["eppo_code"]
|
||||
|
||||
# PUT by UUID
|
||||
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
|
||||
|
||||
fields = []
|
||||
if needs_qid: fields.append(f"qid={wd['qid']}")
|
||||
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
|
||||
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
|
||||
print(f" UPDATED: {name} -> {', '.join(fields)}")
|
||||
updated += 1
|
||||
except Exception as e:
|
||||
print(f" ERROR updating {name}: {e}")
|
||||
errors += 1
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"RESULTS:")
|
||||
print(f" Updated: {updated}")
|
||||
print(f" Skipped (no new data): {skipped}")
|
||||
print(f" Not found on Wikidata: {not_found}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total species: {len(species_list)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Expand HerbAPI species database with common permaculture/garden species."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
import ssl
|
||||
|
||||
BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
DELAY = 0.15
|
||||
|
||||
# SSL context for GBIF (https)
|
||||
ssl_ctx = ssl.create_default_context()
|
||||
|
||||
|
||||
def api_get(path):
|
||||
req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def api_post(path, data):
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}{path}",
|
||||
data=body,
|
||||
headers={"Authorization": AUTH, "Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read()), resp.status
|
||||
except urllib.error.HTTPError as e:
|
||||
err_body = e.read().decode()
|
||||
print(f" ERROR {e.code}: {err_body}")
|
||||
return None, e.code
|
||||
|
||||
|
||||
def gbif_get_german_name(scientific_name):
|
||||
"""Query GBIF for the German vernacular name."""
|
||||
try:
|
||||
url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
|
||||
req = urllib.request.Request(url)
|
||||
with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
|
||||
match = json.loads(resp.read())
|
||||
|
||||
usage_key = match.get("usageKey")
|
||||
if not usage_key:
|
||||
return None
|
||||
|
||||
url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
|
||||
req2 = urllib.request.Request(url2)
|
||||
with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
|
||||
vn = json.loads(resp.read())
|
||||
|
||||
for r in vn.get("results", []):
|
||||
if r.get("language") == "deu":
|
||||
return r["vernacularName"]
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" GBIF lookup failed for {scientific_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ── Families to ensure exist ─────────────────────────────────────────
|
||||
FAMILIES_NEEDED = {
|
||||
"Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
|
||||
"Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
|
||||
"Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
|
||||
"Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
|
||||
"Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
|
||||
"Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
|
||||
"Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
|
||||
"Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
|
||||
"Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
|
||||
"Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
|
||||
"Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
|
||||
"Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
|
||||
"Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
|
||||
"Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
|
||||
"Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
|
||||
# New families not yet in the DB:
|
||||
"Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
|
||||
"Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
|
||||
"Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
|
||||
}
|
||||
|
||||
# ── Species to add ───────────────────────────────────────────────────
|
||||
# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
|
||||
SPECIES = [
|
||||
# Vegetables
|
||||
("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
|
||||
("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
|
||||
("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
|
||||
("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
|
||||
{"food_uses": "Fruit"}),
|
||||
("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
|
||||
{"food_uses": "Fruit"}),
|
||||
("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
|
||||
{"food_uses": "Fruit, seeds, flowers"}),
|
||||
("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
|
||||
{"food_uses": "Fruit, seeds"}),
|
||||
("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
|
||||
{"food_uses": "Leaves"}),
|
||||
("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
|
||||
{"food_uses": "Leaves"}),
|
||||
("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
|
||||
{"food_uses": "Leaves, flower buds, stems"}),
|
||||
("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
|
||||
{"food_uses": "Root, leaves"}),
|
||||
("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
|
||||
{"food_uses": "Root, leaves, seed pods"}),
|
||||
("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
|
||||
{"food_uses": "Bulb, leaves"}),
|
||||
("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
|
||||
{"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
|
||||
("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
|
||||
{"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
|
||||
("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
|
||||
{"food_uses": "Leaves, root"}),
|
||||
("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
|
||||
{"food_uses": "Stalks, root, leaves"}),
|
||||
("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
|
||||
{"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
|
||||
("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
|
||||
{"food_uses": "Root"}),
|
||||
("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
|
||||
{"food_uses": "Kernels, cobs"}),
|
||||
("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
|
||||
{"food_uses": "Fruit"}),
|
||||
|
||||
# Herbs
|
||||
("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||
("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||
("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
|
||||
{"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
|
||||
("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||
("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
|
||||
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||
("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
|
||||
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||
("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
|
||||
{"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
|
||||
("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
|
||||
{"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
|
||||
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
|
||||
"attracts_beneficial_insects": True, "attracts_pollinators": True}),
|
||||
("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
|
||||
{"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
|
||||
("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
|
||||
{"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||
("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
|
||||
{"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
|
||||
"other_uses": "Earthworm attractant (biodynamic)"}),
|
||||
|
||||
# Flowers & cover crops
|
||||
("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
|
||||
{"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
|
||||
("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
|
||||
{"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||
("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
|
||||
{"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
|
||||
("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
|
||||
{"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||
("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
|
||||
{"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
|
||||
("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
|
||||
{"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
|
||||
"ground_cover_quality": "excellent", "attracts_pollinators": True}),
|
||||
("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Sprouts",
|
||||
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
|
||||
"other_uses": "Green manure, deep-rooting soil improver"}),
|
||||
|
||||
# Fruit / Trees
|
||||
("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
|
||||
{"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
|
||||
("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
|
||||
{"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
|
||||
("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
|
||||
{"food_uses": "Fruit", "attracts_pollinators": True}),
|
||||
("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
|
||||
{"food_uses": "Berries"}),
|
||||
("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
|
||||
{"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
|
||||
"wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
|
||||
("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
|
||||
{"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
|
||||
("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
|
||||
{"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
|
||||
"medicinal_uses": "High vitamin C, skin care",
|
||||
"other_uses": "Erosion control, windbreak"}),
|
||||
("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
|
||||
{"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Load existing families
|
||||
print("=== Loading existing families ===")
|
||||
fam_resp = api_get("/families?per_page=100")
|
||||
family_map = {} # name_scientific -> id
|
||||
for f in fam_resp["data"]:
|
||||
family_map[f["name_scientific"]] = f["id"]
|
||||
print(f" Found {len(family_map)} existing families")
|
||||
|
||||
# 2. Create missing families
|
||||
print("\n=== Creating missing families ===")
|
||||
families_created = 0
|
||||
for fam_name, fam_info in FAMILIES_NEEDED.items():
|
||||
if fam_name in family_map:
|
||||
print(f" SKIP (exists): {fam_name}")
|
||||
continue
|
||||
payload = {
|
||||
"name_scientific": fam_name,
|
||||
"name_en": fam_info["name_en"],
|
||||
"name_de": fam_info["name_de"],
|
||||
}
|
||||
print(f" CREATE: {fam_name} ...", end=" ")
|
||||
result, status = api_post("/families", payload)
|
||||
if result and "id" in result:
|
||||
family_map[fam_name] = result["id"]
|
||||
print(f"OK ({result['id']})")
|
||||
families_created += 1
|
||||
else:
|
||||
print(f"FAILED (status={status})")
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n Families created: {families_created}")
|
||||
|
||||
# 3. Load existing species
|
||||
print("\n=== Loading existing species ===")
|
||||
sp_resp = api_get("/species?per_page=200")
|
||||
existing_species = set()
|
||||
for s in sp_resp["data"]:
|
||||
existing_species.add(s["name_scientific"])
|
||||
print(f" Found {len(existing_species)} existing species")
|
||||
|
||||
# 4. Add new species
|
||||
print("\n=== Adding new species ===")
|
||||
created = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
|
||||
if sci_name in existing_species:
|
||||
print(f" SKIP (exists): {sci_name}")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Look up family ID
|
||||
fam_id = family_map.get(family)
|
||||
if not fam_id:
|
||||
print(f" SKIP (no family '{family}'): {sci_name}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
# Try GBIF for German name
|
||||
gbif_de = gbif_get_german_name(sci_name)
|
||||
if gbif_de:
|
||||
print(f" GBIF name for {sci_name}: {gbif_de}")
|
||||
# Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
|
||||
# Keep our curated name_de but log the GBIF one
|
||||
|
||||
payload = {
|
||||
"name_scientific": sci_name,
|
||||
"family_id": fam_id,
|
||||
"name_en": name_en,
|
||||
"name_de": name_de,
|
||||
"plant_layer": plant_layer,
|
||||
}
|
||||
# Add extra fields
|
||||
for k, v in extras.items():
|
||||
payload[k] = v
|
||||
|
||||
print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
|
||||
result, status = api_post("/species", payload)
|
||||
if result and "id" in result:
|
||||
print(f"OK ({result['id']})")
|
||||
created += 1
|
||||
else:
|
||||
print(f"FAILED (status={status})")
|
||||
failed += 1
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"SUMMARY")
|
||||
print(f" Families created: {families_created}")
|
||||
print(f" Species created: {created}")
|
||||
print(f" Species skipped: {skipped}")
|
||||
print(f" Species failed: {failed}")
|
||||
print(f" Total species now: {len(existing_species) + created}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,362 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
# Force unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
# --- Configuration ---
|
||||
S3_ENDPOINT = "http://garage.sub-net.at:3900"
|
||||
S3_BUCKET = "herbapi"
|
||||
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
|
||||
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
|
||||
S3_REGION = "garage"
|
||||
|
||||
DB_HOST = "10.31.3.90"
|
||||
DB_USER = "herbapi"
|
||||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||
DB_NAME = "herbapi"
|
||||
|
||||
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||
THUMB_WIDTH = 800
|
||||
REQUEST_DELAY = 0.3
|
||||
|
||||
ALLOWED_LICENSES = {
|
||||
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
|
||||
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
|
||||
"pd-us", "pd-usgov", "pd-author",
|
||||
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
|
||||
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
|
||||
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
|
||||
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
|
||||
}
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convert scientific name to a URL-safe slug."""
|
||||
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||
|
||||
|
||||
def psql(query: str) -> str:
|
||||
"""Run a psql query and return output."""
|
||||
env = os.environ.copy()
|
||||
env["PGPASSWORD"] = DB_PASS
|
||||
result = subprocess.run(
|
||||
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def fetch_json(url: str) -> dict | None:
|
||||
"""Fetch JSON from a URL with proper User-Agent."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read())
|
||||
except Exception as e:
|
||||
print(f" HTTP error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_wikidata_image(qid: str) -> str | None:
|
||||
"""Query Wikidata SPARQL for P18 image filename."""
|
||||
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
|
||||
"query": sparql, "format": "json"
|
||||
})
|
||||
data = fetch_json(url)
|
||||
if not data:
|
||||
return None
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
if not bindings:
|
||||
return None
|
||||
image_url = bindings[0]["image"]["value"]
|
||||
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
|
||||
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
|
||||
return filename
|
||||
|
||||
|
||||
def get_commons_info(filename: str) -> dict | None:
|
||||
"""Get image info from Wikimedia Commons API."""
|
||||
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
|
||||
"action": "query",
|
||||
"titles": f"File:{filename}",
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url|extmetadata",
|
||||
"iiurlwidth": str(THUMB_WIDTH),
|
||||
"format": "json",
|
||||
})
|
||||
data = fetch_json(url)
|
||||
if not data:
|
||||
return None
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page_id, page in pages.items():
|
||||
if page_id == "-1":
|
||||
return None
|
||||
imageinfo = page.get("imageinfo", [])
|
||||
if not imageinfo:
|
||||
return None
|
||||
info = imageinfo[0]
|
||||
meta = info.get("extmetadata", {})
|
||||
|
||||
thumb_url = info.get("thumburl") or info.get("url")
|
||||
desc_url = info.get("descriptionurl", "")
|
||||
|
||||
license_short = meta.get("LicenseShortName", {}).get("value", "")
|
||||
artist_html = meta.get("Artist", {}).get("value", "")
|
||||
# Strip HTML tags from artist
|
||||
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
|
||||
# Clean up whitespace
|
||||
artist = re.sub(r'\s+', ' ', artist)
|
||||
|
||||
return {
|
||||
"thumb_url": thumb_url,
|
||||
"description_url": desc_url,
|
||||
"license": license_short,
|
||||
"artist": artist,
|
||||
"filename": filename,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def is_license_allowed(license_str: str) -> bool:
|
||||
"""Check if a license is in our allowed list."""
|
||||
normalized = license_str.lower().strip()
|
||||
# Direct match
|
||||
if normalized in ALLOWED_LICENSES:
|
||||
return True
|
||||
# Check for NC or ND
|
||||
if "nc" in normalized or "nd" in normalized:
|
||||
return False
|
||||
# Check patterns
|
||||
if normalized.startswith("public domain") or normalized.startswith("pd"):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def normalize_license(license_str: str) -> str:
|
||||
"""Normalize license string for storage."""
|
||||
low = license_str.lower().strip()
|
||||
if "public domain" in low or low.startswith("pd"):
|
||||
return "Public domain"
|
||||
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
|
||||
return "CC0 1.0"
|
||||
# CC BY-SA X.0
|
||||
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
|
||||
if m:
|
||||
return f"CC BY-SA {m.group(1)}"
|
||||
# CC BY X.0
|
||||
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
|
||||
if m:
|
||||
return f"CC BY {m.group(1)}"
|
||||
return license_str
|
||||
|
||||
|
||||
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
|
||||
"""Upload to S3 Garage using AWS CLI."""
|
||||
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
|
||||
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
|
||||
env["AWS_DEFAULT_REGION"] = S3_REGION
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"aws", "s3", "cp", tmp_path,
|
||||
f"s3://{S3_BUCKET}/{s3_key}",
|
||||
"--endpoint-url", S3_ENDPOINT,
|
||||
"--content-type", content_type,
|
||||
],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
os.unlink(tmp_path)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
|
||||
|
||||
|
||||
def download_image(url: str) -> bytes | None:
|
||||
"""Download image data from URL."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
return resp.read()
|
||||
except Exception as e:
|
||||
print(f" Download error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Get species
|
||||
rows = psql(
|
||||
"SELECT id, name_scientific, wikidata_qid FROM species "
|
||||
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
|
||||
"ORDER BY name_scientific"
|
||||
)
|
||||
if not rows:
|
||||
print("No species with wikidata_qid found.")
|
||||
return
|
||||
|
||||
species_list = []
|
||||
for line in rows.split("\n"):
|
||||
parts = line.split("|")
|
||||
if len(parts) == 3:
|
||||
species_list.append({
|
||||
"id": parts[0],
|
||||
"name": parts[1],
|
||||
"qid": parts[2],
|
||||
})
|
||||
|
||||
print(f"Found {len(species_list)} species with Wikidata QIDs.")
|
||||
|
||||
# 2. Get existing images
|
||||
existing = set()
|
||||
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
|
||||
if existing_rows:
|
||||
for line in existing_rows.split("\n"):
|
||||
line = line.strip()
|
||||
if line:
|
||||
existing.add(line)
|
||||
|
||||
print(f"Found {len(existing)} species that already have images.")
|
||||
|
||||
imported = 0
|
||||
skipped_existing = 0
|
||||
skipped_no_image = 0
|
||||
skipped_license = 0
|
||||
skipped_download = 0
|
||||
errors = 0
|
||||
|
||||
for i, sp in enumerate(species_list):
|
||||
name = sp["name"]
|
||||
qid = sp["qid"]
|
||||
sp_id = sp["id"]
|
||||
slug = slugify(name)
|
||||
|
||||
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
|
||||
|
||||
if sp_id in existing:
|
||||
print(" Already has image, skipping.")
|
||||
skipped_existing += 1
|
||||
continue
|
||||
|
||||
# Query Wikidata for image
|
||||
time.sleep(REQUEST_DELAY)
|
||||
filename = get_wikidata_image(qid)
|
||||
if not filename:
|
||||
print(" No image on Wikidata.")
|
||||
skipped_no_image += 1
|
||||
continue
|
||||
|
||||
# Get Commons info
|
||||
time.sleep(REQUEST_DELAY)
|
||||
info = get_commons_info(filename)
|
||||
if not info:
|
||||
print(f" Could not get Commons info for {filename}")
|
||||
skipped_no_image += 1
|
||||
continue
|
||||
|
||||
# Check license
|
||||
raw_license = info["license"]
|
||||
if not is_license_allowed(raw_license):
|
||||
print(f" License not allowed: {raw_license}")
|
||||
skipped_license += 1
|
||||
continue
|
||||
|
||||
norm_license = normalize_license(raw_license)
|
||||
artist = info["artist"]
|
||||
thumb_url = info["thumb_url"]
|
||||
desc_url = info["description_url"]
|
||||
|
||||
print(f" License: {raw_license} -> {norm_license}")
|
||||
print(f" Artist: {artist[:80]}")
|
||||
print(f" Thumbnail: {thumb_url[:100]}...")
|
||||
|
||||
# Download image
|
||||
time.sleep(REQUEST_DELAY)
|
||||
image_data = download_image(thumb_url)
|
||||
if not image_data:
|
||||
print(" Failed to download image.")
|
||||
skipped_download += 1
|
||||
continue
|
||||
|
||||
print(f" Downloaded {len(image_data)} bytes")
|
||||
|
||||
# Determine file extension from URL
|
||||
ext = "jpg"
|
||||
if ".png" in thumb_url.lower():
|
||||
ext = "png"
|
||||
elif ".svg" in thumb_url.lower():
|
||||
ext = "svg"
|
||||
elif ".gif" in thumb_url.lower():
|
||||
ext = "gif"
|
||||
|
||||
s3_key = f"species/{slug}.{ext}"
|
||||
content_type = {
|
||||
"jpg": "image/jpeg",
|
||||
"png": "image/png",
|
||||
"svg": "image/svg+xml",
|
||||
"gif": "image/gif",
|
||||
}.get(ext, "image/jpeg")
|
||||
|
||||
# Upload to S3
|
||||
try:
|
||||
s3_upload(s3_key, image_data, content_type)
|
||||
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
|
||||
except RuntimeError as e:
|
||||
print(f" S3 upload failed: {e}")
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Insert into database
|
||||
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
|
||||
# Escape single quotes for SQL
|
||||
caption_esc = caption.replace("'", "''")
|
||||
desc_url_esc = desc_url.replace("'", "''")
|
||||
norm_license_esc = norm_license.replace("'", "''")
|
||||
s3_key_esc = s3_key.replace("'", "''")
|
||||
|
||||
insert_sql = (
|
||||
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
|
||||
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
|
||||
)
|
||||
|
||||
result = psql(insert_sql)
|
||||
# psql returns empty on success for INSERT
|
||||
print(f" Inserted into images table.")
|
||||
imported += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DONE!")
|
||||
print(f" Imported: {imported}")
|
||||
print(f" Skipped (existing):{skipped_existing}")
|
||||
print(f" Skipped (no image):{skipped_no_image}")
|
||||
print(f" Skipped (license): {skipped_license}")
|
||||
print(f" Skipped (download):{skipped_download}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total processed: {len(species_list)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
# Config
|
||||
DB_HOST = "10.31.3.90"
|
||||
DB_USER = "herbapi"
|
||||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||
DB_NAME = "herbapi"
|
||||
S3_BUCKET = "herbapi"
|
||||
S3_ENDPOINT = "http://10.31.3.170:3900"
|
||||
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||
REQUEST_DELAY = 0.3
|
||||
|
||||
# AWS env for subprocess calls
|
||||
AWS_ENV = {
|
||||
**os.environ,
|
||||
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
|
||||
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
|
||||
"AWS_DEFAULT_REGION": "garage",
|
||||
}
|
||||
|
||||
# Stats
|
||||
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
|
||||
|
||||
|
||||
def fetch_url(url):
|
||||
"""Fetch URL with custom User-Agent."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read()
|
||||
|
||||
|
||||
def fetch_json(url):
|
||||
"""Fetch URL and parse JSON."""
|
||||
return json.loads(fetch_url(url))
|
||||
|
||||
|
||||
def psql(sql):
|
||||
"""Run psql command and return output."""
|
||||
result = subprocess.run(
|
||||
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
|
||||
capture_output=True, text=True,
|
||||
env={**os.environ, "PGPASSWORD": DB_PASS},
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def is_license_allowed(license_str):
|
||||
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
|
||||
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
|
||||
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
|
||||
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
|
||||
"""
|
||||
if not license_str:
|
||||
return False
|
||||
ls = license_str.lower().strip()
|
||||
|
||||
# Reject NC and ND explicitly first
|
||||
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
|
||||
return False
|
||||
|
||||
# Public domain / CC0
|
||||
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
|
||||
return True
|
||||
if "public domain" in ls or ls.startswith("pd"):
|
||||
return True
|
||||
|
||||
# CC BY-SA (any version, any jurisdiction)
|
||||
if re.match(r"cc\s+by-sa\b", ls):
|
||||
return True
|
||||
|
||||
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
|
||||
if re.match(r"cc\s+by\b", ls):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_wikidata_image(qid):
|
||||
"""Query Wikidata SPARQL for P18 image filename."""
|
||||
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
|
||||
data = fetch_json(url)
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
if not bindings:
|
||||
return None
|
||||
image_url = bindings[0]["image"]["value"]
|
||||
# Extract filename from commons URL
|
||||
filename = urllib.parse.unquote(image_url.split("/")[-1])
|
||||
return filename
|
||||
|
||||
|
||||
def get_commons_info(filename):
|
||||
"""Get image info from Commons API: license, artist, thumbnail URL."""
|
||||
title = f"File:{filename}"
|
||||
url = (
|
||||
f"https://commons.wikimedia.org/w/api.php?action=query"
|
||||
f"&titles={urllib.parse.quote(title)}"
|
||||
f"&prop=imageinfo&iiprop=url|extmetadata"
|
||||
f"&iiurlwidth=800&format=json"
|
||||
)
|
||||
data = fetch_json(url)
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page_id, page in pages.items():
|
||||
if page_id == "-1":
|
||||
return None
|
||||
imageinfo = page.get("imageinfo", [{}])[0]
|
||||
meta = imageinfo.get("extmetadata", {})
|
||||
|
||||
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
|
||||
artist_html = meta.get("Artist", {}).get("value", "")
|
||||
|
||||
# Clean up artist: strip HTML tags
|
||||
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
|
||||
# Collapse whitespace
|
||||
artist = re.sub(r"\s+", " ", artist)
|
||||
if len(artist) > 120:
|
||||
artist = artist[:117] + "..."
|
||||
|
||||
# Use the API-provided thumbnail URL (iiurlwidth=800)
|
||||
thumb_url = imageinfo.get("thumburl", "")
|
||||
# Also get the description URL
|
||||
desc_url = imageinfo.get("descriptionurl", "")
|
||||
|
||||
return {
|
||||
"license": license_short,
|
||||
"artist": artist,
|
||||
"thumb_url": thumb_url,
|
||||
"desc_url": desc_url,
|
||||
"filename": filename,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def process_species(species_id, slug, name_sci, qid):
|
||||
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
|
||||
stats["total"] += 1
|
||||
|
||||
# Step 1: Get image filename from Wikidata
|
||||
try:
|
||||
filename = get_wikidata_image(qid)
|
||||
except Exception as e:
|
||||
print(f" ERROR querying Wikidata for {qid}: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not filename:
|
||||
print(f" No P18 image for {qid}")
|
||||
stats["no_p18"] += 1
|
||||
return False
|
||||
|
||||
# Step 2: Get Commons info (license, artist, thumb URL)
|
||||
try:
|
||||
info = get_commons_info(filename)
|
||||
except Exception as e:
|
||||
print(f" ERROR querying Commons for {filename}: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not info:
|
||||
print(f" No Commons info for {filename}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
|
||||
# Step 3: Check license
|
||||
if not is_license_allowed(info["license"]):
|
||||
print(f" Bad license: {info['license']} for {filename}")
|
||||
stats["bad_license"] += 1
|
||||
return False
|
||||
|
||||
# Step 4: Download thumbnail using API-provided URL
|
||||
thumb_url = info["thumb_url"]
|
||||
if not thumb_url:
|
||||
print(f" No thumbnail URL available for {filename}")
|
||||
stats["download_fail"] += 1
|
||||
return False
|
||||
|
||||
# Determine file extension from thumbnail URL
|
||||
ext = "jpg"
|
||||
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||
ext = "png"
|
||||
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||
ext = "gif"
|
||||
|
||||
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
|
||||
try:
|
||||
img_data = fetch_url(thumb_url)
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(img_data)
|
||||
except Exception as e:
|
||||
print(f" ERROR downloading {thumb_url}: {e}")
|
||||
stats["download_fail"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Step 5: Upload to S3
|
||||
s3_key = f"species/{slug}.{ext}"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
|
||||
capture_output=True, text=True, env=AWS_ENV, timeout=60,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" S3 upload failed: {result.stderr}")
|
||||
stats["upload_fail"] += 1
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ERROR uploading to S3: {e}")
|
||||
stats["upload_fail"] += 1
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Step 6: Insert into DB
|
||||
caption = f"Photo: {info['artist']}" if info["artist"] else ""
|
||||
caption_sql = caption.replace("'", "''")
|
||||
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
|
||||
source_url_sql = source_url.replace("'", "''")
|
||||
license_sql = info["license"].replace("'", "''")
|
||||
|
||||
sql = (
|
||||
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
|
||||
)
|
||||
try:
|
||||
psql(sql)
|
||||
except Exception as e:
|
||||
print(f" ERROR inserting to DB: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
|
||||
stats["imported"] += 1
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
# Get species without images
|
||||
rows = psql(
|
||||
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
|
||||
"FROM species s "
|
||||
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
|
||||
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
|
||||
"ORDER BY s.name_scientific;"
|
||||
)
|
||||
if not rows:
|
||||
print("No species need images.")
|
||||
return
|
||||
|
||||
species_list = []
|
||||
for line in rows.split("\n"):
|
||||
parts = line.strip().split("|")
|
||||
if len(parts) == 4:
|
||||
species_list.append(parts)
|
||||
|
||||
print(f"Processing {len(species_list)} species...\n")
|
||||
|
||||
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
|
||||
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
|
||||
ok = process_species(sid, slug, name_sci, qid)
|
||||
if ok:
|
||||
print(f" OK - imported")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"RESULTS:")
|
||||
print(f" Total species processed: {stats['total']}")
|
||||
print(f" Successfully imported: {stats['imported']}")
|
||||
print(f" No P18 image: {stats['no_p18']}")
|
||||
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
|
||||
print(f" Download failures: {stats['download_fail']}")
|
||||
print(f" Upload failures: {stats['upload_fail']}")
|
||||
print(f" Other errors: {stats['errors']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
|
||||
import json, urllib.request, urllib.parse, time, sys
|
||||
|
||||
API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
GBIF = "https://api.gbif.org/v1"
|
||||
|
||||
def api_post(path, data):
|
||||
req = urllib.request.Request(f"{API}{path}",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req)
|
||||
return json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def gbif_de_name(name):
|
||||
"""Get German common name from GBIF."""
|
||||
url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
|
||||
try:
|
||||
match = json.loads(urllib.request.urlopen(url).read())
|
||||
if not match.get("usageKey"): return None
|
||||
url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
|
||||
data = json.loads(urllib.request.urlopen(url2).read())
|
||||
for r in data.get("results", []):
|
||||
if r.get("language") == "deu":
|
||||
return r["vernacularName"]
|
||||
except: pass
|
||||
return None
|
||||
|
||||
FAMILIES = [
|
||||
("Fabaceae", "Hülsenfrüchtler", "Legumes"),
|
||||
("Rosaceae", "Rosengewächse", "Rose family"),
|
||||
("Brassicaceae", "Kreuzblütler", "Cabbage family"),
|
||||
("Apiaceae", "Doldenblütler", "Carrot family"),
|
||||
("Lamiaceae", "Lippenblütler", "Mint family"),
|
||||
("Asteraceae", "Korbblütler", "Daisy family"),
|
||||
("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
|
||||
("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
|
||||
("Poaceae", "Süßgräser", "Grass family"),
|
||||
("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
|
||||
("Boraginaceae", "Raublattgewächse", "Borage family"),
|
||||
("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
|
||||
("Betulaceae", "Birkengewächse", "Birch family"),
|
||||
("Fagaceae", "Buchengewächse", "Beech family"),
|
||||
("Juglandaceae", "Walnussgewächse", "Walnut family"),
|
||||
("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
|
||||
("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
|
||||
("Ericaceae", "Heidekrautgewächse", "Heath family"),
|
||||
("Moraceae", "Maulbeergewächse", "Mulberry family"),
|
||||
("Urticaceae", "Brennnesselgewächse", "Nettle family"),
|
||||
("Malvaceae", "Malvengewächse", "Mallow family"),
|
||||
("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
|
||||
("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
|
||||
("Asparagaceae", "Spargelgewächse", "Asparagus family"),
|
||||
("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
|
||||
]
|
||||
|
||||
SPECIES = [
|
||||
("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
|
||||
("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
|
||||
("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
|
||||
("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
|
||||
("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
|
||||
("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
|
||||
("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
|
||||
("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
|
||||
("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
|
||||
("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||
("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||
("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
|
||||
("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
|
||||
("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
|
||||
("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
|
||||
("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
|
||||
("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
|
||||
("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
|
||||
("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
|
||||
("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
|
||||
("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
|
||||
("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
|
||||
("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
|
||||
("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
|
||||
("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
|
||||
]
|
||||
|
||||
# Create families
|
||||
print("=== Creating families ===")
|
||||
family_map = {}
|
||||
for sci, de, en in FAMILIES:
|
||||
r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
|
||||
if r:
|
||||
family_map[sci] = r["id"]
|
||||
print(f" ✓ {sci}")
|
||||
time.sleep(0.05)
|
||||
print(f"Created {len(family_map)} families\n")
|
||||
|
||||
# Create species
|
||||
print("=== Creating species (with GBIF German names) ===")
|
||||
created = 0
|
||||
for sci_name, family_sci, extra in SPECIES:
|
||||
fam_id = family_map.get(family_sci)
|
||||
if not fam_id:
|
||||
print(f" ✗ {sci_name} — family {family_sci} missing")
|
||||
continue
|
||||
de_name = gbif_de_name(sci_name)
|
||||
data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
|
||||
r = api_post("/species", data)
|
||||
if r:
|
||||
created += 1
|
||||
print(f" ✓ {sci_name} → {de_name or '(no DE name)'}")
|
||||
time.sleep(0.15)
|
||||
print(f"Created {created} species\n")
|
||||
|
||||
# Create suppliers
|
||||
print("=== Creating suppliers ===")
|
||||
for name, url, country, organic, demeter, notes in [
|
||||
("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
|
||||
("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
|
||||
]:
|
||||
r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
|
||||
if r: print(f" ✓ {name}")
|
||||
print("\nDone!")
|
||||
@@ -0,0 +1,514 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape Arche Noah seed catalog and import cultivars into HerbAPI.
|
||||
|
||||
Uses the shop.arche-noah.at Angular SPA's backend API (ACM) to fetch
|
||||
product listings and details, then creates cultivars in HerbAPI matched
|
||||
to existing species.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# --- Configuration -----------------------------------------------------------
|
||||
|
||||
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
|
||||
SHOP_BASE = "https://shop.arche-noah.at/ACM/api/"
|
||||
SHOP_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
REQUEST_DELAY = 0.5 # seconds between requests
|
||||
|
||||
# Only import products from these Arche Noah article lines (their own seeds)
|
||||
ARCHE_NOAH_LINES = {
|
||||
"Bio-Saatgut von ARCHE NOAH",
|
||||
"Kostbarkeiten aus dem ARCHE NOAH Samenarchiv",
|
||||
}
|
||||
|
||||
# Search terms to discover all seed products across the shop
|
||||
SEARCH_TERMS = [
|
||||
"Tomate", "Paradeiser", "Paprika", "Chili", "Gurke", "Kürbis", "Zucchini",
|
||||
"Bohne", "Erbse", "Fisole", "Salat", "Kohl", "Kraut", "Melanzani", "Aubergine",
|
||||
"Mais", "Zwiebel", "Lauch", "Karotte", "Rübe", "Basilikum", "Kräuter",
|
||||
"Blume", "Sonnenblume", "Dill", "Petersilie", "Spinat", "Mangold",
|
||||
"Melone", "Fenchel", "Sellerie", "Rettich", "Radieschen",
|
||||
"Koriander", "Oregano", "Thymian", "Salbei", "Rosmarin", "Minze",
|
||||
"Ringelblume", "Kornblume", "Kapuzinerkresse", "Senf",
|
||||
"Erdbeere", "Lupine", "Luzerne", "Klee", "Bohne", "Mohn",
|
||||
"Radicchio", "Rucola", "Endivie", "Artischocke", "Pastinake",
|
||||
"Schnittlauch", "Knoblauch", "Bärlauch", "Wermut",
|
||||
"Baldrian", "Johanniskraut", "Sonnenhut", "Beinwell",
|
||||
"Studentenblume", "Tagetes", "Phacelia", "Buchweizen",
|
||||
"Rote Bete", "Rote Rübe", "Mangold", "Melde",
|
||||
"Kohlrabi", "Brokkoli", "Blumenkohl", "Rosenkohl", "Wirsing",
|
||||
"Pflücksalat", "Kopfsalat", "Feldsalat", "Asiasalat",
|
||||
"Zuckermais", "Popcorn",
|
||||
]
|
||||
|
||||
# --- Helpers -----------------------------------------------------------------
|
||||
|
||||
def herbapi_request(method, path, data=None):
|
||||
"""Make a request to HerbAPI."""
|
||||
url = f"{HERBAPI_BASE}/{path}"
|
||||
body = json.dumps(data).encode() if data else None
|
||||
req = urllib.request.Request(url, data=body, method=method, headers={
|
||||
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
raw = resp.read().decode("utf-8")
|
||||
return json.loads(raw) if raw.strip() else None
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode("utf-8", errors="replace")
|
||||
print(f" HerbAPI {method} {path}: HTTP {e.code} - {body[:200]}", file=sys.stderr)
|
||||
raise
|
||||
|
||||
|
||||
def shop_create_session():
|
||||
"""Create an anonymous session on the Arche Noah shop."""
|
||||
req = urllib.request.Request(
|
||||
SHOP_BASE + "webshop/createanonymoususer",
|
||||
data=json.dumps({}).encode(),
|
||||
headers={
|
||||
"User-Agent": SHOP_UA,
|
||||
"Content-Type": "application/json",
|
||||
"Origin": "https://shop.arche-noah.at",
|
||||
"Referer": "https://shop.arche-noah.at/",
|
||||
},
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=15)
|
||||
cookie = resp.headers.get("Set-Cookie", "")
|
||||
session = cookie.split("JSESSIONID=")[1].split(";")[0] if "JSESSIONID=" in cookie else ""
|
||||
if not session:
|
||||
raise RuntimeError("Failed to get shop session")
|
||||
return session
|
||||
|
||||
|
||||
def shop_request(session, endpoint, payload):
|
||||
"""Make a POST request to the shop API."""
|
||||
req = urllib.request.Request(
|
||||
SHOP_BASE + endpoint,
|
||||
data=json.dumps(payload).encode(),
|
||||
headers={
|
||||
"User-Agent": SHOP_UA,
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
"Cookie": f"JSESSIONID={session}",
|
||||
"Origin": "https://shop.arche-noah.at",
|
||||
"Referer": "https://shop.arche-noah.at/",
|
||||
},
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
raw = resp.read().decode("utf-8")
|
||||
return json.loads(raw) if raw.strip() else None
|
||||
|
||||
|
||||
def extract_latin_name(detail_headline3):
|
||||
"""Extract the Latin/botanical name from the product detail headline3 field."""
|
||||
if not detail_headline3:
|
||||
return None
|
||||
# Remove HTML tags
|
||||
text = re.sub(r"<[^>]+>", "", detail_headline3).strip()
|
||||
# Remove "Hier geht es zu unseren..." trailing text
|
||||
text = text.split("Hier geht")[0].strip()
|
||||
# Should be something like "Solanum lycopersicum" or "Capsicum annuum"
|
||||
if text and re.match(r"^[A-Z][a-z]+ [a-z]", text):
|
||||
return text
|
||||
return None
|
||||
|
||||
|
||||
def match_species(latin_name, species_by_scientific):
|
||||
"""
|
||||
Match a Latin name to a species, handling subspecies/variety suffixes.
|
||||
E.g., "Phaseolus vulgaris var. nanus" should match "Phaseolus vulgaris".
|
||||
Also handles "subsp.", "convar.", "f." qualifiers.
|
||||
"""
|
||||
if not latin_name:
|
||||
return None
|
||||
|
||||
normalized = latin_name.strip().lower()
|
||||
|
||||
# Direct match
|
||||
species = species_by_scientific.get(normalized)
|
||||
if species:
|
||||
return species
|
||||
|
||||
# Strip subspecies/variety/convar/forma qualifiers and try genus + species only
|
||||
# Pattern: "Genus species [var.|subsp.|convar.|f.|ssp.] ..."
|
||||
m = re.match(r"^([A-Za-z]+ [a-z]+)", normalized)
|
||||
if m:
|
||||
base = m.group(1).strip()
|
||||
species = species_by_scientific.get(base)
|
||||
if species:
|
||||
return species
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_cultivar_name(product_name):
|
||||
"""
|
||||
Extract the cultivar/variety name from the product name.
|
||||
Format examples:
|
||||
"Salatparadeiser 'Naama' HG026" -> "Naama"
|
||||
"Cocktailparadeiser 'Golden Perfection' TO019" -> "Golden Perfection"
|
||||
"Buschbohne 'Marmorierter Mond' HG055" -> "Marmorierter Mond"
|
||||
"""
|
||||
# Try to extract name in quotes (various quote styles)
|
||||
m = re.search(r"['\u2018\u2019`\u00b4]+([^'\u2018\u2019`\u00b4]+)['\u2018\u2019`\u00b4]+", product_name)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
# Fallback: remove the article number suffix and type prefix
|
||||
# Remove trailing article number like HG026, TO019, etc.
|
||||
name = re.sub(r"\s+[A-Z]{1,3}\d{2,4}\s*$", "", product_name).strip()
|
||||
# Remove common prefixes like "Salatparadeiser", "Buschbohne", etc.
|
||||
# Just return the full cleaned name
|
||||
return name
|
||||
|
||||
|
||||
def parse_pack_info(unit_desc):
|
||||
"""
|
||||
Parse pack size info from unitDesc like '20-30 Korn' or '2g'.
|
||||
Returns (pack_size, pack_unit) or (None, None).
|
||||
"""
|
||||
if not unit_desc:
|
||||
return None, None
|
||||
# "20-30 Korn" -> take the lower bound
|
||||
m = re.match(r"(\d+)(?:-\d+)?\s*(\w+)", unit_desc)
|
||||
if m:
|
||||
return float(m.group(1)), m.group(2)
|
||||
return None, None
|
||||
|
||||
|
||||
# --- Main scraping logic -----------------------------------------------------
|
||||
|
||||
def fetch_all_arche_noah_products(session):
|
||||
"""Search the shop API to find all Arche Noah seed products."""
|
||||
all_products = {}
|
||||
seen_terms = set()
|
||||
|
||||
for term in SEARCH_TERMS:
|
||||
if term.lower() in seen_terms:
|
||||
continue
|
||||
seen_terms.add(term.lower())
|
||||
|
||||
offset = 0
|
||||
while True:
|
||||
payload = {
|
||||
"searchCriteria": term,
|
||||
"startIndex": offset,
|
||||
"numDataSets": 200,
|
||||
"allowAllProducts": False,
|
||||
}
|
||||
try:
|
||||
data = shop_request(session, "webshop/getproducts", payload)
|
||||
except Exception as e:
|
||||
print(f" Search '{term}' offset={offset} failed: {e}", file=sys.stderr)
|
||||
break
|
||||
|
||||
if not data:
|
||||
break
|
||||
|
||||
new_count = 0
|
||||
for p in data:
|
||||
if p["sid"] not in all_products:
|
||||
all_products[p["sid"]] = p
|
||||
new_count += 1
|
||||
|
||||
if len(data) < 200:
|
||||
break
|
||||
offset += len(data)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Filter to Arche Noah's own seed products only
|
||||
an_products = {
|
||||
sid: p for sid, p in all_products.items()
|
||||
if (p.get("articleLineDesc") or "") in ARCHE_NOAH_LINES
|
||||
}
|
||||
|
||||
print(f"Found {len(all_products)} total products, {len(an_products)} Arche Noah seed products")
|
||||
return an_products
|
||||
|
||||
|
||||
def fetch_product_details(session, products):
|
||||
"""Fetch detailed info (Latin names) for each product."""
|
||||
details = {}
|
||||
total = len(products)
|
||||
for i, (sid, product) in enumerate(products.items()):
|
||||
try:
|
||||
detail = shop_request(session, "webshop/getproductdetail", {"productSid": sid})
|
||||
if detail:
|
||||
details[sid] = detail
|
||||
except Exception as e:
|
||||
print(f" Detail for {sid} failed: {e}", file=sys.stderr)
|
||||
|
||||
if (i + 1) % 20 == 0:
|
||||
print(f" Fetched details: {i + 1}/{total}")
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
print(f"Fetched {len(details)} product details")
|
||||
return details
|
||||
|
||||
|
||||
def load_herbapi_species():
|
||||
"""Load all species from HerbAPI and build lookup maps (handles pagination)."""
|
||||
page = 1
|
||||
species_list = []
|
||||
while True:
|
||||
result = herbapi_request("GET", f"species?per_page=100&page={page}")
|
||||
if isinstance(result, dict) and "data" in result:
|
||||
data = result["data"]
|
||||
total = result.get("total", 0)
|
||||
elif isinstance(result, list):
|
||||
data = result
|
||||
total = len(data)
|
||||
else:
|
||||
break
|
||||
species_list.extend(data)
|
||||
if len(species_list) >= total or not data:
|
||||
break
|
||||
page += 1
|
||||
|
||||
# Build lookup by scientific name (normalized lowercase)
|
||||
by_scientific = {}
|
||||
for s in species_list:
|
||||
key = s["name_scientific"].strip().lower()
|
||||
by_scientific[key] = s
|
||||
return species_list, by_scientific
|
||||
|
||||
|
||||
def load_herbapi_cultivars():
|
||||
"""Load all existing cultivars from HerbAPI (handles pagination, max 100/page)."""
|
||||
page = 1
|
||||
all_cultivars = []
|
||||
while True:
|
||||
result = herbapi_request("GET", f"cultivars?per_page=100&page={page}")
|
||||
if isinstance(result, dict) and "data" in result:
|
||||
data = result["data"]
|
||||
total = result.get("total", 0)
|
||||
elif isinstance(result, list):
|
||||
data = result
|
||||
total = len(data)
|
||||
else:
|
||||
break
|
||||
|
||||
all_cultivars.extend(data)
|
||||
if len(all_cultivars) >= total or not data:
|
||||
break
|
||||
page += 1
|
||||
|
||||
# Build lookup by (species_id, normalized cultivar name)
|
||||
by_key = {}
|
||||
for c in all_cultivars:
|
||||
key = (c["species_id"], c["name"].strip().lower())
|
||||
by_key[key] = c
|
||||
|
||||
return all_cultivars, by_key
|
||||
|
||||
|
||||
def ensure_supplier():
|
||||
"""Create the Arche Noah supplier if it doesn't exist, return its ID."""
|
||||
suppliers = herbapi_request("GET", "suppliers")
|
||||
if isinstance(suppliers, dict) and "data" in suppliers:
|
||||
suppliers = suppliers["data"]
|
||||
|
||||
for s in suppliers:
|
||||
if "arche" in s["name"].lower() and "noah" in s["name"].lower():
|
||||
print(f"Supplier 'Arche Noah' already exists: {s['id']}")
|
||||
return s["id"]
|
||||
|
||||
print("Creating supplier 'Arche Noah'...")
|
||||
result = herbapi_request("POST", "suppliers", {
|
||||
"name": "Arche Noah",
|
||||
"url": "https://www.arche-noah.at",
|
||||
"country": "AT",
|
||||
"is_organic": True,
|
||||
"is_demeter": False,
|
||||
"notes": "Austrian society for heritage seed preservation and biodiversity",
|
||||
})
|
||||
print(f"Created supplier: {result['id']}")
|
||||
return result["id"]
|
||||
|
||||
|
||||
def load_existing_supplier_links(cultivar_id):
|
||||
"""Load existing supplier links for a cultivar."""
|
||||
try:
|
||||
result = herbapi_request("GET", f"cultivars/{cultivar_id}/suppliers")
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
if isinstance(result, dict) and "data" in result:
|
||||
return result["data"]
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def main():
|
||||
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
print(f"=== Arche Noah Seed Catalog Scraper ===")
|
||||
print(f"Started at {now_str}\n")
|
||||
|
||||
# Step 1: Create Arche Noah supplier in HerbAPI
|
||||
print("[1/6] Ensuring Arche Noah supplier exists...")
|
||||
supplier_id = ensure_supplier()
|
||||
print()
|
||||
|
||||
# Step 2: Load HerbAPI species for matching
|
||||
print("[2/6] Loading HerbAPI species...")
|
||||
species_list, species_by_scientific = load_herbapi_species()
|
||||
print(f"Loaded {len(species_list)} species")
|
||||
print()
|
||||
|
||||
# Step 3: Load existing cultivars for idempotency
|
||||
print("[3/6] Loading existing cultivars...")
|
||||
existing_cultivars, cultivars_by_key = load_herbapi_cultivars()
|
||||
print(f"Loaded {len(existing_cultivars)} existing cultivars")
|
||||
print()
|
||||
|
||||
# Step 4: Scrape Arche Noah shop
|
||||
print("[4/6] Scraping Arche Noah shop catalog...")
|
||||
session = shop_create_session()
|
||||
print(f"Got shop session")
|
||||
products = fetch_all_arche_noah_products(session)
|
||||
print()
|
||||
|
||||
# Step 5: Fetch product details (to get Latin names)
|
||||
print("[5/6] Fetching product details for Latin name matching...")
|
||||
details = fetch_product_details(session, products)
|
||||
print()
|
||||
|
||||
# Step 6: Create cultivars in HerbAPI
|
||||
print("[6/6] Creating cultivars in HerbAPI...")
|
||||
stats = {
|
||||
"created": 0,
|
||||
"skipped_existing": 0,
|
||||
"skipped_no_species": 0,
|
||||
"supplier_linked": 0,
|
||||
"supplier_link_existed": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
for sid, product in sorted(products.items()):
|
||||
detail = details.get(sid, {})
|
||||
|
||||
# Extract Latin name from detail
|
||||
latin_name = extract_latin_name(detail.get("detailHeadline3", ""))
|
||||
if not latin_name:
|
||||
# Fallback: try from category mapping
|
||||
latin_name = None
|
||||
|
||||
# Match to HerbAPI species (handles subspecies/variety suffixes)
|
||||
species = match_species(latin_name, species_by_scientific)
|
||||
|
||||
if not species:
|
||||
print(f" SKIP (no species match): {product['name']} | latin={latin_name}")
|
||||
stats["skipped_no_species"] += 1
|
||||
continue
|
||||
|
||||
# Extract cultivar name
|
||||
cultivar_name = extract_cultivar_name(product["name"])
|
||||
if not cultivar_name:
|
||||
print(f" SKIP (no cultivar name): {product['name']}")
|
||||
stats["skipped_no_species"] += 1
|
||||
continue
|
||||
|
||||
# Check if cultivar already exists (idempotency)
|
||||
lookup_key = (species["id"], cultivar_name.strip().lower())
|
||||
existing = cultivars_by_key.get(lookup_key)
|
||||
|
||||
if existing:
|
||||
cultivar_id = existing["id"]
|
||||
stats["skipped_existing"] += 1
|
||||
else:
|
||||
# Determine if this is organic
|
||||
is_organic = product.get("articleLineDesc") == "Bio-Saatgut von ARCHE NOAH"
|
||||
|
||||
# Build product URL
|
||||
alias = product.get("alias") or detail.get("alias", "")
|
||||
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
||||
|
||||
# Create cultivar
|
||||
cultivar_data = {
|
||||
"species_id": species["id"],
|
||||
"name": cultivar_name,
|
||||
"name_de": cultivar_name,
|
||||
"is_organic": is_organic,
|
||||
"source_urls": [product_url] if product_url else None,
|
||||
}
|
||||
|
||||
try:
|
||||
result = herbapi_request("POST", "cultivars", cultivar_data)
|
||||
cultivar_id = result["id"]
|
||||
stats["created"] += 1
|
||||
# Add to lookup for idempotency within this run
|
||||
cultivars_by_key[lookup_key] = result
|
||||
print(f" CREATED: {cultivar_name} ({species['name_scientific']})")
|
||||
except Exception as e:
|
||||
print(f" ERROR creating '{cultivar_name}': {e}", file=sys.stderr)
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
# Link cultivar to supplier
|
||||
existing_links = load_existing_supplier_links(cultivar_id)
|
||||
already_linked = any(
|
||||
link["supplier_id"] == supplier_id for link in existing_links
|
||||
)
|
||||
|
||||
if already_linked:
|
||||
stats["supplier_link_existed"] += 1
|
||||
else:
|
||||
# Parse pack info
|
||||
unit_desc = product.get("unitDesc") or detail.get("unitDesc", "")
|
||||
pack_size, pack_unit = parse_pack_info(unit_desc)
|
||||
|
||||
# Get price
|
||||
price = None
|
||||
price_list = product.get("priceListPos") or detail.get("priceListPos", [])
|
||||
if price_list:
|
||||
price = price_list[0].get("singleUnitPrice")
|
||||
|
||||
# Build product URL
|
||||
alias = product.get("alias") or detail.get("alias", "")
|
||||
product_url = f"https://shop.arche-noah.at/produkt/{alias}" if alias else None
|
||||
|
||||
link_data = {
|
||||
"supplier_id": supplier_id,
|
||||
"article_number": str(product.get("articleNr", "")),
|
||||
"product_url": product_url,
|
||||
"price_eur": price,
|
||||
"pack_size": pack_size,
|
||||
"pack_unit": pack_unit,
|
||||
}
|
||||
|
||||
try:
|
||||
herbapi_request("POST", f"cultivars/{cultivar_id}/suppliers", link_data)
|
||||
stats["supplier_linked"] += 1
|
||||
except Exception as e:
|
||||
print(f" ERROR linking supplier for '{cultivar_name}': {e}", file=sys.stderr)
|
||||
stats["errors"] += 1
|
||||
|
||||
time.sleep(0.1) # small delay between HerbAPI calls
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Scraping complete!")
|
||||
print(f" Cultivars created: {stats['created']}")
|
||||
print(f" Cultivars already existed: {stats['skipped_existing']}")
|
||||
print(f" Skipped (no species match): {stats['skipped_no_species']}")
|
||||
print(f" Supplier links created: {stats['supplier_linked']}")
|
||||
print(f" Supplier links existed: {stats['supplier_link_existed']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,843 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scraper for Bingenheimer Saatgut (https://www.bingenheimersaatgut.de/)
|
||||
Extracts cultivar data and imports into HerbAPI.
|
||||
|
||||
Categories scraped: Gemüse (vegetables), Kräuter (herbs), Gründüngung (green manure).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from html.parser import HTMLParser
|
||||
from typing import Optional
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────────
|
||||
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
SITE_BASE = "https://www.bingenheimersaatgut.de"
|
||||
DELAY = 0.5
|
||||
USER_AGENT = "HerbAPI-Scraper/1.0 (+https://sub-net.at)"
|
||||
|
||||
# ── Category URLs to scrape ───────────────────────────────────────────────
|
||||
# (url_path, default_species_scientific_name)
|
||||
|
||||
VEGETABLE_CATEGORIES = [
|
||||
("gemuese/tomaten", "Solanum lycopersicum"),
|
||||
("gemuese/gurken/gewuerzgurke", "Cucumis sativus"),
|
||||
("gemuese/gurken/salatgurken", "Cucumis sativus"),
|
||||
("gemuese/aubergine", "Solanum melongena"),
|
||||
("gemuese/bohnen/buschbohne", "Phaseolus vulgaris"),
|
||||
("gemuese/bohnen/stangenbohne", "Phaseolus vulgaris"),
|
||||
("gemuese/bohnen/dicke-bohne", "Vicia faba"),
|
||||
("gemuese/bohnen/feuerbohne", "Phaseolus coccineus"),
|
||||
("gemuese/bohnen/edamame-sojabohne", "Glycine max"),
|
||||
("gemuese/bohnen/spaghettibohne", "Vigna unguiculata"),
|
||||
("gemuese/erbsen/markerbse", "Pisum sativum"),
|
||||
("gemuese/erbsen/schalerbse", "Pisum sativum"),
|
||||
("gemuese/erbsen/zuckererbse", "Pisum sativum"),
|
||||
("gemuese/feldsalat", "Valerianella locusta"),
|
||||
("gemuese/knollenfenchel", "Foeniculum vulgare"),
|
||||
("gemuese/kohl/blumenkohl", "Brassica oleracea"),
|
||||
("gemuese/kohl/brokkoli", "Brassica oleracea"),
|
||||
("gemuese/kohl/chinakohlpak-choi", "Brassica rapa"),
|
||||
("gemuese/kohl/gruenkohl", "Brassica oleracea"),
|
||||
("gemuese/kohl/kohlrabi", "Brassica oleracea"),
|
||||
("gemuese/kohl/rotkohl", "Brassica oleracea"),
|
||||
("gemuese/kohl/weisskohl", "Brassica oleracea"),
|
||||
("gemuese/kohl/wirsing", "Brassica oleracea"),
|
||||
("gemuese/kohl/rosenkohl", "Brassica oleracea"),
|
||||
("gemuese/kresse", "Lepidium sativum"),
|
||||
("gemuese/kuerbis", "Cucurbita maxima"),
|
||||
("gemuese/zuckermais", "Zea mays"),
|
||||
("gemuese/mangold", "Beta vulgaris"),
|
||||
("gemuese/melone", "Cucumis melo"),
|
||||
("gemuese/moehren", "Daucus carota"),
|
||||
("gemuese/paprika/gemuesepaprika", "Capsicum annuum"),
|
||||
("gemuese/paprika/chili", "Capsicum annuum"),
|
||||
("gemuese/pastinaken", "Pastinaca sativa"),
|
||||
("gemuese/petersilienwurzel", "Petroselinum crispum"),
|
||||
("gemuese/physalis", "Physalis peruviana"),
|
||||
("gemuese/porreelauch", "Allium porrum"),
|
||||
("gemuese/radies", "Raphanus sativus"),
|
||||
("gemuese/rettich", "Raphanus sativus"),
|
||||
("gemuese/rote-bete", "Beta vulgaris"),
|
||||
("gemuese/rueben/mai-herbstruebennavets", "Brassica rapa"),
|
||||
("gemuese/rueben/kohlruebe", "Brassica napus"),
|
||||
("gemuese/rucola", "Eruca vesicaria"),
|
||||
("gemuese/salat/bataviasalat", "Lactuca sativa"),
|
||||
("gemuese/salat/eichblattsalat", "Lactuca sativa"),
|
||||
("gemuese/salat/eissalat", "Lactuca sativa"),
|
||||
("gemuese/salat/endivien", "Cichorium endivia"),
|
||||
("gemuese/salat/hirschhornwegerich", "Plantago coronopus"),
|
||||
("gemuese/salat/kopfsalat", "Lactuca sativa"),
|
||||
("gemuese/salat/lollosalat", "Lactuca sativa"),
|
||||
("gemuese/salat/romanasalat", "Lactuca sativa"),
|
||||
("gemuese/salat/baby-leaf", "Lactuca sativa"),
|
||||
("gemuese/sellerie/knollensellerie", "Apium graveolens"),
|
||||
("gemuese/sellerie/stangen--bleichsellerie", "Apium graveolens"),
|
||||
("gemuese/spinatspinat-aehnliche/spinat", "Spinacia oleracea"),
|
||||
("gemuese/spinatspinat-aehnliche/neuseelaender-spinat", "Tetragonia tetragonioides"),
|
||||
("gemuese/blattstielgemuese", "Beta vulgaris"),
|
||||
("gemuese/zwiebeln", "Allium cepa"),
|
||||
("gemuese/lauchzwiebeln", "Allium fistulosum"),
|
||||
("gemuese/artischocke", "Cynara cardunculus"),
|
||||
("gemuese/asia-salate", "Brassica juncea"),
|
||||
("gemuese/chicoree", "Cichorium intybus"),
|
||||
("gemuese/schwarz-haferwurzel", "Scorzonera hispanica"),
|
||||
("gemuese/winterpostelein", "Claytonia perfoliata"),
|
||||
("gemuese/zucchini", "Cucurbita pepo"),
|
||||
("gemuese/catalogna", "Cichorium intybus"),
|
||||
("gemuese/zichoriensalate", "Cichorium intybus"),
|
||||
]
|
||||
|
||||
HERB_CATEGORIES = [
|
||||
("kraeuter/basilikum", "Ocimum basilicum"),
|
||||
("kraeuter/bohnenkraut", "Satureja hortensis"),
|
||||
("kraeuter/borretsch", "Borago officinalis"),
|
||||
("kraeuter/dill", "Anethum graveolens"),
|
||||
("kraeuter/kuemmel", "Carum carvi"),
|
||||
("kraeuter/kerbel", "Anthriscus cerefolium"),
|
||||
("kraeuter/koriander", "Coriandrum sativum"),
|
||||
("kraeuter/gewuerzfenchel", "Foeniculum vulgare"),
|
||||
("kraeuter/kultursauerampfer", "Rumex acetosa"),
|
||||
("kraeuter/lavendel", "Lavandula angustifolia"),
|
||||
("kraeuter/liebstock", "Levisticum officinale"),
|
||||
("kraeuter/majoran", "Origanum majorana"),
|
||||
("kraeuter/oregano", "Origanum vulgare"),
|
||||
("kraeuter/pimpinelle", "Sanguisorba minor"),
|
||||
("kraeuter/estragon", "Artemisia dracunculus"),
|
||||
("kraeuter/salbei", "Salvia officinalis"),
|
||||
("kraeuter/schnittlauch", "Allium schoenoprasum"),
|
||||
("kraeuter/schnittknoblauch", "Allium tuberosum"),
|
||||
("kraeuter/schwarzkuemmel", "Nigella sativa"),
|
||||
("kraeuter/speisechrysantheme", "Glebionis coronaria"),
|
||||
("kraeuter/thymian", "Thymus vulgaris"),
|
||||
("kraeuter/ysop", "Hyssopus officinalis"),
|
||||
("kraeuter/winterkresse", "Barbarea vulgaris"),
|
||||
("kraeuter/brunnenkresse", "Nasturtium officinale"),
|
||||
("kraeuter/melisse", "Melissa officinalis"),
|
||||
("kraeuter/petersilie", "Petroselinum crispum"),
|
||||
("kraeuter/schnittsellerie", "Apium graveolens"),
|
||||
("kraeuter/beifuss", "Artemisia vulgaris"),
|
||||
]
|
||||
|
||||
GREEN_MANURE_CATEGORIES = [
|
||||
("gruenduengung", None),
|
||||
]
|
||||
|
||||
ALL_CATEGORIES = VEGETABLE_CATEGORIES + HERB_CATEGORIES + GREEN_MANURE_CATEGORIES
|
||||
|
||||
# ── Stats ─────────────────────────────────────────────────────────────────
|
||||
stats = {
|
||||
"categories_scraped": 0,
|
||||
"products_found": 0,
|
||||
"detail_pages_fetched": 0,
|
||||
"cultivars_created": 0,
|
||||
"cultivars_existed": 0,
|
||||
"supplier_links_created": 0,
|
||||
"supplier_links_existed": 0,
|
||||
"species_created": 0,
|
||||
"families_created": 0,
|
||||
"species_not_matched": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
|
||||
# ── HTTP helpers ──────────────────────────────────────────────────────────
|
||||
def fetch_page(url: str) -> str:
|
||||
"""Fetch a web page with User-Agent header."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
return ""
|
||||
raise
|
||||
|
||||
|
||||
def api_get(path: str, params: dict = None) -> dict:
|
||||
"""GET from HerbAPI."""
|
||||
url = f"{API_BASE}{path}"
|
||||
if params:
|
||||
url += "?" + urllib.parse.urlencode(params)
|
||||
req = urllib.request.Request(url, headers={
|
||||
"Authorization": f"Bearer {API_TOKEN}",
|
||||
"Accept": "application/json",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def api_post(path: str, data: dict) -> tuple:
|
||||
"""POST to HerbAPI. Returns (response_dict, status_code)."""
|
||||
url = f"{API_BASE}{path}"
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=body, method="POST", headers={
|
||||
"Authorization": f"Bearer {API_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read()), resp.status
|
||||
except urllib.error.HTTPError as e:
|
||||
err_body = e.read().decode("utf-8", errors="replace")
|
||||
return {"error": err_body, "_status": e.code}, e.code
|
||||
|
||||
|
||||
# ── HTML parsing helpers ──────────────────────────────────────────────────
|
||||
def parse_product_links(html: str) -> list:
|
||||
"""Parse product links from listing page using regex."""
|
||||
links = []
|
||||
# Magento product-item-link pattern
|
||||
pattern = re.compile(
|
||||
r'<a[^>]+href="([^"]*?/de/bio-saatgut/[^"]+?)"[^>]*class="[^"]*product-item-link[^"]*"[^>]*>\s*(.*?)\s*</a>',
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
for match in pattern.finditer(html):
|
||||
url = match.group(1)
|
||||
name = re.sub(r'<[^>]+>', '', match.group(2)).strip()
|
||||
if name:
|
||||
if not url.startswith("http"):
|
||||
url = SITE_BASE + url
|
||||
links.append((url, name))
|
||||
|
||||
if not links:
|
||||
# Broader pattern for product detail links
|
||||
pattern2 = re.compile(
|
||||
r'href="([^"]*?/de/bio-saatgut/(?:gemuese|kraeuter|gruenduengung)/[^"]+?/[^"/.]+)"[^>]*>\s*([^<]{3,})',
|
||||
re.IGNORECASE
|
||||
)
|
||||
seen = set()
|
||||
for match in pattern2.finditer(html):
|
||||
url = match.group(1).strip()
|
||||
name = match.group(2).strip()
|
||||
if name and url not in seen and not url.endswith(".html"):
|
||||
seen.add(url)
|
||||
if not url.startswith("http"):
|
||||
url = SITE_BASE + url
|
||||
links.append((url, name))
|
||||
|
||||
# Deduplicate by URL
|
||||
seen_urls = set()
|
||||
unique = []
|
||||
for url, name in links:
|
||||
if url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
unique.append((url, name))
|
||||
return unique
|
||||
|
||||
|
||||
def extract_latin_from_detail(html: str) -> Optional[str]:
|
||||
"""Extract Latin/botanical name from product detail page."""
|
||||
patterns = [
|
||||
r'<(?:em|i)[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,}(?:\s+(?:var\.|subsp\.)\s+[a-z]+)?)\s*</(?:em|i)>',
|
||||
r'class="[^"]*(?:botanical|latin|species)[^"]*"[^>]*>\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
||||
r'(?:Botanischer?\s+Name|Lateinischer?\s+Name|Art)\s*:?\s*(?:<[^>]+>)*\s*([A-Z][a-z]+\s+[a-z]{2,})',
|
||||
]
|
||||
for pat in patterns:
|
||||
m = re.search(pat, html, re.IGNORECASE)
|
||||
if m:
|
||||
name = m.group(1).strip()
|
||||
parts = name.split()
|
||||
if len(parts) >= 2 and parts[0][0].isupper() and parts[1][0].islower():
|
||||
return name
|
||||
return None
|
||||
|
||||
|
||||
def extract_description_from_detail(html: str) -> str:
|
||||
"""Extract product description from detail page."""
|
||||
desc_patterns = [
|
||||
r'<div[^>]*class="[^"]*product[- ]description[^"]*"[^>]*>(.*?)</div>',
|
||||
r'<div[^>]*class="[^"]*beschreibung[^"]*"[^>]*>(.*?)</div>',
|
||||
r'data-content-type="description"[^>]*>(.*?)</div>',
|
||||
]
|
||||
for pat in desc_patterns:
|
||||
m = re.search(pat, html, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
raw = m.group(1)
|
||||
text = re.sub(r'<[^>]+>', ' ', raw)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
if len(text) > 20:
|
||||
return text[:2000]
|
||||
return ""
|
||||
|
||||
|
||||
def extract_article_number(product_name: str, url: str) -> Optional[str]:
|
||||
"""Extract article number from product name or URL."""
|
||||
m = re.search(r'\(([A-Z]\s*\d+[A-Z]?)\)', product_name)
|
||||
if m:
|
||||
return m.group(1).replace(" ", "")
|
||||
slug = url.rstrip("/").split("/")[-1]
|
||||
m = re.search(r'-([a-z]\d+[a-z]?)$', slug, re.IGNORECASE)
|
||||
if m:
|
||||
return m.group(1).upper()
|
||||
return None
|
||||
|
||||
|
||||
def extract_variety_name(product_name: str) -> str:
|
||||
"""Extract the variety/cultivar name from the full product name."""
|
||||
name = product_name.strip()
|
||||
|
||||
# Remove article number suffix like (G802)
|
||||
name = re.sub(r'\s*\([A-Z]\s*\d+[A-Z]?\)\s*$', '', name)
|
||||
|
||||
# Common German vegetable/herb type prefixes to strip
|
||||
prefixes = [
|
||||
# Tomatoes
|
||||
r'(?:Normal(?:früchtige)?|Fleisch|Cherry|Balkon|Wild|Freiland|Roma|Ochsenherz|'
|
||||
r'Cocktail|Dattel|Mini|Snack|Stab|Busch|Salat|Zwerg)[\s-]*[Tt]omate\s+',
|
||||
# Beans
|
||||
r'(?:Busch|Stangen|Dicke|Feuer|Spaghetti)[\s-]*[Bb]ohne\s+',
|
||||
r'Edamame(?:-Sojabohne)?\s+',
|
||||
# Peas
|
||||
r'(?:Mark|Schal|Zucker|Pal)[\s-]*[Ee]rbse\s+',
|
||||
# Cucurbits
|
||||
r'(?:Salat|Einlege|Gewürz|Freiland|Schlangen)[\s-]*[Gg]urke\s+',
|
||||
r'Zucchini\s+',
|
||||
r'Kürbis\s+',
|
||||
r'(?:Wasser)?[Mm]elone\s+',
|
||||
# Brassicas
|
||||
r'(?:Blumen|Grün|Rot|Weiß|Rosen)[\s-]*[Kk]ohl\s+',
|
||||
r'Kohlrabi\s+',
|
||||
r'Wirsing\s+',
|
||||
r'Brokkoli\s+',
|
||||
r'Chinakohl\s+',
|
||||
r'Pak\s+Choi\s+',
|
||||
r'Kohlrübe\s+',
|
||||
r'Mai-/Herbstrüben?(?:/Navets)?\s+',
|
||||
# Root vegetables
|
||||
r'Möhre\s+',
|
||||
r'Karotten?(?:\s*-?\s*Mix)?\s+',
|
||||
r'Pastinake\s+',
|
||||
r'Radies(?:chen)?\s+',
|
||||
r'Rettich\s+',
|
||||
r'Schwarzwurzel\s+',
|
||||
r'Haferwurzel\s+',
|
||||
r'Petersilienwurzel\s+',
|
||||
# Beets
|
||||
r'(?:Rote|Gelbe|Weiße)\s+Bete?\s+',
|
||||
r'Mangold\s+',
|
||||
# Lettuce & leafy
|
||||
r'(?:Kopf|Eichblatt|Batavia|Eis|Lollo|Romana|Baby-Leaf)[\s-]*[Ss]alat\s+',
|
||||
r'Feldsalat\s+',
|
||||
r'Endivie\s+',
|
||||
r'Asia[\s-]*Salat\s+',
|
||||
r'Spinat\s+',
|
||||
# Alliums
|
||||
r'Zwiebel\s+',
|
||||
r'Lauchzwiebel\s+',
|
||||
r'Porree(?:/Lauch)?\s+',
|
||||
r'Schnittlauch\s+',
|
||||
r'Schnittknoblauch\s+',
|
||||
# Peppers
|
||||
r'(?:Gemüse|Block|Spitz|Papier)[\s-]*[Pp]aprika\s+',
|
||||
r'Chili\s+',
|
||||
# Celery
|
||||
r'(?:Knollen|Stangen|Bleich|Schnitt)[\s-]*[Ss]ellerie\s+',
|
||||
# Herbs
|
||||
r'Basilikum\s+',
|
||||
r'Koriander\s+',
|
||||
r'Dill\s+',
|
||||
r'Petersilie\s+',
|
||||
r'(?:Knollen|Gewürz)[\s-]*[Ff]enchel\s+',
|
||||
r'Salbei\s+',
|
||||
r'Thymian\s+',
|
||||
r'Oregano\s+',
|
||||
r'Lavendel\s+',
|
||||
r'Melisse\s+',
|
||||
r'Majoran\s+',
|
||||
r'Estragon\s+',
|
||||
r'Kresse\s+',
|
||||
r'Bohnenkraut\s+',
|
||||
r'Borretsch\s+',
|
||||
r'Kümmel\s+',
|
||||
r'Kerbel\s+',
|
||||
r'Liebstock\s+',
|
||||
r'Ysop\s+',
|
||||
r'Pimpinelle\s+',
|
||||
r'Beifuß\s+',
|
||||
r'Schwarzkümmel\s+',
|
||||
# Other
|
||||
r'Zuckermais\s+',
|
||||
r'Artischocke\s+',
|
||||
r'Physalis\s+',
|
||||
r'Aubergine\s+',
|
||||
r'Catalogna\s+',
|
||||
]
|
||||
for prefix in prefixes:
|
||||
name = re.sub(r'^' + prefix, '', name, flags=re.IGNORECASE)
|
||||
|
||||
name = name.strip().strip("'\"")
|
||||
return name
|
||||
|
||||
|
||||
# ── API data caches ───────────────────────────────────────────────────────
|
||||
species_cache = {} # scientific_name_lower -> {id, name_scientific, ...}
|
||||
family_cache = {} # name_scientific_lower -> {id, name_scientific}
|
||||
cultivar_cache = {} # slug -> {id, name, species_id, ...}
|
||||
supplier_id = None
|
||||
|
||||
|
||||
def load_api_data():
|
||||
"""Load all existing data from HerbAPI for matching."""
|
||||
global supplier_id
|
||||
|
||||
print("Loading existing HerbAPI data...")
|
||||
|
||||
# Load families
|
||||
page = 1
|
||||
while True:
|
||||
resp = api_get("/families", {"per_page": 100, "page": page})
|
||||
for f in resp["data"]:
|
||||
family_cache[f["name_scientific"].lower()] = f
|
||||
if len(resp["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
print(f" Loaded {len(family_cache)} families")
|
||||
|
||||
# Load species
|
||||
page = 1
|
||||
while True:
|
||||
resp = api_get("/species", {"per_page": 100, "page": page})
|
||||
for s in resp["data"]:
|
||||
species_cache[s["name_scientific"].lower()] = s
|
||||
if len(resp["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
print(f" Loaded {len(species_cache)} species")
|
||||
|
||||
# Load ALL cultivars (slug + id + name + species_id)
|
||||
page = 1
|
||||
while True:
|
||||
resp = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||
for c in resp["data"]:
|
||||
cultivar_cache[c["slug"]] = {
|
||||
"id": c["id"],
|
||||
"name": c["name"],
|
||||
"species_id": c["species_id"],
|
||||
}
|
||||
if len(resp["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
print(f" Loaded {len(cultivar_cache)} cultivars")
|
||||
|
||||
# Create or find Bingenheimer supplier
|
||||
resp = api_get("/suppliers")
|
||||
for s in resp:
|
||||
if "bingenheimer" in s["name"].lower():
|
||||
supplier_id = s["id"]
|
||||
print(f" Found existing supplier: {s['name']} ({s['id']})")
|
||||
break
|
||||
|
||||
if not supplier_id:
|
||||
print(" Creating Bingenheimer Saatgut supplier...")
|
||||
s, code = api_post("/suppliers", {
|
||||
"name": "Bingenheimer Saatgut",
|
||||
"url": "https://www.bingenheimersaatgut.de",
|
||||
"country": "DE",
|
||||
"is_organic": True,
|
||||
"is_demeter": True,
|
||||
"notes": "German biodynamic seed company, Demeter certified, open-pollinated varieties"
|
||||
})
|
||||
if "id" in s:
|
||||
supplier_id = s["id"]
|
||||
print(f" Created supplier: {s['id']}")
|
||||
else:
|
||||
print(f" ERROR creating supplier: {s}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def find_or_create_species(latin_name: str) -> Optional[str]:
|
||||
"""Find species by Latin name or create it. Returns species ID."""
|
||||
if not latin_name:
|
||||
return None
|
||||
|
||||
key = latin_name.lower().strip()
|
||||
|
||||
# Direct match
|
||||
if key in species_cache:
|
||||
return species_cache[key]["id"]
|
||||
|
||||
# Try without subspecies/variety
|
||||
base = " ".join(key.split()[:2])
|
||||
if base in species_cache:
|
||||
return species_cache[base]["id"]
|
||||
|
||||
# Handle synonyms
|
||||
synonyms = {
|
||||
"lycopersicon esculentum": "solanum lycopersicum",
|
||||
"capsicum annuum var. annuum": "capsicum annuum",
|
||||
"brassica oleracea var. botrytis": "brassica oleracea",
|
||||
"brassica oleracea var. italica": "brassica oleracea",
|
||||
"brassica oleracea var. gemmifera": "brassica oleracea",
|
||||
"brassica oleracea var. gongylodes": "brassica oleracea",
|
||||
"brassica oleracea var. capitata": "brassica oleracea",
|
||||
"brassica oleracea var. sabauda": "brassica oleracea",
|
||||
"brassica oleracea var. sabellica": "brassica oleracea",
|
||||
"brassica rapa var. rapa": "brassica rapa",
|
||||
"brassica rapa subsp. pekinensis": "brassica rapa",
|
||||
"brassica rapa subsp. chinensis": "brassica rapa",
|
||||
"beta vulgaris var. conditiva": "beta vulgaris",
|
||||
"beta vulgaris subsp. vulgaris": "beta vulgaris",
|
||||
"beta vulgaris var. vulgaris": "beta vulgaris",
|
||||
"allium porrum": "allium cepa",
|
||||
"allium ampeloprasum": "allium cepa",
|
||||
"origanum majorana": "origanum vulgare",
|
||||
"cichorium intybus var. foliosum": "cichorium intybus",
|
||||
"petroselinum crispum var. tuberosum": "petroselinum crispum",
|
||||
"apium graveolens var. rapaceum": "apium graveolens",
|
||||
"apium graveolens var. dulce": "apium graveolens",
|
||||
"lactuca sativa var. capitata": "lactuca sativa",
|
||||
"lactuca sativa var. crispa": "lactuca sativa",
|
||||
"lactuca sativa var. longifolia": "lactuca sativa",
|
||||
}
|
||||
if key in synonyms:
|
||||
syn_key = synonyms[key]
|
||||
if syn_key in species_cache:
|
||||
return species_cache[syn_key]["id"]
|
||||
|
||||
# Try to create the species
|
||||
genus = latin_name.split()[0]
|
||||
family_map = {
|
||||
"Solanum": "Solanaceae", "Capsicum": "Solanaceae", "Physalis": "Solanaceae",
|
||||
"Nicandra": "Solanaceae",
|
||||
"Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae", "Citrullus": "Cucurbitaceae",
|
||||
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Vicia": "Fabaceae",
|
||||
"Glycine": "Fabaceae", "Lens": "Fabaceae", "Lupinus": "Fabaceae",
|
||||
"Trifolium": "Fabaceae", "Medicago": "Fabaceae", "Vigna": "Fabaceae",
|
||||
"Brassica": "Brassicaceae", "Raphanus": "Brassicaceae", "Eruca": "Brassicaceae",
|
||||
"Lepidium": "Brassicaceae", "Nasturtium": "Brassicaceae", "Barbarea": "Brassicaceae",
|
||||
"Sinapis": "Brassicaceae", "Crambe": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
||||
"Allium": "Amaryllidaceae",
|
||||
"Daucus": "Apiaceae", "Petroselinum": "Apiaceae", "Apium": "Apiaceae",
|
||||
"Foeniculum": "Apiaceae", "Pastinaca": "Apiaceae", "Coriandrum": "Apiaceae",
|
||||
"Anethum": "Apiaceae", "Levisticum": "Apiaceae", "Anthriscus": "Apiaceae",
|
||||
"Carum": "Apiaceae", "Myrrhis": "Apiaceae", "Pimpinella": "Apiaceae",
|
||||
"Sanguisorba": "Rosaceae",
|
||||
"Lactuca": "Asteraceae", "Cichorium": "Asteraceae", "Cynara": "Asteraceae",
|
||||
"Helianthus": "Asteraceae", "Calendula": "Asteraceae", "Tagetes": "Asteraceae",
|
||||
"Scorzonera": "Asteraceae", "Tragopogon": "Asteraceae", "Glebionis": "Asteraceae",
|
||||
"Artemisia": "Asteraceae",
|
||||
"Beta": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
||||
"Atriplex": "Chenopodiaceae", "Chenopodium": "Chenopodiaceae",
|
||||
"Ocimum": "Lamiaceae", "Origanum": "Lamiaceae", "Thymus": "Lamiaceae",
|
||||
"Salvia": "Lamiaceae", "Melissa": "Lamiaceae", "Lavandula": "Lamiaceae",
|
||||
"Satureja": "Lamiaceae", "Hyssopus": "Lamiaceae", "Rosmarinus": "Lamiaceae",
|
||||
"Mentha": "Lamiaceae",
|
||||
"Zea": "Poaceae",
|
||||
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae",
|
||||
"Valerianella": "Caprifoliaceae",
|
||||
"Tropaeolum": "Tropaeolaceae",
|
||||
"Rumex": "Polygonaceae",
|
||||
"Nigella": "Ranunculaceae",
|
||||
"Claytonia": "Montiaceae",
|
||||
"Tetragonia": "Aizoaceae",
|
||||
"Basella": "Basellaceae",
|
||||
"Plantago": "Plantaginaceae",
|
||||
}
|
||||
|
||||
family_name = family_map.get(genus)
|
||||
if not family_name:
|
||||
print(f" WARNING: Unknown genus '{genus}' for species '{latin_name}'")
|
||||
stats["species_not_matched"].append(latin_name)
|
||||
return None
|
||||
|
||||
family_id = find_or_create_family(family_name)
|
||||
if not family_id:
|
||||
return None
|
||||
|
||||
print(f" Creating species: {latin_name}")
|
||||
resp, code = api_post("/species", {
|
||||
"name_scientific": latin_name,
|
||||
"family_id": family_id,
|
||||
})
|
||||
if "id" in resp:
|
||||
species_cache[latin_name.lower()] = resp
|
||||
stats["species_created"] += 1
|
||||
return resp["id"]
|
||||
else:
|
||||
# Might already exist, reload
|
||||
print(f" Species creation returned {code}: {resp.get('error','')[:100]}")
|
||||
page = 1
|
||||
while True:
|
||||
r = api_get("/species", {"per_page": 100, "page": page})
|
||||
for s in r["data"]:
|
||||
species_cache[s["name_scientific"].lower()] = s
|
||||
if len(r["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
if latin_name.lower() in species_cache:
|
||||
return species_cache[latin_name.lower()]["id"]
|
||||
stats["errors"].append(f"Species creation failed: {latin_name}")
|
||||
return None
|
||||
|
||||
|
||||
def find_or_create_family(family_name: str) -> Optional[str]:
|
||||
"""Find or create a plant family. Returns family ID."""
|
||||
key = family_name.lower()
|
||||
if key in family_cache:
|
||||
return family_cache[key]["id"]
|
||||
|
||||
print(f" Creating family: {family_name}")
|
||||
resp, code = api_post("/families", {"name_scientific": family_name})
|
||||
if "id" in resp:
|
||||
family_cache[key] = resp
|
||||
stats["families_created"] += 1
|
||||
return resp["id"]
|
||||
else:
|
||||
# Reload
|
||||
r = api_get("/families", {"per_page": 200})
|
||||
for ff in r["data"]:
|
||||
family_cache[ff["name_scientific"].lower()] = ff
|
||||
if key in family_cache:
|
||||
return family_cache[key]["id"]
|
||||
stats["errors"].append(f"Family creation failed: {family_name}")
|
||||
return None
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
"""Generate a URL-safe slug."""
|
||||
text = text.lower()
|
||||
replacements = {
|
||||
"ä": "a", "ö": "o", "ü": "u", "ß": "ss",
|
||||
"é": "e", "è": "e", "ê": "e", "ë": "e",
|
||||
"à": "a", "â": "a", "á": "a",
|
||||
"ô": "o", "ù": "u", "û": "u", "ú": "u",
|
||||
"ï": "i", "î": "i", "í": "i",
|
||||
"ç": "c", "ñ": "n", "ó": "o",
|
||||
"œ": "oe", "æ": "ae",
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
text = text.replace(old, new)
|
||||
text = re.sub(r'[^a-z0-9\s-]', '', text)
|
||||
text = re.sub(r'[\s]+', '-', text.strip())
|
||||
text = re.sub(r'-+', '-', text)
|
||||
return text.strip('-')
|
||||
|
||||
|
||||
def find_existing_cultivar(species_name: str, variety_name: str, species_id: str) -> Optional[str]:
|
||||
"""Check if cultivar already exists. Returns cultivar ID or None."""
|
||||
expected_slug = slugify(f"{species_name} {variety_name}")
|
||||
|
||||
# Direct slug match
|
||||
if expected_slug in cultivar_cache:
|
||||
return cultivar_cache[expected_slug]["id"]
|
||||
|
||||
# Check for name match in same species
|
||||
variety_lower = variety_name.lower()
|
||||
for slug, data in cultivar_cache.items():
|
||||
if data["species_id"] == species_id and data["name"].lower() == variety_lower:
|
||||
return data["id"]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def scrape_category(cat_path: str, default_species: Optional[str]):
|
||||
"""Scrape a single category page and all its products."""
|
||||
url = f"{SITE_BASE}/de/bio-saatgut/{cat_path}.html"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Category: {cat_path}")
|
||||
|
||||
html = fetch_page(url)
|
||||
if not html:
|
||||
print(" SKIP: Page not found (404)")
|
||||
return
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
products = parse_product_links(html)
|
||||
print(f" Found {len(products)} products")
|
||||
stats["products_found"] += len(products)
|
||||
stats["categories_scraped"] += 1
|
||||
|
||||
for prod_url, prod_name in products:
|
||||
process_product(prod_url, prod_name, default_species)
|
||||
|
||||
|
||||
def process_product(prod_url: str, prod_name: str, default_species: Optional[str]):
|
||||
"""Process a single product: fetch detail, extract data, create cultivar."""
|
||||
article_number = extract_article_number(prod_name, prod_url)
|
||||
variety_name = extract_variety_name(prod_name)
|
||||
|
||||
if not variety_name:
|
||||
print(f" SKIP (no variety): {prod_name}")
|
||||
return
|
||||
|
||||
# Skip mixes, sets, bundles
|
||||
skip_keywords = ["mischung", "saatscheibe", "saatband", "saatplatte",
|
||||
"saat-set", " mix ", "trio ", "quartett", "gutschein",
|
||||
"buch ", "düngung", "erde ", "-garten"]
|
||||
name_lower = prod_name.lower()
|
||||
# Exception: if the variety name itself is the whole thing, keep it
|
||||
if any(kw in name_lower for kw in skip_keywords) and variety_name.lower() != prod_name.lower():
|
||||
# Only skip if it really seems like a mix
|
||||
if "mischung" in name_lower or "mix" in name_lower or "trio" in name_lower:
|
||||
print(f" SKIP (mix/set): {prod_name}")
|
||||
return
|
||||
|
||||
print(f"\n Product: {prod_name}")
|
||||
print(f" Variety: {variety_name}, SKU: {article_number}")
|
||||
|
||||
# Fetch detail page
|
||||
latin_name = None
|
||||
description = ""
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
detail_html = fetch_page(prod_url)
|
||||
stats["detail_pages_fetched"] += 1
|
||||
if detail_html:
|
||||
latin_name = extract_latin_from_detail(detail_html)
|
||||
description = extract_description_from_detail(detail_html)
|
||||
except Exception as e:
|
||||
print(f" WARNING: Detail page error: {e}")
|
||||
|
||||
species_name = latin_name or default_species
|
||||
if not species_name:
|
||||
print(f" SKIP: No species for '{prod_name}'")
|
||||
stats["species_not_matched"].append(prod_name)
|
||||
return
|
||||
|
||||
print(f" Species: {species_name}")
|
||||
|
||||
species_id = find_or_create_species(species_name)
|
||||
if not species_id:
|
||||
print(f" SKIP: Could not resolve species '{species_name}'")
|
||||
return
|
||||
|
||||
# Check if cultivar already exists
|
||||
existing_id = find_existing_cultivar(species_name, variety_name, species_id)
|
||||
|
||||
cultivar_id = None
|
||||
|
||||
if existing_id:
|
||||
cultivar_id = existing_id
|
||||
print(f" EXISTS: cultivar already in DB")
|
||||
stats["cultivars_existed"] += 1
|
||||
else:
|
||||
# Create cultivar
|
||||
data = {
|
||||
"species_id": species_id,
|
||||
"name": variety_name,
|
||||
"name_de": variety_name,
|
||||
"is_organic": True,
|
||||
}
|
||||
if description:
|
||||
data["description"] = description
|
||||
|
||||
resp, code = api_post("/cultivars", data)
|
||||
|
||||
if "id" in resp:
|
||||
cultivar_id = resp["id"]
|
||||
cultivar_cache[resp["slug"]] = {
|
||||
"id": resp["id"],
|
||||
"name": variety_name,
|
||||
"species_id": species_id,
|
||||
}
|
||||
stats["cultivars_created"] += 1
|
||||
print(f" CREATED: {resp['slug']}")
|
||||
elif code == 500 and "Database error" in str(resp.get("error", "")):
|
||||
# Likely slug conflict - try to find existing
|
||||
print(f" DB conflict - searching for existing cultivar...")
|
||||
# Reload cultivars for this species
|
||||
page = 1
|
||||
while True:
|
||||
r = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||
for c in r["data"]:
|
||||
cultivar_cache[c["slug"]] = {
|
||||
"id": c["id"],
|
||||
"name": c["name"],
|
||||
"species_id": c["species_id"],
|
||||
}
|
||||
if c["species_id"] == species_id and c["name"].lower() == variety_name.lower():
|
||||
cultivar_id = c["id"]
|
||||
if cultivar_id or len(r["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
|
||||
if cultivar_id:
|
||||
print(f" Found existing after conflict: {cultivar_id}")
|
||||
stats["cultivars_existed"] += 1
|
||||
else:
|
||||
print(f" ERROR: DB error and could not find existing cultivar")
|
||||
stats["errors"].append(f"DB error + not found: {species_name} / {variety_name}")
|
||||
return
|
||||
else:
|
||||
print(f" ERROR ({code}): {str(resp.get('error',''))[:100]}")
|
||||
stats["errors"].append(f"Create failed: {variety_name}: {resp.get('error','')[:80]}")
|
||||
return
|
||||
|
||||
# Link to supplier
|
||||
if cultivar_id and supplier_id:
|
||||
link_data = {
|
||||
"supplier_id": supplier_id,
|
||||
"product_url": prod_url,
|
||||
}
|
||||
if article_number:
|
||||
link_data["article_number"] = article_number
|
||||
|
||||
resp, code = api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
||||
|
||||
if "id" in resp:
|
||||
stats["supplier_links_created"] += 1
|
||||
print(f" LINKED (SKU: {article_number})")
|
||||
elif code == 500 or "already" in str(resp.get("error", "")).lower():
|
||||
stats["supplier_links_existed"] += 1
|
||||
print(f" LINK EXISTS")
|
||||
else:
|
||||
print(f" LINK ERROR ({code}): {str(resp.get('error',''))[:80]}")
|
||||
stats["errors"].append(f"Link failed: {variety_name}: {resp.get('error','')[:60]}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Bingenheimer Saatgut Scraper for HerbAPI")
|
||||
print("=" * 60)
|
||||
|
||||
load_api_data()
|
||||
|
||||
print(f"\nScraping {len(ALL_CATEGORIES)} categories...")
|
||||
|
||||
for cat_path, default_species in ALL_CATEGORIES:
|
||||
try:
|
||||
scrape_category(cat_path, default_species)
|
||||
except Exception as e:
|
||||
print(f" ERROR in category {cat_path}: {e}")
|
||||
stats["errors"].append(f"Category error: {cat_path}: {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SCRAPING COMPLETE - SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Categories scraped: {stats['categories_scraped']}")
|
||||
print(f"Products found: {stats['products_found']}")
|
||||
print(f"Detail pages fetched: {stats['detail_pages_fetched']}")
|
||||
print(f"Cultivars created: {stats['cultivars_created']}")
|
||||
print(f"Cultivars existed: {stats['cultivars_existed']}")
|
||||
print(f"Supplier links created: {stats['supplier_links_created']}")
|
||||
print(f"Supplier links existed: {stats['supplier_links_existed']}")
|
||||
print(f"Species created: {stats['species_created']}")
|
||||
print(f"Families created: {stats['families_created']}")
|
||||
print(f"Errors: {len(stats['errors'])}")
|
||||
|
||||
if stats["species_not_matched"]:
|
||||
print(f"\nUnmatched species ({len(stats['species_not_matched'])}):")
|
||||
for s in stats["species_not_matched"][:30]:
|
||||
print(f" - {s}")
|
||||
|
||||
if stats["errors"]:
|
||||
print(f"\nErrors ({len(stats['errors'])}):")
|
||||
for e in stats["errors"][:30]:
|
||||
print(f" - {e}")
|
||||
|
||||
return 0 if not stats["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,760 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scraper for Dreschflegel organic seed catalog (dreschflegel-saatgut.de).
|
||||
Extracts cultivar data and imports into HerbAPI.
|
||||
|
||||
Run 2 - fixes pagination (API caps at 100/page), better species matching,
|
||||
caches scraped products, handles duplicates gracefully.
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
import gzip
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import html as html_mod
|
||||
from collections import defaultdict
|
||||
|
||||
# --- Configuration ---
|
||||
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
SITE_BASE = "https://www.dreschflegel-saatgut.de"
|
||||
DELAY = 0.5
|
||||
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Scraper/1.0)"
|
||||
CACHE_FILE = "/tmp/dreschflegel_products_cache.json"
|
||||
|
||||
# Unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
stats = defaultdict(int)
|
||||
|
||||
|
||||
def api_request(method, path, data=None):
|
||||
"""Make an API request to HerbAPI."""
|
||||
url = f"{API_BASE}{path}"
|
||||
body = json.dumps(data).encode("utf-8") if data else None
|
||||
req = urllib.request.Request(url, data=body, method=method)
|
||||
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||
req.add_header("Content-Type", "application/json")
|
||||
req.add_header("Accept", "application/json")
|
||||
try:
|
||||
resp = urllib.request.urlopen(req)
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as e:
|
||||
body_text = e.read().decode("utf-8", errors="replace")
|
||||
if e.code == 409 or "already exists" in body_text.lower() or "duplicate" in body_text.lower():
|
||||
return None # Duplicate, handled silently
|
||||
if e.code == 500 and "database error" in body_text.lower():
|
||||
# Likely a unique constraint violation = duplicate
|
||||
return None
|
||||
print(f" API error {e.code} {method} {path}: {body_text[:200]}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_page(url):
|
||||
"""Fetch a web page with delay and user-agent."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
print(f" Fetch error {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_sitemap_urls():
|
||||
"""Download sitemap and extract all URLs."""
|
||||
print("Fetching sitemap index...")
|
||||
html = fetch_page(f"{SITE_BASE}/sitemap.xml")
|
||||
if not html:
|
||||
return []
|
||||
|
||||
sitemap_urls = re.findall(r"<loc>(.*?)</loc>", html)
|
||||
all_urls = []
|
||||
|
||||
for smap_url in sitemap_urls:
|
||||
if smap_url.endswith(".xml.gz"):
|
||||
print(f" Fetching compressed sitemap...")
|
||||
req = urllib.request.Request(smap_url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
data = gzip.decompress(resp.read()).decode("utf-8")
|
||||
urls = re.findall(r"<loc>(.*?)</loc>", data)
|
||||
all_urls.extend(urls)
|
||||
print(f" Found {len(urls)} URLs")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
return all_urls
|
||||
|
||||
|
||||
def classify_urls(urls):
|
||||
"""Filter URLs to likely product pages (single-segment paths)."""
|
||||
skip_prefixes = [
|
||||
"impressum", "agb", "datenschutz", "kontakt", "widerrufs",
|
||||
"versand", "abkuerz", "zertifikat", "wichtige-hinweise",
|
||||
"muster-", "gutscheine", "kalender", "flyer", "katalog",
|
||||
"sommer-herbst", "unsere-hoefe", "bestellschein",
|
||||
"dreschflegel-news", "termine", "rezepte", "anbautipps",
|
||||
"tipps-zur", "gartentelefon", "gartenfreude", "buecher",
|
||||
"navigation", "vielfalt", "sut20", "saatgut",
|
||||
"neuheiten", "kennenlernangebote", "sut25", "vielfalt25",
|
||||
"saatgut-vielfalt", "saat",
|
||||
]
|
||||
candidates = []
|
||||
for url in urls:
|
||||
url = url.rstrip("/")
|
||||
path = url.replace("https://dreschflegel-saatgut.de/", "").replace(
|
||||
"https://www.dreschflegel-saatgut.de/", ""
|
||||
)
|
||||
if not path or "/" in path:
|
||||
continue
|
||||
if any(path == p or path.startswith(p) for p in skip_prefixes):
|
||||
continue
|
||||
candidates.append(url)
|
||||
return candidates
|
||||
|
||||
|
||||
def parse_product_page(html_content):
|
||||
"""Extract product data from a Dreschflegel product page."""
|
||||
if not html_content or 'class="botname"' not in html_content:
|
||||
return None
|
||||
|
||||
result = {}
|
||||
|
||||
m = re.search(r"<h1>(.*?)</h1>", html_content)
|
||||
if m:
|
||||
result["name"] = html_mod.unescape(m.group(1).strip())
|
||||
|
||||
m = re.search(r'<div class="botname">\s*(.*?)\s*</div>', html_content, re.DOTALL)
|
||||
if m:
|
||||
result["botanical_name"] = html_mod.unescape(m.group(1).strip())
|
||||
|
||||
m = re.search(
|
||||
r'class="product-detail-ordernumber"[^>]*>\s*(\d+)',
|
||||
html_content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
result["article_number"] = m.group(1)
|
||||
|
||||
m = re.search(r'itemprop="price"[^>]*content="([^"]+)"', html_content)
|
||||
if m:
|
||||
try:
|
||||
result["price"] = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
m = re.search(
|
||||
r"product-detail-description-text.*?<p>(.*?)</p>",
|
||||
html_content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
desc = re.sub(r"<[^>]+>", "", m.group(1).strip())
|
||||
desc = html_mod.unescape(desc).strip()
|
||||
if desc:
|
||||
result["description"] = desc
|
||||
|
||||
m = re.search(r"Inhalt reicht f[üu]r:</th><td>\s*(.*?)\s*</td>", html_content)
|
||||
if m:
|
||||
result["pack_info"] = html_mod.unescape(m.group(1).strip())
|
||||
|
||||
return result if "name" in result and "botanical_name" in result else None
|
||||
|
||||
|
||||
def scrape_all_products(candidate_urls):
|
||||
"""Scrape product pages, using cache for already-scraped URLs."""
|
||||
# Load cache
|
||||
cache = {}
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE, "r") as f:
|
||||
cache = json.load(f)
|
||||
print(f" Loaded {len(cache)} cached products")
|
||||
|
||||
products = []
|
||||
to_fetch = [u for u in candidate_urls if u not in cache]
|
||||
already_cached = [u for u in candidate_urls if u in cache]
|
||||
|
||||
# Add cached products
|
||||
for u in already_cached:
|
||||
if cache[u]: # None means "not a product page"
|
||||
products.append(cache[u])
|
||||
|
||||
cached_products = len(products)
|
||||
cached_non_products = len(already_cached) - cached_products
|
||||
print(f" {cached_products} products from cache, "
|
||||
f"{cached_non_products} non-products cached, "
|
||||
f"{len(to_fetch)} to fetch")
|
||||
|
||||
for i, url in enumerate(to_fetch):
|
||||
if (i + 1) % 50 == 0 or i == 0:
|
||||
print(f" Fetching {i + 1}/{len(to_fetch)}...")
|
||||
|
||||
time.sleep(DELAY)
|
||||
html_content = fetch_page(url)
|
||||
if not html_content:
|
||||
stats["fetch_errors"] += 1
|
||||
cache[url] = None
|
||||
continue
|
||||
|
||||
product = parse_product_page(html_content)
|
||||
if product:
|
||||
product["url"] = url
|
||||
products.append(product)
|
||||
cache[url] = product
|
||||
stats["products_scraped"] += 1
|
||||
else:
|
||||
cache[url] = None
|
||||
stats["not_product_pages"] += 1
|
||||
|
||||
# Save cache periodically
|
||||
if (i + 1) % 100 == 0:
|
||||
with open(CACHE_FILE, "w") as f:
|
||||
json.dump(cache, f)
|
||||
|
||||
# Final cache save
|
||||
with open(CACHE_FILE, "w") as f:
|
||||
json.dump(cache, f)
|
||||
|
||||
print(f" Total: {len(products)} products ({stats['products_scraped']} newly scraped)")
|
||||
return products
|
||||
|
||||
|
||||
def paginated_get(path):
|
||||
"""Fetch all pages from a paginated API endpoint."""
|
||||
all_items = []
|
||||
page = 1
|
||||
while True:
|
||||
resp = api_request("GET", f"{path}{'&' if '?' in path else '?'}per_page=100&page={page}")
|
||||
if not resp or "data" not in resp or not resp["data"]:
|
||||
break
|
||||
all_items.extend(resp["data"])
|
||||
if len(resp["data"]) < 100:
|
||||
break
|
||||
page += 1
|
||||
return all_items
|
||||
|
||||
|
||||
def load_api_data():
|
||||
"""Load all species, families, cultivars from HerbAPI."""
|
||||
print("Loading HerbAPI data...")
|
||||
|
||||
families = {}
|
||||
for f in paginated_get("/families"):
|
||||
families[f["name_scientific"].lower()] = f
|
||||
print(f" {len(families)} families")
|
||||
|
||||
species = {}
|
||||
for s in paginated_get("/species"):
|
||||
species[s["name_scientific"].lower().strip()] = s
|
||||
print(f" {len(species)} species")
|
||||
|
||||
cultivars = {}
|
||||
for c in paginated_get("/cultivars"):
|
||||
key = (c["species_id"], c["name"].lower().strip())
|
||||
cultivars[key] = c
|
||||
print(f" {len(cultivars)} cultivars")
|
||||
|
||||
return families, species, cultivars
|
||||
|
||||
|
||||
def ensure_supplier():
|
||||
"""Create or find the Dreschflegel supplier."""
|
||||
resp = api_request("GET", "/suppliers")
|
||||
if resp:
|
||||
for s in resp:
|
||||
if "dreschflegel" in s["name"].lower():
|
||||
print(f" Supplier exists: {s['name']} ({s['id']})")
|
||||
return s
|
||||
data = {
|
||||
"name": "Dreschflegel",
|
||||
"url": "https://www.dreschflegel-saatgut.de",
|
||||
"country": "DE",
|
||||
"is_organic": True,
|
||||
"is_demeter": False,
|
||||
"notes": "German organic seed cooperative, open-pollinated heritage varieties",
|
||||
}
|
||||
resp = api_request("POST", "/suppliers", data)
|
||||
if resp:
|
||||
print(f" Created supplier: {resp['name']} ({resp['id']})")
|
||||
return resp
|
||||
|
||||
|
||||
# Genus → family mapping for species creation
|
||||
GENUS_TO_FAMILY = {
|
||||
# Asteraceae
|
||||
"Achillea": "Asteraceae", "Artemisia": "Asteraceae", "Aster": "Asteraceae",
|
||||
"Calendula": "Asteraceae", "Carthamus": "Asteraceae", "Centaurea": "Asteraceae",
|
||||
"Chamomilla": "Asteraceae", "Chrysanthemum": "Asteraceae", "Cichorium": "Asteraceae",
|
||||
"Cnicus": "Asteraceae", "Cosmos": "Asteraceae", "Cynara": "Asteraceae",
|
||||
"Dahlia": "Asteraceae", "Dimorphotheca": "Asteraceae", "Echinacea": "Asteraceae",
|
||||
"Echinops": "Asteraceae", "Erigeron": "Asteraceae", "Eupatorium": "Asteraceae",
|
||||
"Gaillardia": "Asteraceae", "Helenium": "Asteraceae", "Helianthus": "Asteraceae",
|
||||
"Helichrysum": "Asteraceae", "Inula": "Asteraceae", "Lactuca": "Asteraceae",
|
||||
"Leontodon": "Asteraceae", "Matricaria": "Asteraceae", "Onopordum": "Asteraceae",
|
||||
"Petasites": "Asteraceae", "Rudbeckia": "Asteraceae", "Scorzonera": "Asteraceae",
|
||||
"Silphium": "Asteraceae", "Solidago": "Asteraceae", "Tagetes": "Asteraceae",
|
||||
"Tanacetum": "Asteraceae", "Taraxacum": "Asteraceae", "Telekia": "Asteraceae",
|
||||
"Tragopogon": "Asteraceae", "Tussilago": "Asteraceae", "Zinnia": "Asteraceae",
|
||||
"Xerochrysum": "Asteraceae", "Coreopsis": "Asteraceae",
|
||||
# Solanaceae
|
||||
"Capsicum": "Solanaceae", "Lycium": "Solanaceae", "Nicotiana": "Solanaceae",
|
||||
"Physalis": "Solanaceae", "Solanum": "Solanaceae", "Atropa": "Solanaceae",
|
||||
# Cucurbitaceae
|
||||
"Citrullus": "Cucurbitaceae", "Cucumis": "Cucurbitaceae", "Cucurbita": "Cucurbitaceae",
|
||||
"Luffa": "Cucurbitaceae", "Momordica": "Cucurbitaceae",
|
||||
# Fabaceae
|
||||
"Cicer": "Fabaceae", "Glycine": "Fabaceae", "Lathyrus": "Fabaceae",
|
||||
"Lens": "Fabaceae", "Lupinus": "Fabaceae", "Medicago": "Fabaceae",
|
||||
"Phaseolus": "Fabaceae", "Pisum": "Fabaceae", "Trifolium": "Fabaceae",
|
||||
"Trigonella": "Fabaceae", "Vicia": "Fabaceae", "Vigna": "Fabaceae",
|
||||
"Caragana": "Fabaceae", "Cytisus": "Fabaceae", "Robinia": "Fabaceae",
|
||||
# Brassicaceae
|
||||
"Armoracia": "Brassicaceae", "Barbarea": "Brassicaceae", "Brassica": "Brassicaceae",
|
||||
"Crambe": "Brassicaceae", "Eruca": "Brassicaceae", "Hesperis": "Brassicaceae",
|
||||
"Iberis": "Brassicaceae", "Isatis": "Brassicaceae", "Lepidium": "Brassicaceae",
|
||||
"Lunaria": "Brassicaceae", "Raphanus": "Brassicaceae", "Sinapis": "Brassicaceae",
|
||||
"Nasturtium": "Brassicaceae", "Diplotaxis": "Brassicaceae",
|
||||
# Apiaceae
|
||||
"Anethum": "Apiaceae", "Anthriscus": "Apiaceae", "Apium": "Apiaceae",
|
||||
"Carum": "Apiaceae", "Chaerophyllum": "Apiaceae", "Coriandrum": "Apiaceae",
|
||||
"Daucus": "Apiaceae", "Foeniculum": "Apiaceae", "Levisticum": "Apiaceae",
|
||||
"Myrrhis": "Apiaceae", "Pastinaca": "Apiaceae", "Petroselinum": "Apiaceae",
|
||||
"Pimpinella": "Apiaceae", "Angelica": "Apiaceae", "Aegopodium": "Apiaceae",
|
||||
# Lamiaceae
|
||||
"Agastache": "Lamiaceae", "Ajuga": "Lamiaceae", "Dracocephalum": "Lamiaceae",
|
||||
"Elsholtzia": "Lamiaceae", "Hyssopus": "Lamiaceae", "Lavandula": "Lamiaceae",
|
||||
"Melissa": "Lamiaceae", "Mentha": "Lamiaceae", "Monarda": "Lamiaceae",
|
||||
"Nepeta": "Lamiaceae", "Ocimum": "Lamiaceae", "Origanum": "Lamiaceae",
|
||||
"Perilla": "Lamiaceae", "Rosmarinus": "Lamiaceae", "Salvia": "Lamiaceae",
|
||||
"Satureja": "Lamiaceae", "Stachys": "Lamiaceae", "Thymus": "Lamiaceae",
|
||||
# Amaryllidaceae / Alliaceae
|
||||
"Allium": "Amaryllidaceae",
|
||||
# Poaceae
|
||||
"Avena": "Poaceae", "Hordeum": "Poaceae", "Panicum": "Poaceae",
|
||||
"Secale": "Poaceae", "Sorghum": "Poaceae", "Triticum": "Poaceae",
|
||||
"Zea": "Poaceae", "Setaria": "Poaceae",
|
||||
# Chenopodiaceae
|
||||
"Atriplex": "Chenopodiaceae", "Beta": "Chenopodiaceae",
|
||||
"Chenopodium": "Chenopodiaceae", "Spinacia": "Chenopodiaceae",
|
||||
# Rosaceae
|
||||
"Filipendula": "Rosaceae", "Fragaria": "Rosaceae", "Malus": "Rosaceae",
|
||||
"Prunus": "Rosaceae", "Pyrus": "Rosaceae", "Rosa": "Rosaceae",
|
||||
"Rubus": "Rosaceae", "Sanguisorba": "Rosaceae", "Sorbus": "Rosaceae",
|
||||
"Waldsteinia": "Rosaceae",
|
||||
# Boraginaceae
|
||||
"Borago": "Boraginaceae", "Phacelia": "Boraginaceae", "Symphytum": "Boraginaceae",
|
||||
"Pulmonaria": "Boraginaceae", "Myosotis": "Boraginaceae",
|
||||
# Malvaceae
|
||||
"Alcea": "Malvaceae", "Althaea": "Malvaceae", "Malva": "Malvaceae",
|
||||
"Hibiscus": "Malvaceae", "Lavatera": "Malvaceae", "Abelmoschus": "Malvaceae",
|
||||
# Polygonaceae
|
||||
"Fagopyrum": "Polygonaceae", "Rheum": "Polygonaceae", "Rumex": "Polygonaceae",
|
||||
# Caryophyllaceae
|
||||
"Agrostemma": "Caryophyllaceae", "Dianthus": "Caryophyllaceae",
|
||||
"Gypsophila": "Caryophyllaceae", "Lychnis": "Caryophyllaceae",
|
||||
"Saponaria": "Caryophyllaceae", "Silene": "Caryophyllaceae",
|
||||
# Tropaeolaceae
|
||||
"Tropaeolum": "Tropaeolaceae",
|
||||
# Papaveraceae
|
||||
"Eschscholzia": "Papaveraceae", "Papaver": "Papaveraceae",
|
||||
"Meconopsis": "Papaveraceae",
|
||||
# Caprifoliaceae
|
||||
"Valerianella": "Caprifoliaceae", "Valeriana": "Caprifoliaceae",
|
||||
"Lonicera": "Caprifoliaceae", "Sambucus": "Adoxaceae",
|
||||
# Plantaginaceae
|
||||
"Digitalis": "Plantaginaceae", "Plantago": "Plantaginaceae",
|
||||
"Antirrhinum": "Plantaginaceae", "Linaria": "Plantaginaceae",
|
||||
# Violaceae
|
||||
"Viola": "Violaceae",
|
||||
# Ranunculaceae
|
||||
"Aquilegia": "Ranunculaceae", "Consolida": "Ranunculaceae",
|
||||
"Delphinium": "Ranunculaceae", "Nigella": "Ranunculaceae",
|
||||
# Linaceae
|
||||
"Linum": "Linaceae",
|
||||
# Convolvulaceae
|
||||
"Ipomoea": "Convolvulaceae", "Convolvulus": "Convolvulaceae",
|
||||
# Portulacaceae / Montiaceae
|
||||
"Claytonia": "Montiaceae", "Portulaca": "Portulacaceae",
|
||||
# Amaranthaceae
|
||||
"Amaranthus": "Amaranthaceae", "Celosia": "Amaranthaceae",
|
||||
"Gomphrena": "Amaranthaceae",
|
||||
# Asparagaceae
|
||||
"Asparagus": "Asparagaceae",
|
||||
# Resedaceae
|
||||
"Reseda": "Resedaceae",
|
||||
# Balsaminaceae
|
||||
"Impatiens": "Balsaminaceae",
|
||||
# Hydrangeaceae
|
||||
"Hydrangea": "Hydrangeaceae",
|
||||
# Campanulaceae
|
||||
"Campanula": "Campanulaceae", "Phyteuma": "Campanulaceae",
|
||||
# Scrophulariaceae
|
||||
"Verbascum": "Scrophulariaceae",
|
||||
# Verbenaceae
|
||||
"Verbena": "Verbenaceae",
|
||||
# Onagraceae
|
||||
"Oenothera": "Onagraceae", "Clarkia": "Onagraceae",
|
||||
# Cucurbitaceae extras
|
||||
"Benincasa": "Cucurbitaceae", "Lagenaria": "Cucurbitaceae",
|
||||
# Hypericaceae
|
||||
"Hypericum": "Hypericaceae",
|
||||
# Adoxaceae
|
||||
"Sambucus": "Adoxaceae",
|
||||
# Others
|
||||
"Nigella": "Ranunculaceae",
|
||||
"Dipsacus": "Caprifoliaceae",
|
||||
"Knautia": "Caprifoliaceae",
|
||||
"Scabiosa": "Caprifoliaceae",
|
||||
"Succisa": "Caprifoliaceae",
|
||||
"Asclepias": "Apocynaceae",
|
||||
"Cynoglossum": "Boraginaceae",
|
||||
"Echium": "Boraginaceae",
|
||||
"Anchusa": "Boraginaceae",
|
||||
"Lithospermum": "Boraginaceae",
|
||||
"Tanacetum": "Asteraceae",
|
||||
"Onobrychis": "Fabaceae",
|
||||
"Ornithopus": "Fabaceae",
|
||||
"Lotus": "Fabaceae",
|
||||
"Anthyllis": "Fabaceae",
|
||||
"Melilotus": "Fabaceae",
|
||||
"Galega": "Fabaceae",
|
||||
"Lespedeza": "Fabaceae",
|
||||
"Arachis": "Fabaceae",
|
||||
"Senna": "Fabaceae",
|
||||
# Additional genera found in Dreschflegel catalog
|
||||
"Acmella": "Asteraceae", "Adonis": "Ranunculaceae", "Ageratum": "Asteraceae",
|
||||
"Amethystia": "Lamiaceae", "Anacyclus": "Asteraceae", "Anthemis": "Asteraceae",
|
||||
"Asphodeline": "Asphodelaceae", "Brachyscome": "Asteraceae", "Bupleurum": "Apiaceae",
|
||||
"Callistephus": "Asteraceae", "Camelina": "Brassicaceae", "Cardaria": "Brassicaceae",
|
||||
"Cardiospermum": "Sapindaceae", "Cerinthe": "Boraginaceae",
|
||||
"Chamaemelum": "Asteraceae", "Cistanthe": "Montiaceae", "Cleome": "Cleomaceae",
|
||||
"Cochlearia": "Brassicaceae", "Codonopsis": "Campanulaceae", "Coix": "Poaceae",
|
||||
"Cyperus": "Cyperaceae", "Digitaria": "Poaceae", "Dorotheanthus": "Aizoaceae",
|
||||
"Emilia": "Asteraceae", "Eragrostis": "Poaceae", "Erysimum": "Brassicaceae",
|
||||
"Euphorbia": "Euphorbiaceae", "Gentiana": "Gentianaceae", "Geum": "Rosaceae",
|
||||
"Gilia": "Polemoniaceae", "Godetia": "Onagraceae", "Helipterum": "Asteraceae",
|
||||
"Lallemantia": "Lamiaceae", "Leonurus": "Lamiaceae", "Leuzea": "Asteraceae",
|
||||
"Liatris": "Asteraceae", "Malope": "Malvaceae", "Marrubium": "Lamiaceae",
|
||||
"Matthiola": "Brassicaceae", "Maurandya": "Plantaginaceae",
|
||||
"Melothria": "Cucurbitaceae", "Meum": "Apiaceae", "Nemesia": "Scrophulariaceae",
|
||||
"Nicandra": "Solanaceae", "Nicotinia": "Solanaceae", "Oenanthe": "Apiaceae",
|
||||
"Oxalis": "Oxalidaceae", "Pennisetum": "Poaceae", "Penstemon": "Plantaginaceae",
|
||||
"Phlox": "Polemoniaceae", "Polemonium": "Polemoniaceae",
|
||||
"Porophyllum": "Asteraceae", "Primula": "Primulaceae", "Psyllium": "Plantaginaceae",
|
||||
"Quamoclit": "Convolvulaceae", "Ruta": "Rutaceae", "Salpiglossis": "Solanaceae",
|
||||
"Sanvitalia": "Asteraceae", "Sideritis": "Lamiaceae", "Silybum": "Asteraceae",
|
||||
"Talinum": "Talinaceae", "Thelesperma": "Asteraceae", "Vaccaria": "Caryophyllaceae",
|
||||
"Veronica": "Plantaginaceae", "Xeranthemum": "Asteraceae",
|
||||
}
|
||||
|
||||
|
||||
def normalize_species_name(botanical_name):
|
||||
"""Normalize botanical name to 'Genus species' for matching.
|
||||
Handles var., subsp., ssp., hybrids etc.
|
||||
"""
|
||||
name = botanical_name.strip()
|
||||
parts = name.split()
|
||||
if len(parts) < 2:
|
||||
return None, None
|
||||
|
||||
genus = parts[0]
|
||||
# Handle 'Genus x species' (hybrid notation)
|
||||
if parts[1] == "x" and len(parts) >= 3:
|
||||
species = f"x {parts[2]}"
|
||||
elif parts[1] in ("var.", "subsp.", "ssp.", "spec.", "sp."):
|
||||
# Only genus level - can't match to species
|
||||
return genus, None
|
||||
else:
|
||||
species = parts[1]
|
||||
|
||||
return genus, species
|
||||
|
||||
|
||||
def find_species(botanical_name, species_cache):
|
||||
"""Find existing species matching a botanical name.
|
||||
Tries exact match, then genus+species without var/subsp.
|
||||
"""
|
||||
genus, sp = normalize_species_name(botanical_name)
|
||||
if not genus:
|
||||
return None
|
||||
|
||||
if sp:
|
||||
# Try exact genus+species
|
||||
search_key = f"{genus} {sp}".lower()
|
||||
if search_key in species_cache:
|
||||
return species_cache[search_key]
|
||||
|
||||
# Try all species with same genus
|
||||
genus_lower = genus.lower()
|
||||
matches = {k: v for k, v in species_cache.items() if k.startswith(genus_lower + " ")}
|
||||
if len(matches) == 1:
|
||||
# Only one species in this genus - use it
|
||||
return list(matches.values())[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_or_create_species(botanical_name, families, species_cache):
|
||||
"""Find or create a species from a botanical name."""
|
||||
# Try to find existing
|
||||
sp = find_species(botanical_name, species_cache)
|
||||
if sp:
|
||||
return sp
|
||||
|
||||
genus, species_epithet = normalize_species_name(botanical_name)
|
||||
if not genus or not species_epithet:
|
||||
stats["species_no_epithet"] += 1
|
||||
return None
|
||||
|
||||
sci_name = f"{genus} {species_epithet}"
|
||||
|
||||
# Check cache again with normalized name
|
||||
if sci_name.lower() in species_cache:
|
||||
return species_cache[sci_name.lower()]
|
||||
|
||||
# Need to create - find the family
|
||||
family_name = GENUS_TO_FAMILY.get(genus)
|
||||
if not family_name:
|
||||
stats["species_no_family"] += 1
|
||||
print(f" [SKIP] No family mapping for genus: {genus} ({botanical_name})")
|
||||
return None
|
||||
|
||||
# Find or create the family
|
||||
family = families.get(family_name.lower())
|
||||
if not family:
|
||||
print(f" Creating family: {family_name}")
|
||||
resp = api_request("POST", "/families", {"name_scientific": family_name})
|
||||
if resp:
|
||||
families[family_name.lower()] = resp
|
||||
family = resp
|
||||
stats["families_created"] += 1
|
||||
else:
|
||||
# May already exist (duplicate from previous run) - reload
|
||||
for f in paginated_get("/families"):
|
||||
if f["name_scientific"].lower() == family_name.lower():
|
||||
families[family_name.lower()] = f
|
||||
family = f
|
||||
break
|
||||
if not family:
|
||||
print(f" [SKIP] Cannot create family: {family_name}")
|
||||
return None
|
||||
|
||||
# Create species
|
||||
print(f" Creating species: {sci_name} (family: {family_name})")
|
||||
resp = api_request("POST", "/species", {
|
||||
"name_scientific": sci_name,
|
||||
"family_id": family["id"],
|
||||
})
|
||||
if resp:
|
||||
species_cache[sci_name.lower()] = resp
|
||||
stats["species_created"] += 1
|
||||
return resp
|
||||
else:
|
||||
# May already exist - try to find it
|
||||
time.sleep(0.1)
|
||||
for s in paginated_get("/species"):
|
||||
if s["name_scientific"].lower() == sci_name.lower():
|
||||
species_cache[sci_name.lower()] = s
|
||||
return s
|
||||
return None
|
||||
|
||||
|
||||
def extract_cultivar_name(product_name):
|
||||
"""Extract the cultivar/variety name from the full product name."""
|
||||
name = product_name.strip()
|
||||
|
||||
# Common German crop type prefixes to strip (longest first)
|
||||
prefixes = [
|
||||
# Tomatoes
|
||||
"Salattomate", "Stabtomate", "Buschtomate", "Cocktailtomate",
|
||||
"Cherrytomate", "Fleischtomate", "Wildtomate", "Balkontomate",
|
||||
"Flaschentomate", "Eitomate", "Datteltomate", "Tomate",
|
||||
# Lettuce
|
||||
"Winterkopfsalat", "Kopfsalat", "Bataviasalat", "Eissalat",
|
||||
"Blattsalat", "Schnittsalat", "Pflücksalat", "Römersalat",
|
||||
"Spargelsalat", "Romanasalat",
|
||||
# Beans
|
||||
"Buschbohne", "Stangenbohne", "Feuerbohne", "Puffbohne",
|
||||
"Prunkbohne",
|
||||
# Peas
|
||||
"Markerbse", "Zuckererbse", "Palerbse", "Schalerbse",
|
||||
"Knackerbse", "Kapuzinererbse",
|
||||
# Cucumbers
|
||||
"Einlegegurke", "Salatgurke", "Schälgurke", "Landgurke",
|
||||
"Freilandgurke",
|
||||
# Squash
|
||||
"Hokkaidokürbis", "Butternutkürbis", "Speisekürbis",
|
||||
"Riesenkürbis", "Zierkürbis", "Muskatkürbis", "Ölkürbis",
|
||||
# Melon
|
||||
"Wassermelone", "Zuckermelone",
|
||||
# Peppers
|
||||
"Gemüsepaprika", "Blockpaprika", "Spitzpaprika", "Tomatenpaprika",
|
||||
"Snackpaprika", "Peperoni", "Chili",
|
||||
# Brassicas
|
||||
"Kohlrabi", "Brokkoli", "Blumenkohl", "Grünkohl", "Rosenkohl",
|
||||
"Wirsing", "Rotkohl", "Weißkohl", "Spitzkohl", "Palmkohl",
|
||||
"Chinakohl", "Pak Choi", "Markstammkohl",
|
||||
# Root veg
|
||||
"Möhre", "Karotte", "Pastinake", "Rote Bete", "Rote Beete",
|
||||
"Herbstrübe", "Mairübe", "Stoppelrübe", "Schwarzer Rettich",
|
||||
"Steckrübe", "Knollensellerie", "Petersilienwurzel",
|
||||
"Rettich", "Radieschen",
|
||||
# Onions
|
||||
"Winterheckenzwiebel", "Lauchzwiebel", "Speisezwiebel",
|
||||
"Schalotte", "Wintersteckzwiebel", "Zwiebel",
|
||||
# Herbs
|
||||
"Rotes Basilikum", "Buschbasilikum", "Zitronen-Basilikum",
|
||||
"Thai-Basilikum", "Wildes Basilikum", "Zimtbasilikum",
|
||||
"Basilikum", "Schnittknoblauch",
|
||||
# Grains
|
||||
"Sommerweizen", "Winterweizen", "Sommerroggen", "Winterroggen",
|
||||
"Nackthafer", "Nacktgerste", "Dinkel", "Emmer", "Einkorn",
|
||||
# Misc
|
||||
"Zuckermais", "Popcornmais",
|
||||
"Salattomate", "Zucchini",
|
||||
]
|
||||
|
||||
for prefix in sorted(prefixes, key=len, reverse=True):
|
||||
if name.startswith(prefix + " "):
|
||||
return name[len(prefix):].strip()
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def get_existing_supplier_links(cultivar_id, supplier_id):
|
||||
"""Check if a cultivar-supplier link already exists."""
|
||||
resp = api_request("GET", f"/cultivars/{cultivar_id}/suppliers")
|
||||
if resp:
|
||||
for link in resp:
|
||||
if link["supplier_id"] == supplier_id:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Dreschflegel Seed Catalog Scraper for HerbAPI (v2)")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Supplier
|
||||
print("\n[1] Setting up supplier...")
|
||||
supplier = ensure_supplier()
|
||||
if not supplier:
|
||||
print("FATAL: Could not create/find supplier")
|
||||
sys.exit(1)
|
||||
supplier_id = supplier["id"]
|
||||
|
||||
# Step 2: Load API data
|
||||
print("\n[2] Loading existing HerbAPI data...")
|
||||
families, species_cache, cultivar_cache = load_api_data()
|
||||
|
||||
# Step 3: Get product URLs
|
||||
print("\n[3] Fetching sitemap...")
|
||||
all_urls = get_sitemap_urls()
|
||||
if not all_urls:
|
||||
print("FATAL: Could not fetch sitemap")
|
||||
sys.exit(1)
|
||||
candidate_urls = classify_urls(all_urls)
|
||||
print(f" {len(all_urls)} total URLs, {len(candidate_urls)} product candidates")
|
||||
|
||||
# Step 4: Scrape
|
||||
print(f"\n[4] Scraping product pages...")
|
||||
products = scrape_all_products(candidate_urls)
|
||||
|
||||
# Step 5: Import
|
||||
print(f"\n[5] Importing {len(products)} products into HerbAPI...")
|
||||
|
||||
for i, product in enumerate(products):
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" Processing {i + 1}/{len(products)}...")
|
||||
|
||||
botanical = product.get("botanical_name", "")
|
||||
if not botanical:
|
||||
stats["no_botanical"] += 1
|
||||
continue
|
||||
|
||||
# Find or create species
|
||||
sp = find_or_create_species(botanical, families, species_cache)
|
||||
if not sp:
|
||||
stats["species_not_matched"] += 1
|
||||
continue
|
||||
|
||||
species_id = sp["id"]
|
||||
cultivar_name = extract_cultivar_name(product["name"])
|
||||
|
||||
# Check if cultivar already exists
|
||||
cv_key = (species_id, cultivar_name.lower().strip())
|
||||
if cv_key in cultivar_cache:
|
||||
cv = cultivar_cache[cv_key]
|
||||
stats["cultivars_existing"] += 1
|
||||
else:
|
||||
cv_data = {
|
||||
"species_id": species_id,
|
||||
"name": cultivar_name,
|
||||
"is_organic": True,
|
||||
}
|
||||
if product.get("description"):
|
||||
cv_data["description"] = product["description"]
|
||||
|
||||
cv = api_request("POST", "/cultivars", cv_data)
|
||||
if cv:
|
||||
cultivar_cache[cv_key] = cv
|
||||
stats["cultivars_created"] += 1
|
||||
else:
|
||||
# Might already exist from previous run - try to find it
|
||||
found = False
|
||||
for c in paginated_get(f"/cultivars?species_id={species_id}"):
|
||||
if c["name"].lower().strip() == cultivar_name.lower().strip():
|
||||
cultivar_cache[cv_key] = c
|
||||
cv = c
|
||||
stats["cultivars_existing"] += 1
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
stats["cultivar_create_errors"] += 1
|
||||
continue
|
||||
|
||||
# Link to supplier (check first for idempotency)
|
||||
if get_existing_supplier_links(cv["id"], supplier_id):
|
||||
stats["supplier_links_existing"] += 1
|
||||
continue
|
||||
|
||||
link_data = {
|
||||
"supplier_id": supplier_id,
|
||||
"article_number": product.get("article_number", ""),
|
||||
"product_url": product.get("url", ""),
|
||||
"price_eur": product.get("price"),
|
||||
}
|
||||
pack_info = product.get("pack_info", "")
|
||||
if pack_info:
|
||||
m = re.search(r"ca\.?\s*(\d+)\s*(Pfl|Korn|Samen|g|kg|ml)", pack_info)
|
||||
if m:
|
||||
link_data["pack_size"] = float(m.group(1))
|
||||
unit_map = {"Pfl": "Pflanzen", "Korn": "Korn", "Samen": "Korn"}
|
||||
link_data["pack_unit"] = unit_map.get(m.group(2), m.group(2))
|
||||
|
||||
resp = api_request("POST", f"/cultivars/{cv['id']}/suppliers", link_data)
|
||||
if resp:
|
||||
stats["supplier_links_created"] += 1
|
||||
else:
|
||||
stats["supplier_link_errors"] += 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
for key, val in sorted(stats.items()):
|
||||
print(f" {key}: {val}")
|
||||
print(f"\n Total species in DB: {len(species_cache)}")
|
||||
print(f" Total cultivars tracked: {len(cultivar_cache)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
||||
DB_CMD = [
|
||||
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
|
||||
'-t', '-A', '-F|'
|
||||
]
|
||||
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
|
||||
|
||||
MONTH_MAP = {
|
||||
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||||
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
||||
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
||||
}
|
||||
|
||||
|
||||
def run_sql(sql):
|
||||
result = subprocess.run(
|
||||
DB_CMD + ['-c', sql],
|
||||
capture_output=True, text=True, env=DB_ENV
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def fetch_page(url):
|
||||
result = subprocess.run(
|
||||
['curl', '-sL', '--max-time', '15', url],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def parse_months(text):
|
||||
if not text:
|
||||
return None
|
||||
text_lower = text.lower().strip()
|
||||
months = []
|
||||
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
|
||||
if month_name in text_lower:
|
||||
if month_num not in months:
|
||||
months.append(month_num)
|
||||
text_lower = text_lower.replace(month_name, '')
|
||||
return sorted(months) if months else None
|
||||
|
||||
|
||||
def parse_depth(text):
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
|
||||
if match:
|
||||
v1 = float(match.group(1).replace(',', '.'))
|
||||
v2 = float(match.group(2).replace(',', '.'))
|
||||
return round((v1 + v2) / 2, 1)
|
||||
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
|
||||
if match:
|
||||
return float(match.group(1).replace(',', '.'))
|
||||
return None
|
||||
|
||||
|
||||
def parse_spacing(text):
|
||||
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
|
||||
if not text:
|
||||
return None, None
|
||||
text = text.lower().strip()
|
||||
# "X x Y cm"
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||||
if match:
|
||||
return float(match.group(2)), float(match.group(1))
|
||||
# "X - Y cm" range -> average as plant spacing
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||||
if match:
|
||||
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||||
# Single value
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
|
||||
if match:
|
||||
return None, float(match.group(1))
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_germination_days(text):
|
||||
if not text:
|
||||
return None
|
||||
text = text.lower()
|
||||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
|
||||
if match:
|
||||
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
|
||||
match = re.search(r'(\d+)\s*weeks?', text)
|
||||
if match:
|
||||
return int(match.group(1)) * 7
|
||||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
|
||||
if match:
|
||||
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
|
||||
match = re.search(r'(\d+)\s*days?', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def parse_germ_temp(text):
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
|
||||
if match:
|
||||
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||||
match = re.search(r'(\d+)\s*°', text)
|
||||
if match:
|
||||
return float(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def parse_lifecycle(text):
|
||||
if not text:
|
||||
return None
|
||||
text = text.lower().strip()
|
||||
if 'perennial' in text:
|
||||
return True
|
||||
if 'annual' in text or 'biennial' in text:
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
def parse_light(text):
|
||||
if not text:
|
||||
return None
|
||||
text = text.lower().strip()
|
||||
if 'full sun' in text and 'partial' in text:
|
||||
return 'full sun to partial shade'
|
||||
if 'full sun' in text:
|
||||
return 'full sun'
|
||||
if 'partial' in text or 'semi' in text or 'half' in text:
|
||||
return 'partial shade'
|
||||
if 'shade' in text:
|
||||
return 'shade'
|
||||
if 'sun' in text:
|
||||
return 'full sun'
|
||||
return text
|
||||
|
||||
|
||||
def extract_data(html):
|
||||
data = {}
|
||||
|
||||
# Extract table cell pairs
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
|
||||
clean_cells = []
|
||||
for c in cells:
|
||||
clean = re.sub(r'<[^>]+>', ' ', c).strip()
|
||||
clean = re.sub(r'\s+', ' ', clean)
|
||||
clean_cells.append(clean)
|
||||
|
||||
specs = {}
|
||||
i = 0
|
||||
while i < len(clean_cells) - 1:
|
||||
key = clean_cells[i].rstrip(':').strip()
|
||||
val = clean_cells[i + 1].strip()
|
||||
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
|
||||
specs[key.lower()] = val
|
||||
i += 2
|
||||
|
||||
# Extract description from itemprop="description"
|
||||
desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
|
||||
if desc_match:
|
||||
content = desc_match.group(1)
|
||||
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
|
||||
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
|
||||
content = re.sub(r'<[^>]+>', ' ', content)
|
||||
content = re.sub(r'\s+', ' ', content).strip()
|
||||
for marker in ['Other names', 'Additional contact mail', 'Question about']:
|
||||
idx = content.find(marker)
|
||||
if idx > 0:
|
||||
content = content[:idx].strip()
|
||||
if len(content) > 20:
|
||||
data['description'] = content
|
||||
|
||||
if 'description' not in data:
|
||||
meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
|
||||
if meta_match and len(meta_match.group(1)) > 20:
|
||||
data['description'] = meta_match.group(1)
|
||||
|
||||
# Parse specs
|
||||
if 'planting distance' in specs:
|
||||
row_sp, plant_sp = parse_spacing(specs['planting distance'])
|
||||
if plant_sp:
|
||||
data['plant_spacing_cm'] = plant_sp
|
||||
if row_sp:
|
||||
data['row_spacing_cm'] = row_sp
|
||||
|
||||
if 'row spacing' in specs:
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
|
||||
if match:
|
||||
data['row_spacing_cm'] = float(match.group(1))
|
||||
|
||||
if 'sowing depth' in specs:
|
||||
depth = parse_depth(specs['sowing depth'])
|
||||
if depth is not None:
|
||||
data['planting_depth_cm'] = depth
|
||||
|
||||
# Harvesting months - prefer explicit harvest time over flowering
|
||||
if 'harvest time' in specs:
|
||||
months = parse_months(specs['harvest time'])
|
||||
if months:
|
||||
data['harvesting_months'] = months
|
||||
elif 'harvesting months' in specs:
|
||||
months = parse_months(specs['harvesting months'])
|
||||
if months:
|
||||
data['harvesting_months'] = months
|
||||
elif 'flowering months' in specs:
|
||||
months = parse_months(specs['flowering months'])
|
||||
if months:
|
||||
data['harvesting_months'] = months
|
||||
|
||||
if 'when to sow outdoors' in specs:
|
||||
months = parse_months(specs['when to sow outdoors'])
|
||||
if months:
|
||||
data['direct_sowing_months'] = months
|
||||
|
||||
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
|
||||
if indoor_key in specs:
|
||||
months = parse_months(specs[indoor_key])
|
||||
if months:
|
||||
data['indoor_sowing_months'] = months
|
||||
break
|
||||
|
||||
if 'lifecycle' in specs:
|
||||
perennial = parse_lifecycle(specs['lifecycle'])
|
||||
if perennial is not None:
|
||||
data['perennial'] = perennial
|
||||
|
||||
if 'sunlight' in specs:
|
||||
light = parse_light(specs['sunlight'])
|
||||
if light:
|
||||
data['light_requirement'] = light
|
||||
|
||||
if 'germination time' in specs:
|
||||
days = parse_germination_days(specs['germination time'])
|
||||
if days:
|
||||
data['days_to_germination'] = days
|
||||
|
||||
if 'germination temperature' in specs:
|
||||
temp = parse_germ_temp(specs['germination temperature'])
|
||||
if temp:
|
||||
data['germination_temp_c'] = temp
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def get_current_values(cultivar_id):
|
||||
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||||
perennial, harvesting_months, direct_sowing_months, light_requirement,
|
||||
days_to_germination, germination_temp_c, indoor_sowing_months
|
||||
FROM cultivars WHERE id = '{cultivar_id}'"""
|
||||
row = run_sql(sql)
|
||||
if not row:
|
||||
return {}
|
||||
parts = row.split('|')
|
||||
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
|
||||
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
|
||||
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
|
||||
current = {}
|
||||
for i, f in enumerate(fields):
|
||||
if i < len(parts):
|
||||
val = parts[i].strip()
|
||||
if val and val != '':
|
||||
current[f] = val
|
||||
return current
|
||||
|
||||
|
||||
def build_update_sql(cultivar_id, data, current):
|
||||
sets = []
|
||||
updated_fields = []
|
||||
for field, value in data.items():
|
||||
if field in current and current[field]:
|
||||
continue
|
||||
|
||||
if isinstance(value, str):
|
||||
escaped = value.replace("'", "''")
|
||||
sets.append(f"{field} = '{escaped}'")
|
||||
elif isinstance(value, bool):
|
||||
sets.append(f"{field} = {'true' if value else 'false'}")
|
||||
elif isinstance(value, list):
|
||||
arr_str = '{' + ','.join(str(x) for x in value) + '}'
|
||||
sets.append(f"{field} = '{arr_str}'")
|
||||
elif isinstance(value, (int, float)):
|
||||
sets.append(f"{field} = {value}")
|
||||
updated_fields.append(field)
|
||||
|
||||
if not sets:
|
||||
return None, []
|
||||
|
||||
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
|
||||
|
||||
|
||||
def main():
|
||||
sql = """
|
||||
SELECT c.id, c.name, cs.product_url
|
||||
FROM cultivars c
|
||||
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
|
||||
JOIN suppliers s ON cs.supplier_id = s.id
|
||||
WHERE s.name = 'Magic Garden Seeds'
|
||||
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
|
||||
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
|
||||
ORDER BY c.name;
|
||||
"""
|
||||
rows = run_sql(sql)
|
||||
if not rows:
|
||||
print("No cultivars to process")
|
||||
return
|
||||
|
||||
cultivars = []
|
||||
for line in rows.strip().split('\n'):
|
||||
parts = line.split('|')
|
||||
if len(parts) >= 3:
|
||||
cultivars.append({
|
||||
'id': parts[0],
|
||||
'name': parts[1],
|
||||
'url': parts[2]
|
||||
})
|
||||
|
||||
print(f"Processing {len(cultivars)} MGS cultivars...")
|
||||
sys.stdout.flush()
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
fields_updated = {}
|
||||
|
||||
for i, cv in enumerate(cultivars):
|
||||
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
|
||||
|
||||
try:
|
||||
html = fetch_page(cv['url'])
|
||||
if not html or len(html) < 1000:
|
||||
print("FAILED (empty page)")
|
||||
failed += 1
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
data = extract_data(html)
|
||||
if not data:
|
||||
print("NO DATA")
|
||||
skipped += 1
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
current = get_current_values(cv['id'])
|
||||
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
|
||||
|
||||
if not sql_stmt:
|
||||
print(f"SKIP (all fields populated)")
|
||||
skipped += 1
|
||||
else:
|
||||
run_sql(sql_stmt)
|
||||
for f in upd_fields:
|
||||
fields_updated[f] = fields_updated.get(f, 0) + 1
|
||||
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
|
||||
updated += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
failed += 1
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\n=== MGS Summary ===")
|
||||
print(f"Total processed: {len(cultivars)}")
|
||||
print(f"Updated: {updated}")
|
||||
print(f"Skipped (all fields already populated): {skipped}")
|
||||
print(f"Failed: {failed}")
|
||||
print(f"\nFields updated:")
|
||||
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
|
||||
print(f" {field}: {count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape NaturaDB wildlife interaction data and enrich HerbAPI species.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import sys
|
||||
|
||||
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
NATURADB_BASE = "https://www.naturadb.de/pflanzen"
|
||||
USER_AGENT = "Mozilla/5.0 (compatible; HerbAPI-Enrichment/1.0; +https://sub-net.at)"
|
||||
DELAY = 0.5
|
||||
|
||||
|
||||
def api_get(path):
|
||||
"""GET from HerbAPI."""
|
||||
url = f"{HERBAPI_BASE}{path}"
|
||||
req = urllib.request.Request(url)
|
||||
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
||||
req.add_header("Accept", "application/json")
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
|
||||
def api_put(path, data):
|
||||
"""PUT to HerbAPI."""
|
||||
url = f"{HERBAPI_BASE}{path}"
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(url, data=body, method="PUT")
|
||||
req.add_header("Authorization", f"Bearer {HERBAPI_TOKEN}")
|
||||
req.add_header("Content-Type", "application/json")
|
||||
req.add_header("Accept", "application/json")
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
|
||||
def fetch_naturadb(latin_name):
|
||||
"""Fetch a NaturaDB plant page. Returns HTML string or None."""
|
||||
slug = latin_name.lower().replace(" ", "-")
|
||||
url = f"{NATURADB_BASE}/{slug}/"
|
||||
req = urllib.request.Request(url)
|
||||
req.add_header("User-Agent", USER_AGENT)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
return None
|
||||
print(f" HTTP {e.code} for {url}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_td_value(html, label):
|
||||
"""Extract value from <td>label:</td><td>value</td> pattern."""
|
||||
pattern = rf"<td>{re.escape(label)}:?</td>\s*<td[^>]*>(.*?)</td>"
|
||||
m = re.search(pattern, html, re.DOTALL)
|
||||
if m:
|
||||
# Strip HTML tags from value
|
||||
val = re.sub(r"<[^>]+>", "", m.group(1)).strip()
|
||||
return val
|
||||
return None
|
||||
|
||||
|
||||
def extract_native_status(html):
|
||||
"""Extract native status from chip badges."""
|
||||
# Look for the primary native status chips (large, colored)
|
||||
statuses = []
|
||||
for m in re.finditer(
|
||||
r'chip--large\s+chip--no-border\s+clr-white\s+bg-\w+"[^>]*>([^<]+)', html
|
||||
):
|
||||
tag = m.group(1).strip()
|
||||
if tag in (
|
||||
"heimische Wildform",
|
||||
"Archäophyt",
|
||||
"Neophyt",
|
||||
"nicht heimisch (Neophyt)",
|
||||
):
|
||||
statuses.append(tag)
|
||||
return statuses
|
||||
|
||||
|
||||
def extract_badge_tags(html):
|
||||
"""Extract ecological badge chips (large, plain text)."""
|
||||
tags = []
|
||||
for m in re.finditer(r'chip--large\s+clr-text"[^>]*>([^<]+)', html):
|
||||
tag = m.group(1).strip()
|
||||
if tag and tag not in ("", "winterhart"):
|
||||
tags.append(tag)
|
||||
return tags
|
||||
|
||||
|
||||
def parse_count(text):
|
||||
"""Extract leading integer from text like '82 (Nektar und/oder ...)' """
|
||||
if not text:
|
||||
return None
|
||||
m = re.match(r"(\d+)", text.strip())
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def parse_specialist_count(text):
|
||||
"""Extract specialist/spezialisiert count from text like '39 (davon 5 spezialisiert)'."""
|
||||
if not text:
|
||||
return None
|
||||
m = re.search(r"davon\s+(\d+)\s+spezialisiert", text)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def parse_nectar_pollen(text):
|
||||
"""Extract numeric value from '2/4 - mäßig' -> 2."""
|
||||
if not text:
|
||||
return None
|
||||
m = re.match(r"(\d+)/4", text.strip())
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def build_wildlife_value(data):
|
||||
"""Build a structured wildlife_value string from scraped data."""
|
||||
parts = []
|
||||
|
||||
# Nectar and pollen
|
||||
np_parts = []
|
||||
if data.get("nectar") is not None:
|
||||
np_parts.append(f"Nectar: {data['nectar']}/4")
|
||||
if data.get("pollen") is not None:
|
||||
np_parts.append(f"Pollen: {data['pollen']}/4")
|
||||
if np_parts:
|
||||
parts.append(", ".join(np_parts) + ".")
|
||||
|
||||
# Wild bees
|
||||
if data.get("wildbienen_count") is not None:
|
||||
s = f"Supports {data['wildbienen_count']} wild bee species"
|
||||
if data.get("wildbienen_specialists") is not None:
|
||||
s += f" ({data['wildbienen_specialists']} specialists)"
|
||||
parts.append(s + ".")
|
||||
|
||||
# Butterflies / moths
|
||||
if data.get("schmetterlinge_count") is not None:
|
||||
s = f"{data['schmetterlinge_count']} butterfly/moth species"
|
||||
if data.get("raupen_count") is not None:
|
||||
spec = ""
|
||||
if data.get("raupen_specialists") is not None:
|
||||
spec = f" ({data['raupen_specialists']} specialized)"
|
||||
s += f", {data['raupen_count']} as caterpillar host{spec}"
|
||||
parts.append(s + ".")
|
||||
|
||||
# Hoverflies
|
||||
if data.get("schwebfliegen_count") is not None:
|
||||
parts.append(f"{data['schwebfliegen_count']} hoverfly species.")
|
||||
|
||||
# Beetles
|
||||
if data.get("kaefer_count") is not None:
|
||||
parts.append(f"{data['kaefer_count']} beetle species.")
|
||||
|
||||
# Birds
|
||||
if data.get("vogelarten_count") is not None:
|
||||
parts.append(f"{data['vogelarten_count']} bird species.")
|
||||
|
||||
# Mammals
|
||||
if data.get("saeugetier_count") is not None:
|
||||
parts.append(f"{data['saeugetier_count']} mammal species.")
|
||||
|
||||
# Native status
|
||||
if data.get("native_status"):
|
||||
parts.append(" ".join(data["native_status"]) + ".")
|
||||
|
||||
# Notable badges
|
||||
notable = [
|
||||
t
|
||||
for t in data.get("badges", [])
|
||||
if any(
|
||||
kw in t.lower()
|
||||
for kw in [
|
||||
"insektenpflanze",
|
||||
"raupenfutter",
|
||||
"vogelschutz",
|
||||
"vogelnähr",
|
||||
"bienenweide",
|
||||
]
|
||||
)
|
||||
]
|
||||
if notable:
|
||||
parts.append("Tags: " + ", ".join(notable) + ".")
|
||||
|
||||
return " ".join(parts) if parts else None
|
||||
|
||||
|
||||
def scrape_species(html):
|
||||
"""Parse NaturaDB HTML and return structured wildlife data dict."""
|
||||
data = {}
|
||||
|
||||
# Nectar and pollen values
|
||||
nectar_raw = extract_td_value(html, "Nektarwert")
|
||||
pollen_raw = extract_td_value(html, "Pollenwert")
|
||||
data["nectar"] = parse_nectar_pollen(nectar_raw)
|
||||
data["pollen"] = parse_nectar_pollen(pollen_raw)
|
||||
|
||||
# Wild bees
|
||||
bees_raw = extract_td_value(html, "Wildbienen")
|
||||
data["wildbienen_count"] = parse_count(bees_raw)
|
||||
data["wildbienen_specialists"] = parse_specialist_count(bees_raw)
|
||||
|
||||
# Butterflies/moths
|
||||
schmett_raw = extract_td_value(html, "Schmetterlinge")
|
||||
data["schmetterlinge_count"] = parse_count(schmett_raw)
|
||||
|
||||
# Caterpillar hosts
|
||||
raupen_raw = extract_td_value(html, "Raupen")
|
||||
data["raupen_count"] = parse_count(raupen_raw)
|
||||
data["raupen_specialists"] = parse_specialist_count(raupen_raw)
|
||||
|
||||
# Hoverflies
|
||||
schweb_raw = extract_td_value(html, "Schwebfliegen")
|
||||
data["schwebfliegen_count"] = parse_count(schweb_raw)
|
||||
|
||||
# Beetles
|
||||
kaefer_raw = extract_td_value(html, "Käfer")
|
||||
data["kaefer_count"] = parse_count(kaefer_raw)
|
||||
|
||||
# Birds
|
||||
vogel_raw = extract_td_value(html, "fressende Vogelarten")
|
||||
data["vogelarten_count"] = parse_count(vogel_raw)
|
||||
|
||||
# Mammals
|
||||
saeuget_raw = extract_td_value(html, "fressende Säugetierarten")
|
||||
data["saeugetier_count"] = parse_count(saeuget_raw)
|
||||
|
||||
# Native status
|
||||
data["native_status"] = extract_native_status(html)
|
||||
|
||||
# Badge tags
|
||||
data["badges"] = extract_badge_tags(html)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def has_any_data(data):
|
||||
"""Check if we scraped anything meaningful."""
|
||||
for k, v in data.items():
|
||||
if k in ("native_status", "badges"):
|
||||
if v:
|
||||
return True
|
||||
elif v is not None:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("Fetching species list from HerbAPI...")
|
||||
species_list = api_get("/species?per_page=200")["data"]
|
||||
print(f"Found {len(species_list)} species.\n")
|
||||
|
||||
enriched = 0
|
||||
skipped_has_data = 0
|
||||
skipped_not_found = 0
|
||||
skipped_no_data = 0
|
||||
errors = 0
|
||||
|
||||
for i, sp in enumerate(species_list):
|
||||
slug = sp["slug"]
|
||||
name = sp["name_scientific"]
|
||||
existing_wv = sp.get("wildlife_value")
|
||||
|
||||
# Only enrich if wildlife_value is empty/null
|
||||
if existing_wv:
|
||||
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} SKIP (already has data)")
|
||||
skipped_has_data += 1
|
||||
continue
|
||||
|
||||
print(f"[{i+1:3d}/{len(species_list)}] {slug:40s} ", end="", flush=True)
|
||||
|
||||
# Fetch NaturaDB page
|
||||
html = fetch_naturadb(name)
|
||||
time.sleep(DELAY)
|
||||
|
||||
if html is None:
|
||||
print("NOT FOUND on NaturaDB")
|
||||
skipped_not_found += 1
|
||||
continue
|
||||
|
||||
# Parse wildlife data
|
||||
data = scrape_species(html)
|
||||
|
||||
if not has_any_data(data):
|
||||
print("no wildlife data on page")
|
||||
skipped_no_data += 1
|
||||
continue
|
||||
|
||||
# Build wildlife_value string
|
||||
wildlife_value = build_wildlife_value(data)
|
||||
if not wildlife_value:
|
||||
print("no wildlife data extracted")
|
||||
skipped_no_data += 1
|
||||
continue
|
||||
|
||||
# GET full species, merge, PUT back
|
||||
try:
|
||||
full = api_get(f"/species/{slug}")
|
||||
full["wildlife_value"] = wildlife_value
|
||||
|
||||
# Remove read-only / computed fields that the PUT endpoint might reject
|
||||
for key in ("created_at", "updated_at", "family"):
|
||||
full.pop(key, None)
|
||||
|
||||
api_put(f"/species/{full['id']}", full)
|
||||
print(f"ENRICHED -> {wildlife_value[:80]}...")
|
||||
enriched += 1
|
||||
except Exception as e:
|
||||
print(f"API ERROR: {e}")
|
||||
errors += 1
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print(f"DONE. Results:")
|
||||
print(f" Enriched: {enriched}")
|
||||
print(f" Already had data: {skipped_has_data}")
|
||||
print(f" Not on NaturaDB: {skipped_not_found}")
|
||||
print(f" No wildlife data: {skipped_no_data}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total: {len(species_list)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,560 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape cultivar data from Reinsaat (reinsaat.at) and push into HerbAPI.
|
||||
|
||||
Strategy:
|
||||
1. Fetch category pages, recursively discover product pages via JSON-LD detection
|
||||
2. Extract structured data from JSON-LD Product schema + HTML text for growing data
|
||||
3. Match Latin names to existing species in the API
|
||||
4. Create cultivar records and link them to Reinsaat supplier
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import ssl
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from html.parser import HTMLParser
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
AUTH_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
|
||||
DELAY = 0.5 # seconds between requests
|
||||
USER_AGENT = "HerbAPI-Scraper/1.0 (florian.berthold@sub-net.at)"
|
||||
|
||||
# ── Categories to scrape ────────────────────────────────────────────────────
|
||||
# (category_url, default_species_hint for leaf pages in this category)
|
||||
CATEGORIES = [
|
||||
("https://www.reinsaat.at/shop/DE/tomaten_paradeiser/", "Solanum lycopersicum"),
|
||||
("https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/", None),
|
||||
("https://www.reinsaat.at/shop/DE/kuerbis/", None),
|
||||
("https://www.reinsaat.at/shop/DE/zucchini/", "Cucurbita pepo"),
|
||||
("https://www.reinsaat.at/shop/DE/bohnen/", None),
|
||||
("https://www.reinsaat.at/shop/DE/karotten_moehren_1/", "Daucus carota"),
|
||||
("https://www.reinsaat.at/shop/DE/rote_ruebe/", "Beta vulgaris"),
|
||||
("https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/", None),
|
||||
]
|
||||
|
||||
# ── Known Latin name genera we can match ────────────────────────────────────
|
||||
KNOWN_GENERA = (
|
||||
"Solanum|Cucurbita|Vicia|Phaseolus|Glycine|Daucus|Beta|Borago|Lavandula|"
|
||||
"Salvia|Melissa|Thymus|Calendula|Allium|Ocimum|Satureja|Origanum|Anethum|"
|
||||
"Foeniculum|Carum|Nigella|Levisticum|Rumex|Majorana|Hyssopus|Coriandrum|"
|
||||
"Petroselinum|Eruca|Tropaeolum|Lupinus|Helianthus|Tagetes|Zinnia|Cosmos|"
|
||||
"Papaver|Centaurea|Matricaria|Chrysanthemum|Antirrhinum|Lathyrus|Ipomoea|"
|
||||
"Phacelia|Trifolium|Symphytum|Urtica|Fragaria|Sambucus"
|
||||
)
|
||||
|
||||
LATIN_PATTERN = re.compile(
|
||||
rf'((?:{KNOWN_GENERA})\s+[a-z]+(?:\s+L\.?)?(?:\s+(?:ssp|var|subsp)\.\s+[a-z]+)?)'
|
||||
)
|
||||
|
||||
|
||||
# ── HTML helpers ────────────────────────────────────────────────────────────
|
||||
class TextExtractor(HTMLParser):
|
||||
"""Extract all visible text from HTML."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.parts = []
|
||||
self._skip = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("script", "style", "noscript"):
|
||||
self._skip += 1
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("script", "style", "noscript") and self._skip > 0:
|
||||
self._skip -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._skip == 0:
|
||||
t = data.strip()
|
||||
if t:
|
||||
self.parts.append(t)
|
||||
|
||||
|
||||
def extract_links(html: str, base_url: str) -> list[str]:
|
||||
"""Extract all <a href> links from HTML, resolving relative URLs."""
|
||||
links = []
|
||||
seen = set()
|
||||
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html, re.IGNORECASE):
|
||||
href = m.group(1)
|
||||
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||
continue
|
||||
full = urllib.parse.urljoin(base_url, href)
|
||||
if full not in seen:
|
||||
seen.add(full)
|
||||
links.append(full)
|
||||
return links
|
||||
|
||||
|
||||
def extract_jsonld_product(html: str) -> Optional[dict]:
|
||||
"""Extract the JSON-LD Product object from HTML, if present."""
|
||||
for m in re.finditer(
|
||||
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
|
||||
html, re.DOTALL | re.IGNORECASE
|
||||
):
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
return data
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
# ── HTTP helpers ────────────────────────────────────────────────────────────
|
||||
_ssl_ctx = ssl.create_default_context()
|
||||
|
||||
def fetch_url(url: str, retries: int = 2) -> str:
|
||||
"""Fetch a URL with retries."""
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
|
||||
})
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
return resp.read().decode(charset)
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||
if attempt < retries:
|
||||
time.sleep(2)
|
||||
continue
|
||||
raise
|
||||
return ""
|
||||
|
||||
|
||||
def api_get(path: str):
|
||||
"""GET from HerbAPI."""
|
||||
req = urllib.request.Request(
|
||||
f"{API_BASE}{path}",
|
||||
headers={"Authorization": f"Bearer {AUTH_TOKEN}", "Accept": "application/json"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def api_post(path: str, data: dict):
|
||||
"""POST to HerbAPI."""
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{API_BASE}{path}",
|
||||
data=body,
|
||||
headers={
|
||||
"Authorization": f"Bearer {AUTH_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
error_body = e.read().decode("utf-8", errors="replace")
|
||||
print(f" API ERROR {e.code}: {error_body[:500]}")
|
||||
raise
|
||||
|
||||
|
||||
# ── Species matching ────────────────────────────────────────────────────────
|
||||
def load_species() -> dict:
|
||||
"""Load species from API. Returns dict: lowercase scientific name -> species dict."""
|
||||
result = {}
|
||||
page = 1
|
||||
while True:
|
||||
data = api_get(f"/species?per_page=100&page={page}")
|
||||
species_list = data.get("data", data) if isinstance(data, dict) else data
|
||||
for s in species_list:
|
||||
key = s["name_scientific"].lower().strip()
|
||||
result[key] = s
|
||||
if isinstance(data, dict) and "pagination" in data:
|
||||
if page >= data["pagination"].get("total_pages", 1):
|
||||
break
|
||||
else:
|
||||
break
|
||||
page += 1
|
||||
return result
|
||||
|
||||
|
||||
def match_species(latin_name: str, species_map: dict) -> Optional[dict]:
|
||||
"""Match a Latin name to an existing species. Returns species dict or None."""
|
||||
if not latin_name:
|
||||
return None
|
||||
|
||||
# Clean the name: remove author citations, subspecies
|
||||
clean = latin_name.strip()
|
||||
clean = re.sub(r'\s+L\.\s*$', '', clean)
|
||||
clean = re.sub(r'\s+[A-Z][a-z]*\.\s*$', '', clean)
|
||||
clean = re.sub(r'\s+(?:ssp|subsp|var)\.\s+\S+', '', clean)
|
||||
|
||||
key = clean.lower().strip()
|
||||
if key in species_map:
|
||||
return species_map[key]
|
||||
|
||||
# Try genus + species (first two words)
|
||||
parts = key.split()
|
||||
if len(parts) >= 2:
|
||||
two = f"{parts[0]} {parts[1]}"
|
||||
if two in species_map:
|
||||
return species_map[two]
|
||||
|
||||
# Try genus-only match (less reliable, but useful for Borago, etc.)
|
||||
if parts:
|
||||
for skey, sval in species_map.items():
|
||||
if skey.startswith(parts[0] + " "):
|
||||
return sval
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── Product data extraction ─────────────────────────────────────────────────
|
||||
@dataclass
|
||||
class ProductData:
|
||||
name: str = ""
|
||||
latin_name: str = ""
|
||||
description: str = ""
|
||||
sku: str = ""
|
||||
url: str = ""
|
||||
is_organic: bool = True
|
||||
sowing_depth_cm: Optional[float] = None
|
||||
row_spacing_cm: Optional[float] = None
|
||||
plant_spacing_cm: Optional[float] = None
|
||||
germination_temp_c: Optional[float] = None
|
||||
perennial: bool = False
|
||||
|
||||
|
||||
def parse_product(html: str, url: str, default_species: Optional[str] = None) -> Optional[ProductData]:
|
||||
"""Parse a product page. Returns ProductData or None if not a product page."""
|
||||
jsonld = extract_jsonld_product(html)
|
||||
if not jsonld:
|
||||
return None # Not a product page
|
||||
|
||||
product = ProductData(url=url)
|
||||
|
||||
# ── From JSON-LD ──
|
||||
product.name = jsonld.get("name", "").strip()
|
||||
product.description = jsonld.get("description", "").strip()
|
||||
product.sku = jsonld.get("model", "").strip()
|
||||
|
||||
# ── Extract full text for pattern matching ──
|
||||
extractor = TextExtractor()
|
||||
extractor.feed(html)
|
||||
full_text = " ".join(extractor.parts)
|
||||
|
||||
# ── Latin name ──
|
||||
m = LATIN_PATTERN.search(full_text)
|
||||
if m:
|
||||
product.latin_name = m.group(1).strip()
|
||||
# Also check <i>/<em> tags in HTML
|
||||
if not product.latin_name:
|
||||
for italic in re.finditer(r'<(?:i|em)[^>]*>(.*?)</(?:i|em)>', html, re.IGNORECASE | re.DOTALL):
|
||||
clean = re.sub(r'<[^>]+>', '', italic.group(1)).strip()
|
||||
im = LATIN_PATTERN.search(clean)
|
||||
if im:
|
||||
product.latin_name = im.group(1).strip()
|
||||
break
|
||||
if not product.latin_name and default_species:
|
||||
product.latin_name = default_species
|
||||
|
||||
# ── Sowing depth ──
|
||||
depth_pats = [
|
||||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
|
||||
r'(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm\s+(?:tief|Tiefe)',
|
||||
]
|
||||
for pat in depth_pats:
|
||||
dm = re.search(pat, full_text, re.IGNORECASE)
|
||||
if dm:
|
||||
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
|
||||
product.sowing_depth_cm = sum(vals) / len(vals)
|
||||
break
|
||||
|
||||
# Fallback: look in raw HTML for common depth patterns like "0,5–1 cm" near depth keywords
|
||||
if product.sowing_depth_cm is None:
|
||||
dm = re.search(
|
||||
r'(?:Saattiefe|Ablagetiefe|Aussaattiefe|Saatgutablage)\D{0,30}?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if dm:
|
||||
d1 = float(dm.group(1).replace(",", "."))
|
||||
d2 = float(dm.group(2).replace(",", "."))
|
||||
product.sowing_depth_cm = (d1 + d2) / 2
|
||||
|
||||
# ── Spacing ──
|
||||
# Look for "ROW x PLANT cm" patterns
|
||||
spacing_pats = [
|
||||
# "30–40 x 2–4 cm" (range x range)
|
||||
r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
|
||||
# "100 x 50 cm" (simple)
|
||||
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||
]
|
||||
for pat in spacing_pats:
|
||||
matches = re.findall(pat, full_text, re.IGNORECASE)
|
||||
if matches:
|
||||
# Prefer the last match (often the more relevant outdoor spacing)
|
||||
m = matches[-1]
|
||||
if len(m) == 4:
|
||||
product.row_spacing_cm = (float(m[0]) + float(m[1])) / 2
|
||||
product.plant_spacing_cm = (float(m[2]) + float(m[3])) / 2
|
||||
elif len(m) == 2:
|
||||
v1 = float(m[0].replace(",", "."))
|
||||
v2 = float(m[1].replace(",", "."))
|
||||
product.row_spacing_cm = v1
|
||||
product.plant_spacing_cm = v2
|
||||
break
|
||||
|
||||
# ── Germination temperature ──
|
||||
temp_pats = [
|
||||
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*°?\s*C',
|
||||
r'(\d+)\s*[-–und ]*\s*(\d+)\s*°\s*C',
|
||||
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
|
||||
]
|
||||
for pat in temp_pats:
|
||||
tm = re.search(pat, full_text, re.IGNORECASE)
|
||||
if tm:
|
||||
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
|
||||
# Sanity check: germination temps are typically 5-35°C
|
||||
avg = sum(vals) / len(vals)
|
||||
if 5 <= avg <= 40:
|
||||
product.germination_temp_c = avg
|
||||
break
|
||||
|
||||
# ── Perennial ──
|
||||
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
|
||||
for pat in perennial_pats:
|
||||
if re.search(pat, full_text, re.IGNORECASE):
|
||||
product.perennial = True
|
||||
break
|
||||
|
||||
return product
|
||||
|
||||
|
||||
# ── Recursive product discovery ─────────────────────────────────────────────
|
||||
def discover_products(
|
||||
category_url: str,
|
||||
default_species: Optional[str],
|
||||
max_depth: int = 3,
|
||||
_depth: int = 0,
|
||||
_visited: set = None,
|
||||
) -> list[ProductData]:
|
||||
"""Recursively discover and parse product pages under a category URL."""
|
||||
if _visited is None:
|
||||
_visited = set()
|
||||
if category_url in _visited or _depth > max_depth:
|
||||
return []
|
||||
_visited.add(category_url)
|
||||
|
||||
indent = " " * (_depth + 1)
|
||||
print(f"{indent}Fetching: {category_url}")
|
||||
|
||||
try:
|
||||
html = fetch_url(category_url)
|
||||
time.sleep(DELAY)
|
||||
except Exception as e:
|
||||
print(f"{indent} ERROR: {e}")
|
||||
return []
|
||||
|
||||
# Check if this IS a product page
|
||||
product = parse_product(html, category_url, default_species)
|
||||
if product:
|
||||
return [product]
|
||||
|
||||
# It's a category/subcategory page: extract child links
|
||||
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
|
||||
child_links = []
|
||||
for link in extract_links(html, category_url):
|
||||
parsed = urllib.parse.urlparse(link)
|
||||
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
|
||||
continue
|
||||
child_path = parsed.path.rstrip("/")
|
||||
# Must be a direct child of the category path
|
||||
if not child_path.startswith(cat_path + "/"):
|
||||
continue
|
||||
relative = child_path[len(cat_path) + 1:]
|
||||
# Must be exactly one level deeper (no further slashes)
|
||||
if "/" in relative:
|
||||
continue
|
||||
# Skip empty or same-path
|
||||
if not relative:
|
||||
continue
|
||||
# Build clean URL
|
||||
clean_url = f"https://www.reinsaat.at{child_path}/"
|
||||
if clean_url not in _visited:
|
||||
child_links.append(clean_url)
|
||||
|
||||
# Deduplicate
|
||||
child_links = list(dict.fromkeys(child_links))
|
||||
print(f"{indent} Found {len(child_links)} child links")
|
||||
|
||||
products = []
|
||||
for child_url in child_links:
|
||||
results = discover_products(child_url, default_species, max_depth, _depth + 1, _visited)
|
||||
products.extend(results)
|
||||
|
||||
return products
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────────
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Reinsaat Scraper -> HerbAPI")
|
||||
print("=" * 70)
|
||||
|
||||
# Load species
|
||||
print("\n[1] Loading species from API...")
|
||||
species_map = load_species()
|
||||
sci_names = [k for k in species_map if " " in k]
|
||||
print(f" {len(sci_names)} species loaded:")
|
||||
for k in sorted(sci_names):
|
||||
s = species_map[k]
|
||||
print(f" {s['name_scientific']:40s} {s['id'][:12]}...")
|
||||
|
||||
# Load existing cultivars
|
||||
print("\n[2] Loading existing cultivars...")
|
||||
existing_cultivars = {} # (species_id, name_lower) -> cultivar_id
|
||||
page = 1
|
||||
while True:
|
||||
data = api_get(f"/cultivars?per_page=100&page={page}")
|
||||
clist = data.get("data", data) if isinstance(data, dict) else data
|
||||
if not clist:
|
||||
break
|
||||
for c in clist:
|
||||
existing_cultivars[(c["species_id"], c["name"].lower())] = c["id"]
|
||||
# Check pagination - API uses {data, total, page, per_page} format
|
||||
if isinstance(data, dict):
|
||||
total = data.get("total", len(clist))
|
||||
per_page = data.get("per_page", 100)
|
||||
if page * per_page >= total:
|
||||
break
|
||||
else:
|
||||
break
|
||||
page += 1
|
||||
print(f" {len(existing_cultivars)} existing cultivars")
|
||||
|
||||
# Discover products from all categories
|
||||
print("\n[3] Discovering products from Reinsaat categories...")
|
||||
all_products: list[ProductData] = []
|
||||
visited: set[str] = set()
|
||||
|
||||
for cat_url, species_hint in CATEGORIES:
|
||||
print(f"\n Category: {cat_url}")
|
||||
products = discover_products(cat_url, species_hint, max_depth=3, _visited=visited)
|
||||
all_products.extend(products)
|
||||
print(f" -> {len(products)} products from this category")
|
||||
|
||||
print(f"\n Total products discovered: {len(all_products)}")
|
||||
|
||||
# Deduplicate by URL
|
||||
seen_urls = set()
|
||||
unique_products = []
|
||||
for p in all_products:
|
||||
if p.url not in seen_urls:
|
||||
seen_urls.add(p.url)
|
||||
unique_products.append(p)
|
||||
all_products = unique_products
|
||||
print(f" Unique products: {len(all_products)}")
|
||||
|
||||
# Process products
|
||||
print("\n[4] Creating cultivars in API...")
|
||||
stats = {"created": 0, "skipped_no_species": 0, "skipped_exists": 0, "errors": 0, "linked": 0}
|
||||
|
||||
for i, product in enumerate(all_products):
|
||||
pct = (i + 1) / len(all_products) * 100
|
||||
print(f"\n [{i+1}/{len(all_products)}] ({pct:.0f}%) {product.name}")
|
||||
|
||||
# Match species
|
||||
species = match_species(product.latin_name, species_map)
|
||||
if not species:
|
||||
print(f" Skip: no species match for '{product.latin_name}'")
|
||||
stats["skipped_no_species"] += 1
|
||||
continue
|
||||
|
||||
species_id = species["id"]
|
||||
print(f" Species: {species['name_scientific']}")
|
||||
print(f" SKU: {product.sku}, Depth: {product.sowing_depth_cm}, "
|
||||
f"Spacing: {product.row_spacing_cm}x{product.plant_spacing_cm}, "
|
||||
f"Temp: {product.germination_temp_c}, Perennial: {product.perennial}")
|
||||
|
||||
# Check duplicates
|
||||
key = (species_id, product.name.lower())
|
||||
if key in existing_cultivars:
|
||||
# Still try to link supplier if cultivar exists
|
||||
cultivar_id = existing_cultivars[key]
|
||||
print(f" Exists: {cultivar_id[:12]}... - checking supplier link")
|
||||
try:
|
||||
api_post(f"/cultivars/{cultivar_id}/suppliers", {
|
||||
"supplier_id": REINSAAT_SUPPLIER_ID,
|
||||
"product_url": product.url,
|
||||
"article_number": product.sku,
|
||||
})
|
||||
print(f" Linked to Reinsaat (SKU: {product.sku})")
|
||||
stats["linked"] += 1
|
||||
except Exception:
|
||||
pass # Already linked or other error
|
||||
stats["skipped_exists"] += 1
|
||||
continue
|
||||
|
||||
# Build payload
|
||||
payload = {
|
||||
"species_id": species_id,
|
||||
"name": product.name,
|
||||
"name_de": product.name,
|
||||
"name_en": "",
|
||||
"description": product.description,
|
||||
"is_organic": product.is_organic,
|
||||
"perennial": product.perennial,
|
||||
}
|
||||
if product.sowing_depth_cm is not None:
|
||||
payload["planting_depth_cm"] = round(product.sowing_depth_cm, 2)
|
||||
if product.row_spacing_cm is not None:
|
||||
payload["row_spacing_cm"] = round(product.row_spacing_cm, 1)
|
||||
if product.plant_spacing_cm is not None:
|
||||
payload["plant_spacing_cm"] = round(product.plant_spacing_cm, 1)
|
||||
if product.germination_temp_c is not None:
|
||||
payload["germination_temp_c"] = round(product.germination_temp_c, 1)
|
||||
|
||||
# Create cultivar
|
||||
try:
|
||||
result = api_post("/cultivars", payload)
|
||||
cultivar_id = result["id"]
|
||||
print(f" Created: {cultivar_id}")
|
||||
stats["created"] += 1
|
||||
existing_cultivars[key] = cultivar_id
|
||||
except Exception as e:
|
||||
print(f" FAILED to create: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
# Link to supplier
|
||||
try:
|
||||
api_post(f"/cultivars/{cultivar_id}/suppliers", {
|
||||
"supplier_id": REINSAAT_SUPPLIER_ID,
|
||||
"product_url": product.url,
|
||||
"article_number": product.sku,
|
||||
})
|
||||
print(f" Linked to Reinsaat (SKU: {product.sku})")
|
||||
stats["linked"] += 1
|
||||
except Exception as e:
|
||||
print(f" FAILED to link supplier: {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f" Created: {stats['created']}")
|
||||
print(f" Linked to supplier: {stats['linked']}")
|
||||
print(f" Skipped (no species): {stats['skipped_no_species']}")
|
||||
print(f" Skipped (exists): {stats['skipped_exists']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,770 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reinsaat Scraper v2 — scrape ALL Reinsaat categories, match species by extracting
|
||||
genus+species from extended botanical names, create/enrich cultivars, link supplier.
|
||||
|
||||
Uses direct PostgreSQL access (psycopg2) for speed and reliability.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
import html as html_mod
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
# Unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
DB_HOST = "10.31.3.90"
|
||||
DB_NAME = "herbapi"
|
||||
DB_USER = "herbapi"
|
||||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||
|
||||
REINSAAT_SUPPLIER_ID = "019ced24-1702-72d1-9acc-90435441a5c4"
|
||||
DELAY = 0.3
|
||||
USER_AGENT = "HerbAPI-Scraper/2.0 (florian.berthold@sub-net.at)"
|
||||
|
||||
# ── All Reinsaat categories ────────────────────────────────────────────────
|
||||
CATEGORIES = [
|
||||
"https://www.reinsaat.at/shop/DE/bohnen/",
|
||||
"https://www.reinsaat.at/shop/DE/erbsen/",
|
||||
"https://www.reinsaat.at/shop/DE/gurken/",
|
||||
"https://www.reinsaat.at/shop/DE/karotten_moehren_1/",
|
||||
"https://www.reinsaat.at/shop/DE/knollenfenchel/",
|
||||
"https://www.reinsaat.at/shop/DE/kohlgewaechse/",
|
||||
"https://www.reinsaat.at/shop/DE/kuerbis/",
|
||||
"https://www.reinsaat.at/shop/DE/mais/",
|
||||
"https://www.reinsaat.at/shop/DE/mangold/",
|
||||
"https://www.reinsaat.at/shop/DE/melanzani_1/",
|
||||
"https://www.reinsaat.at/shop/DE/melone/",
|
||||
"https://www.reinsaat.at/shop/DE/paprika/",
|
||||
"https://www.reinsaat.at/shop/DE/pastinaken_1/",
|
||||
"https://www.reinsaat.at/shop/DE/petersilie/",
|
||||
"https://www.reinsaat.at/shop/DE/pfefferoni_chili/",
|
||||
"https://www.reinsaat.at/shop/DE/porree/",
|
||||
"https://www.reinsaat.at/shop/DE/radies_rettich/",
|
||||
"https://www.reinsaat.at/shop/DE/rote_ruebe/",
|
||||
"https://www.reinsaat.at/shop/DE/salate/",
|
||||
"https://www.reinsaat.at/shop/DE/schwarzwurzeln/",
|
||||
"https://www.reinsaat.at/shop/DE/sellerie/",
|
||||
"https://www.reinsaat.at/shop/DE/spinat/",
|
||||
"https://www.reinsaat.at/shop/DE/tomaten_paradeiser/",
|
||||
"https://www.reinsaat.at/shop/DE/wurzelpetersilie_1/",
|
||||
"https://www.reinsaat.at/shop/DE/zucchini/",
|
||||
"https://www.reinsaat.at/shop/DE/zwiebel_knoblauch/",
|
||||
"https://www.reinsaat.at/shop/DE/kuechen-_und_gewuerzkraeuter/",
|
||||
"https://www.reinsaat.at/shop/DE/blumen_und_heilkraeuter/",
|
||||
"https://www.reinsaat.at/shop/DE/gruenduengung/",
|
||||
]
|
||||
|
||||
# ── HTTP ────────────────────────────────────────────────────────────────────
|
||||
_ssl_ctx = ssl.create_default_context()
|
||||
|
||||
|
||||
def fetch_url(url: str, retries: int = 2) -> str:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "de-AT,de;q=0.9,en;q=0.5",
|
||||
})
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30, context=_ssl_ctx) as resp:
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
return resp.read().decode(charset)
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||
if attempt < retries:
|
||||
time.sleep(2)
|
||||
continue
|
||||
raise
|
||||
return ""
|
||||
|
||||
|
||||
# ── HTML parsing helpers ────────────────────────────────────────────────────
|
||||
def extract_links(html_text: str, base_url: str) -> list[str]:
|
||||
links = []
|
||||
seen = set()
|
||||
for m in re.finditer(r'<a\s[^>]*href="([^"]*)"', html_text, re.IGNORECASE):
|
||||
href = m.group(1)
|
||||
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||
continue
|
||||
full = urllib.parse.urljoin(base_url, href)
|
||||
if full not in seen:
|
||||
seen.add(full)
|
||||
links.append(full)
|
||||
return links
|
||||
|
||||
|
||||
def extract_jsonld_product(html_text: str) -> Optional[dict]:
|
||||
for m in re.finditer(
|
||||
r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
|
||||
html_text, re.DOTALL | re.IGNORECASE
|
||||
):
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
return data
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def html_to_text(html_text: str) -> str:
|
||||
"""Strip HTML tags and decode entities."""
|
||||
text = re.sub(r'<[^>]+>', ' ', html_text)
|
||||
text = html_mod.unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def extract_botanical_name(html_text: str) -> str:
|
||||
"""
|
||||
Extract the botanical/Latin name from the page.
|
||||
Primary source: <div class="fce_shop_kurztext"> content.
|
||||
Fallback: <em> tags in growing infos.
|
||||
|
||||
Returns the raw text (may include authority names, infraspecific ranks, etc.)
|
||||
"""
|
||||
# Primary: kurztext div
|
||||
m = re.search(r'class="fce_shop_kurztext"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
text = html_to_text(m.group(1)).strip()
|
||||
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||||
return text
|
||||
|
||||
# Fallback: first <em> in growingInfos that looks like a Latin name
|
||||
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||
if gi:
|
||||
for em in re.finditer(r'<em>(.*?)</em>', gi.group(1), re.DOTALL):
|
||||
text = html_to_text(em.group(1)).strip()
|
||||
if text and re.search(r'[A-Z][a-z]+\s+[a-z]', text):
|
||||
return text
|
||||
|
||||
# Last resort: any <em>/<i> tag with a Latin-looking name
|
||||
for tag in re.finditer(r'<(?:em|i)>(.*?)</(?:em|i)>', html_text, re.DOTALL | re.IGNORECASE):
|
||||
text = html_to_text(tag.group(1)).strip()
|
||||
if text and re.search(r'^[A-Z][a-z]+\s+[a-z]+', text) and len(text) < 100:
|
||||
return text
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def normalize_latin_name(raw: str) -> str:
|
||||
"""
|
||||
Extract genus + species from an extended botanical name.
|
||||
|
||||
Examples:
|
||||
"Pisum sativum L. convar. sat." -> "Pisum sativum"
|
||||
"Capsicum annuum L." -> "Capsicum annuum"
|
||||
"Brassica oleracea L. convar. botrytis" -> "Brassica oleracea"
|
||||
"Solanum lycopersicum L." -> "Solanum lycopersicum"
|
||||
"Cucumis sativus" -> "Cucumis sativus"
|
||||
"Mentha x piperita" -> "Mentha x piperita"
|
||||
"""
|
||||
if not raw:
|
||||
return ""
|
||||
|
||||
# Clean up
|
||||
name = raw.strip()
|
||||
# Remove leading/trailing punctuation
|
||||
name = name.strip(".,;:")
|
||||
|
||||
words = name.split()
|
||||
if len(words) < 2:
|
||||
return name
|
||||
|
||||
genus = words[0]
|
||||
|
||||
# Handle hybrid notation: "Mentha x piperita" or "Mentha × piperita"
|
||||
if len(words) >= 3 and words[1] in ("x", "×"):
|
||||
return f"{genus} x {words[2]}"
|
||||
|
||||
species = words[1]
|
||||
|
||||
# Validate: genus should start uppercase, species lowercase
|
||||
if not genus[0].isupper() or not species[0].islower():
|
||||
return name # Can't parse, return as-is
|
||||
|
||||
return f"{genus} {species}"
|
||||
|
||||
|
||||
# ── Calendar parsing ────────────────────────────────────────────────────────
|
||||
CALENDAR_ROW_TYPES = {
|
||||
"voranzucht": "indoor_sowing_months",
|
||||
"vorzucht": "indoor_sowing_months",
|
||||
"vorkultur": "indoor_sowing_months",
|
||||
"aussaat/ pflanzung freiland": "direct_sowing_months",
|
||||
"aussaat/pflanzung freiland": "direct_sowing_months",
|
||||
"aussaat freiland": "direct_sowing_months",
|
||||
"direktsaat": "direct_sowing_months",
|
||||
"pflanzung freiland": "transplanting_months",
|
||||
"pflanzung": "transplanting_months",
|
||||
"aussaat/ pflanzung gewächshaus": "glasshouse_months",
|
||||
"aussaat/pflanzung gewächshaus": "glasshouse_months",
|
||||
"gewächshaus": "glasshouse_months",
|
||||
"ernte": "harvesting_months",
|
||||
}
|
||||
|
||||
|
||||
def parse_calendar(html_text: str) -> dict:
|
||||
"""
|
||||
Parse the Reinsaat growing calendar table.
|
||||
Returns dict with keys like 'direct_sowing_months', 'harvesting_months' etc.
|
||||
Each value is a sorted list of month integers (1-12).
|
||||
"""
|
||||
result = {}
|
||||
|
||||
cal_match = re.search(r'class="rs-growing-time[^"]*"(.*?)</table>', html_text, re.DOTALL)
|
||||
if not cal_match:
|
||||
return result
|
||||
|
||||
cal = cal_match.group(1)
|
||||
rows = re.findall(r'<tr>(.*?)</tr>', cal, re.DOTALL)
|
||||
|
||||
for row in rows:
|
||||
# Get label
|
||||
label_m = re.search(r'class="type-lable"[^>]*>(.*?)</td>', row, re.DOTALL)
|
||||
if not label_m:
|
||||
continue
|
||||
label = html_to_text(label_m.group(1)).strip().lower()
|
||||
|
||||
# Map label to our field
|
||||
field_name = None
|
||||
for pattern, fname in CALENDAR_ROW_TYPES.items():
|
||||
if pattern in label:
|
||||
field_name = fname
|
||||
break
|
||||
if not field_name:
|
||||
continue
|
||||
|
||||
# Extract background colors for each cell (24 cells = 12 months x 2 halves)
|
||||
colors = re.findall(r'background-color:\s*([^;"]+)', row)
|
||||
|
||||
# Convert to months: cell i maps to month (i // 2) + 1
|
||||
active_months = set()
|
||||
for i, color in enumerate(colors):
|
||||
color = color.strip().lower()
|
||||
if color != "none" and color != "transparent" and color != "":
|
||||
month = (i // 2) + 1
|
||||
if 1 <= month <= 12:
|
||||
active_months.add(month)
|
||||
|
||||
if active_months:
|
||||
# Merge if same field already found (e.g. two sowing rows)
|
||||
if field_name in result:
|
||||
result[field_name] = sorted(set(result[field_name]) | active_months)
|
||||
else:
|
||||
result[field_name] = sorted(active_months)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ── Growing data extraction ─────────────────────────────────────────────────
|
||||
def extract_growing_data(html_text: str) -> dict:
|
||||
"""Extract spacing, depth, germination temp from the growing text."""
|
||||
data = {}
|
||||
|
||||
# Get the growingInfos text
|
||||
gi = re.search(r'class="growingInfos"[^>]*>(.*?)</div>', html_text, re.DOTALL | re.IGNORECASE)
|
||||
if not gi:
|
||||
return data
|
||||
|
||||
full_text = html_to_text(gi.group(1))
|
||||
# Also get the raw HTML for better entity handling
|
||||
raw_html = gi.group(1)
|
||||
# Convert HTML entities for pattern matching
|
||||
raw_text = html_mod.unescape(re.sub(r'<[^>]+>', ' ', raw_html))
|
||||
raw_text = re.sub(r'\s+', ' ', raw_text)
|
||||
|
||||
# ── Sowing depth ──
|
||||
depth_pats = [
|
||||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*[-–]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||
r'(?:Saattiefe|Aussaattiefe|Ablagetiefe|Saatgutablage)[:\s]*(?:ca\.?\s*)?(\d+(?:[.,]\d+)?)\s*cm',
|
||||
]
|
||||
for pat in depth_pats:
|
||||
dm = re.search(pat, raw_text, re.IGNORECASE)
|
||||
if dm:
|
||||
vals = [float(dm.group(i).replace(",", ".")) for i in range(1, dm.lastindex + 1)]
|
||||
data["planting_depth_cm"] = round(sum(vals) / len(vals), 2)
|
||||
break
|
||||
|
||||
# ── Spacing: "ROW x PLANT cm" ──
|
||||
spacing_pats = [
|
||||
# "30–45 x 3–5 cm" (range x range)
|
||||
r'(\d+)\s*[-–]\s*(\d+)\s*[x×]\s*(\d+)\s*[-–]\s*(\d+)\s*cm',
|
||||
# "100 x 50 cm" (simple)
|
||||
r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*cm',
|
||||
]
|
||||
for pat in spacing_pats:
|
||||
matches = re.findall(pat, raw_text, re.IGNORECASE)
|
||||
if matches:
|
||||
m = matches[-1] # prefer last match
|
||||
if len(m) == 4:
|
||||
data["row_spacing_cm"] = round((float(m[0]) + float(m[1])) / 2, 1)
|
||||
data["plant_spacing_cm"] = round((float(m[2]) + float(m[3])) / 2, 1)
|
||||
elif len(m) == 2:
|
||||
v1 = float(m[0].replace(",", "."))
|
||||
v2 = float(m[1].replace(",", "."))
|
||||
data["row_spacing_cm"] = round(v1, 1)
|
||||
data["plant_spacing_cm"] = round(v2, 1)
|
||||
break
|
||||
|
||||
# ── Germination temperature ──
|
||||
temp_pats = [
|
||||
r'(?:Keimtemperatur|Keimtemp)[.:\s]*(?:ca\.?\s*)?(\d+)\s*[-–]\s*(\d+)\s*[°]?\s*C',
|
||||
r'(?:mindestens|mind\.)\s*(\d+)\s*°\s*C',
|
||||
]
|
||||
for pat in temp_pats:
|
||||
tm = re.search(pat, raw_text, re.IGNORECASE)
|
||||
if tm:
|
||||
vals = [float(tm.group(i)) for i in range(1, tm.lastindex + 1)]
|
||||
avg = sum(vals) / len(vals)
|
||||
if 5 <= avg <= 40:
|
||||
data["germination_temp_c"] = round(avg, 1)
|
||||
break
|
||||
|
||||
# ── Perennial ──
|
||||
perennial_pats = [r'mehrj[aä]hrig', r'winterhart', r'ausdauernd', r'Halbstrauch', r'Staude']
|
||||
for pat in perennial_pats:
|
||||
if re.search(pat, raw_text, re.IGNORECASE):
|
||||
data["perennial"] = True
|
||||
break
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# ── Product data ────────────────────────────────────────────────────────────
|
||||
@dataclass
|
||||
class ProductData:
|
||||
name: str = ""
|
||||
raw_latin_name: str = ""
|
||||
normalized_latin: str = ""
|
||||
description: str = ""
|
||||
sku: str = ""
|
||||
url: str = ""
|
||||
is_organic: bool = True
|
||||
growing_data: dict = field(default_factory=dict)
|
||||
calendar: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
def parse_product(html_text: str, url: str) -> Optional[ProductData]:
|
||||
"""Parse a product page. Returns ProductData or None if not a product page."""
|
||||
jsonld = extract_jsonld_product(html_text)
|
||||
if not jsonld:
|
||||
return None
|
||||
|
||||
product = ProductData(url=url)
|
||||
product.name = jsonld.get("name", "").strip()
|
||||
product.description = jsonld.get("description", "").strip()
|
||||
product.sku = jsonld.get("model", "").strip()
|
||||
|
||||
# Extract and normalize botanical name
|
||||
product.raw_latin_name = extract_botanical_name(html_text)
|
||||
product.normalized_latin = normalize_latin_name(product.raw_latin_name)
|
||||
|
||||
# Extract growing data
|
||||
product.growing_data = extract_growing_data(html_text)
|
||||
|
||||
# Parse calendar
|
||||
product.calendar = parse_calendar(html_text)
|
||||
|
||||
# Check organic status (Reinsaat is all organic, but check for "demeter" too)
|
||||
product.is_organic = True
|
||||
|
||||
return product
|
||||
|
||||
|
||||
# ── Recursive discovery ─────────────────────────────────────────────────────
|
||||
def discover_products(
|
||||
category_url: str,
|
||||
max_depth: int = 4,
|
||||
_depth: int = 0,
|
||||
_visited: set = None,
|
||||
) -> list[ProductData]:
|
||||
if _visited is None:
|
||||
_visited = set()
|
||||
if category_url in _visited or _depth > max_depth:
|
||||
return []
|
||||
_visited.add(category_url)
|
||||
|
||||
indent = " " * (_depth + 1)
|
||||
|
||||
try:
|
||||
html_text = fetch_url(category_url)
|
||||
time.sleep(DELAY)
|
||||
except Exception as e:
|
||||
print(f"{indent}ERROR fetching {category_url}: {e}")
|
||||
return []
|
||||
|
||||
# Check if this is a product page
|
||||
product = parse_product(html_text, category_url)
|
||||
if product:
|
||||
return [product]
|
||||
|
||||
# Category page: find child links
|
||||
cat_path = urllib.parse.urlparse(category_url).path.rstrip("/")
|
||||
child_links = []
|
||||
for link in extract_links(html_text, category_url):
|
||||
parsed = urllib.parse.urlparse(link)
|
||||
if parsed.netloc and parsed.netloc != "www.reinsaat.at":
|
||||
continue
|
||||
child_path = parsed.path.rstrip("/")
|
||||
if not child_path.startswith(cat_path + "/"):
|
||||
continue
|
||||
relative = child_path[len(cat_path) + 1:]
|
||||
if "/" in relative or not relative:
|
||||
continue
|
||||
clean_url = f"https://www.reinsaat.at{child_path}/"
|
||||
if clean_url not in _visited:
|
||||
child_links.append(clean_url)
|
||||
|
||||
child_links = list(dict.fromkeys(child_links))
|
||||
print(f"{indent}Category {category_url} -> {len(child_links)} children")
|
||||
|
||||
products = []
|
||||
for child_url in child_links:
|
||||
results = discover_products(child_url, max_depth, _depth + 1, _visited)
|
||||
products.extend(results)
|
||||
|
||||
return products
|
||||
|
||||
|
||||
# ── Slug generation ─────────────────────────────────────────────────────────
|
||||
def make_slug(species_name: str, cultivar_name: str) -> str:
|
||||
"""Generate a URL-friendly slug."""
|
||||
raw = f"{species_name}-{cultivar_name}".lower()
|
||||
# Replace umlauts and special chars
|
||||
replacements = {
|
||||
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
|
||||
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
||||
'á': 'a', 'à': 'a', 'â': 'a',
|
||||
'í': 'i', 'ì': 'i', 'î': 'i',
|
||||
'ó': 'o', 'ò': 'o', 'ô': 'o',
|
||||
'ú': 'u', 'ù': 'u', 'û': 'u',
|
||||
'ñ': 'n', 'ç': 'c',
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
raw = raw.replace(old, new)
|
||||
# Keep only alphanumeric and hyphens
|
||||
slug = re.sub(r'[^a-z0-9]+', '-', raw)
|
||||
slug = slug.strip('-')
|
||||
# Collapse multiple hyphens
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
return slug
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────────
|
||||
def db_connect():
|
||||
"""Create a fresh DB connection."""
|
||||
conn = psycopg2.connect(
|
||||
host=DB_HOST, dbname=DB_NAME, user=DB_USER, password=DB_PASS
|
||||
)
|
||||
conn.autocommit = False
|
||||
return conn
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Reinsaat Scraper v2")
|
||||
print("=" * 70)
|
||||
|
||||
# ── Phase 1: Discover all products (no DB needed) ──
|
||||
print("\n[1] Discovering products from Reinsaat categories...")
|
||||
all_products: list[ProductData] = []
|
||||
visited: set[str] = set()
|
||||
|
||||
for cat_url in CATEGORIES:
|
||||
print(f"\n Category: {cat_url}")
|
||||
products = discover_products(cat_url, max_depth=4, _visited=visited)
|
||||
all_products.extend(products)
|
||||
print(f" -> {len(products)} products")
|
||||
|
||||
# Deduplicate by URL
|
||||
seen_urls = set()
|
||||
unique_products = []
|
||||
for p in all_products:
|
||||
if p.url not in seen_urls:
|
||||
seen_urls.add(p.url)
|
||||
unique_products.append(p)
|
||||
all_products = unique_products
|
||||
print(f"\n Total unique products: {len(all_products)}")
|
||||
|
||||
# ── Phase 2: Connect to DB and load existing data ──
|
||||
print("\n[2] Connecting to DB and loading existing data...")
|
||||
conn = db_connect()
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
# Load species
|
||||
cur.execute("SELECT id, name_scientific FROM species ORDER BY name_scientific")
|
||||
species_rows = cur.fetchall()
|
||||
species_map = {}
|
||||
for row in species_rows:
|
||||
key = row["name_scientific"].lower().strip()
|
||||
species_map[key] = row
|
||||
print(f" {len(species_map)} species loaded")
|
||||
|
||||
# Load existing cultivars
|
||||
cur.execute("""
|
||||
SELECT id, species_id, name, slug, description,
|
||||
row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||||
germination_temp_c, perennial,
|
||||
indoor_sowing_months, direct_sowing_months,
|
||||
transplanting_months, glasshouse_months, harvesting_months
|
||||
FROM cultivars
|
||||
""")
|
||||
cultivar_rows = cur.fetchall()
|
||||
existing_cultivars = {}
|
||||
existing_slugs = set()
|
||||
for row in cultivar_rows:
|
||||
sid = str(row["species_id"])
|
||||
name_lower = row["name"].lower()
|
||||
existing_cultivars[(sid, name_lower)] = dict(row)
|
||||
existing_slugs.add(row["slug"])
|
||||
print(f" {len(existing_cultivars)} cultivars loaded")
|
||||
|
||||
# Load existing Reinsaat supplier links
|
||||
cur.execute("""
|
||||
SELECT cultivar_id, product_url, article_number
|
||||
FROM cultivar_suppliers
|
||||
WHERE supplier_id = %s
|
||||
""", (REINSAAT_SUPPLIER_ID,))
|
||||
existing_links = {}
|
||||
for row in cur.fetchall():
|
||||
cid = str(row["cultivar_id"])
|
||||
url = row["product_url"] or ""
|
||||
sku = row["article_number"] or ""
|
||||
existing_links.setdefault(cid, []).append((url, sku))
|
||||
print(f" {sum(len(v) for v in existing_links.values())} existing links for {len(existing_links)} cultivars")
|
||||
|
||||
# ── Phase 3: Process products ──
|
||||
print("\n[3] Processing products...")
|
||||
stats = {
|
||||
"created": 0,
|
||||
"linked": 0,
|
||||
"enriched": 0,
|
||||
"skipped_no_species": 0,
|
||||
"skipped_no_name": 0,
|
||||
"link_exists": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
unmatched = []
|
||||
|
||||
for i, product in enumerate(all_products):
|
||||
pct = (i + 1) / len(all_products) * 100
|
||||
prefix = f" [{i+1}/{len(all_products)}] ({pct:.0f}%)"
|
||||
|
||||
if not product.name:
|
||||
stats["skipped_no_name"] += 1
|
||||
continue
|
||||
|
||||
# Match species
|
||||
normalized = product.normalized_latin.lower().strip()
|
||||
species = species_map.get(normalized)
|
||||
|
||||
if not species:
|
||||
# Try exact match on raw name (first two words)
|
||||
raw_words = product.raw_latin_name.split()
|
||||
if len(raw_words) >= 2:
|
||||
attempt = f"{raw_words[0].lower()} {raw_words[1].lower()}"
|
||||
species = species_map.get(attempt)
|
||||
|
||||
if not species:
|
||||
stats["skipped_no_species"] += 1
|
||||
unmatched.append((product.name, product.raw_latin_name, product.normalized_latin, product.url))
|
||||
continue
|
||||
|
||||
species_id = str(species["id"])
|
||||
species_name = species["name_scientific"]
|
||||
|
||||
# Check if cultivar exists
|
||||
ckey = (species_id, product.name.lower())
|
||||
existing = existing_cultivars.get(ckey)
|
||||
|
||||
if existing:
|
||||
cultivar_id = str(existing["id"])
|
||||
|
||||
# ── Enrich existing cultivar with missing data ──
|
||||
updates = {}
|
||||
|
||||
# Growing data from page
|
||||
gd = product.growing_data
|
||||
if gd.get("planting_depth_cm") and not existing.get("planting_depth_cm"):
|
||||
updates["planting_depth_cm"] = gd["planting_depth_cm"]
|
||||
if gd.get("row_spacing_cm") and not existing.get("row_spacing_cm"):
|
||||
updates["row_spacing_cm"] = gd["row_spacing_cm"]
|
||||
if gd.get("plant_spacing_cm") and not existing.get("plant_spacing_cm"):
|
||||
updates["plant_spacing_cm"] = gd["plant_spacing_cm"]
|
||||
if gd.get("germination_temp_c") and not existing.get("germination_temp_c"):
|
||||
updates["germination_temp_c"] = gd["germination_temp_c"]
|
||||
if gd.get("perennial") and not existing.get("perennial"):
|
||||
updates["perennial"] = True
|
||||
|
||||
# Calendar data
|
||||
cal = product.calendar
|
||||
if cal.get("indoor_sowing_months") and not existing.get("indoor_sowing_months"):
|
||||
updates["indoor_sowing_months"] = cal["indoor_sowing_months"]
|
||||
if cal.get("direct_sowing_months") and not existing.get("direct_sowing_months"):
|
||||
updates["direct_sowing_months"] = cal["direct_sowing_months"]
|
||||
if cal.get("transplanting_months") and not existing.get("transplanting_months"):
|
||||
updates["transplanting_months"] = cal["transplanting_months"]
|
||||
if cal.get("glasshouse_months") and not existing.get("glasshouse_months"):
|
||||
updates["glasshouse_months"] = cal["glasshouse_months"]
|
||||
if cal.get("harvesting_months") and not existing.get("harvesting_months"):
|
||||
updates["harvesting_months"] = cal["harvesting_months"]
|
||||
|
||||
# Description
|
||||
if product.description and not existing.get("description"):
|
||||
updates["description"] = product.description
|
||||
|
||||
if updates:
|
||||
set_clauses = []
|
||||
values = []
|
||||
for col, val in updates.items():
|
||||
set_clauses.append(f"{col} = %s")
|
||||
values.append(val)
|
||||
set_clauses.append("updated_at = NOW()")
|
||||
values.append(cultivar_id)
|
||||
cur.execute(
|
||||
f"UPDATE cultivars SET {', '.join(set_clauses)} WHERE id = %s::uuid",
|
||||
values
|
||||
)
|
||||
stats["enriched"] += 1
|
||||
print(f"{prefix} {product.name} -> ENRICHED ({', '.join(updates.keys())})")
|
||||
|
||||
# ── Add supplier link if missing ──
|
||||
link_exists = False
|
||||
if cultivar_id in existing_links:
|
||||
for lurl, lsku in existing_links[cultivar_id]:
|
||||
if lurl == product.url or (lsku and lsku == product.sku):
|
||||
link_exists = True
|
||||
break
|
||||
|
||||
if link_exists:
|
||||
stats["link_exists"] += 1
|
||||
else:
|
||||
try:
|
||||
cur.execute("SAVEPOINT link_sp")
|
||||
cur.execute("""
|
||||
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||||
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||||
ON CONFLICT (cultivar_id, supplier_id, article_number) DO UPDATE
|
||||
SET product_url = EXCLUDED.product_url, last_checked_at = NOW()
|
||||
""", (cultivar_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||||
cur.execute("RELEASE SAVEPOINT link_sp")
|
||||
stats["linked"] += 1
|
||||
existing_links.setdefault(cultivar_id, []).append((product.url, product.sku))
|
||||
print(f"{prefix} {product.name} -> LINKED ({product.sku})")
|
||||
except Exception as e:
|
||||
print(f"{prefix} {product.name} -> LINK ERROR: {e}")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT link_sp")
|
||||
stats["errors"] += 1
|
||||
else:
|
||||
# ── Create new cultivar ──
|
||||
slug = make_slug(species_name, product.name)
|
||||
# Ensure unique slug
|
||||
base_slug = slug
|
||||
counter = 2
|
||||
while slug in existing_slugs:
|
||||
slug = f"{base_slug}-{counter}"
|
||||
counter += 1
|
||||
|
||||
gd = product.growing_data
|
||||
cal = product.calendar
|
||||
|
||||
try:
|
||||
cur.execute("SAVEPOINT create_sp")
|
||||
cur.execute("""
|
||||
INSERT INTO cultivars (
|
||||
species_id, name, name_de, slug, description,
|
||||
is_organic, perennial,
|
||||
planting_depth_cm, row_spacing_cm, plant_spacing_cm,
|
||||
germination_temp_c,
|
||||
indoor_sowing_months, direct_sowing_months,
|
||||
transplanting_months, glasshouse_months, harvesting_months
|
||||
) VALUES (
|
||||
%s::uuid, %s, %s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s,
|
||||
%s,
|
||||
%s, %s,
|
||||
%s, %s, %s
|
||||
)
|
||||
RETURNING id
|
||||
""", (
|
||||
species_id,
|
||||
product.name,
|
||||
product.name,
|
||||
slug,
|
||||
product.description,
|
||||
product.is_organic,
|
||||
gd.get("perennial", False),
|
||||
gd.get("planting_depth_cm"),
|
||||
gd.get("row_spacing_cm"),
|
||||
gd.get("plant_spacing_cm"),
|
||||
gd.get("germination_temp_c"),
|
||||
cal.get("indoor_sowing_months"),
|
||||
cal.get("direct_sowing_months"),
|
||||
cal.get("transplanting_months"),
|
||||
cal.get("glasshouse_months"),
|
||||
cal.get("harvesting_months"),
|
||||
))
|
||||
new_id = str(cur.fetchone()["id"])
|
||||
existing_slugs.add(slug)
|
||||
existing_cultivars[ckey] = {"id": new_id}
|
||||
stats["created"] += 1
|
||||
|
||||
# Link to supplier
|
||||
cur.execute("""
|
||||
INSERT INTO cultivar_suppliers (cultivar_id, supplier_id, product_url, article_number, last_checked_at)
|
||||
VALUES (%s::uuid, %s::uuid, %s, %s, NOW())
|
||||
""", (new_id, REINSAAT_SUPPLIER_ID, product.url, product.sku))
|
||||
stats["linked"] += 1
|
||||
existing_links.setdefault(new_id, []).append((product.url, product.sku))
|
||||
|
||||
print(f"{prefix} {product.name} -> CREATED ({species_name}, {slug})")
|
||||
cur.execute("RELEASE SAVEPOINT create_sp")
|
||||
except Exception as e:
|
||||
print(f"{prefix} {product.name} -> CREATE ERROR: {e}")
|
||||
cur.execute("ROLLBACK TO SAVEPOINT create_sp")
|
||||
stats["errors"] += 1
|
||||
|
||||
# ── Commit ──
|
||||
conn.commit()
|
||||
|
||||
# ── Summary ──
|
||||
print("\n" + "=" * 70)
|
||||
print("SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f" Total products discovered: {len(all_products)}")
|
||||
print(f" New cultivars created: {stats['created']}")
|
||||
print(f" New supplier links added: {stats['linked']}")
|
||||
print(f" Cultivars enriched: {stats['enriched']}")
|
||||
print(f" Links already existed: {stats['link_exists']}")
|
||||
print(f" Skipped (no species): {stats['skipped_no_species']}")
|
||||
print(f" Skipped (no name): {stats['skipped_no_name']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
print("=" * 70)
|
||||
|
||||
if unmatched:
|
||||
print(f"\n UNMATCHED PRODUCTS ({len(unmatched)}):")
|
||||
for name, raw_latin, normalized, url in sorted(unmatched, key=lambda x: x[2]):
|
||||
print(f" {normalized:30s} (raw: {raw_latin:40s}) {name:30s} {url}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,635 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reinsaat v3 scraper - uses HerbAPI REST API, robust botanical name matching."""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from html import unescape
|
||||
|
||||
# --- Config ---
|
||||
API_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
API_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
REINSAAT_BASE = "https://www.reinsaat.at"
|
||||
DELAY = 0.3
|
||||
|
||||
# Categories to scrape (seed products only, skip books/bulbs/peonies/potatoes/gift/seed_tapes)
|
||||
CATEGORIES = [
|
||||
"beans", "peas", "florence_fennel", "cucumbers", "brassica", "garden_cress",
|
||||
"pumpkins_squash", "corn", "swiss_chard", "aubergine_eggplants", "melons",
|
||||
"carrots", "sweet_pepper", "chilli_peppers_chill", "parsnips", "parsley",
|
||||
"parsley_root", "leeks", "radish", "beetroot", "lettuce", "black_salsify",
|
||||
"celery", "spinach", "tomatoes", "zucchini_courgette", "onion_garlic",
|
||||
"culinary_and_aromatic_herbs", "conservation_varieties", "flowers_and_herbs",
|
||||
"wild_flowers_seeds", "green_manure",
|
||||
]
|
||||
|
||||
# Suffixes to strip from botanical names (authority names, infraspecific ranks)
|
||||
STRIP_SUFFIXES = {
|
||||
"l.", "mill.", "dc.", "l", "convar.", "convar", "var.", "var",
|
||||
"subsp.", "subsp", "ssp.", "ssp", "f.", "em.", "auct.",
|
||||
"hort.", "medik.", "moench", "pers.", "salisb.", "thunb.",
|
||||
"crantz", "gaertn.", "lam.", "link", "siebold", "zucc.",
|
||||
"sat.", "sat", "axillare", "medikus",
|
||||
}
|
||||
|
||||
|
||||
def api_get(path, params=None):
|
||||
"""GET from HerbAPI."""
|
||||
url = f"{API_BASE}{path}"
|
||||
if params:
|
||||
url += "?" + urllib.parse.urlencode(params)
|
||||
req = urllib.request.Request(url)
|
||||
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def api_post(path, data):
|
||||
"""POST to HerbAPI."""
|
||||
url = f"{API_BASE}{path}"
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(url, data=body, method="POST")
|
||||
req.add_header("Authorization", f"Bearer {API_TOKEN}")
|
||||
req.add_header("Content-Type", "application/json")
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def fetch_page(url):
|
||||
"""Fetch a web page, return HTML string."""
|
||||
req = urllib.request.Request(url)
|
||||
req.add_header("User-Agent", "Mozilla/5.0 (HerbAPI Scraper)")
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
BOTANICAL_TYPOS = {
|
||||
"capscicum": "capsicum",
|
||||
"capsicum frutenscens": "capsicum frutescens",
|
||||
"tropaelum": "tropaeolum",
|
||||
"lact.": "lactuca",
|
||||
}
|
||||
|
||||
ABBREVIATED_NAMES = {
|
||||
"origanum vulg.": "origanum vulgare",
|
||||
"helichrysum bract.": "helichrysum bracteatum",
|
||||
"campanula lat.": "campanula latifolia",
|
||||
"cosmos bip.": "cosmos bipinnatus",
|
||||
"papaver somnif.": "papaver somniferum",
|
||||
}
|
||||
|
||||
|
||||
def normalise_botanical(raw):
|
||||
"""Strip botanical name to genus + species only.
|
||||
|
||||
'Pisum sativum L. convar. sat.' -> 'pisum sativum'
|
||||
'Solanum lycopersicum L.' -> 'solanum lycopersicum'
|
||||
'Beta vulgaris L. ssp. vulgaris' -> 'beta vulgaris'
|
||||
"""
|
||||
if not raw:
|
||||
return None
|
||||
# Clean HTML entities
|
||||
raw = unescape(raw).replace("\xa0", " ").strip()
|
||||
# Remove trailing commas/periods
|
||||
raw = raw.rstrip(",. ")
|
||||
# Remove content in parentheses
|
||||
raw = re.sub(r"\([^)]*\)", "", raw)
|
||||
# Check abbreviated names first (before splitting)
|
||||
raw_lower = raw.lower().strip()
|
||||
for abbrev, full in ABBREVIATED_NAMES.items():
|
||||
if raw_lower.startswith(abbrev):
|
||||
return full
|
||||
|
||||
parts = raw.split()
|
||||
if len(parts) < 2:
|
||||
return None
|
||||
# Genus (capitalised) + species (lowercase)
|
||||
genus = parts[0].lower().rstrip(",")
|
||||
species = parts[1].lower().rstrip(",")
|
||||
|
||||
# Fix known typos
|
||||
if genus in BOTANICAL_TYPOS:
|
||||
genus = BOTANICAL_TYPOS[genus]
|
||||
full_name = f"{genus} {species}"
|
||||
if full_name in BOTANICAL_TYPOS:
|
||||
full_name = BOTANICAL_TYPOS[full_name]
|
||||
genus, species = full_name.split()
|
||||
|
||||
# Validate: genus should start with letter, species should be all lowercase
|
||||
if not genus[0].isalpha() or not species[0].isalpha():
|
||||
return None
|
||||
# Skip if species looks like an authority (starts with uppercase in original)
|
||||
if parts[1][0].isupper():
|
||||
return None
|
||||
return f"{genus} {species}"
|
||||
|
||||
|
||||
def extract_product_data(html, url):
|
||||
"""Extract product info from a Reinsaat product page."""
|
||||
result = {}
|
||||
|
||||
# H1 = variety name
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||
if m:
|
||||
name = unescape(m.group(1)).strip()
|
||||
# Clean up names like "RS-To-01.26 (Alda)" -> "Alda"
|
||||
paren = re.search(r"\(([^)]+)\)", name)
|
||||
if paren and re.match(r"RS-", name):
|
||||
name = paren.group(1).strip()
|
||||
result["name"] = name
|
||||
|
||||
# Botanical name from fce_shop_kurztext
|
||||
m = re.search(
|
||||
r'fce_shop_kurztext[^>]*>\s*(?:<em[^>]*>)?\s*([^<]+?)\s*(?:</em>)?\s*</div>',
|
||||
html,
|
||||
)
|
||||
if m:
|
||||
result["botanical_raw"] = unescape(m.group(1)).replace("\xa0", " ").strip()
|
||||
result["botanical_norm"] = normalise_botanical(result["botanical_raw"])
|
||||
|
||||
# Article number from JSON-LD
|
||||
for jm in re.finditer(
|
||||
r'<script type="application/ld\+json">(.*?)</script>', html, re.S
|
||||
):
|
||||
try:
|
||||
jd = json.loads(jm.group(1))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if jd.get("@type") == "Product":
|
||||
if "model" in jd:
|
||||
result["article_number"] = str(jd["model"])
|
||||
# Get smallest pack price (usually the Portion)
|
||||
offers = jd.get("offers", {})
|
||||
if isinstance(offers, dict):
|
||||
offer_list = offers.get("offers", [])
|
||||
elif isinstance(offers, list):
|
||||
offer_list = offers
|
||||
else:
|
||||
offer_list = []
|
||||
if offer_list:
|
||||
prices = [
|
||||
o["price"]
|
||||
for o in offer_list
|
||||
if isinstance(o.get("price"), (int, float)) and o["price"] > 0
|
||||
]
|
||||
if prices:
|
||||
result["price_eur"] = min(prices)
|
||||
break
|
||||
|
||||
# Price table - get pack sizes
|
||||
tables = re.findall(r"<table[^>]*>(.*?)</table>", html, re.S)
|
||||
for tbl in tables:
|
||||
if "€" not in tbl:
|
||||
continue
|
||||
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", tbl, re.S)
|
||||
if len(rows) >= 2:
|
||||
size_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[0], re.S)
|
||||
size_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in size_cells]
|
||||
price_cells = re.findall(r"<td[^>]*>(.*?)</td>", rows[1], re.S)
|
||||
price_texts = [re.sub(r"<[^>]+>", "", c).strip() for c in price_cells]
|
||||
# Find the "Port." entry
|
||||
for i, st in enumerate(size_texts):
|
||||
if "Port" in st:
|
||||
if i < len(price_texts):
|
||||
pm = re.search(r"[\d,\.]+", price_texts[i].replace(",", "."))
|
||||
if pm:
|
||||
result["port_price"] = float(pm.group())
|
||||
break
|
||||
# Get portion content info
|
||||
result["pack_sizes"] = size_texts
|
||||
break
|
||||
|
||||
# Sowing depth
|
||||
m = re.search(r"(?:sowing|seed)\s*depth[:\s]*(?:approx\.?\s*)?(\d+[\.,]?\d*)\s*(?:-\s*(\d+[\.,]?\d*)\s*)?cm", html, re.I)
|
||||
if m:
|
||||
d1 = float(m.group(1).replace(",", "."))
|
||||
d2 = float(m.group(2).replace(",", ".")) if m.group(2) else d1
|
||||
result["planting_depth_cm"] = round((d1 + d2) / 2, 2)
|
||||
|
||||
# Spacing: "row spacing NNxNN cm" or "NN x NN cm"
|
||||
# Try outdoor spacing first
|
||||
m = re.search(r"(?:outdoors?|field)[^.]*?(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||
if not m:
|
||||
m = re.search(r"row\s*spacing\s*(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||
if not m:
|
||||
m = re.search(r"(\d+)\s*(?:x|×)\s*(\d+)\s*cm", html, re.I)
|
||||
if m:
|
||||
result["row_spacing_cm"] = float(m.group(1))
|
||||
result["plant_spacing_cm"] = float(m.group(2))
|
||||
|
||||
# Row spacing without plant spacing (e.g. "row spacing 30-45 cm")
|
||||
if "row_spacing_cm" not in result:
|
||||
m = re.search(r"row\s*spacing\s*(\d+)(?:\s*-\s*(\d+))?\s*cm", html, re.I)
|
||||
if m:
|
||||
r1 = int(m.group(1))
|
||||
r2 = int(m.group(2)) if m.group(2) else r1
|
||||
result["row_spacing_cm"] = float((r1 + r2) // 2)
|
||||
|
||||
# Germination temperature
|
||||
m = re.search(r"germination\s*temp[^:]*:\s*(\d+)\s*(?:-\s*(\d+))?\s*°?\s*C", html, re.I)
|
||||
if m:
|
||||
t1 = int(m.group(1))
|
||||
t2 = int(m.group(2)) if m.group(2) else t1
|
||||
result["germination_temp_c"] = float((t1 + t2) // 2)
|
||||
|
||||
# Pack unit from portion info - "20 seeds" or "25 g" etc
|
||||
portion_m = re.search(r"[Pp]ortion\s*(?:contents?)?[:\s]*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||||
if not portion_m:
|
||||
# Try "Port. (20 seeds)" format
|
||||
portion_m = re.search(r"Port[.\w]*\s*\(?\s*(\d+[\.,]?\d*)\s*(seeds?|Korn|g|kg)", html)
|
||||
if portion_m:
|
||||
result["pack_size"] = float(portion_m.group(1).replace(",", "."))
|
||||
unit = portion_m.group(2).lower()
|
||||
if unit in ("seed", "seeds", "korn"):
|
||||
result["pack_unit"] = "Korn"
|
||||
else:
|
||||
result["pack_unit"] = unit
|
||||
|
||||
result["url"] = url
|
||||
return result
|
||||
|
||||
|
||||
def get_all_species():
|
||||
"""Fetch all species from API, build lookup by normalised name."""
|
||||
species_map = {}
|
||||
page = 1
|
||||
while True:
|
||||
data = api_get("/species", {"per_page": 100, "page": page})
|
||||
batch = data.get("data", [])
|
||||
for sp in batch:
|
||||
norm = normalise_botanical(sp["name_scientific"])
|
||||
if norm:
|
||||
species_map[norm] = {"id": sp["id"], "slug": sp["slug"], "name": sp["name_scientific"]}
|
||||
print(f" page {page}: {len(batch)} species (total so far: {len(species_map)})")
|
||||
if len(batch) < 100:
|
||||
break
|
||||
page += 1
|
||||
return species_map
|
||||
|
||||
|
||||
def get_all_cultivars():
|
||||
"""Fetch all cultivars, build lookup by (species_id, normalised name)."""
|
||||
cultivar_map = {} # (species_id, lower_name) -> cultivar
|
||||
page = 1
|
||||
while True:
|
||||
data = api_get("/cultivars", {"per_page": 100, "page": page})
|
||||
batch = data.get("data", [])
|
||||
for cv in batch:
|
||||
key = (cv["species_id"], cv["name"].lower().strip())
|
||||
cultivar_map[key] = cv
|
||||
print(f" page {page}: {len(batch)} cultivars (total so far: {len(cultivar_map)})")
|
||||
if len(batch) < 100:
|
||||
break
|
||||
page += 1
|
||||
return cultivar_map
|
||||
|
||||
|
||||
def get_reinsaat_supplier():
|
||||
"""Get Reinsaat supplier record."""
|
||||
suppliers = api_get("/suppliers")
|
||||
for s in suppliers:
|
||||
if s["slug"] == "reinsaat":
|
||||
return s
|
||||
raise RuntimeError("Reinsaat supplier not found in API")
|
||||
|
||||
|
||||
def get_cultivar_suppliers(cultivar_id):
|
||||
"""Get existing supplier links for a cultivar."""
|
||||
return api_get(f"/cultivars/{cultivar_id}/suppliers")
|
||||
|
||||
|
||||
def get_product_urls_from_category(cat_slug):
|
||||
"""Fetch product URLs from a category page. Handles one level of subcategories."""
|
||||
cat_url = f"{REINSAAT_BASE}/shop/EN/{cat_slug}/"
|
||||
try:
|
||||
html = fetch_page(cat_url)
|
||||
except Exception as e:
|
||||
print(f" WARN: Failed to fetch category {cat_slug}: {e}")
|
||||
return []
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Get all internal links under this category
|
||||
pattern = rf'/shop/EN/{re.escape(cat_slug)}/([^"]+)/'
|
||||
raw_links = re.findall(rf'href="({pattern})"', html)
|
||||
# raw_links is list of (full_path, slug_part) but re gives us captured groups
|
||||
# Let me redo this
|
||||
raw_links = re.findall(rf'href="(/shop/EN/{re.escape(cat_slug)}/[^"]+/)"', html)
|
||||
unique_links = sorted(set(raw_links))
|
||||
|
||||
product_urls = []
|
||||
subcategory_urls = []
|
||||
|
||||
for link in unique_links:
|
||||
full_url = REINSAAT_BASE + link
|
||||
# Determine depth relative to category
|
||||
parts = link.rstrip("/").split("/")
|
||||
# /shop/EN/cat_slug/item -> 4 parts = product or subcategory
|
||||
# /shop/EN/cat_slug/subcat/item -> 5 parts = nested product
|
||||
if len(parts) == 4:
|
||||
# Could be product or subcategory - we'll check later
|
||||
product_urls.append(full_url)
|
||||
elif len(parts) >= 5:
|
||||
product_urls.append(full_url)
|
||||
|
||||
return product_urls
|
||||
|
||||
|
||||
def is_product_page(html):
|
||||
"""Check if HTML is a product page (has botanical name or JSON-LD Product)."""
|
||||
return bool(
|
||||
re.search(r'fce_shop_kurztext', html)
|
||||
or re.search(r'"@type":\s*"Product"', html)
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Reinsaat v3 Scraper")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Load all species
|
||||
print("\n[1/4] Loading species from API...")
|
||||
species_map = get_all_species()
|
||||
print(f" Loaded {len(species_map)} species")
|
||||
|
||||
# Step 2: Load all cultivars
|
||||
print("\n[2/4] Loading cultivars from API...")
|
||||
cultivar_map = get_all_cultivars()
|
||||
print(f" Loaded {len(cultivar_map)} cultivars")
|
||||
|
||||
# Step 3: Get Reinsaat supplier
|
||||
print("\n[3/4] Getting Reinsaat supplier...")
|
||||
supplier = get_reinsaat_supplier()
|
||||
supplier_id = supplier["id"]
|
||||
print(f" Reinsaat ID: {supplier_id}")
|
||||
|
||||
# Step 4: Scrape categories
|
||||
print(f"\n[4/4] Scraping {len(CATEGORIES)} categories...")
|
||||
|
||||
stats = {
|
||||
"products_found": 0,
|
||||
"botanical_extracted": 0,
|
||||
"species_matched": 0,
|
||||
"species_not_matched": 0,
|
||||
"cultivar_existed": 0,
|
||||
"cultivar_created": 0,
|
||||
"link_existed": 0,
|
||||
"link_created": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
unmatched_species = {} # botanical_norm -> count
|
||||
new_cultivars = []
|
||||
new_links = []
|
||||
|
||||
for cat_i, cat in enumerate(CATEGORIES):
|
||||
print(f"\n--- [{cat_i+1}/{len(CATEGORIES)}] {cat} ---")
|
||||
urls = get_product_urls_from_category(cat)
|
||||
print(f" Found {len(urls)} URLs")
|
||||
|
||||
for url in urls:
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
html = fetch_page(url)
|
||||
except Exception as e:
|
||||
print(f" ERROR fetching {url}: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
# Check if this is actually a product page
|
||||
if not is_product_page(html):
|
||||
# Might be a subcategory - get links from it
|
||||
sub_links = re.findall(rf'href="(/shop/EN/[^"]+/)"', html)
|
||||
sub_links = [
|
||||
REINSAAT_BASE + l
|
||||
for l in sorted(set(sub_links))
|
||||
if l.startswith(f"/shop/EN/{cat}/")
|
||||
and l.count("/") > url.rstrip("/").count("/")
|
||||
]
|
||||
if sub_links:
|
||||
# It's a subcategory, process its product links
|
||||
for sub_url in sub_links:
|
||||
if sub_url in urls:
|
||||
continue # already in list
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
sub_html = fetch_page(sub_url)
|
||||
except Exception as e:
|
||||
print(f" ERROR fetching {sub_url}: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
if not is_product_page(sub_html):
|
||||
continue
|
||||
process_product(
|
||||
sub_html, sub_url, species_map, cultivar_map,
|
||||
supplier_id, stats, unmatched_species,
|
||||
new_cultivars, new_links,
|
||||
)
|
||||
continue
|
||||
|
||||
process_product(
|
||||
html, url, species_map, cultivar_map,
|
||||
supplier_id, stats, unmatched_species,
|
||||
new_cultivars, new_links,
|
||||
)
|
||||
|
||||
# Report
|
||||
print("\n" + "=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Products found: {stats['products_found']}")
|
||||
print(f"Botanical extracted: {stats['botanical_extracted']}")
|
||||
print(f"Species matched: {stats['species_matched']}")
|
||||
print(f"Species NOT matched: {stats['species_not_matched']}")
|
||||
print(f"Cultivars existed: {stats['cultivar_existed']}")
|
||||
print(f"Cultivars created: {stats['cultivar_created']}")
|
||||
print(f"Links existed: {stats['link_existed']}")
|
||||
print(f"Links created: {stats['link_created']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
|
||||
if new_cultivars:
|
||||
print(f"\n--- New cultivars ({len(new_cultivars)}) ---")
|
||||
for cv in new_cultivars:
|
||||
print(f" + {cv['name']} ({cv.get('species', '?')})")
|
||||
|
||||
if new_links:
|
||||
print(f"\n--- New supplier links ({len(new_links)}) ---")
|
||||
for lk in new_links:
|
||||
print(f" + {lk['cultivar']} -> {lk.get('article', '?')}")
|
||||
|
||||
if unmatched_species:
|
||||
print(f"\n--- Unmatched species ({len(unmatched_species)}) ---")
|
||||
for name, count in sorted(unmatched_species.items(), key=lambda x: -x[1]):
|
||||
print(f" ? {name} (x{count})")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
def process_product(html, url, species_map, cultivar_map, supplier_id,
|
||||
stats, unmatched_species, new_cultivars, new_links):
|
||||
"""Process a single product page."""
|
||||
stats["products_found"] += 1
|
||||
prod = extract_product_data(html, url)
|
||||
|
||||
if not prod.get("name"):
|
||||
return
|
||||
|
||||
bot_norm = prod.get("botanical_norm")
|
||||
if not bot_norm:
|
||||
# No botanical name found on page
|
||||
stats["species_not_matched"] += 1
|
||||
unmatched_species["(no botanical name)"] = unmatched_species.get("(no botanical name)", 0) + 1
|
||||
return
|
||||
|
||||
stats["botanical_extracted"] += 1
|
||||
|
||||
# Match species
|
||||
species = species_map.get(bot_norm)
|
||||
if not species:
|
||||
stats["species_not_matched"] += 1
|
||||
unmatched_species[bot_norm] = unmatched_species.get(bot_norm, 0) + 1
|
||||
return
|
||||
|
||||
stats["species_matched"] += 1
|
||||
species_id = species["id"]
|
||||
cultivar_name = prod["name"]
|
||||
|
||||
# Check if cultivar exists
|
||||
cv_key = (species_id, cultivar_name.lower().strip())
|
||||
existing_cv = cultivar_map.get(cv_key)
|
||||
|
||||
if existing_cv:
|
||||
stats["cultivar_existed"] += 1
|
||||
cultivar_id = existing_cv["id"]
|
||||
else:
|
||||
# Create cultivar
|
||||
create_data = {
|
||||
"species_id": species_id,
|
||||
"name": cultivar_name,
|
||||
"is_organic": True,
|
||||
"source_urls": [url],
|
||||
}
|
||||
# Add growing data if we extracted any
|
||||
if "planting_depth_cm" in prod:
|
||||
create_data["planting_depth_cm"] = prod["planting_depth_cm"]
|
||||
if "row_spacing_cm" in prod:
|
||||
create_data["row_spacing_cm"] = prod["row_spacing_cm"]
|
||||
if "plant_spacing_cm" in prod:
|
||||
create_data["plant_spacing_cm"] = prod["plant_spacing_cm"]
|
||||
if "germination_temp_c" in prod:
|
||||
create_data["germination_temp_c"] = prod["germination_temp_c"]
|
||||
|
||||
try:
|
||||
new_cv = api_post("/cultivars", create_data)
|
||||
cultivar_id = new_cv["id"]
|
||||
stats["cultivar_created"] += 1
|
||||
new_cultivars.append({
|
||||
"name": cultivar_name,
|
||||
"species": species["name"],
|
||||
"id": cultivar_id,
|
||||
})
|
||||
# Add to local cache
|
||||
cultivar_map[cv_key] = new_cv
|
||||
print(f" + Created cultivar: {cultivar_name} ({species['name']})")
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||||
if e.code == 500 and "Database error" in body:
|
||||
# Likely slug collision - search for existing cultivar
|
||||
try:
|
||||
# Try multiple search strategies
|
||||
found = None
|
||||
cn_lower = cultivar_name.lower().strip()
|
||||
|
||||
# Strategy 1: search by full name
|
||||
search_data = api_get("/cultivars", {"search": cultivar_name, "per_page": 50})
|
||||
for cv in search_data.get("data", []):
|
||||
if cv["name"].lower().strip() == cn_lower:
|
||||
found = cv
|
||||
break
|
||||
# Strategy 2: match by species_id + partial name
|
||||
if not found:
|
||||
for cv in search_data.get("data", []):
|
||||
if cv["species_id"] == species_id:
|
||||
# Match if names are similar (ignoring punctuation)
|
||||
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||||
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||||
if cv_clean == cn_clean or cv_clean in cn_clean or cn_clean in cv_clean:
|
||||
found = cv
|
||||
break
|
||||
# Strategy 3: search by last significant word
|
||||
if not found:
|
||||
words = [w for w in cultivar_name.split() if len(w) > 2]
|
||||
if words:
|
||||
search2 = api_get("/cultivars", {"search": words[-1], "per_page": 50})
|
||||
for cv in search2.get("data", []):
|
||||
if cv["species_id"] == species_id:
|
||||
cv_clean = re.sub(r'[^\w\s]', '', cv["name"].lower())
|
||||
cn_clean = re.sub(r'[^\w\s]', '', cn_lower)
|
||||
if cv_clean == cn_clean:
|
||||
found = cv
|
||||
break
|
||||
|
||||
if found:
|
||||
cultivar_id = found["id"]
|
||||
cultivar_map[cv_key] = found
|
||||
stats["cultivar_existed"] += 1
|
||||
else:
|
||||
print(f" WARN: could not create or find cultivar '{cultivar_name}' (DB error + no search match)")
|
||||
stats["errors"] += 1
|
||||
return
|
||||
except Exception as e2:
|
||||
print(f" ERROR searching for '{cultivar_name}' after collision: {e2}")
|
||||
stats["errors"] += 1
|
||||
return
|
||||
else:
|
||||
print(f" ERROR creating cultivar '{cultivar_name}': {e.code} {body}")
|
||||
stats["errors"] += 1
|
||||
return
|
||||
|
||||
# Check if Reinsaat supplier link exists
|
||||
try:
|
||||
existing_links = get_cultivar_suppliers(cultivar_id)
|
||||
except Exception:
|
||||
existing_links = []
|
||||
|
||||
has_reinsaat = any(l["supplier_id"] == supplier_id for l in existing_links)
|
||||
|
||||
if has_reinsaat:
|
||||
stats["link_existed"] += 1
|
||||
else:
|
||||
# Create supplier link
|
||||
link_data = {
|
||||
"supplier_id": supplier_id,
|
||||
"product_url": url,
|
||||
}
|
||||
if "article_number" in prod:
|
||||
link_data["article_number"] = prod["article_number"]
|
||||
if "port_price" in prod:
|
||||
link_data["price_eur"] = prod["port_price"]
|
||||
elif "price_eur" in prod:
|
||||
link_data["price_eur"] = prod["price_eur"]
|
||||
if "pack_size" in prod:
|
||||
link_data["pack_size"] = prod["pack_size"]
|
||||
if "pack_unit" in prod:
|
||||
link_data["pack_unit"] = prod["pack_unit"]
|
||||
|
||||
try:
|
||||
api_post(f"/cultivars/{cultivar_id}/suppliers", link_data)
|
||||
stats["link_created"] += 1
|
||||
new_links.append({
|
||||
"cultivar": cultivar_name,
|
||||
"article": prod.get("article_number", "?"),
|
||||
"url": url,
|
||||
})
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode() if hasattr(e, 'read') else str(e)
|
||||
print(f" ERROR linking '{cultivar_name}': {e.code} {body}")
|
||||
stats["errors"] += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user