Add scraper and enrichment scripts to tools/ directory

This commit is contained in:
2026-03-16 11:10:18 +01:00
parent 83ab8c4cf9
commit 0ef902cc91
13 changed files with 6031 additions and 0 deletions
+156
View File
@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
import json
import time
import urllib.parse
import urllib.request
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
HEADERS_WD = {
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
"Accept": "application/json",
}
def herbapi_request(path, method="GET", data=None):
url = f"{HERBAPI_BASE}{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method, headers={
"Authorization": f"Bearer {HERBAPI_TOKEN}",
"Content-Type": "application/json",
})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def query_wikidata_batch(names):
"""Query Wikidata for a batch of scientific names."""
values = " ".join(f'"{n}"' for n in names)
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
VALUES ?name {{ {values} }}
?item wdt:P225 ?name .
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
}}"""
encoded = urllib.parse.quote(sparql)
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
req = urllib.request.Request(url, headers=HEADERS_WD)
with urllib.request.urlopen(req, timeout=60) as resp:
data = json.loads(resp.read())
results = {}
for binding in data.get("results", {}).get("bindings", []):
name = binding["name"]["value"]
qid_url = binding["item"]["value"]
qid = qid_url.rsplit("/", 1)[-1]
gbif = binding.get("gbifId", {}).get("value")
eppo = binding.get("eppoCode", {}).get("value")
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
return results
def main():
# 1. Fetch all species
resp = herbapi_request("/species?per_page=200")
species_list = resp["data"]
print(f"Fetched {len(species_list)} species from HerbAPI\n")
# 2. Collect species needing enrichment
to_enrich = [sp for sp in species_list
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
if not to_enrich:
print("All species already enriched.")
return
print(f"{len(to_enrich)} species need enrichment\n")
# 3. Batch query Wikidata
BATCH_SIZE = 20
wikidata_results = {}
names = [sp["name_scientific"] for sp in to_enrich]
for i in range(0, len(names), BATCH_SIZE):
batch = names[i:i + BATCH_SIZE]
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
try:
results = query_wikidata_batch(batch)
wikidata_results.update(results)
print(f" Got {len(results)} matches")
except Exception as e:
print(f" ERROR: {e}")
if i + BATCH_SIZE < len(names):
time.sleep(2)
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
updated = 0
skipped = 0
not_found = 0
errors = 0
for sp in to_enrich:
name = sp["name_scientific"]
wd = wikidata_results.get(name)
if not wd:
print(f" SKIP (no Wikidata match): {name}")
not_found += 1
continue
# Check what needs updating
needs_qid = not sp["wikidata_qid"] and wd["qid"]
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
if not (needs_qid or needs_gbif or needs_eppo):
print(f" SKIP (nothing new): {name}")
skipped += 1
continue
try:
# GET full species by slug for the complete object
full_sp = herbapi_request(f"/species/{sp['slug']}")
# Remove read-only fields
species_id = full_sp.pop("id")
full_sp.pop("slug", None)
full_sp.pop("created_at", None)
full_sp.pop("updated_at", None)
# Merge new data (only null fields)
if needs_qid:
full_sp["wikidata_qid"] = wd["qid"]
if needs_gbif:
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
if needs_eppo:
full_sp["eppo_code"] = wd["eppo_code"]
# PUT by UUID
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
fields = []
if needs_qid: fields.append(f"qid={wd['qid']}")
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
print(f" UPDATED: {name} -> {', '.join(fields)}")
updated += 1
except Exception as e:
print(f" ERROR updating {name}: {e}")
errors += 1
print(f"\n{'=' * 60}")
print(f"RESULTS:")
print(f" Updated: {updated}")
print(f" Skipped (no new data): {skipped}")
print(f" Not found on Wikidata: {not_found}")
print(f" Errors: {errors}")
print(f" Total species: {len(species_list)}")
if __name__ == "__main__":
main()
+305
View File
@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""Expand HerbAPI species database with common permaculture/garden species."""
import json
import time
import urllib.request
import urllib.parse
import urllib.error
import ssl
BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
DELAY = 0.15
# SSL context for GBIF (https)
ssl_ctx = ssl.create_default_context()
def api_get(path):
req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def api_post(path, data):
body = json.dumps(data).encode()
req = urllib.request.Request(
f"{BASE_URL}{path}",
data=body,
headers={"Authorization": AUTH, "Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read()), resp.status
except urllib.error.HTTPError as e:
err_body = e.read().decode()
print(f" ERROR {e.code}: {err_body}")
return None, e.code
def gbif_get_german_name(scientific_name):
"""Query GBIF for the German vernacular name."""
try:
url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
req = urllib.request.Request(url)
with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
match = json.loads(resp.read())
usage_key = match.get("usageKey")
if not usage_key:
return None
url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
req2 = urllib.request.Request(url2)
with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
vn = json.loads(resp.read())
for r in vn.get("results", []):
if r.get("language") == "deu":
return r["vernacularName"]
return None
except Exception as e:
print(f" GBIF lookup failed for {scientific_name}: {e}")
return None
# ── Families to ensure exist ─────────────────────────────────────────
FAMILIES_NEEDED = {
"Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
"Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
"Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
"Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
"Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
"Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
"Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
"Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
"Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
"Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
"Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
"Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
"Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
"Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
"Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
# New families not yet in the DB:
"Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
"Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
"Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
}
# ── Species to add ───────────────────────────────────────────────────
# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
SPECIES = [
# Vegetables
("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
{"food_uses": "Fruit"}),
("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
{"food_uses": "Fruit"}),
("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
{"food_uses": "Fruit, seeds, flowers"}),
("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
{"food_uses": "Fruit, seeds"}),
("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
{"food_uses": "Leaves"}),
("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
{"food_uses": "Leaves"}),
("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
{"food_uses": "Leaves, flower buds, stems"}),
("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
{"food_uses": "Root, leaves"}),
("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
{"food_uses": "Root, leaves, seed pods"}),
("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
{"food_uses": "Bulb, leaves"}),
("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
{"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
{"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
{"food_uses": "Leaves, root"}),
("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
{"food_uses": "Stalks, root, leaves"}),
("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
{"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
{"food_uses": "Root"}),
("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
{"food_uses": "Kernels, cobs"}),
("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
{"food_uses": "Fruit"}),
# Herbs
("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True}),
("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
{"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
{"food_uses": "Leaves", "attracts_pollinators": True}),
("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
{"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
{"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
"attracts_beneficial_insects": True, "attracts_pollinators": True}),
("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
{"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
{"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
{"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
"other_uses": "Earthworm attractant (biodynamic)"}),
# Flowers & cover crops
("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
{"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
{"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
{"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
{"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
{"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
{"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
"ground_cover_quality": "excellent", "attracts_pollinators": True}),
("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
{"nitrogen_fixer": True, "food_uses": "Sprouts",
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
"other_uses": "Green manure, deep-rooting soil improver"}),
# Fruit / Trees
("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
{"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
{"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
{"food_uses": "Fruit", "attracts_pollinators": True}),
("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
{"food_uses": "Berries"}),
("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
{"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
"wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
{"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
{"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
"medicinal_uses": "High vitamin C, skin care",
"other_uses": "Erosion control, windbreak"}),
("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
{"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
]
def main():
# 1. Load existing families
print("=== Loading existing families ===")
fam_resp = api_get("/families?per_page=100")
family_map = {} # name_scientific -> id
for f in fam_resp["data"]:
family_map[f["name_scientific"]] = f["id"]
print(f" Found {len(family_map)} existing families")
# 2. Create missing families
print("\n=== Creating missing families ===")
families_created = 0
for fam_name, fam_info in FAMILIES_NEEDED.items():
if fam_name in family_map:
print(f" SKIP (exists): {fam_name}")
continue
payload = {
"name_scientific": fam_name,
"name_en": fam_info["name_en"],
"name_de": fam_info["name_de"],
}
print(f" CREATE: {fam_name} ...", end=" ")
result, status = api_post("/families", payload)
if result and "id" in result:
family_map[fam_name] = result["id"]
print(f"OK ({result['id']})")
families_created += 1
else:
print(f"FAILED (status={status})")
time.sleep(DELAY)
print(f"\n Families created: {families_created}")
# 3. Load existing species
print("\n=== Loading existing species ===")
sp_resp = api_get("/species?per_page=200")
existing_species = set()
for s in sp_resp["data"]:
existing_species.add(s["name_scientific"])
print(f" Found {len(existing_species)} existing species")
# 4. Add new species
print("\n=== Adding new species ===")
created = 0
skipped = 0
failed = 0
for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
if sci_name in existing_species:
print(f" SKIP (exists): {sci_name}")
skipped += 1
continue
# Look up family ID
fam_id = family_map.get(family)
if not fam_id:
print(f" SKIP (no family '{family}'): {sci_name}")
failed += 1
continue
# Try GBIF for German name
gbif_de = gbif_get_german_name(sci_name)
if gbif_de:
print(f" GBIF name for {sci_name}: {gbif_de}")
# Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
# Keep our curated name_de but log the GBIF one
payload = {
"name_scientific": sci_name,
"family_id": fam_id,
"name_en": name_en,
"name_de": name_de,
"plant_layer": plant_layer,
}
# Add extra fields
for k, v in extras.items():
payload[k] = v
print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
result, status = api_post("/species", payload)
if result and "id" in result:
print(f"OK ({result['id']})")
created += 1
else:
print(f"FAILED (status={status})")
failed += 1
time.sleep(DELAY)
print(f"\n{'='*50}")
print(f"SUMMARY")
print(f" Families created: {families_created}")
print(f" Species created: {created}")
print(f" Species skipped: {skipped}")
print(f" Species failed: {failed}")
print(f" Total species now: {len(existing_species) + created}")
if __name__ == "__main__":
main()
+362
View File
@@ -0,0 +1,362 @@
#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
# --- Configuration ---
S3_ENDPOINT = "http://garage.sub-net.at:3900"
S3_BUCKET = "herbapi"
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
S3_REGION = "garage"
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
THUMB_WIDTH = 800
REQUEST_DELAY = 0.3
ALLOWED_LICENSES = {
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
"pd-us", "pd-usgov", "pd-author",
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
}
def slugify(name: str) -> str:
"""Convert scientific name to a URL-safe slug."""
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def psql(query: str) -> str:
"""Run a psql query and return output."""
env = os.environ.copy()
env["PGPASSWORD"] = DB_PASS
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
capture_output=True, text=True, env=env
)
if result.returncode != 0:
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
return result.stdout.strip()
def fetch_json(url: str) -> dict | None:
"""Fetch JSON from a URL with proper User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
except Exception as e:
print(f" HTTP error fetching {url}: {e}")
return None
def get_wikidata_image(qid: str) -> str | None:
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
"query": sparql, "format": "json"
})
data = fetch_json(url)
if not data:
return None
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
return filename
def get_commons_info(filename: str) -> dict | None:
"""Get image info from Wikimedia Commons API."""
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
"action": "query",
"titles": f"File:{filename}",
"prop": "imageinfo",
"iiprop": "url|extmetadata",
"iiurlwidth": str(THUMB_WIDTH),
"format": "json",
})
data = fetch_json(url)
if not data:
return None
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [])
if not imageinfo:
return None
info = imageinfo[0]
meta = info.get("extmetadata", {})
thumb_url = info.get("thumburl") or info.get("url")
desc_url = info.get("descriptionurl", "")
license_short = meta.get("LicenseShortName", {}).get("value", "")
artist_html = meta.get("Artist", {}).get("value", "")
# Strip HTML tags from artist
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
# Clean up whitespace
artist = re.sub(r'\s+', ' ', artist)
return {
"thumb_url": thumb_url,
"description_url": desc_url,
"license": license_short,
"artist": artist,
"filename": filename,
}
return None
def is_license_allowed(license_str: str) -> bool:
"""Check if a license is in our allowed list."""
normalized = license_str.lower().strip()
# Direct match
if normalized in ALLOWED_LICENSES:
return True
# Check for NC or ND
if "nc" in normalized or "nd" in normalized:
return False
# Check patterns
if normalized.startswith("public domain") or normalized.startswith("pd"):
return True
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
return True
return False
def normalize_license(license_str: str) -> str:
"""Normalize license string for storage."""
low = license_str.lower().strip()
if "public domain" in low or low.startswith("pd"):
return "Public domain"
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
return "CC0 1.0"
# CC BY-SA X.0
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY-SA {m.group(1)}"
# CC BY X.0
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY {m.group(1)}"
return license_str
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
"""Upload to S3 Garage using AWS CLI."""
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
with open(tmp_path, "wb") as f:
f.write(data)
env = os.environ.copy()
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
env["AWS_DEFAULT_REGION"] = S3_REGION
result = subprocess.run(
[
"aws", "s3", "cp", tmp_path,
f"s3://{S3_BUCKET}/{s3_key}",
"--endpoint-url", S3_ENDPOINT,
"--content-type", content_type,
],
capture_output=True, text=True, env=env
)
os.unlink(tmp_path)
if result.returncode != 0:
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
def download_image(url: str) -> bytes | None:
"""Download image data from URL."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.read()
except Exception as e:
print(f" Download error: {e}")
return None
def main():
# 1. Get species
rows = psql(
"SELECT id, name_scientific, wikidata_qid FROM species "
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
"ORDER BY name_scientific"
)
if not rows:
print("No species with wikidata_qid found.")
return
species_list = []
for line in rows.split("\n"):
parts = line.split("|")
if len(parts) == 3:
species_list.append({
"id": parts[0],
"name": parts[1],
"qid": parts[2],
})
print(f"Found {len(species_list)} species with Wikidata QIDs.")
# 2. Get existing images
existing = set()
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
if existing_rows:
for line in existing_rows.split("\n"):
line = line.strip()
if line:
existing.add(line)
print(f"Found {len(existing)} species that already have images.")
imported = 0
skipped_existing = 0
skipped_no_image = 0
skipped_license = 0
skipped_download = 0
errors = 0
for i, sp in enumerate(species_list):
name = sp["name"]
qid = sp["qid"]
sp_id = sp["id"]
slug = slugify(name)
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
if sp_id in existing:
print(" Already has image, skipping.")
skipped_existing += 1
continue
# Query Wikidata for image
time.sleep(REQUEST_DELAY)
filename = get_wikidata_image(qid)
if not filename:
print(" No image on Wikidata.")
skipped_no_image += 1
continue
# Get Commons info
time.sleep(REQUEST_DELAY)
info = get_commons_info(filename)
if not info:
print(f" Could not get Commons info for {filename}")
skipped_no_image += 1
continue
# Check license
raw_license = info["license"]
if not is_license_allowed(raw_license):
print(f" License not allowed: {raw_license}")
skipped_license += 1
continue
norm_license = normalize_license(raw_license)
artist = info["artist"]
thumb_url = info["thumb_url"]
desc_url = info["description_url"]
print(f" License: {raw_license} -> {norm_license}")
print(f" Artist: {artist[:80]}")
print(f" Thumbnail: {thumb_url[:100]}...")
# Download image
time.sleep(REQUEST_DELAY)
image_data = download_image(thumb_url)
if not image_data:
print(" Failed to download image.")
skipped_download += 1
continue
print(f" Downloaded {len(image_data)} bytes")
# Determine file extension from URL
ext = "jpg"
if ".png" in thumb_url.lower():
ext = "png"
elif ".svg" in thumb_url.lower():
ext = "svg"
elif ".gif" in thumb_url.lower():
ext = "gif"
s3_key = f"species/{slug}.{ext}"
content_type = {
"jpg": "image/jpeg",
"png": "image/png",
"svg": "image/svg+xml",
"gif": "image/gif",
}.get(ext, "image/jpeg")
# Upload to S3
try:
s3_upload(s3_key, image_data, content_type)
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
except RuntimeError as e:
print(f" S3 upload failed: {e}")
errors += 1
continue
# Insert into database
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
# Escape single quotes for SQL
caption_esc = caption.replace("'", "''")
desc_url_esc = desc_url.replace("'", "''")
norm_license_esc = norm_license.replace("'", "''")
s3_key_esc = s3_key.replace("'", "''")
insert_sql = (
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
)
result = psql(insert_sql)
# psql returns empty on success for INSERT
print(f" Inserted into images table.")
imported += 1
print(f"\n{'='*60}")
print(f"DONE!")
print(f" Imported: {imported}")
print(f" Skipped (existing):{skipped_existing}")
print(f" Skipped (no image):{skipped_no_image}")
print(f" Skipped (license): {skipped_license}")
print(f" Skipped (download):{skipped_download}")
print(f" Errors: {errors}")
print(f" Total processed: {len(species_list)}")
if __name__ == "__main__":
main()
+290
View File
@@ -0,0 +1,290 @@
#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Config
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
S3_BUCKET = "herbapi"
S3_ENDPOINT = "http://10.31.3.170:3900"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
REQUEST_DELAY = 0.3
# AWS env for subprocess calls
AWS_ENV = {
**os.environ,
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
"AWS_DEFAULT_REGION": "garage",
}
# Stats
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
def fetch_url(url):
"""Fetch URL with custom User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
def fetch_json(url):
"""Fetch URL and parse JSON."""
return json.loads(fetch_url(url))
def psql(sql):
"""Run psql command and return output."""
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
capture_output=True, text=True,
env={**os.environ, "PGPASSWORD": DB_PASS},
)
return result.stdout.strip()
def is_license_allowed(license_str):
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
"""
if not license_str:
return False
ls = license_str.lower().strip()
# Reject NC and ND explicitly first
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
return False
# Public domain / CC0
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
return True
if "public domain" in ls or ls.startswith("pd"):
return True
# CC BY-SA (any version, any jurisdiction)
if re.match(r"cc\s+by-sa\b", ls):
return True
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
if re.match(r"cc\s+by\b", ls):
return True
return False
def get_wikidata_image(qid):
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
data = fetch_json(url)
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# Extract filename from commons URL
filename = urllib.parse.unquote(image_url.split("/")[-1])
return filename
def get_commons_info(filename):
"""Get image info from Commons API: license, artist, thumbnail URL."""
title = f"File:{filename}"
url = (
f"https://commons.wikimedia.org/w/api.php?action=query"
f"&titles={urllib.parse.quote(title)}"
f"&prop=imageinfo&iiprop=url|extmetadata"
f"&iiurlwidth=800&format=json"
)
data = fetch_json(url)
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [{}])[0]
meta = imageinfo.get("extmetadata", {})
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
artist_html = meta.get("Artist", {}).get("value", "")
# Clean up artist: strip HTML tags
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
# Collapse whitespace
artist = re.sub(r"\s+", " ", artist)
if len(artist) > 120:
artist = artist[:117] + "..."
# Use the API-provided thumbnail URL (iiurlwidth=800)
thumb_url = imageinfo.get("thumburl", "")
# Also get the description URL
desc_url = imageinfo.get("descriptionurl", "")
return {
"license": license_short,
"artist": artist,
"thumb_url": thumb_url,
"desc_url": desc_url,
"filename": filename,
}
return None
def process_species(species_id, slug, name_sci, qid):
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
stats["total"] += 1
# Step 1: Get image filename from Wikidata
try:
filename = get_wikidata_image(qid)
except Exception as e:
print(f" ERROR querying Wikidata for {qid}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not filename:
print(f" No P18 image for {qid}")
stats["no_p18"] += 1
return False
# Step 2: Get Commons info (license, artist, thumb URL)
try:
info = get_commons_info(filename)
except Exception as e:
print(f" ERROR querying Commons for {filename}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not info:
print(f" No Commons info for {filename}")
stats["errors"] += 1
return False
# Step 3: Check license
if not is_license_allowed(info["license"]):
print(f" Bad license: {info['license']} for {filename}")
stats["bad_license"] += 1
return False
# Step 4: Download thumbnail using API-provided URL
thumb_url = info["thumb_url"]
if not thumb_url:
print(f" No thumbnail URL available for {filename}")
stats["download_fail"] += 1
return False
# Determine file extension from thumbnail URL
ext = "jpg"
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "png"
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "gif"
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
try:
img_data = fetch_url(thumb_url)
with open(tmp_path, "wb") as f:
f.write(img_data)
except Exception as e:
print(f" ERROR downloading {thumb_url}: {e}")
stats["download_fail"] += 1
return False
time.sleep(REQUEST_DELAY)
# Step 5: Upload to S3
s3_key = f"species/{slug}.{ext}"
try:
result = subprocess.run(
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
capture_output=True, text=True, env=AWS_ENV, timeout=60,
)
if result.returncode != 0:
print(f" S3 upload failed: {result.stderr}")
stats["upload_fail"] += 1
return False
except Exception as e:
print(f" ERROR uploading to S3: {e}")
stats["upload_fail"] += 1
return False
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
# Step 6: Insert into DB
caption = f"Photo: {info['artist']}" if info["artist"] else ""
caption_sql = caption.replace("'", "''")
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
source_url_sql = source_url.replace("'", "''")
license_sql = info["license"].replace("'", "''")
sql = (
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
)
try:
psql(sql)
except Exception as e:
print(f" ERROR inserting to DB: {e}")
stats["errors"] += 1
return False
stats["imported"] += 1
return True
def main():
# Get species without images
rows = psql(
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
"FROM species s "
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
"ORDER BY s.name_scientific;"
)
if not rows:
print("No species need images.")
return
species_list = []
for line in rows.split("\n"):
parts = line.strip().split("|")
if len(parts) == 4:
species_list.append(parts)
print(f"Processing {len(species_list)} species...\n")
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
ok = process_species(sid, slug, name_sci, qid)
if ok:
print(f" OK - imported")
print(f"\n{'='*50}")
print(f"RESULTS:")
print(f" Total species processed: {stats['total']}")
print(f" Successfully imported: {stats['imported']}")
print(f" No P18 image: {stats['no_p18']}")
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
print(f" Download failures: {stats['download_fail']}")
print(f" Upload failures: {stats['upload_fail']}")
print(f" Other errors: {stats['errors']}")
if __name__ == "__main__":
main()
+126
View File
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
import json, urllib.request, urllib.parse, time, sys
API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
GBIF = "https://api.gbif.org/v1"
def api_post(path, data):
req = urllib.request.Request(f"{API}{path}",
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
try:
resp = urllib.request.urlopen(req)
return json.loads(resp.read())
except urllib.error.HTTPError as e:
print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
return None
def gbif_de_name(name):
"""Get German common name from GBIF."""
url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
try:
match = json.loads(urllib.request.urlopen(url).read())
if not match.get("usageKey"): return None
url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
data = json.loads(urllib.request.urlopen(url2).read())
for r in data.get("results", []):
if r.get("language") == "deu":
return r["vernacularName"]
except: pass
return None
FAMILIES = [
("Fabaceae", "Hülsenfrüchtler", "Legumes"),
("Rosaceae", "Rosengewächse", "Rose family"),
("Brassicaceae", "Kreuzblütler", "Cabbage family"),
("Apiaceae", "Doldenblütler", "Carrot family"),
("Lamiaceae", "Lippenblütler", "Mint family"),
("Asteraceae", "Korbblütler", "Daisy family"),
("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
("Poaceae", "Süßgräser", "Grass family"),
("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
("Boraginaceae", "Raublattgewächse", "Borage family"),
("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
("Betulaceae", "Birkengewächse", "Birch family"),
("Fagaceae", "Buchengewächse", "Beech family"),
("Juglandaceae", "Walnussgewächse", "Walnut family"),
("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
("Ericaceae", "Heidekrautgewächse", "Heath family"),
("Moraceae", "Maulbeergewächse", "Mulberry family"),
("Urticaceae", "Brennnesselgewächse", "Nettle family"),
("Malvaceae", "Malvengewächse", "Mallow family"),
("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
("Asparagaceae", "Spargelgewächse", "Asparagus family"),
("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
]
SPECIES = [
("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
]
# Create families
print("=== Creating families ===")
family_map = {}
for sci, de, en in FAMILIES:
r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
if r:
family_map[sci] = r["id"]
print(f"{sci}")
time.sleep(0.05)
print(f"Created {len(family_map)} families\n")
# Create species
print("=== Creating species (with GBIF German names) ===")
created = 0
for sci_name, family_sci, extra in SPECIES:
fam_id = family_map.get(family_sci)
if not fam_id:
print(f"{sci_name} — family {family_sci} missing")
continue
de_name = gbif_de_name(sci_name)
data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
r = api_post("/species", data)
if r:
created += 1
print(f"{sci_name}{de_name or '(no DE name)'}")
time.sleep(0.15)
print(f"Created {created} species\n")
# Create suppliers
print("=== Creating suppliers ===")
for name, url, country, organic, demeter, notes in [
("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
]:
r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
if r: print(f"{name}")
print("\nDone!")