Add scraper and enrichment scripts to tools/ directory
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Enrich HerbAPI species with Wikidata QID, GBIF ID, and EPPO code."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
HERBAPI_BASE = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
HERBAPI_TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
||||
|
||||
HEADERS_WD = {
|
||||
"User-Agent": "HerbAPI-Enrichment/1.0 (florian.berthold@sub-net.at)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
|
||||
def herbapi_request(path, method="GET", data=None):
|
||||
url = f"{HERBAPI_BASE}{path}"
|
||||
body = json.dumps(data).encode() if data else None
|
||||
req = urllib.request.Request(url, data=body, method=method, headers={
|
||||
"Authorization": f"Bearer {HERBAPI_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
})
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def query_wikidata_batch(names):
|
||||
"""Query Wikidata for a batch of scientific names."""
|
||||
values = " ".join(f'"{n}"' for n in names)
|
||||
sparql = f"""SELECT ?name ?item ?gbifId ?eppoCode WHERE {{
|
||||
VALUES ?name {{ {values} }}
|
||||
?item wdt:P225 ?name .
|
||||
OPTIONAL {{ ?item wdt:P846 ?gbifId }}
|
||||
OPTIONAL {{ ?item wdt:P3031 ?eppoCode }}
|
||||
}}"""
|
||||
encoded = urllib.parse.quote(sparql)
|
||||
url = f"{WIKIDATA_SPARQL}?query={encoded}&format=json"
|
||||
req = urllib.request.Request(url, headers=HEADERS_WD)
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
data = json.loads(resp.read())
|
||||
|
||||
results = {}
|
||||
for binding in data.get("results", {}).get("bindings", []):
|
||||
name = binding["name"]["value"]
|
||||
qid_url = binding["item"]["value"]
|
||||
qid = qid_url.rsplit("/", 1)[-1]
|
||||
gbif = binding.get("gbifId", {}).get("value")
|
||||
eppo = binding.get("eppoCode", {}).get("value")
|
||||
results[name] = {"qid": qid, "gbif_id": gbif, "eppo_code": eppo}
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Fetch all species
|
||||
resp = herbapi_request("/species?per_page=200")
|
||||
species_list = resp["data"]
|
||||
print(f"Fetched {len(species_list)} species from HerbAPI\n")
|
||||
|
||||
# 2. Collect species needing enrichment
|
||||
to_enrich = [sp for sp in species_list
|
||||
if not sp["wikidata_qid"] or not sp["gbif_id"] or not sp["eppo_code"]]
|
||||
|
||||
if not to_enrich:
|
||||
print("All species already enriched.")
|
||||
return
|
||||
|
||||
print(f"{len(to_enrich)} species need enrichment\n")
|
||||
|
||||
# 3. Batch query Wikidata
|
||||
BATCH_SIZE = 20
|
||||
wikidata_results = {}
|
||||
names = [sp["name_scientific"] for sp in to_enrich]
|
||||
|
||||
for i in range(0, len(names), BATCH_SIZE):
|
||||
batch = names[i:i + BATCH_SIZE]
|
||||
print(f"Querying Wikidata batch {i // BATCH_SIZE + 1}: {len(batch)} species...")
|
||||
try:
|
||||
results = query_wikidata_batch(batch)
|
||||
wikidata_results.update(results)
|
||||
print(f" Got {len(results)} matches")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
if i + BATCH_SIZE < len(names):
|
||||
time.sleep(2)
|
||||
|
||||
print(f"\nWikidata returned data for {len(wikidata_results)} / {len(names)} species\n")
|
||||
|
||||
# 4. Update HerbAPI - GET full object by slug, merge, PUT by UUID
|
||||
updated = 0
|
||||
skipped = 0
|
||||
not_found = 0
|
||||
errors = 0
|
||||
|
||||
for sp in to_enrich:
|
||||
name = sp["name_scientific"]
|
||||
wd = wikidata_results.get(name)
|
||||
if not wd:
|
||||
print(f" SKIP (no Wikidata match): {name}")
|
||||
not_found += 1
|
||||
continue
|
||||
|
||||
# Check what needs updating
|
||||
needs_qid = not sp["wikidata_qid"] and wd["qid"]
|
||||
needs_gbif = not sp["gbif_id"] and wd["gbif_id"]
|
||||
needs_eppo = not sp["eppo_code"] and wd["eppo_code"]
|
||||
|
||||
if not (needs_qid or needs_gbif or needs_eppo):
|
||||
print(f" SKIP (nothing new): {name}")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# GET full species by slug for the complete object
|
||||
full_sp = herbapi_request(f"/species/{sp['slug']}")
|
||||
|
||||
# Remove read-only fields
|
||||
species_id = full_sp.pop("id")
|
||||
full_sp.pop("slug", None)
|
||||
full_sp.pop("created_at", None)
|
||||
full_sp.pop("updated_at", None)
|
||||
|
||||
# Merge new data (only null fields)
|
||||
if needs_qid:
|
||||
full_sp["wikidata_qid"] = wd["qid"]
|
||||
if needs_gbif:
|
||||
full_sp["gbif_id"] = str(wd["gbif_id"]) # API expects string
|
||||
if needs_eppo:
|
||||
full_sp["eppo_code"] = wd["eppo_code"]
|
||||
|
||||
# PUT by UUID
|
||||
herbapi_request(f"/species/{species_id}", method="PUT", data=full_sp)
|
||||
|
||||
fields = []
|
||||
if needs_qid: fields.append(f"qid={wd['qid']}")
|
||||
if needs_gbif: fields.append(f"gbif={wd['gbif_id']}")
|
||||
if needs_eppo: fields.append(f"eppo={wd['eppo_code']}")
|
||||
print(f" UPDATED: {name} -> {', '.join(fields)}")
|
||||
updated += 1
|
||||
except Exception as e:
|
||||
print(f" ERROR updating {name}: {e}")
|
||||
errors += 1
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"RESULTS:")
|
||||
print(f" Updated: {updated}")
|
||||
print(f" Skipped (no new data): {skipped}")
|
||||
print(f" Not found on Wikidata: {not_found}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total species: {len(species_list)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Expand HerbAPI species database with common permaculture/garden species."""
|
||||
|
||||
import json
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
import ssl
|
||||
|
||||
BASE_URL = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
AUTH = "Bearer km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
DELAY = 0.15
|
||||
|
||||
# SSL context for GBIF (https)
|
||||
ssl_ctx = ssl.create_default_context()
|
||||
|
||||
|
||||
def api_get(path):
|
||||
req = urllib.request.Request(f"{BASE_URL}{path}", headers={"Authorization": AUTH})
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def api_post(path, data):
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}{path}",
|
||||
data=body,
|
||||
headers={"Authorization": AUTH, "Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read()), resp.status
|
||||
except urllib.error.HTTPError as e:
|
||||
err_body = e.read().decode()
|
||||
print(f" ERROR {e.code}: {err_body}")
|
||||
return None, e.code
|
||||
|
||||
|
||||
def gbif_get_german_name(scientific_name):
|
||||
"""Query GBIF for the German vernacular name."""
|
||||
try:
|
||||
url = f"https://api.gbif.org/v1/species/match?name={urllib.parse.quote(scientific_name)}"
|
||||
req = urllib.request.Request(url)
|
||||
with urllib.request.urlopen(req, context=ssl_ctx, timeout=10) as resp:
|
||||
match = json.loads(resp.read())
|
||||
|
||||
usage_key = match.get("usageKey")
|
||||
if not usage_key:
|
||||
return None
|
||||
|
||||
url2 = f"https://api.gbif.org/v1/species/{usage_key}/vernacularNames?limit=100"
|
||||
req2 = urllib.request.Request(url2)
|
||||
with urllib.request.urlopen(req2, context=ssl_ctx, timeout=10) as resp:
|
||||
vn = json.loads(resp.read())
|
||||
|
||||
for r in vn.get("results", []):
|
||||
if r.get("language") == "deu":
|
||||
return r["vernacularName"]
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" GBIF lookup failed for {scientific_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ── Families to ensure exist ─────────────────────────────────────────
|
||||
FAMILIES_NEEDED = {
|
||||
"Fabaceae": {"name_en": "Legumes", "name_de": "Hülsenfrüchtler"},
|
||||
"Solanaceae": {"name_en": "Nightshade family", "name_de": "Nachtschattengewächse"},
|
||||
"Cucurbitaceae": {"name_en": "Gourd family", "name_de": "Kürbisgewächse"},
|
||||
"Asteraceae": {"name_en": "Daisy family", "name_de": "Korbblütler"},
|
||||
"Chenopodiaceae": {"name_en": "Goosefoot family", "name_de": "Gänsefußgewächse"},
|
||||
"Brassicaceae": {"name_en": "Cabbage family", "name_de": "Kreuzblütler"},
|
||||
"Amaryllidaceae": {"name_en": "Amaryllis family", "name_de": "Amaryllisgewächse"},
|
||||
"Apiaceae": {"name_en": "Carrot family", "name_de": "Doldenblütler"},
|
||||
"Poaceae": {"name_en": "Grass family", "name_de": "Süßgräser"},
|
||||
"Lamiaceae": {"name_en": "Mint family", "name_de": "Lippenblütler"},
|
||||
"Caprifoliaceae": {"name_en": "Honeysuckle family", "name_de": "Geißblattgewächse"},
|
||||
"Rosaceae": {"name_en": "Rose family", "name_de": "Rosengewächse"},
|
||||
"Grossulariaceae": {"name_en": "Gooseberry family", "name_de": "Stachelbeergewächse"},
|
||||
"Ericaceae": {"name_en": "Heath family", "name_de": "Heidekrautgewächse"},
|
||||
"Moraceae": {"name_en": "Mulberry family", "name_de": "Maulbeergewächse"},
|
||||
# New families not yet in the DB:
|
||||
"Hypericaceae": {"name_en": "St John's wort family", "name_de": "Johanniskrautgewächse"},
|
||||
"Tropaeolaceae": {"name_en": "Nasturtium family", "name_de": "Kapuzinerkressengewächse"},
|
||||
"Elaeagnaceae": {"name_en": "Oleaster family", "name_de": "Ölweidengewächse"},
|
||||
}
|
||||
|
||||
# ── Species to add ───────────────────────────────────────────────────
|
||||
# Format: (scientific_name, family, name_en, name_de, plant_layer, extra_fields)
|
||||
SPECIES = [
|
||||
# Vegetables
|
||||
("Phaseolus vulgaris", "Fabaceae", "common bean", "Gartenbohne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds)"}),
|
||||
("Phaseolus coccineus", "Fabaceae", "runner bean", "Feuerbohne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Beans (pods, seeds), flowers", "attracts_pollinators": True}),
|
||||
("Pisum sativum", "Fabaceae", "pea", "Erbse", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Peas, shoots"}),
|
||||
("Capsicum annuum", "Solanaceae", "pepper", "Paprika", "herbaceous",
|
||||
{"food_uses": "Fruit"}),
|
||||
("Cucumis sativus", "Cucurbitaceae", "cucumber", "Gurke", "ground_cover",
|
||||
{"food_uses": "Fruit"}),
|
||||
("Cucurbita maxima", "Cucurbitaceae", "winter squash", "Riesenkürbis", "ground_cover",
|
||||
{"food_uses": "Fruit, seeds, flowers"}),
|
||||
("Cucurbita moschata", "Cucurbitaceae", "butternut squash", "Moschuskürbis", "ground_cover",
|
||||
{"food_uses": "Fruit, seeds"}),
|
||||
("Lactuca sativa", "Asteraceae", "lettuce", "Salat", "herbaceous",
|
||||
{"food_uses": "Leaves"}),
|
||||
("Spinacia oleracea", "Chenopodiaceae", "spinach", "Spinat", "herbaceous",
|
||||
{"food_uses": "Leaves"}),
|
||||
("Brassica oleracea", "Brassicaceae", "cabbage / kale", "Kohl", "herbaceous",
|
||||
{"food_uses": "Leaves, flower buds, stems"}),
|
||||
("Brassica rapa", "Brassicaceae", "turnip", "Rübe", "herbaceous",
|
||||
{"food_uses": "Root, leaves"}),
|
||||
("Raphanus sativus", "Brassicaceae", "radish", "Rettich", "herbaceous",
|
||||
{"food_uses": "Root, leaves, seed pods"}),
|
||||
("Allium cepa", "Amaryllidaceae", "onion", "Zwiebel", "herbaceous",
|
||||
{"food_uses": "Bulb, leaves"}),
|
||||
("Allium sativum", "Amaryllidaceae", "garlic", "Knoblauch", "herbaceous",
|
||||
{"food_uses": "Bulb, scapes", "medicinal_uses": "Antimicrobial, cardiovascular"}),
|
||||
("Allium schoenoprasum", "Amaryllidaceae", "chives", "Schnittlauch", "herbaceous",
|
||||
{"food_uses": "Leaves, flowers", "attracts_pollinators": True}),
|
||||
("Petroselinum crispum", "Apiaceae", "parsley", "Petersilie", "herbaceous",
|
||||
{"food_uses": "Leaves, root"}),
|
||||
("Apium graveolens", "Apiaceae", "celery", "Sellerie", "herbaceous",
|
||||
{"food_uses": "Stalks, root, leaves"}),
|
||||
("Foeniculum vulgare", "Apiaceae", "fennel", "Fenchel", "herbaceous",
|
||||
{"food_uses": "Bulb, fronds, seeds", "attracts_beneficial_insects": True}),
|
||||
("Pastinaca sativa", "Apiaceae", "parsnip", "Pastinake", "herbaceous",
|
||||
{"food_uses": "Root"}),
|
||||
("Zea mays", "Poaceae", "corn", "Mais", "herbaceous",
|
||||
{"food_uses": "Kernels, cobs"}),
|
||||
("Solanum melongena", "Solanaceae", "eggplant", "Melanzani", "herbaceous",
|
||||
{"food_uses": "Fruit"}),
|
||||
|
||||
# Herbs
|
||||
("Ocimum basilicum", "Lamiaceae", "basil", "Basilikum", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||
("Origanum vulgare", "Lamiaceae", "oregano", "Oregano", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||
("Mentha x piperita", "Lamiaceae", "peppermint", "Pfefferminze", "herbaceous",
|
||||
{"food_uses": "Leaves (tea, culinary)", "medicinal_uses": "Digestive, headache relief", "invasiveness": "spreading"}),
|
||||
("Rosmarinus officinalis", "Lamiaceae", "rosemary", "Rosmarin", "herbaceous",
|
||||
{"food_uses": "Leaves", "attracts_pollinators": True}),
|
||||
("Anethum graveolens", "Apiaceae", "dill", "Dill", "herbaceous",
|
||||
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||
("Coriandrum sativum", "Apiaceae", "coriander", "Koriander", "herbaceous",
|
||||
{"food_uses": "Leaves, seeds", "attracts_beneficial_insects": True}),
|
||||
("Artemisia absinthium", "Asteraceae", "wormwood", "Wermut", "herbaceous",
|
||||
{"medicinal_uses": "Digestive, anti-parasitic", "other_uses": "Companion plant pest deterrent", "allelopathic": True}),
|
||||
("Achillea millefolium", "Asteraceae", "yarrow", "Schafgarbe", "herbaceous",
|
||||
{"food_uses": "Young leaves (salad)", "medicinal_uses": "Wound healing, anti-inflammatory",
|
||||
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "K, P, Cu",
|
||||
"attracts_beneficial_insects": True, "attracts_pollinators": True}),
|
||||
("Hypericum perforatum", "Hypericaceae", "St John's wort", "Johanniskraut", "herbaceous",
|
||||
{"medicinal_uses": "Antidepressant, wound healing", "attracts_pollinators": True}),
|
||||
("Echinacea purpurea", "Asteraceae", "echinacea", "Sonnenhut", "herbaceous",
|
||||
{"medicinal_uses": "Immune stimulant", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||
("Valeriana officinalis", "Caprifoliaceae", "valerian", "Baldrian", "herbaceous",
|
||||
{"medicinal_uses": "Sedative, sleep aid", "attracts_pollinators": True,
|
||||
"other_uses": "Earthworm attractant (biodynamic)"}),
|
||||
|
||||
# Flowers & cover crops
|
||||
("Tagetes patula", "Asteraceae", "French marigold", "Studentenblume", "herbaceous",
|
||||
{"other_uses": "Nematode suppression, companion plant", "attracts_pollinators": True}),
|
||||
("Helianthus annuus", "Asteraceae", "sunflower", "Sonnenblume", "herbaceous",
|
||||
{"food_uses": "Seeds, oil", "attracts_pollinators": True, "wildlife_value": "Seeds for birds"}),
|
||||
("Tropaeolum majus", "Tropaeolaceae", "nasturtium", "Kapuzinerkresse", "ground_cover",
|
||||
{"food_uses": "Leaves, flowers, seeds (capers)", "other_uses": "Trap crop for aphids"}),
|
||||
("Centaurea cyanus", "Asteraceae", "cornflower", "Kornblume", "herbaceous",
|
||||
{"food_uses": "Flowers (edible garnish)", "attracts_pollinators": True, "attracts_beneficial_insects": True}),
|
||||
("Sinapis alba", "Brassicaceae", "white mustard", "Weißer Senf", "herbaceous",
|
||||
{"food_uses": "Seeds, young leaves", "other_uses": "Green manure, biofumigant"}),
|
||||
("Trifolium repens", "Fabaceae", "white clover", "Weißklee", "ground_cover",
|
||||
{"nitrogen_fixer": True, "food_uses": "Flowers (tea), young leaves",
|
||||
"ground_cover_quality": "excellent", "attracts_pollinators": True}),
|
||||
("Medicago sativa", "Fabaceae", "alfalfa", "Luzerne", "herbaceous",
|
||||
{"nitrogen_fixer": True, "food_uses": "Sprouts",
|
||||
"dynamic_accumulator": True, "dynamic_accumulator_nutrients": "N, K, Ca, Mg, Fe",
|
||||
"other_uses": "Green manure, deep-rooting soil improver"}),
|
||||
|
||||
# Fruit / Trees
|
||||
("Prunus avium", "Rosaceae", "sweet cherry", "Süßkirsche", "canopy",
|
||||
{"food_uses": "Fruit", "attracts_pollinators": True, "wildlife_value": "Fruit for birds"}),
|
||||
("Prunus cerasus", "Rosaceae", "sour cherry", "Sauerkirsche", "understory",
|
||||
{"food_uses": "Fruit (cooking, preserves)", "attracts_pollinators": True}),
|
||||
("Pyrus communis", "Rosaceae", "pear", "Birne", "canopy",
|
||||
{"food_uses": "Fruit", "attracts_pollinators": True}),
|
||||
("Ribes uva-crispa", "Grossulariaceae", "gooseberry", "Stachelbeere", "shrub",
|
||||
{"food_uses": "Berries"}),
|
||||
("Rubus fruticosus", "Rosaceae", "blackberry", "Brombeere", "shrub",
|
||||
{"food_uses": "Berries, leaves (tea)", "attracts_pollinators": True,
|
||||
"wildlife_value": "Berries for birds, nesting habitat", "invasiveness": "spreading"}),
|
||||
("Vaccinium myrtillus", "Ericaceae", "bilberry", "Heidelbeere", "shrub",
|
||||
{"food_uses": "Berries", "medicinal_uses": "Antioxidant, eye health"}),
|
||||
("Hippophae rhamnoides", "Elaeagnaceae", "sea buckthorn", "Sanddorn", "shrub",
|
||||
{"nitrogen_fixer": True, "food_uses": "Berries (juice, oil)",
|
||||
"medicinal_uses": "High vitamin C, skin care",
|
||||
"other_uses": "Erosion control, windbreak"}),
|
||||
("Morus alba", "Moraceae", "white mulberry", "Weiße Maulbeere", "canopy",
|
||||
{"food_uses": "Fruit, young leaves", "wildlife_value": "Fruit for birds"}),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Load existing families
|
||||
print("=== Loading existing families ===")
|
||||
fam_resp = api_get("/families?per_page=100")
|
||||
family_map = {} # name_scientific -> id
|
||||
for f in fam_resp["data"]:
|
||||
family_map[f["name_scientific"]] = f["id"]
|
||||
print(f" Found {len(family_map)} existing families")
|
||||
|
||||
# 2. Create missing families
|
||||
print("\n=== Creating missing families ===")
|
||||
families_created = 0
|
||||
for fam_name, fam_info in FAMILIES_NEEDED.items():
|
||||
if fam_name in family_map:
|
||||
print(f" SKIP (exists): {fam_name}")
|
||||
continue
|
||||
payload = {
|
||||
"name_scientific": fam_name,
|
||||
"name_en": fam_info["name_en"],
|
||||
"name_de": fam_info["name_de"],
|
||||
}
|
||||
print(f" CREATE: {fam_name} ...", end=" ")
|
||||
result, status = api_post("/families", payload)
|
||||
if result and "id" in result:
|
||||
family_map[fam_name] = result["id"]
|
||||
print(f"OK ({result['id']})")
|
||||
families_created += 1
|
||||
else:
|
||||
print(f"FAILED (status={status})")
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n Families created: {families_created}")
|
||||
|
||||
# 3. Load existing species
|
||||
print("\n=== Loading existing species ===")
|
||||
sp_resp = api_get("/species?per_page=200")
|
||||
existing_species = set()
|
||||
for s in sp_resp["data"]:
|
||||
existing_species.add(s["name_scientific"])
|
||||
print(f" Found {len(existing_species)} existing species")
|
||||
|
||||
# 4. Add new species
|
||||
print("\n=== Adding new species ===")
|
||||
created = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for sci_name, family, name_en, name_de, plant_layer, extras in SPECIES:
|
||||
if sci_name in existing_species:
|
||||
print(f" SKIP (exists): {sci_name}")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Look up family ID
|
||||
fam_id = family_map.get(family)
|
||||
if not fam_id:
|
||||
print(f" SKIP (no family '{family}'): {sci_name}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
# Try GBIF for German name
|
||||
gbif_de = gbif_get_german_name(sci_name)
|
||||
if gbif_de:
|
||||
print(f" GBIF name for {sci_name}: {gbif_de}")
|
||||
# Use GBIF name if it differs (prefer catalog name as primary, GBIF as validation)
|
||||
# Keep our curated name_de but log the GBIF one
|
||||
|
||||
payload = {
|
||||
"name_scientific": sci_name,
|
||||
"family_id": fam_id,
|
||||
"name_en": name_en,
|
||||
"name_de": name_de,
|
||||
"plant_layer": plant_layer,
|
||||
}
|
||||
# Add extra fields
|
||||
for k, v in extras.items():
|
||||
payload[k] = v
|
||||
|
||||
print(f" CREATE: {sci_name} ({name_de}) ...", end=" ")
|
||||
result, status = api_post("/species", payload)
|
||||
if result and "id" in result:
|
||||
print(f"OK ({result['id']})")
|
||||
created += 1
|
||||
else:
|
||||
print(f"FAILED (status={status})")
|
||||
failed += 1
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"SUMMARY")
|
||||
print(f" Families created: {families_created}")
|
||||
print(f" Species created: {created}")
|
||||
print(f" Species skipped: {skipped}")
|
||||
print(f" Species failed: {failed}")
|
||||
print(f" Total species now: {len(existing_species) + created}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,362 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
# Force unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
# --- Configuration ---
|
||||
S3_ENDPOINT = "http://garage.sub-net.at:3900"
|
||||
S3_BUCKET = "herbapi"
|
||||
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
|
||||
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
|
||||
S3_REGION = "garage"
|
||||
|
||||
DB_HOST = "10.31.3.90"
|
||||
DB_USER = "herbapi"
|
||||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||
DB_NAME = "herbapi"
|
||||
|
||||
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||
THUMB_WIDTH = 800
|
||||
REQUEST_DELAY = 0.3
|
||||
|
||||
ALLOWED_LICENSES = {
|
||||
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
|
||||
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
|
||||
"pd-us", "pd-usgov", "pd-author",
|
||||
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
|
||||
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
|
||||
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
|
||||
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
|
||||
}
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convert scientific name to a URL-safe slug."""
|
||||
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||
|
||||
|
||||
def psql(query: str) -> str:
|
||||
"""Run a psql query and return output."""
|
||||
env = os.environ.copy()
|
||||
env["PGPASSWORD"] = DB_PASS
|
||||
result = subprocess.run(
|
||||
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def fetch_json(url: str) -> dict | None:
|
||||
"""Fetch JSON from a URL with proper User-Agent."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read())
|
||||
except Exception as e:
|
||||
print(f" HTTP error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_wikidata_image(qid: str) -> str | None:
|
||||
"""Query Wikidata SPARQL for P18 image filename."""
|
||||
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
|
||||
"query": sparql, "format": "json"
|
||||
})
|
||||
data = fetch_json(url)
|
||||
if not data:
|
||||
return None
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
if not bindings:
|
||||
return None
|
||||
image_url = bindings[0]["image"]["value"]
|
||||
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
|
||||
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
|
||||
return filename
|
||||
|
||||
|
||||
def get_commons_info(filename: str) -> dict | None:
|
||||
"""Get image info from Wikimedia Commons API."""
|
||||
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
|
||||
"action": "query",
|
||||
"titles": f"File:{filename}",
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url|extmetadata",
|
||||
"iiurlwidth": str(THUMB_WIDTH),
|
||||
"format": "json",
|
||||
})
|
||||
data = fetch_json(url)
|
||||
if not data:
|
||||
return None
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page_id, page in pages.items():
|
||||
if page_id == "-1":
|
||||
return None
|
||||
imageinfo = page.get("imageinfo", [])
|
||||
if not imageinfo:
|
||||
return None
|
||||
info = imageinfo[0]
|
||||
meta = info.get("extmetadata", {})
|
||||
|
||||
thumb_url = info.get("thumburl") or info.get("url")
|
||||
desc_url = info.get("descriptionurl", "")
|
||||
|
||||
license_short = meta.get("LicenseShortName", {}).get("value", "")
|
||||
artist_html = meta.get("Artist", {}).get("value", "")
|
||||
# Strip HTML tags from artist
|
||||
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
|
||||
# Clean up whitespace
|
||||
artist = re.sub(r'\s+', ' ', artist)
|
||||
|
||||
return {
|
||||
"thumb_url": thumb_url,
|
||||
"description_url": desc_url,
|
||||
"license": license_short,
|
||||
"artist": artist,
|
||||
"filename": filename,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def is_license_allowed(license_str: str) -> bool:
|
||||
"""Check if a license is in our allowed list."""
|
||||
normalized = license_str.lower().strip()
|
||||
# Direct match
|
||||
if normalized in ALLOWED_LICENSES:
|
||||
return True
|
||||
# Check for NC or ND
|
||||
if "nc" in normalized or "nd" in normalized:
|
||||
return False
|
||||
# Check patterns
|
||||
if normalized.startswith("public domain") or normalized.startswith("pd"):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
|
||||
return True
|
||||
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def normalize_license(license_str: str) -> str:
|
||||
"""Normalize license string for storage."""
|
||||
low = license_str.lower().strip()
|
||||
if "public domain" in low or low.startswith("pd"):
|
||||
return "Public domain"
|
||||
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
|
||||
return "CC0 1.0"
|
||||
# CC BY-SA X.0
|
||||
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
|
||||
if m:
|
||||
return f"CC BY-SA {m.group(1)}"
|
||||
# CC BY X.0
|
||||
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
|
||||
if m:
|
||||
return f"CC BY {m.group(1)}"
|
||||
return license_str
|
||||
|
||||
|
||||
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
|
||||
"""Upload to S3 Garage using AWS CLI."""
|
||||
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
|
||||
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
|
||||
env["AWS_DEFAULT_REGION"] = S3_REGION
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"aws", "s3", "cp", tmp_path,
|
||||
f"s3://{S3_BUCKET}/{s3_key}",
|
||||
"--endpoint-url", S3_ENDPOINT,
|
||||
"--content-type", content_type,
|
||||
],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
os.unlink(tmp_path)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
|
||||
|
||||
|
||||
def download_image(url: str) -> bytes | None:
|
||||
"""Download image data from URL."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
return resp.read()
|
||||
except Exception as e:
|
||||
print(f" Download error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# 1. Get species
|
||||
rows = psql(
|
||||
"SELECT id, name_scientific, wikidata_qid FROM species "
|
||||
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
|
||||
"ORDER BY name_scientific"
|
||||
)
|
||||
if not rows:
|
||||
print("No species with wikidata_qid found.")
|
||||
return
|
||||
|
||||
species_list = []
|
||||
for line in rows.split("\n"):
|
||||
parts = line.split("|")
|
||||
if len(parts) == 3:
|
||||
species_list.append({
|
||||
"id": parts[0],
|
||||
"name": parts[1],
|
||||
"qid": parts[2],
|
||||
})
|
||||
|
||||
print(f"Found {len(species_list)} species with Wikidata QIDs.")
|
||||
|
||||
# 2. Get existing images
|
||||
existing = set()
|
||||
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
|
||||
if existing_rows:
|
||||
for line in existing_rows.split("\n"):
|
||||
line = line.strip()
|
||||
if line:
|
||||
existing.add(line)
|
||||
|
||||
print(f"Found {len(existing)} species that already have images.")
|
||||
|
||||
imported = 0
|
||||
skipped_existing = 0
|
||||
skipped_no_image = 0
|
||||
skipped_license = 0
|
||||
skipped_download = 0
|
||||
errors = 0
|
||||
|
||||
for i, sp in enumerate(species_list):
|
||||
name = sp["name"]
|
||||
qid = sp["qid"]
|
||||
sp_id = sp["id"]
|
||||
slug = slugify(name)
|
||||
|
||||
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
|
||||
|
||||
if sp_id in existing:
|
||||
print(" Already has image, skipping.")
|
||||
skipped_existing += 1
|
||||
continue
|
||||
|
||||
# Query Wikidata for image
|
||||
time.sleep(REQUEST_DELAY)
|
||||
filename = get_wikidata_image(qid)
|
||||
if not filename:
|
||||
print(" No image on Wikidata.")
|
||||
skipped_no_image += 1
|
||||
continue
|
||||
|
||||
# Get Commons info
|
||||
time.sleep(REQUEST_DELAY)
|
||||
info = get_commons_info(filename)
|
||||
if not info:
|
||||
print(f" Could not get Commons info for {filename}")
|
||||
skipped_no_image += 1
|
||||
continue
|
||||
|
||||
# Check license
|
||||
raw_license = info["license"]
|
||||
if not is_license_allowed(raw_license):
|
||||
print(f" License not allowed: {raw_license}")
|
||||
skipped_license += 1
|
||||
continue
|
||||
|
||||
norm_license = normalize_license(raw_license)
|
||||
artist = info["artist"]
|
||||
thumb_url = info["thumb_url"]
|
||||
desc_url = info["description_url"]
|
||||
|
||||
print(f" License: {raw_license} -> {norm_license}")
|
||||
print(f" Artist: {artist[:80]}")
|
||||
print(f" Thumbnail: {thumb_url[:100]}...")
|
||||
|
||||
# Download image
|
||||
time.sleep(REQUEST_DELAY)
|
||||
image_data = download_image(thumb_url)
|
||||
if not image_data:
|
||||
print(" Failed to download image.")
|
||||
skipped_download += 1
|
||||
continue
|
||||
|
||||
print(f" Downloaded {len(image_data)} bytes")
|
||||
|
||||
# Determine file extension from URL
|
||||
ext = "jpg"
|
||||
if ".png" in thumb_url.lower():
|
||||
ext = "png"
|
||||
elif ".svg" in thumb_url.lower():
|
||||
ext = "svg"
|
||||
elif ".gif" in thumb_url.lower():
|
||||
ext = "gif"
|
||||
|
||||
s3_key = f"species/{slug}.{ext}"
|
||||
content_type = {
|
||||
"jpg": "image/jpeg",
|
||||
"png": "image/png",
|
||||
"svg": "image/svg+xml",
|
||||
"gif": "image/gif",
|
||||
}.get(ext, "image/jpeg")
|
||||
|
||||
# Upload to S3
|
||||
try:
|
||||
s3_upload(s3_key, image_data, content_type)
|
||||
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
|
||||
except RuntimeError as e:
|
||||
print(f" S3 upload failed: {e}")
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Insert into database
|
||||
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
|
||||
# Escape single quotes for SQL
|
||||
caption_esc = caption.replace("'", "''")
|
||||
desc_url_esc = desc_url.replace("'", "''")
|
||||
norm_license_esc = norm_license.replace("'", "''")
|
||||
s3_key_esc = s3_key.replace("'", "''")
|
||||
|
||||
insert_sql = (
|
||||
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
|
||||
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
|
||||
)
|
||||
|
||||
result = psql(insert_sql)
|
||||
# psql returns empty on success for INSERT
|
||||
print(f" Inserted into images table.")
|
||||
imported += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DONE!")
|
||||
print(f" Imported: {imported}")
|
||||
print(f" Skipped (existing):{skipped_existing}")
|
||||
print(f" Skipped (no image):{skipped_no_image}")
|
||||
print(f" Skipped (license): {skipped_license}")
|
||||
print(f" Skipped (download):{skipped_download}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total processed: {len(species_list)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
# Config
|
||||
DB_HOST = "10.31.3.90"
|
||||
DB_USER = "herbapi"
|
||||
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
||||
DB_NAME = "herbapi"
|
||||
S3_BUCKET = "herbapi"
|
||||
S3_ENDPOINT = "http://10.31.3.170:3900"
|
||||
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
||||
REQUEST_DELAY = 0.3
|
||||
|
||||
# AWS env for subprocess calls
|
||||
AWS_ENV = {
|
||||
**os.environ,
|
||||
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
|
||||
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
|
||||
"AWS_DEFAULT_REGION": "garage",
|
||||
}
|
||||
|
||||
# Stats
|
||||
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
|
||||
|
||||
|
||||
def fetch_url(url):
|
||||
"""Fetch URL with custom User-Agent."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read()
|
||||
|
||||
|
||||
def fetch_json(url):
|
||||
"""Fetch URL and parse JSON."""
|
||||
return json.loads(fetch_url(url))
|
||||
|
||||
|
||||
def psql(sql):
|
||||
"""Run psql command and return output."""
|
||||
result = subprocess.run(
|
||||
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
|
||||
capture_output=True, text=True,
|
||||
env={**os.environ, "PGPASSWORD": DB_PASS},
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def is_license_allowed(license_str):
|
||||
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
|
||||
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
|
||||
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
|
||||
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
|
||||
"""
|
||||
if not license_str:
|
||||
return False
|
||||
ls = license_str.lower().strip()
|
||||
|
||||
# Reject NC and ND explicitly first
|
||||
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
|
||||
return False
|
||||
|
||||
# Public domain / CC0
|
||||
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
|
||||
return True
|
||||
if "public domain" in ls or ls.startswith("pd"):
|
||||
return True
|
||||
|
||||
# CC BY-SA (any version, any jurisdiction)
|
||||
if re.match(r"cc\s+by-sa\b", ls):
|
||||
return True
|
||||
|
||||
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
|
||||
if re.match(r"cc\s+by\b", ls):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_wikidata_image(qid):
|
||||
"""Query Wikidata SPARQL for P18 image filename."""
|
||||
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
||||
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
|
||||
data = fetch_json(url)
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
if not bindings:
|
||||
return None
|
||||
image_url = bindings[0]["image"]["value"]
|
||||
# Extract filename from commons URL
|
||||
filename = urllib.parse.unquote(image_url.split("/")[-1])
|
||||
return filename
|
||||
|
||||
|
||||
def get_commons_info(filename):
|
||||
"""Get image info from Commons API: license, artist, thumbnail URL."""
|
||||
title = f"File:{filename}"
|
||||
url = (
|
||||
f"https://commons.wikimedia.org/w/api.php?action=query"
|
||||
f"&titles={urllib.parse.quote(title)}"
|
||||
f"&prop=imageinfo&iiprop=url|extmetadata"
|
||||
f"&iiurlwidth=800&format=json"
|
||||
)
|
||||
data = fetch_json(url)
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page_id, page in pages.items():
|
||||
if page_id == "-1":
|
||||
return None
|
||||
imageinfo = page.get("imageinfo", [{}])[0]
|
||||
meta = imageinfo.get("extmetadata", {})
|
||||
|
||||
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
|
||||
artist_html = meta.get("Artist", {}).get("value", "")
|
||||
|
||||
# Clean up artist: strip HTML tags
|
||||
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
|
||||
# Collapse whitespace
|
||||
artist = re.sub(r"\s+", " ", artist)
|
||||
if len(artist) > 120:
|
||||
artist = artist[:117] + "..."
|
||||
|
||||
# Use the API-provided thumbnail URL (iiurlwidth=800)
|
||||
thumb_url = imageinfo.get("thumburl", "")
|
||||
# Also get the description URL
|
||||
desc_url = imageinfo.get("descriptionurl", "")
|
||||
|
||||
return {
|
||||
"license": license_short,
|
||||
"artist": artist,
|
||||
"thumb_url": thumb_url,
|
||||
"desc_url": desc_url,
|
||||
"filename": filename,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def process_species(species_id, slug, name_sci, qid):
|
||||
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
|
||||
stats["total"] += 1
|
||||
|
||||
# Step 1: Get image filename from Wikidata
|
||||
try:
|
||||
filename = get_wikidata_image(qid)
|
||||
except Exception as e:
|
||||
print(f" ERROR querying Wikidata for {qid}: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not filename:
|
||||
print(f" No P18 image for {qid}")
|
||||
stats["no_p18"] += 1
|
||||
return False
|
||||
|
||||
# Step 2: Get Commons info (license, artist, thumb URL)
|
||||
try:
|
||||
info = get_commons_info(filename)
|
||||
except Exception as e:
|
||||
print(f" ERROR querying Commons for {filename}: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not info:
|
||||
print(f" No Commons info for {filename}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
|
||||
# Step 3: Check license
|
||||
if not is_license_allowed(info["license"]):
|
||||
print(f" Bad license: {info['license']} for {filename}")
|
||||
stats["bad_license"] += 1
|
||||
return False
|
||||
|
||||
# Step 4: Download thumbnail using API-provided URL
|
||||
thumb_url = info["thumb_url"]
|
||||
if not thumb_url:
|
||||
print(f" No thumbnail URL available for {filename}")
|
||||
stats["download_fail"] += 1
|
||||
return False
|
||||
|
||||
# Determine file extension from thumbnail URL
|
||||
ext = "jpg"
|
||||
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||
ext = "png"
|
||||
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
||||
ext = "gif"
|
||||
|
||||
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
|
||||
try:
|
||||
img_data = fetch_url(thumb_url)
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(img_data)
|
||||
except Exception as e:
|
||||
print(f" ERROR downloading {thumb_url}: {e}")
|
||||
stats["download_fail"] += 1
|
||||
return False
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Step 5: Upload to S3
|
||||
s3_key = f"species/{slug}.{ext}"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
|
||||
capture_output=True, text=True, env=AWS_ENV, timeout=60,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" S3 upload failed: {result.stderr}")
|
||||
stats["upload_fail"] += 1
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ERROR uploading to S3: {e}")
|
||||
stats["upload_fail"] += 1
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Step 6: Insert into DB
|
||||
caption = f"Photo: {info['artist']}" if info["artist"] else ""
|
||||
caption_sql = caption.replace("'", "''")
|
||||
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
|
||||
source_url_sql = source_url.replace("'", "''")
|
||||
license_sql = info["license"].replace("'", "''")
|
||||
|
||||
sql = (
|
||||
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
||||
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
|
||||
)
|
||||
try:
|
||||
psql(sql)
|
||||
except Exception as e:
|
||||
print(f" ERROR inserting to DB: {e}")
|
||||
stats["errors"] += 1
|
||||
return False
|
||||
|
||||
stats["imported"] += 1
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
# Get species without images
|
||||
rows = psql(
|
||||
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
|
||||
"FROM species s "
|
||||
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
|
||||
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
|
||||
"ORDER BY s.name_scientific;"
|
||||
)
|
||||
if not rows:
|
||||
print("No species need images.")
|
||||
return
|
||||
|
||||
species_list = []
|
||||
for line in rows.split("\n"):
|
||||
parts = line.strip().split("|")
|
||||
if len(parts) == 4:
|
||||
species_list.append(parts)
|
||||
|
||||
print(f"Processing {len(species_list)} species...\n")
|
||||
|
||||
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
|
||||
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
|
||||
ok = process_species(sid, slug, name_sci, qid)
|
||||
if ok:
|
||||
print(f" OK - imported")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"RESULTS:")
|
||||
print(f" Total species processed: {stats['total']}")
|
||||
print(f" Successfully imported: {stats['imported']}")
|
||||
print(f" No P18 image: {stats['no_p18']}")
|
||||
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
|
||||
print(f" Download failures: {stats['download_fail']}")
|
||||
print(f" Upload failures: {stats['upload_fail']}")
|
||||
print(f" Other errors: {stats['errors']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed HerbAPI with common permaculture plant families and species via GBIF + API."""
|
||||
import json, urllib.request, urllib.parse, time, sys
|
||||
|
||||
API = "http://herbapi01.corp.sub-net.at:8080/api/v1"
|
||||
TOKEN = "km2WjhgyMTHlltwgch5TZADHQ-4uIg0NxBeowD-DHGk"
|
||||
GBIF = "https://api.gbif.org/v1"
|
||||
|
||||
def api_post(path, data):
|
||||
req = urllib.request.Request(f"{API}{path}",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {TOKEN}"})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req)
|
||||
return json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f" ERR {e.code}: {e.read().decode()[:120]}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def gbif_de_name(name):
|
||||
"""Get German common name from GBIF."""
|
||||
url = f"{GBIF}/species/match?name={urllib.parse.quote(name)}"
|
||||
try:
|
||||
match = json.loads(urllib.request.urlopen(url).read())
|
||||
if not match.get("usageKey"): return None
|
||||
url2 = f"{GBIF}/species/{match['usageKey']}/vernacularNames?limit=100"
|
||||
data = json.loads(urllib.request.urlopen(url2).read())
|
||||
for r in data.get("results", []):
|
||||
if r.get("language") == "deu":
|
||||
return r["vernacularName"]
|
||||
except: pass
|
||||
return None
|
||||
|
||||
FAMILIES = [
|
||||
("Fabaceae", "Hülsenfrüchtler", "Legumes"),
|
||||
("Rosaceae", "Rosengewächse", "Rose family"),
|
||||
("Brassicaceae", "Kreuzblütler", "Cabbage family"),
|
||||
("Apiaceae", "Doldenblütler", "Carrot family"),
|
||||
("Lamiaceae", "Lippenblütler", "Mint family"),
|
||||
("Asteraceae", "Korbblütler", "Daisy family"),
|
||||
("Solanaceae", "Nachtschattengewächse", "Nightshade family"),
|
||||
("Cucurbitaceae", "Kürbisgewächse", "Gourd family"),
|
||||
("Poaceae", "Süßgräser", "Grass family"),
|
||||
("Amaryllidaceae", "Amaryllisgewächse", "Amaryllis family"),
|
||||
("Boraginaceae", "Raublattgewächse", "Borage family"),
|
||||
("Adoxaceae", "Moschuskrautgewächse", "Moschatel family"),
|
||||
("Betulaceae", "Birkengewächse", "Birch family"),
|
||||
("Fagaceae", "Buchengewächse", "Beech family"),
|
||||
("Juglandaceae", "Walnussgewächse", "Walnut family"),
|
||||
("Caprifoliaceae", "Geißblattgewächse", "Honeysuckle family"),
|
||||
("Grossulariaceae", "Stachelbeergewächse", "Gooseberry family"),
|
||||
("Ericaceae", "Heidekrautgewächse", "Heath family"),
|
||||
("Moraceae", "Maulbeergewächse", "Mulberry family"),
|
||||
("Urticaceae", "Brennnesselgewächse", "Nettle family"),
|
||||
("Malvaceae", "Malvengewächse", "Mallow family"),
|
||||
("Polygonaceae", "Knöterichgewächse", "Buckwheat family"),
|
||||
("Chenopodiaceae", "Gänsefußgewächse", "Goosefoot family"),
|
||||
("Asparagaceae", "Spargelgewächse", "Asparagus family"),
|
||||
("Plantaginaceae", "Wegerichgewächse", "Plantain family"),
|
||||
]
|
||||
|
||||
SPECIES = [
|
||||
("Sambucus nigra", "Adoxaceae", {"plant_layer": "understory", "nitrogen_fixer": False, "food_uses": "Flowers (cordial, fritters), berries (cooked — syrup, wine)", "medicinal_uses": "Cold/flu remedy, immune support, diaphoretic", "succession_stage": "secondary"}),
|
||||
("Symphytum officinale", "Boraginaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves (limited, contains pyrrolizidine alkaloids)", "medicinal_uses": "Wound healing, bone knitting (external only)", "other_uses": "Dynamic accumulator, mulch/compost activator, animal fodder"}),
|
||||
("Trifolium pratense", "Fabaceae", {"plant_layer": "ground_cover", "nitrogen_fixer": True, "food_uses": "Flowers, young leaves", "medicinal_uses": "Respiratory, menopausal symptoms", "other_uses": "Green manure, nitrogen fixer, bee forage"}),
|
||||
("Corylus avellana", "Betulaceae", {"plant_layer": "shrub", "food_uses": "Nuts", "other_uses": "Coppice wood, hedging, wildlife habitat", "succession_stage": "secondary"}),
|
||||
("Ribes nigrum", "Grossulariaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "High vitamin C, anti-inflammatory"}),
|
||||
("Rubus idaeus", "Rosaceae", {"plant_layer": "shrub", "food_uses": "Berries, leaves (tea)", "medicinal_uses": "Leaf tea for pregnancy/digestion", "succession_stage": "pioneer"}),
|
||||
("Urtica dioica", "Urticaceae", {"plant_layer": "herbaceous", "dynamic_accumulator": True, "food_uses": "Young leaves, seeds", "medicinal_uses": "Anti-inflammatory, prostate, allergies", "other_uses": "Compost activator, fibre, liquid fertiliser"}),
|
||||
("Borago officinalis", "Boraginaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers, young leaves", "other_uses": "Bee forage, companion plant", "attracts_pollinators": True}),
|
||||
("Lavandula angustifolia", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Calming, antiseptic, sleep aid", "other_uses": "Bee forage, pest repellent, fragrance", "attracts_pollinators": True}),
|
||||
("Malus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||
("Prunus domestica", "Rosaceae", {"plant_layer": "canopy", "food_uses": "Fruit", "pollination_type": "Insect-pollinated"}),
|
||||
("Juglans regia", "Juglandaceae", {"plant_layer": "canopy", "food_uses": "Nuts", "other_uses": "Timber, dye", "allelopathic": True}),
|
||||
("Fragaria vesca", "Rosaceae", {"plant_layer": "ground_cover", "food_uses": "Berries, leaves (tea)", "ground_cover_quality": "Good"}),
|
||||
("Allium ursinum", "Amaryllidaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves, flowers, bulbs", "medicinal_uses": "Antimicrobial, blood pressure"}),
|
||||
("Phacelia tanacetifolia", "Boraginaceae", {"plant_layer": "herbaceous", "other_uses": "Green manure, bee forage, cover crop", "attracts_pollinators": True}),
|
||||
("Lupinus polyphyllus", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "other_uses": "Nitrogen fixer, green manure, ornamental"}),
|
||||
("Vicia faba", "Fabaceae", {"plant_layer": "herbaceous", "nitrogen_fixer": True, "food_uses": "Beans", "other_uses": "Nitrogen fixer, green manure"}),
|
||||
("Solanum lycopersicum", "Solanaceae", {"plant_layer": "herbaceous", "food_uses": "Fruit"}),
|
||||
("Cucurbita pepo", "Cucurbitaceae", {"plant_layer": "ground_cover", "food_uses": "Fruit, seeds, flowers"}),
|
||||
("Beta vulgaris", "Chenopodiaceae", {"plant_layer": "herbaceous", "food_uses": "Roots, leaves"}),
|
||||
("Daucus carota", "Apiaceae", {"plant_layer": "herbaceous", "food_uses": "Root"}),
|
||||
("Calendula officinalis", "Asteraceae", {"plant_layer": "herbaceous", "food_uses": "Flowers", "medicinal_uses": "Wound healing, anti-inflammatory, skin care", "other_uses": "Companion plant, pest deterrent", "attracts_pollinators": True}),
|
||||
("Melissa officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Calming, antiviral, digestive", "attracts_pollinators": True}),
|
||||
("Salvia officinalis", "Lamiaceae", {"plant_layer": "herbaceous", "food_uses": "Leaves", "medicinal_uses": "Sore throat, digestive, antimicrobial"}),
|
||||
("Thymus vulgaris", "Lamiaceae", {"plant_layer": "ground_cover", "food_uses": "Leaves", "medicinal_uses": "Respiratory, antimicrobial, cough"}),
|
||||
]
|
||||
|
||||
# Create families
|
||||
print("=== Creating families ===")
|
||||
family_map = {}
|
||||
for sci, de, en in FAMILIES:
|
||||
r = api_post("/families", {"name_scientific": sci, "name_de": de, "name_en": en})
|
||||
if r:
|
||||
family_map[sci] = r["id"]
|
||||
print(f" ✓ {sci}")
|
||||
time.sleep(0.05)
|
||||
print(f"Created {len(family_map)} families\n")
|
||||
|
||||
# Create species
|
||||
print("=== Creating species (with GBIF German names) ===")
|
||||
created = 0
|
||||
for sci_name, family_sci, extra in SPECIES:
|
||||
fam_id = family_map.get(family_sci)
|
||||
if not fam_id:
|
||||
print(f" ✗ {sci_name} — family {family_sci} missing")
|
||||
continue
|
||||
de_name = gbif_de_name(sci_name)
|
||||
data = {"name_scientific": sci_name, "name_de": de_name or "", "name_en": "", "family_id": fam_id, **extra}
|
||||
r = api_post("/species", data)
|
||||
if r:
|
||||
created += 1
|
||||
print(f" ✓ {sci_name} → {de_name or '(no DE name)'}")
|
||||
time.sleep(0.15)
|
||||
print(f"Created {created} species\n")
|
||||
|
||||
# Create suppliers
|
||||
print("=== Creating suppliers ===")
|
||||
for name, url, country, organic, demeter, notes in [
|
||||
("Reinsaat", "https://www.reinsaat.at", "AT", True, True, "Austrian biodynamic seed producer, open-pollinated varieties"),
|
||||
("Magic Garden Seeds", "https://www.magicgardenseeds.com", "DE", False, False, "Specialist seed shop with rare and heritage varieties"),
|
||||
]:
|
||||
r = api_post("/suppliers", {"name": name, "url": url, "country": country, "is_organic": organic, "is_demeter": demeter, "notes": notes})
|
||||
if r: print(f" ✓ {name}")
|
||||
print("\nDone!")
|
||||
Reference in New Issue
Block a user