Files
herbapi/tools/enrichment/import_images_v2.py

291 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Config
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
S3_BUCKET = "herbapi"
S3_ENDPOINT = "http://10.31.3.170:3900"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
REQUEST_DELAY = 0.3
# AWS env for subprocess calls
AWS_ENV = {
**os.environ,
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
"AWS_DEFAULT_REGION": "garage",
}
# Stats
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
def fetch_url(url):
"""Fetch URL with custom User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
def fetch_json(url):
"""Fetch URL and parse JSON."""
return json.loads(fetch_url(url))
def psql(sql):
"""Run psql command and return output."""
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
capture_output=True, text=True,
env={**os.environ, "PGPASSWORD": DB_PASS},
)
return result.stdout.strip()
def is_license_allowed(license_str):
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
"""
if not license_str:
return False
ls = license_str.lower().strip()
# Reject NC and ND explicitly first
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
return False
# Public domain / CC0
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
return True
if "public domain" in ls or ls.startswith("pd"):
return True
# CC BY-SA (any version, any jurisdiction)
if re.match(r"cc\s+by-sa\b", ls):
return True
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
if re.match(r"cc\s+by\b", ls):
return True
return False
def get_wikidata_image(qid):
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
data = fetch_json(url)
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# Extract filename from commons URL
filename = urllib.parse.unquote(image_url.split("/")[-1])
return filename
def get_commons_info(filename):
"""Get image info from Commons API: license, artist, thumbnail URL."""
title = f"File:{filename}"
url = (
f"https://commons.wikimedia.org/w/api.php?action=query"
f"&titles={urllib.parse.quote(title)}"
f"&prop=imageinfo&iiprop=url|extmetadata"
f"&iiurlwidth=800&format=json"
)
data = fetch_json(url)
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [{}])[0]
meta = imageinfo.get("extmetadata", {})
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
artist_html = meta.get("Artist", {}).get("value", "")
# Clean up artist: strip HTML tags
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
# Collapse whitespace
artist = re.sub(r"\s+", " ", artist)
if len(artist) > 120:
artist = artist[:117] + "..."
# Use the API-provided thumbnail URL (iiurlwidth=800)
thumb_url = imageinfo.get("thumburl", "")
# Also get the description URL
desc_url = imageinfo.get("descriptionurl", "")
return {
"license": license_short,
"artist": artist,
"thumb_url": thumb_url,
"desc_url": desc_url,
"filename": filename,
}
return None
def process_species(species_id, slug, name_sci, qid):
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
stats["total"] += 1
# Step 1: Get image filename from Wikidata
try:
filename = get_wikidata_image(qid)
except Exception as e:
print(f" ERROR querying Wikidata for {qid}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not filename:
print(f" No P18 image for {qid}")
stats["no_p18"] += 1
return False
# Step 2: Get Commons info (license, artist, thumb URL)
try:
info = get_commons_info(filename)
except Exception as e:
print(f" ERROR querying Commons for {filename}: {e}")
stats["errors"] += 1
return False
time.sleep(REQUEST_DELAY)
if not info:
print(f" No Commons info for {filename}")
stats["errors"] += 1
return False
# Step 3: Check license
if not is_license_allowed(info["license"]):
print(f" Bad license: {info['license']} for {filename}")
stats["bad_license"] += 1
return False
# Step 4: Download thumbnail using API-provided URL
thumb_url = info["thumb_url"]
if not thumb_url:
print(f" No thumbnail URL available for {filename}")
stats["download_fail"] += 1
return False
# Determine file extension from thumbnail URL
ext = "jpg"
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "png"
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
ext = "gif"
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
try:
img_data = fetch_url(thumb_url)
with open(tmp_path, "wb") as f:
f.write(img_data)
except Exception as e:
print(f" ERROR downloading {thumb_url}: {e}")
stats["download_fail"] += 1
return False
time.sleep(REQUEST_DELAY)
# Step 5: Upload to S3
s3_key = f"species/{slug}.{ext}"
try:
result = subprocess.run(
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
capture_output=True, text=True, env=AWS_ENV, timeout=60,
)
if result.returncode != 0:
print(f" S3 upload failed: {result.stderr}")
stats["upload_fail"] += 1
return False
except Exception as e:
print(f" ERROR uploading to S3: {e}")
stats["upload_fail"] += 1
return False
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
# Step 6: Insert into DB
caption = f"Photo: {info['artist']}" if info["artist"] else ""
caption_sql = caption.replace("'", "''")
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
source_url_sql = source_url.replace("'", "''")
license_sql = info["license"].replace("'", "''")
sql = (
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
)
try:
psql(sql)
except Exception as e:
print(f" ERROR inserting to DB: {e}")
stats["errors"] += 1
return False
stats["imported"] += 1
return True
def main():
# Get species without images
rows = psql(
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
"FROM species s "
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
"ORDER BY s.name_scientific;"
)
if not rows:
print("No species need images.")
return
species_list = []
for line in rows.split("\n"):
parts = line.strip().split("|")
if len(parts) == 4:
species_list.append(parts)
print(f"Processing {len(species_list)} species...\n")
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
ok = process_species(sid, slug, name_sci, qid)
if ok:
print(f" OK - imported")
print(f"\n{'='*50}")
print(f"RESULTS:")
print(f" Total species processed: {stats['total']}")
print(f" Successfully imported: {stats['imported']}")
print(f" No P18 image: {stats['no_p18']}")
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
print(f" Download failures: {stats['download_fail']}")
print(f" Upload failures: {stats['upload_fail']}")
print(f" Other errors: {stats['errors']}")
if __name__ == "__main__":
main()