Files
herbapi/tools/enrichment/import_images.py
T

363 lines
11 KiB
Python

#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
# --- Configuration ---
S3_ENDPOINT = "http://garage.sub-net.at:3900"
S3_BUCKET = "herbapi"
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
S3_REGION = "garage"
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
THUMB_WIDTH = 800
REQUEST_DELAY = 0.3
ALLOWED_LICENSES = {
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
"pd-us", "pd-usgov", "pd-author",
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
}
def slugify(name: str) -> str:
"""Convert scientific name to a URL-safe slug."""
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def psql(query: str) -> str:
"""Run a psql query and return output."""
env = os.environ.copy()
env["PGPASSWORD"] = DB_PASS
result = subprocess.run(
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
capture_output=True, text=True, env=env
)
if result.returncode != 0:
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
return result.stdout.strip()
def fetch_json(url: str) -> dict | None:
"""Fetch JSON from a URL with proper User-Agent."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read())
except Exception as e:
print(f" HTTP error fetching {url}: {e}")
return None
def get_wikidata_image(qid: str) -> str | None:
"""Query Wikidata SPARQL for P18 image filename."""
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
"query": sparql, "format": "json"
})
data = fetch_json(url)
if not data:
return None
bindings = data.get("results", {}).get("bindings", [])
if not bindings:
return None
image_url = bindings[0]["image"]["value"]
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
return filename
def get_commons_info(filename: str) -> dict | None:
"""Get image info from Wikimedia Commons API."""
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
"action": "query",
"titles": f"File:{filename}",
"prop": "imageinfo",
"iiprop": "url|extmetadata",
"iiurlwidth": str(THUMB_WIDTH),
"format": "json",
})
data = fetch_json(url)
if not data:
return None
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id == "-1":
return None
imageinfo = page.get("imageinfo", [])
if not imageinfo:
return None
info = imageinfo[0]
meta = info.get("extmetadata", {})
thumb_url = info.get("thumburl") or info.get("url")
desc_url = info.get("descriptionurl", "")
license_short = meta.get("LicenseShortName", {}).get("value", "")
artist_html = meta.get("Artist", {}).get("value", "")
# Strip HTML tags from artist
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
# Clean up whitespace
artist = re.sub(r'\s+', ' ', artist)
return {
"thumb_url": thumb_url,
"description_url": desc_url,
"license": license_short,
"artist": artist,
"filename": filename,
}
return None
def is_license_allowed(license_str: str) -> bool:
"""Check if a license is in our allowed list."""
normalized = license_str.lower().strip()
# Direct match
if normalized in ALLOWED_LICENSES:
return True
# Check for NC or ND
if "nc" in normalized or "nd" in normalized:
return False
# Check patterns
if normalized.startswith("public domain") or normalized.startswith("pd"):
return True
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
return True
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
return True
return False
def normalize_license(license_str: str) -> str:
"""Normalize license string for storage."""
low = license_str.lower().strip()
if "public domain" in low or low.startswith("pd"):
return "Public domain"
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
return "CC0 1.0"
# CC BY-SA X.0
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY-SA {m.group(1)}"
# CC BY X.0
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
if m:
return f"CC BY {m.group(1)}"
return license_str
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
"""Upload to S3 Garage using AWS CLI."""
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
with open(tmp_path, "wb") as f:
f.write(data)
env = os.environ.copy()
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
env["AWS_DEFAULT_REGION"] = S3_REGION
result = subprocess.run(
[
"aws", "s3", "cp", tmp_path,
f"s3://{S3_BUCKET}/{s3_key}",
"--endpoint-url", S3_ENDPOINT,
"--content-type", content_type,
],
capture_output=True, text=True, env=env
)
os.unlink(tmp_path)
if result.returncode != 0:
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
def download_image(url: str) -> bytes | None:
"""Download image data from URL."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.read()
except Exception as e:
print(f" Download error: {e}")
return None
def main():
# 1. Get species
rows = psql(
"SELECT id, name_scientific, wikidata_qid FROM species "
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
"ORDER BY name_scientific"
)
if not rows:
print("No species with wikidata_qid found.")
return
species_list = []
for line in rows.split("\n"):
parts = line.split("|")
if len(parts) == 3:
species_list.append({
"id": parts[0],
"name": parts[1],
"qid": parts[2],
})
print(f"Found {len(species_list)} species with Wikidata QIDs.")
# 2. Get existing images
existing = set()
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
if existing_rows:
for line in existing_rows.split("\n"):
line = line.strip()
if line:
existing.add(line)
print(f"Found {len(existing)} species that already have images.")
imported = 0
skipped_existing = 0
skipped_no_image = 0
skipped_license = 0
skipped_download = 0
errors = 0
for i, sp in enumerate(species_list):
name = sp["name"]
qid = sp["qid"]
sp_id = sp["id"]
slug = slugify(name)
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
if sp_id in existing:
print(" Already has image, skipping.")
skipped_existing += 1
continue
# Query Wikidata for image
time.sleep(REQUEST_DELAY)
filename = get_wikidata_image(qid)
if not filename:
print(" No image on Wikidata.")
skipped_no_image += 1
continue
# Get Commons info
time.sleep(REQUEST_DELAY)
info = get_commons_info(filename)
if not info:
print(f" Could not get Commons info for {filename}")
skipped_no_image += 1
continue
# Check license
raw_license = info["license"]
if not is_license_allowed(raw_license):
print(f" License not allowed: {raw_license}")
skipped_license += 1
continue
norm_license = normalize_license(raw_license)
artist = info["artist"]
thumb_url = info["thumb_url"]
desc_url = info["description_url"]
print(f" License: {raw_license} -> {norm_license}")
print(f" Artist: {artist[:80]}")
print(f" Thumbnail: {thumb_url[:100]}...")
# Download image
time.sleep(REQUEST_DELAY)
image_data = download_image(thumb_url)
if not image_data:
print(" Failed to download image.")
skipped_download += 1
continue
print(f" Downloaded {len(image_data)} bytes")
# Determine file extension from URL
ext = "jpg"
if ".png" in thumb_url.lower():
ext = "png"
elif ".svg" in thumb_url.lower():
ext = "svg"
elif ".gif" in thumb_url.lower():
ext = "gif"
s3_key = f"species/{slug}.{ext}"
content_type = {
"jpg": "image/jpeg",
"png": "image/png",
"svg": "image/svg+xml",
"gif": "image/gif",
}.get(ext, "image/jpeg")
# Upload to S3
try:
s3_upload(s3_key, image_data, content_type)
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
except RuntimeError as e:
print(f" S3 upload failed: {e}")
errors += 1
continue
# Insert into database
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
# Escape single quotes for SQL
caption_esc = caption.replace("'", "''")
desc_url_esc = desc_url.replace("'", "''")
norm_license_esc = norm_license.replace("'", "''")
s3_key_esc = s3_key.replace("'", "''")
insert_sql = (
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
)
result = psql(insert_sql)
# psql returns empty on success for INSERT
print(f" Inserted into images table.")
imported += 1
print(f"\n{'='*60}")
print(f"DONE!")
print(f" Imported: {imported}")
print(f" Skipped (existing):{skipped_existing}")
print(f" Skipped (no image):{skipped_no_image}")
print(f" Skipped (license): {skipped_license}")
print(f" Skipped (download):{skipped_download}")
print(f" Errors: {errors}")
print(f" Total processed: {len(species_list)}")
if __name__ == "__main__":
main()