363 lines
11 KiB
Python
363 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""Import CC-licensed plant images from Wikimedia Commons via Wikidata into HerbAPI."""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
# Force unbuffered output
|
|
sys.stdout.reconfigure(line_buffering=True)
|
|
sys.stderr.reconfigure(line_buffering=True)
|
|
|
|
# --- Configuration ---
|
|
S3_ENDPOINT = "http://garage.sub-net.at:3900"
|
|
S3_BUCKET = "herbapi"
|
|
S3_ACCESS_KEY = "GK1a89859373a6ac56bf11958f"
|
|
S3_SECRET_KEY = "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899"
|
|
S3_REGION = "garage"
|
|
|
|
DB_HOST = "10.31.3.90"
|
|
DB_USER = "herbapi"
|
|
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
|
DB_NAME = "herbapi"
|
|
|
|
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
|
THUMB_WIDTH = 800
|
|
REQUEST_DELAY = 0.3
|
|
|
|
ALLOWED_LICENSES = {
|
|
"cc0", "cc-zero", "cc0 1.0", "cc-zero 1.0",
|
|
"public domain", "pd", "pd-self", "pd-old", "pd-old-auto", "pd-old-100",
|
|
"pd-us", "pd-usgov", "pd-author",
|
|
"cc by 1.0", "cc by 2.0", "cc by 2.5", "cc by 3.0", "cc by 4.0",
|
|
"cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-4.0",
|
|
"cc by-sa 1.0", "cc by-sa 2.0", "cc by-sa 2.5", "cc by-sa 3.0", "cc by-sa 4.0",
|
|
"cc-by-sa-1.0", "cc-by-sa-2.0", "cc-by-sa-2.5", "cc-by-sa-3.0", "cc-by-sa-4.0",
|
|
}
|
|
|
|
|
|
def slugify(name: str) -> str:
|
|
"""Convert scientific name to a URL-safe slug."""
|
|
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
|
|
|
|
def psql(query: str) -> str:
|
|
"""Run a psql query and return output."""
|
|
env = os.environ.copy()
|
|
env["PGPASSWORD"] = DB_PASS
|
|
result = subprocess.run(
|
|
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", query],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" psql error: {result.stderr.strip()}", file=sys.stderr)
|
|
return result.stdout.strip()
|
|
|
|
|
|
def fetch_json(url: str) -> dict | None:
|
|
"""Fetch JSON from a URL with proper User-Agent."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read())
|
|
except Exception as e:
|
|
print(f" HTTP error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def get_wikidata_image(qid: str) -> str | None:
|
|
"""Query Wikidata SPARQL for P18 image filename."""
|
|
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
|
url = "https://query.wikidata.org/sparql?" + urllib.parse.urlencode({
|
|
"query": sparql, "format": "json"
|
|
})
|
|
data = fetch_json(url)
|
|
if not data:
|
|
return None
|
|
bindings = data.get("results", {}).get("bindings", [])
|
|
if not bindings:
|
|
return None
|
|
image_url = bindings[0]["image"]["value"]
|
|
# URL like http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
|
|
filename = urllib.parse.unquote(image_url.rsplit("/", 1)[-1])
|
|
return filename
|
|
|
|
|
|
def get_commons_info(filename: str) -> dict | None:
|
|
"""Get image info from Wikimedia Commons API."""
|
|
url = "https://commons.wikimedia.org/w/api.php?" + urllib.parse.urlencode({
|
|
"action": "query",
|
|
"titles": f"File:{filename}",
|
|
"prop": "imageinfo",
|
|
"iiprop": "url|extmetadata",
|
|
"iiurlwidth": str(THUMB_WIDTH),
|
|
"format": "json",
|
|
})
|
|
data = fetch_json(url)
|
|
if not data:
|
|
return None
|
|
pages = data.get("query", {}).get("pages", {})
|
|
for page_id, page in pages.items():
|
|
if page_id == "-1":
|
|
return None
|
|
imageinfo = page.get("imageinfo", [])
|
|
if not imageinfo:
|
|
return None
|
|
info = imageinfo[0]
|
|
meta = info.get("extmetadata", {})
|
|
|
|
thumb_url = info.get("thumburl") or info.get("url")
|
|
desc_url = info.get("descriptionurl", "")
|
|
|
|
license_short = meta.get("LicenseShortName", {}).get("value", "")
|
|
artist_html = meta.get("Artist", {}).get("value", "")
|
|
# Strip HTML tags from artist
|
|
artist = re.sub(r'<[^>]+>', '', artist_html).strip()
|
|
# Clean up whitespace
|
|
artist = re.sub(r'\s+', ' ', artist)
|
|
|
|
return {
|
|
"thumb_url": thumb_url,
|
|
"description_url": desc_url,
|
|
"license": license_short,
|
|
"artist": artist,
|
|
"filename": filename,
|
|
}
|
|
return None
|
|
|
|
|
|
def is_license_allowed(license_str: str) -> bool:
|
|
"""Check if a license is in our allowed list."""
|
|
normalized = license_str.lower().strip()
|
|
# Direct match
|
|
if normalized in ALLOWED_LICENSES:
|
|
return True
|
|
# Check for NC or ND
|
|
if "nc" in normalized or "nd" in normalized:
|
|
return False
|
|
# Check patterns
|
|
if normalized.startswith("public domain") or normalized.startswith("pd"):
|
|
return True
|
|
if re.match(r'^cc[- ]?by[- ]?sa[- ]?\d', normalized):
|
|
return True
|
|
if re.match(r'^cc[- ]?by[- ]?\d', normalized):
|
|
return True
|
|
if re.match(r'^cc[- ]?0', normalized) or normalized == "cc zero":
|
|
return True
|
|
return False
|
|
|
|
|
|
def normalize_license(license_str: str) -> str:
|
|
"""Normalize license string for storage."""
|
|
low = license_str.lower().strip()
|
|
if "public domain" in low or low.startswith("pd"):
|
|
return "Public domain"
|
|
if re.match(r'^cc[- ]?0', low) or "cc-zero" in low or "cc zero" in low:
|
|
return "CC0 1.0"
|
|
# CC BY-SA X.0
|
|
m = re.match(r'^cc[- ]?by[- ]?sa[- ]?(\d+\.?\d*)', low)
|
|
if m:
|
|
return f"CC BY-SA {m.group(1)}"
|
|
# CC BY X.0
|
|
m = re.match(r'^cc[- ]?by[- ]?(\d+\.?\d*)', low)
|
|
if m:
|
|
return f"CC BY {m.group(1)}"
|
|
return license_str
|
|
|
|
|
|
def s3_upload(s3_key: str, data: bytes, content_type: str = "image/jpeg"):
|
|
"""Upload to S3 Garage using AWS CLI."""
|
|
tmp_path = "/tmp/_herbapi_upload_tmp_file_file"
|
|
with open(tmp_path, "wb") as f:
|
|
f.write(data)
|
|
|
|
env = os.environ.copy()
|
|
env["AWS_ACCESS_KEY_ID"] = S3_ACCESS_KEY
|
|
env["AWS_SECRET_ACCESS_KEY"] = S3_SECRET_KEY
|
|
env["AWS_DEFAULT_REGION"] = S3_REGION
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"aws", "s3", "cp", tmp_path,
|
|
f"s3://{S3_BUCKET}/{s3_key}",
|
|
"--endpoint-url", S3_ENDPOINT,
|
|
"--content-type", content_type,
|
|
],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
os.unlink(tmp_path)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"S3 upload failed: {result.stderr.strip()}")
|
|
|
|
|
|
def download_image(url: str) -> bytes | None:
|
|
"""Download image data from URL."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
return resp.read()
|
|
except Exception as e:
|
|
print(f" Download error: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
# 1. Get species
|
|
rows = psql(
|
|
"SELECT id, name_scientific, wikidata_qid FROM species "
|
|
"WHERE wikidata_qid IS NOT NULL AND wikidata_qid <> '' "
|
|
"ORDER BY name_scientific"
|
|
)
|
|
if not rows:
|
|
print("No species with wikidata_qid found.")
|
|
return
|
|
|
|
species_list = []
|
|
for line in rows.split("\n"):
|
|
parts = line.split("|")
|
|
if len(parts) == 3:
|
|
species_list.append({
|
|
"id": parts[0],
|
|
"name": parts[1],
|
|
"qid": parts[2],
|
|
})
|
|
|
|
print(f"Found {len(species_list)} species with Wikidata QIDs.")
|
|
|
|
# 2. Get existing images
|
|
existing = set()
|
|
existing_rows = psql("SELECT entity_id FROM images WHERE entity_type = 'species'")
|
|
if existing_rows:
|
|
for line in existing_rows.split("\n"):
|
|
line = line.strip()
|
|
if line:
|
|
existing.add(line)
|
|
|
|
print(f"Found {len(existing)} species that already have images.")
|
|
|
|
imported = 0
|
|
skipped_existing = 0
|
|
skipped_no_image = 0
|
|
skipped_license = 0
|
|
skipped_download = 0
|
|
errors = 0
|
|
|
|
for i, sp in enumerate(species_list):
|
|
name = sp["name"]
|
|
qid = sp["qid"]
|
|
sp_id = sp["id"]
|
|
slug = slugify(name)
|
|
|
|
print(f"\n[{i+1}/{len(species_list)}] {name} ({qid})")
|
|
|
|
if sp_id in existing:
|
|
print(" Already has image, skipping.")
|
|
skipped_existing += 1
|
|
continue
|
|
|
|
# Query Wikidata for image
|
|
time.sleep(REQUEST_DELAY)
|
|
filename = get_wikidata_image(qid)
|
|
if not filename:
|
|
print(" No image on Wikidata.")
|
|
skipped_no_image += 1
|
|
continue
|
|
|
|
# Get Commons info
|
|
time.sleep(REQUEST_DELAY)
|
|
info = get_commons_info(filename)
|
|
if not info:
|
|
print(f" Could not get Commons info for {filename}")
|
|
skipped_no_image += 1
|
|
continue
|
|
|
|
# Check license
|
|
raw_license = info["license"]
|
|
if not is_license_allowed(raw_license):
|
|
print(f" License not allowed: {raw_license}")
|
|
skipped_license += 1
|
|
continue
|
|
|
|
norm_license = normalize_license(raw_license)
|
|
artist = info["artist"]
|
|
thumb_url = info["thumb_url"]
|
|
desc_url = info["description_url"]
|
|
|
|
print(f" License: {raw_license} -> {norm_license}")
|
|
print(f" Artist: {artist[:80]}")
|
|
print(f" Thumbnail: {thumb_url[:100]}...")
|
|
|
|
# Download image
|
|
time.sleep(REQUEST_DELAY)
|
|
image_data = download_image(thumb_url)
|
|
if not image_data:
|
|
print(" Failed to download image.")
|
|
skipped_download += 1
|
|
continue
|
|
|
|
print(f" Downloaded {len(image_data)} bytes")
|
|
|
|
# Determine file extension from URL
|
|
ext = "jpg"
|
|
if ".png" in thumb_url.lower():
|
|
ext = "png"
|
|
elif ".svg" in thumb_url.lower():
|
|
ext = "svg"
|
|
elif ".gif" in thumb_url.lower():
|
|
ext = "gif"
|
|
|
|
s3_key = f"species/{slug}.{ext}"
|
|
content_type = {
|
|
"jpg": "image/jpeg",
|
|
"png": "image/png",
|
|
"svg": "image/svg+xml",
|
|
"gif": "image/gif",
|
|
}.get(ext, "image/jpeg")
|
|
|
|
# Upload to S3
|
|
try:
|
|
s3_upload(s3_key, image_data, content_type)
|
|
print(f" Uploaded to s3://{S3_BUCKET}/{s3_key}")
|
|
except RuntimeError as e:
|
|
print(f" S3 upload failed: {e}")
|
|
errors += 1
|
|
continue
|
|
|
|
# Insert into database
|
|
caption = f"Photo: {artist}" if artist else "Wikimedia Commons"
|
|
# Escape single quotes for SQL
|
|
caption_esc = caption.replace("'", "''")
|
|
desc_url_esc = desc_url.replace("'", "''")
|
|
norm_license_esc = norm_license.replace("'", "''")
|
|
s3_key_esc = s3_key.replace("'", "''")
|
|
|
|
insert_sql = (
|
|
f"INSERT INTO images (id, entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
|
f"VALUES (gen_random_uuid(), 'species', '{sp_id}', '{s3_key_esc}', "
|
|
f"'{caption_esc}', '{desc_url_esc}', '{norm_license_esc}', true)"
|
|
)
|
|
|
|
result = psql(insert_sql)
|
|
# psql returns empty on success for INSERT
|
|
print(f" Inserted into images table.")
|
|
imported += 1
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"DONE!")
|
|
print(f" Imported: {imported}")
|
|
print(f" Skipped (existing):{skipped_existing}")
|
|
print(f" Skipped (no image):{skipped_no_image}")
|
|
print(f" Skipped (license): {skipped_license}")
|
|
print(f" Skipped (download):{skipped_download}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total processed: {len(species_list)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|