291 lines
9.2 KiB
Python
291 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
# Config
|
|
DB_HOST = "10.31.3.90"
|
|
DB_USER = "herbapi"
|
|
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
|
|
DB_NAME = "herbapi"
|
|
S3_BUCKET = "herbapi"
|
|
S3_ENDPOINT = "http://10.31.3.170:3900"
|
|
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
|
|
REQUEST_DELAY = 0.3
|
|
|
|
# AWS env for subprocess calls
|
|
AWS_ENV = {
|
|
**os.environ,
|
|
"AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
|
|
"AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
|
|
"AWS_DEFAULT_REGION": "garage",
|
|
}
|
|
|
|
# Stats
|
|
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}
|
|
|
|
|
|
def fetch_url(url):
|
|
"""Fetch URL with custom User-Agent."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return resp.read()
|
|
|
|
|
|
def fetch_json(url):
|
|
"""Fetch URL and parse JSON."""
|
|
return json.loads(fetch_url(url))
|
|
|
|
|
|
def psql(sql):
|
|
"""Run psql command and return output."""
|
|
result = subprocess.run(
|
|
["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
|
|
capture_output=True, text=True,
|
|
env={**os.environ, "PGPASSWORD": DB_PASS},
|
|
)
|
|
return result.stdout.strip()
|
|
|
|
|
|
def is_license_allowed(license_str):
|
|
"""Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
|
|
Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
|
|
We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
|
|
We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
|
|
"""
|
|
if not license_str:
|
|
return False
|
|
ls = license_str.lower().strip()
|
|
|
|
# Reject NC and ND explicitly first
|
|
if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
|
|
return False
|
|
|
|
# Public domain / CC0
|
|
if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
|
|
return True
|
|
if "public domain" in ls or ls.startswith("pd"):
|
|
return True
|
|
|
|
# CC BY-SA (any version, any jurisdiction)
|
|
if re.match(r"cc\s+by-sa\b", ls):
|
|
return True
|
|
|
|
# CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
|
|
if re.match(r"cc\s+by\b", ls):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def get_wikidata_image(qid):
|
|
"""Query Wikidata SPARQL for P18 image filename."""
|
|
sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
|
|
url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
|
|
data = fetch_json(url)
|
|
bindings = data.get("results", {}).get("bindings", [])
|
|
if not bindings:
|
|
return None
|
|
image_url = bindings[0]["image"]["value"]
|
|
# Extract filename from commons URL
|
|
filename = urllib.parse.unquote(image_url.split("/")[-1])
|
|
return filename
|
|
|
|
|
|
def get_commons_info(filename):
|
|
"""Get image info from Commons API: license, artist, thumbnail URL."""
|
|
title = f"File:{filename}"
|
|
url = (
|
|
f"https://commons.wikimedia.org/w/api.php?action=query"
|
|
f"&titles={urllib.parse.quote(title)}"
|
|
f"&prop=imageinfo&iiprop=url|extmetadata"
|
|
f"&iiurlwidth=800&format=json"
|
|
)
|
|
data = fetch_json(url)
|
|
pages = data.get("query", {}).get("pages", {})
|
|
for page_id, page in pages.items():
|
|
if page_id == "-1":
|
|
return None
|
|
imageinfo = page.get("imageinfo", [{}])[0]
|
|
meta = imageinfo.get("extmetadata", {})
|
|
|
|
license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
|
|
artist_html = meta.get("Artist", {}).get("value", "")
|
|
|
|
# Clean up artist: strip HTML tags
|
|
artist = re.sub(r"<[^>]+>", "", artist_html).strip()
|
|
# Collapse whitespace
|
|
artist = re.sub(r"\s+", " ", artist)
|
|
if len(artist) > 120:
|
|
artist = artist[:117] + "..."
|
|
|
|
# Use the API-provided thumbnail URL (iiurlwidth=800)
|
|
thumb_url = imageinfo.get("thumburl", "")
|
|
# Also get the description URL
|
|
desc_url = imageinfo.get("descriptionurl", "")
|
|
|
|
return {
|
|
"license": license_short,
|
|
"artist": artist,
|
|
"thumb_url": thumb_url,
|
|
"desc_url": desc_url,
|
|
"filename": filename,
|
|
}
|
|
return None
|
|
|
|
|
|
def process_species(species_id, slug, name_sci, qid):
|
|
"""Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
|
|
stats["total"] += 1
|
|
|
|
# Step 1: Get image filename from Wikidata
|
|
try:
|
|
filename = get_wikidata_image(qid)
|
|
except Exception as e:
|
|
print(f" ERROR querying Wikidata for {qid}: {e}")
|
|
stats["errors"] += 1
|
|
return False
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if not filename:
|
|
print(f" No P18 image for {qid}")
|
|
stats["no_p18"] += 1
|
|
return False
|
|
|
|
# Step 2: Get Commons info (license, artist, thumb URL)
|
|
try:
|
|
info = get_commons_info(filename)
|
|
except Exception as e:
|
|
print(f" ERROR querying Commons for {filename}: {e}")
|
|
stats["errors"] += 1
|
|
return False
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if not info:
|
|
print(f" No Commons info for {filename}")
|
|
stats["errors"] += 1
|
|
return False
|
|
|
|
# Step 3: Check license
|
|
if not is_license_allowed(info["license"]):
|
|
print(f" Bad license: {info['license']} for {filename}")
|
|
stats["bad_license"] += 1
|
|
return False
|
|
|
|
# Step 4: Download thumbnail using API-provided URL
|
|
thumb_url = info["thumb_url"]
|
|
if not thumb_url:
|
|
print(f" No thumbnail URL available for {filename}")
|
|
stats["download_fail"] += 1
|
|
return False
|
|
|
|
# Determine file extension from thumbnail URL
|
|
ext = "jpg"
|
|
if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
|
ext = "png"
|
|
elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
|
|
ext = "gif"
|
|
|
|
tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
|
|
try:
|
|
img_data = fetch_url(thumb_url)
|
|
with open(tmp_path, "wb") as f:
|
|
f.write(img_data)
|
|
except Exception as e:
|
|
print(f" ERROR downloading {thumb_url}: {e}")
|
|
stats["download_fail"] += 1
|
|
return False
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
# Step 5: Upload to S3
|
|
s3_key = f"species/{slug}.{ext}"
|
|
try:
|
|
result = subprocess.run(
|
|
["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
|
|
capture_output=True, text=True, env=AWS_ENV, timeout=60,
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" S3 upload failed: {result.stderr}")
|
|
stats["upload_fail"] += 1
|
|
return False
|
|
except Exception as e:
|
|
print(f" ERROR uploading to S3: {e}")
|
|
stats["upload_fail"] += 1
|
|
return False
|
|
finally:
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
# Step 6: Insert into DB
|
|
caption = f"Photo: {info['artist']}" if info["artist"] else ""
|
|
caption_sql = caption.replace("'", "''")
|
|
source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
|
|
source_url_sql = source_url.replace("'", "''")
|
|
license_sql = info["license"].replace("'", "''")
|
|
|
|
sql = (
|
|
f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
|
|
f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
|
|
)
|
|
try:
|
|
psql(sql)
|
|
except Exception as e:
|
|
print(f" ERROR inserting to DB: {e}")
|
|
stats["errors"] += 1
|
|
return False
|
|
|
|
stats["imported"] += 1
|
|
return True
|
|
|
|
|
|
def main():
|
|
# Get species without images
|
|
rows = psql(
|
|
"SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
|
|
"FROM species s "
|
|
"LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
|
|
"WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
|
|
"ORDER BY s.name_scientific;"
|
|
)
|
|
if not rows:
|
|
print("No species need images.")
|
|
return
|
|
|
|
species_list = []
|
|
for line in rows.split("\n"):
|
|
parts = line.strip().split("|")
|
|
if len(parts) == 4:
|
|
species_list.append(parts)
|
|
|
|
print(f"Processing {len(species_list)} species...\n")
|
|
|
|
for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
|
|
print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
|
|
ok = process_species(sid, slug, name_sci, qid)
|
|
if ok:
|
|
print(f" OK - imported")
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"RESULTS:")
|
|
print(f" Total species processed: {stats['total']}")
|
|
print(f" Successfully imported: {stats['imported']}")
|
|
print(f" No P18 image: {stats['no_p18']}")
|
|
print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}")
|
|
print(f" Download failures: {stats['download_fail']}")
|
|
print(f" Upload failures: {stats['upload_fail']}")
|
|
print(f" Other errors: {stats['errors']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|