#!/usr/bin/env python3 """Import CC-licensed plant images from Wikimedia Commons into HerbAPI.""" import hashlib import json import os import re import subprocess import sys import time import urllib.parse import urllib.request # Config DB_HOST = "10.31.3.90" DB_USER = "herbapi" DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj" DB_NAME = "herbapi" S3_BUCKET = "herbapi" S3_ENDPOINT = "http://10.31.3.170:3900" USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)" REQUEST_DELAY = 0.3 # AWS env for subprocess calls AWS_ENV = { **os.environ, "AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f", "AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899", "AWS_DEFAULT_REGION": "garage", } # Stats stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0} def fetch_url(url): """Fetch URL with custom User-Agent.""" req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) with urllib.request.urlopen(req, timeout=30) as resp: return resp.read() def fetch_json(url): """Fetch URL and parse JSON.""" return json.loads(fetch_url(url)) def psql(sql): """Run psql command and return output.""" result = subprocess.run( ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql], capture_output=True, text=True, env={**os.environ, "PGPASSWORD": DB_PASS}, ) return result.stdout.strip() def is_license_allowed(license_str): """Check if license is CC0/CC-BY/CC-BY-SA or Public Domain. Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'. We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version). We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use. """ if not license_str: return False ls = license_str.lower().strip() # Reject NC and ND explicitly first if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls: return False # Public domain / CC0 if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"): return True if "public domain" in ls or ls.startswith("pd"): return True # CC BY-SA (any version, any jurisdiction) if re.match(r"cc\s+by-sa\b", ls): return True # CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND if re.match(r"cc\s+by\b", ls): return True return False def get_wikidata_image(qid): """Query Wikidata SPARQL for P18 image filename.""" sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1" url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json" data = fetch_json(url) bindings = data.get("results", {}).get("bindings", []) if not bindings: return None image_url = bindings[0]["image"]["value"] # Extract filename from commons URL filename = urllib.parse.unquote(image_url.split("/")[-1]) return filename def get_commons_info(filename): """Get image info from Commons API: license, artist, thumbnail URL.""" title = f"File:{filename}" url = ( f"https://commons.wikimedia.org/w/api.php?action=query" f"&titles={urllib.parse.quote(title)}" f"&prop=imageinfo&iiprop=url|extmetadata" f"&iiurlwidth=800&format=json" ) data = fetch_json(url) pages = data.get("query", {}).get("pages", {}) for page_id, page in pages.items(): if page_id == "-1": return None imageinfo = page.get("imageinfo", [{}])[0] meta = imageinfo.get("extmetadata", {}) license_short = meta.get("LicenseShortName", {}).get("value", "").strip() artist_html = meta.get("Artist", {}).get("value", "") # Clean up artist: strip HTML tags artist = re.sub(r"<[^>]+>", "", artist_html).strip() # Collapse whitespace artist = re.sub(r"\s+", " ", artist) if len(artist) > 120: artist = artist[:117] + "..." # Use the API-provided thumbnail URL (iiurlwidth=800) thumb_url = imageinfo.get("thumburl", "") # Also get the description URL desc_url = imageinfo.get("descriptionurl", "") return { "license": license_short, "artist": artist, "thumb_url": thumb_url, "desc_url": desc_url, "filename": filename, } return None def process_species(species_id, slug, name_sci, qid): """Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB.""" stats["total"] += 1 # Step 1: Get image filename from Wikidata try: filename = get_wikidata_image(qid) except Exception as e: print(f" ERROR querying Wikidata for {qid}: {e}") stats["errors"] += 1 return False time.sleep(REQUEST_DELAY) if not filename: print(f" No P18 image for {qid}") stats["no_p18"] += 1 return False # Step 2: Get Commons info (license, artist, thumb URL) try: info = get_commons_info(filename) except Exception as e: print(f" ERROR querying Commons for {filename}: {e}") stats["errors"] += 1 return False time.sleep(REQUEST_DELAY) if not info: print(f" No Commons info for {filename}") stats["errors"] += 1 return False # Step 3: Check license if not is_license_allowed(info["license"]): print(f" Bad license: {info['license']} for {filename}") stats["bad_license"] += 1 return False # Step 4: Download thumbnail using API-provided URL thumb_url = info["thumb_url"] if not thumb_url: print(f" No thumbnail URL available for {filename}") stats["download_fail"] += 1 return False # Determine file extension from thumbnail URL ext = "jpg" if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]: ext = "png" elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]: ext = "gif" tmp_path = f"/tmp/herbapi_img_{slug}.{ext}" try: img_data = fetch_url(thumb_url) with open(tmp_path, "wb") as f: f.write(img_data) except Exception as e: print(f" ERROR downloading {thumb_url}: {e}") stats["download_fail"] += 1 return False time.sleep(REQUEST_DELAY) # Step 5: Upload to S3 s3_key = f"species/{slug}.{ext}" try: result = subprocess.run( ["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT], capture_output=True, text=True, env=AWS_ENV, timeout=60, ) if result.returncode != 0: print(f" S3 upload failed: {result.stderr}") stats["upload_fail"] += 1 return False except Exception as e: print(f" ERROR uploading to S3: {e}") stats["upload_fail"] += 1 return False finally: try: os.unlink(tmp_path) except OSError: pass # Step 6: Insert into DB caption = f"Photo: {info['artist']}" if info["artist"] else "" caption_sql = caption.replace("'", "''") source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}" source_url_sql = source_url.replace("'", "''") license_sql = info["license"].replace("'", "''") sql = ( f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) " f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);" ) try: psql(sql) except Exception as e: print(f" ERROR inserting to DB: {e}") stats["errors"] += 1 return False stats["imported"] += 1 return True def main(): # Get species without images rows = psql( "SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid " "FROM species s " "LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id " "WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL " "ORDER BY s.name_scientific;" ) if not rows: print("No species need images.") return species_list = [] for line in rows.split("\n"): parts = line.strip().split("|") if len(parts) == 4: species_list.append(parts) print(f"Processing {len(species_list)} species...\n") for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1): print(f"[{i}/{len(species_list)}] {name_sci} ({qid})") ok = process_species(sid, slug, name_sci, qid) if ok: print(f" OK - imported") print(f"\n{'='*50}") print(f"RESULTS:") print(f" Total species processed: {stats['total']}") print(f" Successfully imported: {stats['imported']}") print(f" No P18 image: {stats['no_p18']}") print(f" Bad license (NC/ND/GFDL):{stats['bad_license']}") print(f" Download failures: {stats['download_fail']}") print(f" Upload failures: {stats['upload_fail']}") print(f" Other errors: {stats['errors']}") if __name__ == "__main__": main()