herbapi/tools/enrichment/import_images_v2.py

#!/usr/bin/env python3
"""Import CC-licensed plant images from Wikimedia Commons into HerbAPI."""

import hashlib
import json
import os
import re
import subprocess
import sys
import time
import urllib.parse
import urllib.request

# Config
DB_HOST = "10.31.3.90"
DB_USER = "herbapi"
DB_PASS = "_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj"
DB_NAME = "herbapi"
S3_BUCKET = "herbapi"
S3_ENDPOINT = "http://10.31.3.170:3900"
USER_AGENT = "HerbAPI/1.0 (https://herbapi.naturalised.at; florian.berthold@sub-net.at)"
REQUEST_DELAY = 0.3

# AWS env for subprocess calls
AWS_ENV = {
    **os.environ,
    "AWS_ACCESS_KEY_ID": "GK1a89859373a6ac56bf11958f",
    "AWS_SECRET_ACCESS_KEY": "bea45a333b5c7b1efdd7466bdbcac54d8642fa19f0c617ca2fd64bd07951b899",
    "AWS_DEFAULT_REGION": "garage",
}

# Stats
stats = {"total": 0, "imported": 0, "no_p18": 0, "bad_license": 0, "download_fail": 0, "upload_fail": 0, "errors": 0}


def fetch_url(url):
    """Fetch URL with custom User-Agent."""
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read()


def fetch_json(url):
    """Fetch URL and parse JSON."""
    return json.loads(fetch_url(url))


def psql(sql):
    """Run psql command and return output."""
    result = subprocess.run(
        ["psql", "-h", DB_HOST, "-U", DB_USER, DB_NAME, "-t", "-A", "-c", sql],
        capture_output=True, text=True,
        env={**os.environ, "PGPASSWORD": DB_PASS},
    )
    return result.stdout.strip()


def is_license_allowed(license_str):
    """Check if license is CC0/CC-BY/CC-BY-SA or Public Domain.
    Wikimedia returns things like 'CC BY-SA 3.0', 'CC BY 4.0', 'CC0', 'Public domain'.
    We allow CC0, Public Domain, CC BY (any version), CC BY-SA (any version).
    We reject: GFDL, CC BY-NC, CC BY-ND, CC BY-NC-SA, CC BY-NC-ND, FAL, Copyrighted free use.
    """
    if not license_str:
        return False
    ls = license_str.lower().strip()

    # Reject NC and ND explicitly first
    if "nc" in ls.split() or "-nc" in ls or "nd" in ls.split() or "-nd" in ls:
        return False

    # Public domain / CC0
    if ls in ("cc0", "cc-zero", "cc0 1.0", "cc0 1.0 universal"):
        return True
    if "public domain" in ls or ls.startswith("pd"):
        return True

    # CC BY-SA (any version, any jurisdiction)
    if re.match(r"cc\s+by-sa\b", ls):
        return True

    # CC BY (any version, any jurisdiction) -- but NOT CC BY-NC or CC BY-ND
    if re.match(r"cc\s+by\b", ls):
        return True

    return False


def get_wikidata_image(qid):
    """Query Wikidata SPARQL for P18 image filename."""
    sparql = f"SELECT ?image WHERE {{ wd:{qid} wdt:P18 ?image }} LIMIT 1"
    url = f"https://query.wikidata.org/sparql?query={urllib.parse.quote(sparql)}&format=json"
    data = fetch_json(url)
    bindings = data.get("results", {}).get("bindings", [])
    if not bindings:
        return None
    image_url = bindings[0]["image"]["value"]
    # Extract filename from commons URL
    filename = urllib.parse.unquote(image_url.split("/")[-1])
    return filename


def get_commons_info(filename):
    """Get image info from Commons API: license, artist, thumbnail URL."""
    title = f"File:{filename}"
    url = (
        f"https://commons.wikimedia.org/w/api.php?action=query"
        f"&titles={urllib.parse.quote(title)}"
        f"&prop=imageinfo&iiprop=url|extmetadata"
        f"&iiurlwidth=800&format=json"
    )
    data = fetch_json(url)
    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        if page_id == "-1":
            return None
        imageinfo = page.get("imageinfo", [{}])[0]
        meta = imageinfo.get("extmetadata", {})

        license_short = meta.get("LicenseShortName", {}).get("value", "").strip()
        artist_html = meta.get("Artist", {}).get("value", "")

        # Clean up artist: strip HTML tags
        artist = re.sub(r"<[^>]+>", "", artist_html).strip()
        # Collapse whitespace
        artist = re.sub(r"\s+", " ", artist)
        if len(artist) > 120:
            artist = artist[:117] + "..."

        # Use the API-provided thumbnail URL (iiurlwidth=800)
        thumb_url = imageinfo.get("thumburl", "")
        # Also get the description URL
        desc_url = imageinfo.get("descriptionurl", "")

        return {
            "license": license_short,
            "artist": artist,
            "thumb_url": thumb_url,
            "desc_url": desc_url,
            "filename": filename,
        }
    return None


def process_species(species_id, slug, name_sci, qid):
    """Process a single species: fetch image from Wikidata/Commons, upload to S3, insert to DB."""
    stats["total"] += 1

    # Step 1: Get image filename from Wikidata
    try:
        filename = get_wikidata_image(qid)
    except Exception as e:
        print(f"  ERROR querying Wikidata for {qid}: {e}")
        stats["errors"] += 1
        return False
    time.sleep(REQUEST_DELAY)

    if not filename:
        print(f"  No P18 image for {qid}")
        stats["no_p18"] += 1
        return False

    # Step 2: Get Commons info (license, artist, thumb URL)
    try:
        info = get_commons_info(filename)
    except Exception as e:
        print(f"  ERROR querying Commons for {filename}: {e}")
        stats["errors"] += 1
        return False
    time.sleep(REQUEST_DELAY)

    if not info:
        print(f"  No Commons info for {filename}")
        stats["errors"] += 1
        return False

    # Step 3: Check license
    if not is_license_allowed(info["license"]):
        print(f"  Bad license: {info['license']} for {filename}")
        stats["bad_license"] += 1
        return False

    # Step 4: Download thumbnail using API-provided URL
    thumb_url = info["thumb_url"]
    if not thumb_url:
        print(f"  No thumbnail URL available for {filename}")
        stats["download_fail"] += 1
        return False

    # Determine file extension from thumbnail URL
    ext = "jpg"
    if ".png" in thumb_url.lower().split("?")[0].split("/")[-1]:
        ext = "png"
    elif ".gif" in thumb_url.lower().split("?")[0].split("/")[-1]:
        ext = "gif"

    tmp_path = f"/tmp/herbapi_img_{slug}.{ext}"
    try:
        img_data = fetch_url(thumb_url)
        with open(tmp_path, "wb") as f:
            f.write(img_data)
    except Exception as e:
        print(f"  ERROR downloading {thumb_url}: {e}")
        stats["download_fail"] += 1
        return False
    time.sleep(REQUEST_DELAY)

    # Step 5: Upload to S3
    s3_key = f"species/{slug}.{ext}"
    try:
        result = subprocess.run(
            ["aws", "s3", "cp", tmp_path, f"s3://{S3_BUCKET}/{s3_key}", "--endpoint-url", S3_ENDPOINT],
            capture_output=True, text=True, env=AWS_ENV, timeout=60,
        )
        if result.returncode != 0:
            print(f"  S3 upload failed: {result.stderr}")
            stats["upload_fail"] += 1
            return False
    except Exception as e:
        print(f"  ERROR uploading to S3: {e}")
        stats["upload_fail"] += 1
        return False
    finally:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass

    # Step 6: Insert into DB
    caption = f"Photo: {info['artist']}" if info["artist"] else ""
    caption_sql = caption.replace("'", "''")
    source_url = info["desc_url"] or f"https://commons.wikimedia.org/wiki/File:{urllib.parse.quote(filename)}"
    source_url_sql = source_url.replace("'", "''")
    license_sql = info["license"].replace("'", "''")

    sql = (
        f"INSERT INTO images (entity_type, entity_id, s3_key, caption, source_url, license, is_primary) "
        f"VALUES ('species', '{species_id}', '{s3_key}', '{caption_sql}', '{source_url_sql}', '{license_sql}', true);"
    )
    try:
        psql(sql)
    except Exception as e:
        print(f"  ERROR inserting to DB: {e}")
        stats["errors"] += 1
        return False

    stats["imported"] += 1
    return True


def main():
    # Get species without images
    rows = psql(
        "SELECT s.id, s.slug, s.name_scientific, s.wikidata_qid "
        "FROM species s "
        "LEFT JOIN images i ON i.entity_type = 'species' AND i.entity_id = s.id "
        "WHERE s.wikidata_qid IS NOT NULL AND s.wikidata_qid != '' AND i.id IS NULL "
        "ORDER BY s.name_scientific;"
    )
    if not rows:
        print("No species need images.")
        return

    species_list = []
    for line in rows.split("\n"):
        parts = line.strip().split("|")
        if len(parts) == 4:
            species_list.append(parts)

    print(f"Processing {len(species_list)} species...\n")

    for i, (sid, slug, name_sci, qid) in enumerate(species_list, 1):
        print(f"[{i}/{len(species_list)}] {name_sci} ({qid})")
        ok = process_species(sid, slug, name_sci, qid)
        if ok:
            print(f"  OK - imported")

    print(f"\n{'='*50}")
    print(f"RESULTS:")
    print(f"  Total species processed: {stats['total']}")
    print(f"  Successfully imported:   {stats['imported']}")
    print(f"  No P18 image:            {stats['no_p18']}")
    print(f"  Bad license (NC/ND/GFDL):{stats['bad_license']}")
    print(f"  Download failures:       {stats['download_fail']}")
    print(f"  Upload failures:         {stats['upload_fail']}")
    print(f"  Other errors:            {stats['errors']}")


if __name__ == "__main__":
    main()