herbapi/tools/scrapers/scrape_mgs.py

#!/usr/bin/env python3
"""Scrape Magic Garden Seeds product pages and update herbapi database."""

import subprocess
import re
import time
import os
import sys

DB_CMD = [
    'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
    '-t', '-A', '-F|'
]
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}

MONTH_MAP = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4,
    'may': 5, 'june': 6, 'july': 7, 'august': 8,
    'september': 9, 'october': 10, 'november': 11, 'december': 12,
}


def run_sql(sql):
    result = subprocess.run(
        DB_CMD + ['-c', sql],
        capture_output=True, text=True, env=DB_ENV
    )
    return result.stdout.strip()


def fetch_page(url):
    result = subprocess.run(
        ['curl', '-sL', '--max-time', '15', url],
        capture_output=True, text=True
    )
    return result.stdout


def parse_months(text):
    if not text:
        return None
    text_lower = text.lower().strip()
    months = []
    for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
        if month_name in text_lower:
            if month_num not in months:
                months.append(month_num)
            text_lower = text_lower.replace(month_name, '')
    return sorted(months) if months else None


def parse_depth(text):
    if not text:
        return None
    match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
    if match:
        v1 = float(match.group(1).replace(',', '.'))
        v2 = float(match.group(2).replace(',', '.'))
        return round((v1 + v2) / 2, 1)
    match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
    if match:
        return float(match.group(1).replace(',', '.'))
    return None


def parse_spacing(text):
    """Parse planting distance. Returns (row_spacing, plant_spacing)."""
    if not text:
        return None, None
    text = text.lower().strip()
    # "X x Y cm"
    match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return float(match.group(2)), float(match.group(1))
    # "X - Y cm" range -> average as plant spacing
    match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
    # Single value
    match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
    if match:
        return None, float(match.group(1))
    return None, None


def parse_germination_days(text):
    if not text:
        return None
    text = text.lower()
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
    if match:
        return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
    match = re.search(r'(\d+)\s*weeks?', text)
    if match:
        return int(match.group(1)) * 7
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
    if match:
        return int(round((int(match.group(1)) + int(match.group(2))) / 2))
    match = re.search(r'(\d+)\s*days?', text)
    if match:
        return int(match.group(1))
    return None


def parse_germ_temp(text):
    if not text:
        return None
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
    if match:
        return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
    match = re.search(r'(\d+)\s*°', text)
    if match:
        return float(match.group(1))
    return None


def parse_lifecycle(text):
    if not text:
        return None
    text = text.lower().strip()
    if 'perennial' in text:
        return True
    if 'annual' in text or 'biennial' in text:
        return False
    return None


def parse_light(text):
    if not text:
        return None
    text = text.lower().strip()
    if 'full sun' in text and 'partial' in text:
        return 'full sun to partial shade'
    if 'full sun' in text:
        return 'full sun'
    if 'partial' in text or 'semi' in text or 'half' in text:
        return 'partial shade'
    if 'shade' in text:
        return 'shade'
    if 'sun' in text:
        return 'full sun'
    return text


def extract_data(html):
    data = {}

    # Extract table cell pairs
    cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
    clean_cells = []
    for c in cells:
        clean = re.sub(r'<[^>]+>', ' ', c).strip()
        clean = re.sub(r'\s+', ' ', clean)
        clean_cells.append(clean)

    specs = {}
    i = 0
    while i < len(clean_cells) - 1:
        key = clean_cells[i].rstrip(':').strip()
        val = clean_cells[i + 1].strip()
        if key and val and not re.match(r'^[\d,.\s€*]+$', key):
            specs[key.lower()] = val
        i += 2

    # Extract description from itemprop="description"
    desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
    if desc_match:
        content = desc_match.group(1)
        content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
        content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
        content = re.sub(r'<[^>]+>', ' ', content)
        content = re.sub(r'\s+', ' ', content).strip()
        for marker in ['Other names', 'Additional contact mail', 'Question about']:
            idx = content.find(marker)
            if idx > 0:
                content = content[:idx].strip()
        if len(content) > 20:
            data['description'] = content

    if 'description' not in data:
        meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
        if meta_match and len(meta_match.group(1)) > 20:
            data['description'] = meta_match.group(1)

    # Parse specs
    if 'planting distance' in specs:
        row_sp, plant_sp = parse_spacing(specs['planting distance'])
        if plant_sp:
            data['plant_spacing_cm'] = plant_sp
        if row_sp:
            data['row_spacing_cm'] = row_sp

    if 'row spacing' in specs:
        match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
        if match:
            data['row_spacing_cm'] = float(match.group(1))

    if 'sowing depth' in specs:
        depth = parse_depth(specs['sowing depth'])
        if depth is not None:
            data['planting_depth_cm'] = depth

    # Harvesting months - prefer explicit harvest time over flowering
    if 'harvest time' in specs:
        months = parse_months(specs['harvest time'])
        if months:
            data['harvesting_months'] = months
    elif 'harvesting months' in specs:
        months = parse_months(specs['harvesting months'])
        if months:
            data['harvesting_months'] = months
    elif 'flowering months' in specs:
        months = parse_months(specs['flowering months'])
        if months:
            data['harvesting_months'] = months

    if 'when to sow outdoors' in specs:
        months = parse_months(specs['when to sow outdoors'])
        if months:
            data['direct_sowing_months'] = months

    for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
        if indoor_key in specs:
            months = parse_months(specs[indoor_key])
            if months:
                data['indoor_sowing_months'] = months
                break

    if 'lifecycle' in specs:
        perennial = parse_lifecycle(specs['lifecycle'])
        if perennial is not None:
            data['perennial'] = perennial

    if 'sunlight' in specs:
        light = parse_light(specs['sunlight'])
        if light:
            data['light_requirement'] = light

    if 'germination time' in specs:
        days = parse_germination_days(specs['germination time'])
        if days:
            data['days_to_germination'] = days

    if 'germination temperature' in specs:
        temp = parse_germ_temp(specs['germination temperature'])
        if temp:
            data['germination_temp_c'] = temp

    return data


def get_current_values(cultivar_id):
    sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
                     perennial, harvesting_months, direct_sowing_months, light_requirement,
                     days_to_germination, germination_temp_c, indoor_sowing_months
              FROM cultivars WHERE id = '{cultivar_id}'"""
    row = run_sql(sql)
    if not row:
        return {}
    parts = row.split('|')
    fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
              'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
              'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
    current = {}
    for i, f in enumerate(fields):
        if i < len(parts):
            val = parts[i].strip()
            if val and val != '':
                current[f] = val
    return current


def build_update_sql(cultivar_id, data, current):
    sets = []
    updated_fields = []
    for field, value in data.items():
        if field in current and current[field]:
            continue

        if isinstance(value, str):
            escaped = value.replace("'", "''")
            sets.append(f"{field} = '{escaped}'")
        elif isinstance(value, bool):
            sets.append(f"{field} = {'true' if value else 'false'}")
        elif isinstance(value, list):
            arr_str = '{' + ','.join(str(x) for x in value) + '}'
            sets.append(f"{field} = '{arr_str}'")
        elif isinstance(value, (int, float)):
            sets.append(f"{field} = {value}")
        updated_fields.append(field)

    if not sets:
        return None, []

    return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields


def main():
    sql = """
    SELECT c.id, c.name, cs.product_url
    FROM cultivars c
    JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
    JOIN suppliers s ON cs.supplier_id = s.id
    WHERE s.name = 'Magic Garden Seeds'
    AND cs.product_url IS NOT NULL AND cs.product_url <> ''
    AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
    ORDER BY c.name;
    """
    rows = run_sql(sql)
    if not rows:
        print("No cultivars to process")
        return

    cultivars = []
    for line in rows.strip().split('\n'):
        parts = line.split('|')
        if len(parts) >= 3:
            cultivars.append({
                'id': parts[0],
                'name': parts[1],
                'url': parts[2]
            })

    print(f"Processing {len(cultivars)} MGS cultivars...")
    sys.stdout.flush()

    updated = 0
    skipped = 0
    failed = 0
    fields_updated = {}

    for i, cv in enumerate(cultivars):
        print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)

        try:
            html = fetch_page(cv['url'])
            if not html or len(html) < 1000:
                print("FAILED (empty page)")
                failed += 1
                time.sleep(0.5)
                continue

            data = extract_data(html)
            if not data:
                print("NO DATA")
                skipped += 1
                time.sleep(0.5)
                continue

            current = get_current_values(cv['id'])
            sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)

            if not sql_stmt:
                print(f"SKIP (all fields populated)")
                skipped += 1
            else:
                run_sql(sql_stmt)
                for f in upd_fields:
                    fields_updated[f] = fields_updated.get(f, 0) + 1
                print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
                updated += 1

        except Exception as e:
            print(f"ERROR: {e}")
            failed += 1

        time.sleep(0.5)

    print(f"\n=== MGS Summary ===")
    print(f"Total processed: {len(cultivars)}")
    print(f"Updated: {updated}")
    print(f"Skipped (all fields already populated): {skipped}")
    print(f"Failed: {failed}")
    print(f"\nFields updated:")
    for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
        print(f"  {field}: {count}")


if __name__ == '__main__':
    main()