#!/usr/bin/env python3 """Scrape Magic Garden Seeds product pages and update herbapi database.""" import subprocess import re import time import os import sys DB_CMD = [ 'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi', '-t', '-A', '-F|' ] DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'} MONTH_MAP = { 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12, } def run_sql(sql): result = subprocess.run( DB_CMD + ['-c', sql], capture_output=True, text=True, env=DB_ENV ) return result.stdout.strip() def fetch_page(url): result = subprocess.run( ['curl', '-sL', '--max-time', '15', url], capture_output=True, text=True ) return result.stdout def parse_months(text): if not text: return None text_lower = text.lower().strip() months = [] for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])): if month_name in text_lower: if month_num not in months: months.append(month_num) text_lower = text_lower.replace(month_name, '') return sorted(months) if months else None def parse_depth(text): if not text: return None match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text) if match: v1 = float(match.group(1).replace(',', '.')) v2 = float(match.group(2).replace(',', '.')) return round((v1 + v2) / 2, 1) match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text) if match: return float(match.group(1).replace(',', '.')) return None def parse_spacing(text): """Parse planting distance. Returns (row_spacing, plant_spacing).""" if not text: return None, None text = text.lower().strip() # "X x Y cm" match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text) if match: return float(match.group(2)), float(match.group(1)) # "X - Y cm" range -> average as plant spacing match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text) if match: return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1) # Single value match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text) if match: return None, float(match.group(1)) return None, None def parse_germination_days(text): if not text: return None text = text.lower() match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text) if match: return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7)) match = re.search(r'(\d+)\s*weeks?', text) if match: return int(match.group(1)) * 7 match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text) if match: return int(round((int(match.group(1)) + int(match.group(2))) / 2)) match = re.search(r'(\d+)\s*days?', text) if match: return int(match.group(1)) return None def parse_germ_temp(text): if not text: return None match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text) if match: return round((float(match.group(1)) + float(match.group(2))) / 2, 1) match = re.search(r'(\d+)\s*°', text) if match: return float(match.group(1)) return None def parse_lifecycle(text): if not text: return None text = text.lower().strip() if 'perennial' in text: return True if 'annual' in text or 'biennial' in text: return False return None def parse_light(text): if not text: return None text = text.lower().strip() if 'full sun' in text and 'partial' in text: return 'full sun to partial shade' if 'full sun' in text: return 'full sun' if 'partial' in text or 'semi' in text or 'half' in text: return 'partial shade' if 'shade' in text: return 'shade' if 'sun' in text: return 'full sun' return text def extract_data(html): data = {} # Extract table cell pairs cells = re.findall(r']*>(.*?)', html, re.DOTALL) clean_cells = [] for c in cells: clean = re.sub(r'<[^>]+>', ' ', c).strip() clean = re.sub(r'\s+', ' ', clean) clean_cells.append(clean) specs = {} i = 0 while i < len(clean_cells) - 1: key = clean_cells[i].rstrip(':').strip() val = clean_cells[i + 1].strip() if key and val and not re.match(r'^[\d,.\s€*]+$', key): specs[key.lower()] = val i += 2 # Extract description from itemprop="description" desc_match = re.search(r'itemprop="description">(.*?)\s*\s*', html, re.DOTALL) if desc_match: content = desc_match.group(1) content = re.sub(r']*>.*?', '', content, flags=re.DOTALL) content = re.sub(r']*>.*?', '', content, flags=re.DOTALL) content = re.sub(r'<[^>]+>', ' ', content) content = re.sub(r'\s+', ' ', content).strip() for marker in ['Other names', 'Additional contact mail', 'Question about']: idx = content.find(marker) if idx > 0: content = content[:idx].strip() if len(content) > 20: data['description'] = content if 'description' not in data: meta_match = re.search(r']*name="description"[^>]*content="([^"]*)"', html) if meta_match and len(meta_match.group(1)) > 20: data['description'] = meta_match.group(1) # Parse specs if 'planting distance' in specs: row_sp, plant_sp = parse_spacing(specs['planting distance']) if plant_sp: data['plant_spacing_cm'] = plant_sp if row_sp: data['row_spacing_cm'] = row_sp if 'row spacing' in specs: match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing']) if match: data['row_spacing_cm'] = float(match.group(1)) if 'sowing depth' in specs: depth = parse_depth(specs['sowing depth']) if depth is not None: data['planting_depth_cm'] = depth # Harvesting months - prefer explicit harvest time over flowering if 'harvest time' in specs: months = parse_months(specs['harvest time']) if months: data['harvesting_months'] = months elif 'harvesting months' in specs: months = parse_months(specs['harvesting months']) if months: data['harvesting_months'] = months elif 'flowering months' in specs: months = parse_months(specs['flowering months']) if months: data['harvesting_months'] = months if 'when to sow outdoors' in specs: months = parse_months(specs['when to sow outdoors']) if months: data['direct_sowing_months'] = months for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']: if indoor_key in specs: months = parse_months(specs[indoor_key]) if months: data['indoor_sowing_months'] = months break if 'lifecycle' in specs: perennial = parse_lifecycle(specs['lifecycle']) if perennial is not None: data['perennial'] = perennial if 'sunlight' in specs: light = parse_light(specs['sunlight']) if light: data['light_requirement'] = light if 'germination time' in specs: days = parse_germination_days(specs['germination time']) if days: data['days_to_germination'] = days if 'germination temperature' in specs: temp = parse_germ_temp(specs['germination temperature']) if temp: data['germination_temp_c'] = temp return data def get_current_values(cultivar_id): sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm, perennial, harvesting_months, direct_sowing_months, light_requirement, days_to_germination, germination_temp_c, indoor_sowing_months FROM cultivars WHERE id = '{cultivar_id}'""" row = run_sql(sql) if not row: return {} parts = row.split('|') fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm', 'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement', 'days_to_germination', 'germination_temp_c', 'indoor_sowing_months'] current = {} for i, f in enumerate(fields): if i < len(parts): val = parts[i].strip() if val and val != '': current[f] = val return current def build_update_sql(cultivar_id, data, current): sets = [] updated_fields = [] for field, value in data.items(): if field in current and current[field]: continue if isinstance(value, str): escaped = value.replace("'", "''") sets.append(f"{field} = '{escaped}'") elif isinstance(value, bool): sets.append(f"{field} = {'true' if value else 'false'}") elif isinstance(value, list): arr_str = '{' + ','.join(str(x) for x in value) + '}' sets.append(f"{field} = '{arr_str}'") elif isinstance(value, (int, float)): sets.append(f"{field} = {value}") updated_fields.append(field) if not sets: return None, [] return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields def main(): sql = """ SELECT c.id, c.name, cs.product_url FROM cultivars c JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id JOIN suppliers s ON cs.supplier_id = s.id WHERE s.name = 'Magic Garden Seeds' AND cs.product_url IS NOT NULL AND cs.product_url <> '' AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '') ORDER BY c.name; """ rows = run_sql(sql) if not rows: print("No cultivars to process") return cultivars = [] for line in rows.strip().split('\n'): parts = line.split('|') if len(parts) >= 3: cultivars.append({ 'id': parts[0], 'name': parts[1], 'url': parts[2] }) print(f"Processing {len(cultivars)} MGS cultivars...") sys.stdout.flush() updated = 0 skipped = 0 failed = 0 fields_updated = {} for i, cv in enumerate(cultivars): print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True) try: html = fetch_page(cv['url']) if not html or len(html) < 1000: print("FAILED (empty page)") failed += 1 time.sleep(0.5) continue data = extract_data(html) if not data: print("NO DATA") skipped += 1 time.sleep(0.5) continue current = get_current_values(cv['id']) sql_stmt, upd_fields = build_update_sql(cv['id'], data, current) if not sql_stmt: print(f"SKIP (all fields populated)") skipped += 1 else: run_sql(sql_stmt) for f in upd_fields: fields_updated[f] = fields_updated.get(f, 0) + 1 print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})") updated += 1 except Exception as e: print(f"ERROR: {e}") failed += 1 time.sleep(0.5) print(f"\n=== MGS Summary ===") print(f"Total processed: {len(cultivars)}") print(f"Updated: {updated}") print(f"Skipped (all fields already populated): {skipped}") print(f"Failed: {failed}") print(f"\nFields updated:") for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]): print(f" {field}: {count}") if __name__ == '__main__': main()