#!/usr/bin/env python3
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
import subprocess
import re
import time
import os
import sys
DB_CMD = [
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
'-t', '-A', '-F|'
]
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
MONTH_MAP = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
def run_sql(sql):
result = subprocess.run(
DB_CMD + ['-c', sql],
capture_output=True, text=True, env=DB_ENV
)
return result.stdout.strip()
def fetch_page(url):
result = subprocess.run(
['curl', '-sL', '--max-time', '15', url],
capture_output=True, text=True
)
return result.stdout
def parse_months(text):
if not text:
return None
text_lower = text.lower().strip()
months = []
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
if month_name in text_lower:
if month_num not in months:
months.append(month_num)
text_lower = text_lower.replace(month_name, '')
return sorted(months) if months else None
def parse_depth(text):
if not text:
return None
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
v1 = float(match.group(1).replace(',', '.'))
v2 = float(match.group(2).replace(',', '.'))
return round((v1 + v2) / 2, 1)
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
return float(match.group(1).replace(',', '.'))
return None
def parse_spacing(text):
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
if not text:
return None, None
text = text.lower().strip()
# "X x Y cm"
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return float(match.group(2)), float(match.group(1))
# "X - Y cm" range -> average as plant spacing
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
# Single value
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, float(match.group(1))
return None, None
def parse_germination_days(text):
if not text:
return None
text = text.lower()
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
match = re.search(r'(\d+)\s*weeks?', text)
if match:
return int(match.group(1)) * 7
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
match = re.search(r'(\d+)\s*days?', text)
if match:
return int(match.group(1))
return None
def parse_germ_temp(text):
if not text:
return None
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
if match:
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
match = re.search(r'(\d+)\s*°', text)
if match:
return float(match.group(1))
return None
def parse_lifecycle(text):
if not text:
return None
text = text.lower().strip()
if 'perennial' in text:
return True
if 'annual' in text or 'biennial' in text:
return False
return None
def parse_light(text):
if not text:
return None
text = text.lower().strip()
if 'full sun' in text and 'partial' in text:
return 'full sun to partial shade'
if 'full sun' in text:
return 'full sun'
if 'partial' in text or 'semi' in text or 'half' in text:
return 'partial shade'
if 'shade' in text:
return 'shade'
if 'sun' in text:
return 'full sun'
return text
def extract_data(html):
data = {}
# Extract table cell pairs
cells = re.findall(r'
]*>(.*?) | ', html, re.DOTALL)
clean_cells = []
for c in cells:
clean = re.sub(r'<[^>]+>', ' ', c).strip()
clean = re.sub(r'\s+', ' ', clean)
clean_cells.append(clean)
specs = {}
i = 0
while i < len(clean_cells) - 1:
key = clean_cells[i].rstrip(':').strip()
val = clean_cells[i + 1].strip()
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
specs[key.lower()] = val
i += 2
# Extract description from itemprop="description"
desc_match = re.search(r'itemprop="description">(.*?)\s*\s*', html, re.DOTALL)
if desc_match:
content = desc_match.group(1)
content = re.sub(r'', '', content, flags=re.DOTALL)
content = re.sub(r'', '', content, flags=re.DOTALL)
content = re.sub(r'<[^>]+>', ' ', content)
content = re.sub(r'\s+', ' ', content).strip()
for marker in ['Other names', 'Additional contact mail', 'Question about']:
idx = content.find(marker)
if idx > 0:
content = content[:idx].strip()
if len(content) > 20:
data['description'] = content
if 'description' not in data:
meta_match = re.search(r']*name="description"[^>]*content="([^"]*)"', html)
if meta_match and len(meta_match.group(1)) > 20:
data['description'] = meta_match.group(1)
# Parse specs
if 'planting distance' in specs:
row_sp, plant_sp = parse_spacing(specs['planting distance'])
if plant_sp:
data['plant_spacing_cm'] = plant_sp
if row_sp:
data['row_spacing_cm'] = row_sp
if 'row spacing' in specs:
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
if match:
data['row_spacing_cm'] = float(match.group(1))
if 'sowing depth' in specs:
depth = parse_depth(specs['sowing depth'])
if depth is not None:
data['planting_depth_cm'] = depth
# Harvesting months - prefer explicit harvest time over flowering
if 'harvest time' in specs:
months = parse_months(specs['harvest time'])
if months:
data['harvesting_months'] = months
elif 'harvesting months' in specs:
months = parse_months(specs['harvesting months'])
if months:
data['harvesting_months'] = months
elif 'flowering months' in specs:
months = parse_months(specs['flowering months'])
if months:
data['harvesting_months'] = months
if 'when to sow outdoors' in specs:
months = parse_months(specs['when to sow outdoors'])
if months:
data['direct_sowing_months'] = months
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
if indoor_key in specs:
months = parse_months(specs[indoor_key])
if months:
data['indoor_sowing_months'] = months
break
if 'lifecycle' in specs:
perennial = parse_lifecycle(specs['lifecycle'])
if perennial is not None:
data['perennial'] = perennial
if 'sunlight' in specs:
light = parse_light(specs['sunlight'])
if light:
data['light_requirement'] = light
if 'germination time' in specs:
days = parse_germination_days(specs['germination time'])
if days:
data['days_to_germination'] = days
if 'germination temperature' in specs:
temp = parse_germ_temp(specs['germination temperature'])
if temp:
data['germination_temp_c'] = temp
return data
def get_current_values(cultivar_id):
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
perennial, harvesting_months, direct_sowing_months, light_requirement,
days_to_germination, germination_temp_c, indoor_sowing_months
FROM cultivars WHERE id = '{cultivar_id}'"""
row = run_sql(sql)
if not row:
return {}
parts = row.split('|')
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
current = {}
for i, f in enumerate(fields):
if i < len(parts):
val = parts[i].strip()
if val and val != '':
current[f] = val
return current
def build_update_sql(cultivar_id, data, current):
sets = []
updated_fields = []
for field, value in data.items():
if field in current and current[field]:
continue
if isinstance(value, str):
escaped = value.replace("'", "''")
sets.append(f"{field} = '{escaped}'")
elif isinstance(value, bool):
sets.append(f"{field} = {'true' if value else 'false'}")
elif isinstance(value, list):
arr_str = '{' + ','.join(str(x) for x in value) + '}'
sets.append(f"{field} = '{arr_str}'")
elif isinstance(value, (int, float)):
sets.append(f"{field} = {value}")
updated_fields.append(field)
if not sets:
return None, []
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
def main():
sql = """
SELECT c.id, c.name, cs.product_url
FROM cultivars c
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
JOIN suppliers s ON cs.supplier_id = s.id
WHERE s.name = 'Magic Garden Seeds'
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
ORDER BY c.name;
"""
rows = run_sql(sql)
if not rows:
print("No cultivars to process")
return
cultivars = []
for line in rows.strip().split('\n'):
parts = line.split('|')
if len(parts) >= 3:
cultivars.append({
'id': parts[0],
'name': parts[1],
'url': parts[2]
})
print(f"Processing {len(cultivars)} MGS cultivars...")
sys.stdout.flush()
updated = 0
skipped = 0
failed = 0
fields_updated = {}
for i, cv in enumerate(cultivars):
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
try:
html = fetch_page(cv['url'])
if not html or len(html) < 1000:
print("FAILED (empty page)")
failed += 1
time.sleep(0.5)
continue
data = extract_data(html)
if not data:
print("NO DATA")
skipped += 1
time.sleep(0.5)
continue
current = get_current_values(cv['id'])
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
if not sql_stmt:
print(f"SKIP (all fields populated)")
skipped += 1
else:
run_sql(sql_stmt)
for f in upd_fields:
fields_updated[f] = fields_updated.get(f, 0) + 1
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
updated += 1
except Exception as e:
print(f"ERROR: {e}")
failed += 1
time.sleep(0.5)
print(f"\n=== MGS Summary ===")
print(f"Total processed: {len(cultivars)}")
print(f"Updated: {updated}")
print(f"Skipped (all fields already populated): {skipped}")
print(f"Failed: {failed}")
print(f"\nFields updated:")
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
print(f" {field}: {count}")
if __name__ == '__main__':
main()