381 lines
12 KiB
Python
381 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
|
||
|
||
import subprocess
|
||
import re
|
||
import time
|
||
import os
|
||
import sys
|
||
|
||
DB_CMD = [
|
||
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
|
||
'-t', '-A', '-F|'
|
||
]
|
||
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
|
||
|
||
MONTH_MAP = {
|
||
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
||
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
||
}
|
||
|
||
|
||
def run_sql(sql):
|
||
result = subprocess.run(
|
||
DB_CMD + ['-c', sql],
|
||
capture_output=True, text=True, env=DB_ENV
|
||
)
|
||
return result.stdout.strip()
|
||
|
||
|
||
def fetch_page(url):
|
||
result = subprocess.run(
|
||
['curl', '-sL', '--max-time', '15', url],
|
||
capture_output=True, text=True
|
||
)
|
||
return result.stdout
|
||
|
||
|
||
def parse_months(text):
|
||
if not text:
|
||
return None
|
||
text_lower = text.lower().strip()
|
||
months = []
|
||
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
|
||
if month_name in text_lower:
|
||
if month_num not in months:
|
||
months.append(month_num)
|
||
text_lower = text_lower.replace(month_name, '')
|
||
return sorted(months) if months else None
|
||
|
||
|
||
def parse_depth(text):
|
||
if not text:
|
||
return None
|
||
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
|
||
if match:
|
||
v1 = float(match.group(1).replace(',', '.'))
|
||
v2 = float(match.group(2).replace(',', '.'))
|
||
return round((v1 + v2) / 2, 1)
|
||
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
|
||
if match:
|
||
return float(match.group(1).replace(',', '.'))
|
||
return None
|
||
|
||
|
||
def parse_spacing(text):
|
||
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
|
||
if not text:
|
||
return None, None
|
||
text = text.lower().strip()
|
||
# "X x Y cm"
|
||
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||
if match:
|
||
return float(match.group(2)), float(match.group(1))
|
||
# "X - Y cm" range -> average as plant spacing
|
||
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
|
||
if match:
|
||
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||
# Single value
|
||
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
|
||
if match:
|
||
return None, float(match.group(1))
|
||
return None, None
|
||
|
||
|
||
def parse_germination_days(text):
|
||
if not text:
|
||
return None
|
||
text = text.lower()
|
||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
|
||
if match:
|
||
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
|
||
match = re.search(r'(\d+)\s*weeks?', text)
|
||
if match:
|
||
return int(match.group(1)) * 7
|
||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
|
||
if match:
|
||
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
|
||
match = re.search(r'(\d+)\s*days?', text)
|
||
if match:
|
||
return int(match.group(1))
|
||
return None
|
||
|
||
|
||
def parse_germ_temp(text):
|
||
if not text:
|
||
return None
|
||
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
|
||
if match:
|
||
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
|
||
match = re.search(r'(\d+)\s*°', text)
|
||
if match:
|
||
return float(match.group(1))
|
||
return None
|
||
|
||
|
||
def parse_lifecycle(text):
|
||
if not text:
|
||
return None
|
||
text = text.lower().strip()
|
||
if 'perennial' in text:
|
||
return True
|
||
if 'annual' in text or 'biennial' in text:
|
||
return False
|
||
return None
|
||
|
||
|
||
def parse_light(text):
|
||
if not text:
|
||
return None
|
||
text = text.lower().strip()
|
||
if 'full sun' in text and 'partial' in text:
|
||
return 'full sun to partial shade'
|
||
if 'full sun' in text:
|
||
return 'full sun'
|
||
if 'partial' in text or 'semi' in text or 'half' in text:
|
||
return 'partial shade'
|
||
if 'shade' in text:
|
||
return 'shade'
|
||
if 'sun' in text:
|
||
return 'full sun'
|
||
return text
|
||
|
||
|
||
def extract_data(html):
|
||
data = {}
|
||
|
||
# Extract table cell pairs
|
||
cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
|
||
clean_cells = []
|
||
for c in cells:
|
||
clean = re.sub(r'<[^>]+>', ' ', c).strip()
|
||
clean = re.sub(r'\s+', ' ', clean)
|
||
clean_cells.append(clean)
|
||
|
||
specs = {}
|
||
i = 0
|
||
while i < len(clean_cells) - 1:
|
||
key = clean_cells[i].rstrip(':').strip()
|
||
val = clean_cells[i + 1].strip()
|
||
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
|
||
specs[key.lower()] = val
|
||
i += 2
|
||
|
||
# Extract description from itemprop="description"
|
||
desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
|
||
if desc_match:
|
||
content = desc_match.group(1)
|
||
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
|
||
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
|
||
content = re.sub(r'<[^>]+>', ' ', content)
|
||
content = re.sub(r'\s+', ' ', content).strip()
|
||
for marker in ['Other names', 'Additional contact mail', 'Question about']:
|
||
idx = content.find(marker)
|
||
if idx > 0:
|
||
content = content[:idx].strip()
|
||
if len(content) > 20:
|
||
data['description'] = content
|
||
|
||
if 'description' not in data:
|
||
meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
|
||
if meta_match and len(meta_match.group(1)) > 20:
|
||
data['description'] = meta_match.group(1)
|
||
|
||
# Parse specs
|
||
if 'planting distance' in specs:
|
||
row_sp, plant_sp = parse_spacing(specs['planting distance'])
|
||
if plant_sp:
|
||
data['plant_spacing_cm'] = plant_sp
|
||
if row_sp:
|
||
data['row_spacing_cm'] = row_sp
|
||
|
||
if 'row spacing' in specs:
|
||
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
|
||
if match:
|
||
data['row_spacing_cm'] = float(match.group(1))
|
||
|
||
if 'sowing depth' in specs:
|
||
depth = parse_depth(specs['sowing depth'])
|
||
if depth is not None:
|
||
data['planting_depth_cm'] = depth
|
||
|
||
# Harvesting months - prefer explicit harvest time over flowering
|
||
if 'harvest time' in specs:
|
||
months = parse_months(specs['harvest time'])
|
||
if months:
|
||
data['harvesting_months'] = months
|
||
elif 'harvesting months' in specs:
|
||
months = parse_months(specs['harvesting months'])
|
||
if months:
|
||
data['harvesting_months'] = months
|
||
elif 'flowering months' in specs:
|
||
months = parse_months(specs['flowering months'])
|
||
if months:
|
||
data['harvesting_months'] = months
|
||
|
||
if 'when to sow outdoors' in specs:
|
||
months = parse_months(specs['when to sow outdoors'])
|
||
if months:
|
||
data['direct_sowing_months'] = months
|
||
|
||
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
|
||
if indoor_key in specs:
|
||
months = parse_months(specs[indoor_key])
|
||
if months:
|
||
data['indoor_sowing_months'] = months
|
||
break
|
||
|
||
if 'lifecycle' in specs:
|
||
perennial = parse_lifecycle(specs['lifecycle'])
|
||
if perennial is not None:
|
||
data['perennial'] = perennial
|
||
|
||
if 'sunlight' in specs:
|
||
light = parse_light(specs['sunlight'])
|
||
if light:
|
||
data['light_requirement'] = light
|
||
|
||
if 'germination time' in specs:
|
||
days = parse_germination_days(specs['germination time'])
|
||
if days:
|
||
data['days_to_germination'] = days
|
||
|
||
if 'germination temperature' in specs:
|
||
temp = parse_germ_temp(specs['germination temperature'])
|
||
if temp:
|
||
data['germination_temp_c'] = temp
|
||
|
||
return data
|
||
|
||
|
||
def get_current_values(cultivar_id):
|
||
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
|
||
perennial, harvesting_months, direct_sowing_months, light_requirement,
|
||
days_to_germination, germination_temp_c, indoor_sowing_months
|
||
FROM cultivars WHERE id = '{cultivar_id}'"""
|
||
row = run_sql(sql)
|
||
if not row:
|
||
return {}
|
||
parts = row.split('|')
|
||
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
|
||
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
|
||
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
|
||
current = {}
|
||
for i, f in enumerate(fields):
|
||
if i < len(parts):
|
||
val = parts[i].strip()
|
||
if val and val != '':
|
||
current[f] = val
|
||
return current
|
||
|
||
|
||
def build_update_sql(cultivar_id, data, current):
|
||
sets = []
|
||
updated_fields = []
|
||
for field, value in data.items():
|
||
if field in current and current[field]:
|
||
continue
|
||
|
||
if isinstance(value, str):
|
||
escaped = value.replace("'", "''")
|
||
sets.append(f"{field} = '{escaped}'")
|
||
elif isinstance(value, bool):
|
||
sets.append(f"{field} = {'true' if value else 'false'}")
|
||
elif isinstance(value, list):
|
||
arr_str = '{' + ','.join(str(x) for x in value) + '}'
|
||
sets.append(f"{field} = '{arr_str}'")
|
||
elif isinstance(value, (int, float)):
|
||
sets.append(f"{field} = {value}")
|
||
updated_fields.append(field)
|
||
|
||
if not sets:
|
||
return None, []
|
||
|
||
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
|
||
|
||
|
||
def main():
|
||
sql = """
|
||
SELECT c.id, c.name, cs.product_url
|
||
FROM cultivars c
|
||
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
|
||
JOIN suppliers s ON cs.supplier_id = s.id
|
||
WHERE s.name = 'Magic Garden Seeds'
|
||
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
|
||
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
|
||
ORDER BY c.name;
|
||
"""
|
||
rows = run_sql(sql)
|
||
if not rows:
|
||
print("No cultivars to process")
|
||
return
|
||
|
||
cultivars = []
|
||
for line in rows.strip().split('\n'):
|
||
parts = line.split('|')
|
||
if len(parts) >= 3:
|
||
cultivars.append({
|
||
'id': parts[0],
|
||
'name': parts[1],
|
||
'url': parts[2]
|
||
})
|
||
|
||
print(f"Processing {len(cultivars)} MGS cultivars...")
|
||
sys.stdout.flush()
|
||
|
||
updated = 0
|
||
skipped = 0
|
||
failed = 0
|
||
fields_updated = {}
|
||
|
||
for i, cv in enumerate(cultivars):
|
||
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
|
||
|
||
try:
|
||
html = fetch_page(cv['url'])
|
||
if not html or len(html) < 1000:
|
||
print("FAILED (empty page)")
|
||
failed += 1
|
||
time.sleep(0.5)
|
||
continue
|
||
|
||
data = extract_data(html)
|
||
if not data:
|
||
print("NO DATA")
|
||
skipped += 1
|
||
time.sleep(0.5)
|
||
continue
|
||
|
||
current = get_current_values(cv['id'])
|
||
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
|
||
|
||
if not sql_stmt:
|
||
print(f"SKIP (all fields populated)")
|
||
skipped += 1
|
||
else:
|
||
run_sql(sql_stmt)
|
||
for f in upd_fields:
|
||
fields_updated[f] = fields_updated.get(f, 0) + 1
|
||
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
|
||
updated += 1
|
||
|
||
except Exception as e:
|
||
print(f"ERROR: {e}")
|
||
failed += 1
|
||
|
||
time.sleep(0.5)
|
||
|
||
print(f"\n=== MGS Summary ===")
|
||
print(f"Total processed: {len(cultivars)}")
|
||
print(f"Updated: {updated}")
|
||
print(f"Skipped (all fields already populated): {skipped}")
|
||
print(f"Failed: {failed}")
|
||
print(f"\nFields updated:")
|
||
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
|
||
print(f" {field}: {count}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|