Files
herbapi/tools/scrapers/scrape_mgs.py
T

381 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Scrape Magic Garden Seeds product pages and update herbapi database."""
import subprocess
import re
import time
import os
import sys
DB_CMD = [
'psql', '-h', '10.31.3.90', '-U', 'herbapi', 'herbapi',
'-t', '-A', '-F|'
]
DB_ENV = {**os.environ, 'PGPASSWORD': '_6Qo_jEFhE9LZOEbwLynEWoLbc6B4Ipj'}
MONTH_MAP = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
def run_sql(sql):
result = subprocess.run(
DB_CMD + ['-c', sql],
capture_output=True, text=True, env=DB_ENV
)
return result.stdout.strip()
def fetch_page(url):
result = subprocess.run(
['curl', '-sL', '--max-time', '15', url],
capture_output=True, text=True
)
return result.stdout
def parse_months(text):
if not text:
return None
text_lower = text.lower().strip()
months = []
for month_name, month_num in sorted(MONTH_MAP.items(), key=lambda x: -len(x[0])):
if month_name in text_lower:
if month_num not in months:
months.append(month_num)
text_lower = text_lower.replace(month_name, '')
return sorted(months) if months else None
def parse_depth(text):
if not text:
return None
match = re.search(r'(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
v1 = float(match.group(1).replace(',', '.'))
v2 = float(match.group(2).replace(',', '.'))
return round((v1 + v2) / 2, 1)
match = re.search(r'(\d+(?:[.,]\d+)?)\s*cm', text)
if match:
return float(match.group(1).replace(',', '.'))
return None
def parse_spacing(text):
"""Parse planting distance. Returns (row_spacing, plant_spacing)."""
if not text:
return None, None
text = text.lower().strip()
# "X x Y cm"
match = re.search(r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return float(match.group(2)), float(match.group(1))
# "X - Y cm" range -> average as plant spacing
match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, round((float(match.group(1)) + float(match.group(2))) / 2, 1)
# Single value
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', text)
if match:
return None, float(match.group(1))
return None, None
def parse_germination_days(text):
if not text:
return None
text = text.lower()
match = re.search(r'(\d+)\s*-\s*(\d+)\s*weeks?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2 * 7))
match = re.search(r'(\d+)\s*weeks?', text)
if match:
return int(match.group(1)) * 7
match = re.search(r'(\d+)\s*-\s*(\d+)\s*days?', text)
if match:
return int(round((int(match.group(1)) + int(match.group(2))) / 2))
match = re.search(r'(\d+)\s*days?', text)
if match:
return int(match.group(1))
return None
def parse_germ_temp(text):
if not text:
return None
match = re.search(r'(\d+)\s*-\s*(\d+)\s*°', text)
if match:
return round((float(match.group(1)) + float(match.group(2))) / 2, 1)
match = re.search(r'(\d+)\s*°', text)
if match:
return float(match.group(1))
return None
def parse_lifecycle(text):
if not text:
return None
text = text.lower().strip()
if 'perennial' in text:
return True
if 'annual' in text or 'biennial' in text:
return False
return None
def parse_light(text):
if not text:
return None
text = text.lower().strip()
if 'full sun' in text and 'partial' in text:
return 'full sun to partial shade'
if 'full sun' in text:
return 'full sun'
if 'partial' in text or 'semi' in text or 'half' in text:
return 'partial shade'
if 'shade' in text:
return 'shade'
if 'sun' in text:
return 'full sun'
return text
def extract_data(html):
data = {}
# Extract table cell pairs
cells = re.findall(r'<td[^>]*>(.*?)</td>', html, re.DOTALL)
clean_cells = []
for c in cells:
clean = re.sub(r'<[^>]+>', ' ', c).strip()
clean = re.sub(r'\s+', ' ', clean)
clean_cells.append(clean)
specs = {}
i = 0
while i < len(clean_cells) - 1:
key = clean_cells[i].rstrip(':').strip()
val = clean_cells[i + 1].strip()
if key and val and not re.match(r'^[\d,.\s€*]+$', key):
specs[key.lower()] = val
i += 2
# Extract description from itemprop="description"
desc_match = re.search(r'itemprop="description">(.*?)</div>\s*</div>\s*</div>', html, re.DOTALL)
if desc_match:
content = desc_match.group(1)
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
content = re.sub(r'<[^>]+>', ' ', content)
content = re.sub(r'\s+', ' ', content).strip()
for marker in ['Other names', 'Additional contact mail', 'Question about']:
idx = content.find(marker)
if idx > 0:
content = content[:idx].strip()
if len(content) > 20:
data['description'] = content
if 'description' not in data:
meta_match = re.search(r'<meta[^>]*name="description"[^>]*content="([^"]*)"', html)
if meta_match and len(meta_match.group(1)) > 20:
data['description'] = meta_match.group(1)
# Parse specs
if 'planting distance' in specs:
row_sp, plant_sp = parse_spacing(specs['planting distance'])
if plant_sp:
data['plant_spacing_cm'] = plant_sp
if row_sp:
data['row_spacing_cm'] = row_sp
if 'row spacing' in specs:
match = re.search(r'(\d+(?:\.\d+)?)\s*cm', specs['row spacing'])
if match:
data['row_spacing_cm'] = float(match.group(1))
if 'sowing depth' in specs:
depth = parse_depth(specs['sowing depth'])
if depth is not None:
data['planting_depth_cm'] = depth
# Harvesting months - prefer explicit harvest time over flowering
if 'harvest time' in specs:
months = parse_months(specs['harvest time'])
if months:
data['harvesting_months'] = months
elif 'harvesting months' in specs:
months = parse_months(specs['harvesting months'])
if months:
data['harvesting_months'] = months
elif 'flowering months' in specs:
months = parse_months(specs['flowering months'])
if months:
data['harvesting_months'] = months
if 'when to sow outdoors' in specs:
months = parse_months(specs['when to sow outdoors'])
if months:
data['direct_sowing_months'] = months
for indoor_key in ['when to sow indoors', 'pre-cultivation indoors']:
if indoor_key in specs:
months = parse_months(specs[indoor_key])
if months:
data['indoor_sowing_months'] = months
break
if 'lifecycle' in specs:
perennial = parse_lifecycle(specs['lifecycle'])
if perennial is not None:
data['perennial'] = perennial
if 'sunlight' in specs:
light = parse_light(specs['sunlight'])
if light:
data['light_requirement'] = light
if 'germination time' in specs:
days = parse_germination_days(specs['germination time'])
if days:
data['days_to_germination'] = days
if 'germination temperature' in specs:
temp = parse_germ_temp(specs['germination temperature'])
if temp:
data['germination_temp_c'] = temp
return data
def get_current_values(cultivar_id):
sql = f"""SELECT description, row_spacing_cm, plant_spacing_cm, planting_depth_cm,
perennial, harvesting_months, direct_sowing_months, light_requirement,
days_to_germination, germination_temp_c, indoor_sowing_months
FROM cultivars WHERE id = '{cultivar_id}'"""
row = run_sql(sql)
if not row:
return {}
parts = row.split('|')
fields = ['description', 'row_spacing_cm', 'plant_spacing_cm', 'planting_depth_cm',
'perennial', 'harvesting_months', 'direct_sowing_months', 'light_requirement',
'days_to_germination', 'germination_temp_c', 'indoor_sowing_months']
current = {}
for i, f in enumerate(fields):
if i < len(parts):
val = parts[i].strip()
if val and val != '':
current[f] = val
return current
def build_update_sql(cultivar_id, data, current):
sets = []
updated_fields = []
for field, value in data.items():
if field in current and current[field]:
continue
if isinstance(value, str):
escaped = value.replace("'", "''")
sets.append(f"{field} = '{escaped}'")
elif isinstance(value, bool):
sets.append(f"{field} = {'true' if value else 'false'}")
elif isinstance(value, list):
arr_str = '{' + ','.join(str(x) for x in value) + '}'
sets.append(f"{field} = '{arr_str}'")
elif isinstance(value, (int, float)):
sets.append(f"{field} = {value}")
updated_fields.append(field)
if not sets:
return None, []
return f"UPDATE cultivars SET {', '.join(sets)} WHERE id = '{cultivar_id}';", updated_fields
def main():
sql = """
SELECT c.id, c.name, cs.product_url
FROM cultivars c
JOIN cultivar_suppliers cs ON c.id = cs.cultivar_id
JOIN suppliers s ON cs.supplier_id = s.id
WHERE s.name = 'Magic Garden Seeds'
AND cs.product_url IS NOT NULL AND cs.product_url <> ''
AND (c.row_spacing_cm IS NULL OR c.description IS NULL OR c.description = '')
ORDER BY c.name;
"""
rows = run_sql(sql)
if not rows:
print("No cultivars to process")
return
cultivars = []
for line in rows.strip().split('\n'):
parts = line.split('|')
if len(parts) >= 3:
cultivars.append({
'id': parts[0],
'name': parts[1],
'url': parts[2]
})
print(f"Processing {len(cultivars)} MGS cultivars...")
sys.stdout.flush()
updated = 0
skipped = 0
failed = 0
fields_updated = {}
for i, cv in enumerate(cultivars):
print(f"[{i+1}/{len(cultivars)}] {cv['name']}...", end=' ', flush=True)
try:
html = fetch_page(cv['url'])
if not html or len(html) < 1000:
print("FAILED (empty page)")
failed += 1
time.sleep(0.5)
continue
data = extract_data(html)
if not data:
print("NO DATA")
skipped += 1
time.sleep(0.5)
continue
current = get_current_values(cv['id'])
sql_stmt, upd_fields = build_update_sql(cv['id'], data, current)
if not sql_stmt:
print(f"SKIP (all fields populated)")
skipped += 1
else:
run_sql(sql_stmt)
for f in upd_fields:
fields_updated[f] = fields_updated.get(f, 0) + 1
print(f"OK ({len(upd_fields)} fields: {', '.join(upd_fields)})")
updated += 1
except Exception as e:
print(f"ERROR: {e}")
failed += 1
time.sleep(0.5)
print(f"\n=== MGS Summary ===")
print(f"Total processed: {len(cultivars)}")
print(f"Updated: {updated}")
print(f"Skipped (all fields already populated): {skipped}")
print(f"Failed: {failed}")
print(f"\nFields updated:")
for field, count in sorted(fields_updated.items(), key=lambda x: -x[1]):
print(f" {field}: {count}")
if __name__ == '__main__':
main()