#!/usr/bin/env python3
"""
FLHIP Link Recovery Script
Parses all feed_results CSVs to recover article URLs for records
where info_source is NULL in the archive and links is NULL in WordPress.

Run on the server:
    python3 recover_links.py --dry-run   # Preview matches
    python3 recover_links.py             # Apply fixes
"""

import csv
import os
import re
import argparse
import mysql.connector
from pathlib import Path
from dotenv import load_dotenv

# Load credentials from .env - check scraper3 dir and parent flhip dir
_script_dir = Path(__file__).resolve().parent
load_dotenv(_script_dir / '.env')
load_dotenv(_script_dir.parent / '.env')

# ── Configuration ─────────────────────────────────────────────────────────────

CSV_DIR = "/home/dh_q8pqvp/data.flhip.com/scraper1/results"
WEEKLY_LEADS_DIR = "/home/dh_q8pqvp/data.flhip.com/scraper1/results/weekly-leads"

# Leads database
LEADS_DB = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': 'restaurant_openings'
}

# WordPress database
WP_DB = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': 'wpflhip'
}

ARCHIVE_TABLE = 'restaurant_leads_usa2_archive'
HOTEL_ARCHIVE_TABLE = 'hotel_leads_usa_archive'

# ── Helpers ───────────────────────────────────────────────────────────────────

def normalize(name: str) -> str:
    """Lowercase, strip punctuation and extra spaces for fuzzy matching."""
    if not name:
        return ''
    name = name.lower().strip()
    name = re.sub(r"[^a-z0-9\s]", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name


def load_weekly_lead_csvs(weekly_dir: str) -> dict:
    """
    Parse all weekly flhip_restaurants_*.csv, flhip_hotels_*.csv, and *.xlsx files.
    These have business, city, state, and info_source columns directly.
    Returns dict: (normalized_business, normalized_city, normalized_state) -> url
    """
    print(f"Loading weekly lead CSVs from {weekly_dir}...")
    
    lead_map = {}  # (norm_business, norm_city, norm_state) -> url

    def add_entry(business, city, state, info_source, filename):
        if not business or not info_source:
            return
        if 'google.com/alerts' in info_source or '/feed' in info_source:
            return
        key = (normalize(business), normalize(city), normalize(state))
        if key not in lead_map:
            lead_map[key] = info_source

    # Load CSVs
    csv_files = list(Path(weekly_dir).glob("flhip_restaurants_*.csv")) +                 list(Path(weekly_dir).glob("flhip_hotels_*.csv"))
    print(f"Found {len(csv_files)} weekly lead CSV files")
    
    for csv_file in csv_files:
        try:
            with open(csv_file, 'r', encoding='utf-8', errors='replace') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    add_entry(
                        row.get('business', '').strip(),
                        row.get('city', '').strip(),
                        row.get('state', '').strip(),
                        row.get('info_source', '').strip(),
                        csv_file.name
                    )
        except Exception as e:
            print(f"  Error reading {csv_file.name}: {e}")

    # Load XLSX files (e.g. jan-6-2026.xlsx)
    xlsx_files = list(Path(weekly_dir).glob("*.xlsx"))
    print(f"Found {len(xlsx_files)} XLSX files")

    try:
        import openpyxl
        for xlsx_file in xlsx_files:
            try:
                wb = openpyxl.load_workbook(xlsx_file)
                ws = wb.active
                headers = [str(cell.value).strip() if cell.value else '' for cell in ws[1]]
                
                # Map column names (handle variations)
                col = {}
                for i, h in enumerate(headers):
                    hl = h.lower()
                    if 'business' in hl: col['business'] = i
                    elif 'city' in hl: col['city'] = i
                    elif 'state' in hl: col['state'] = i
                    elif 'info source' in hl or 'info_source' in hl: col['info_source'] = i

                for row in ws.iter_rows(min_row=2, values_only=True):
                    add_entry(
                        str(row[col['business']] or '').strip() if 'business' in col else '',
                        str(row[col['city']] or '').strip() if 'city' in col else '',
                        str(row[col['state']] or '').strip() if 'state' in col else '',
                        str(row[col['info_source']] or '').strip() if 'info_source' in col else '',
                        xlsx_file.name
                    )
            except Exception as e:
                print(f"  Error reading {xlsx_file.name}: {e}")
    except ImportError:
        print("  openpyxl not installed - skipping XLSX files")

    print(f"Loaded {len(lead_map)} unique business+city+state entries from weekly CSVs/XLSX")
    return lead_map


def load_all_csvs(csv_dir: str) -> dict:
    """
    Parse all feed_results_*.csv files.
    Returns dict: normalized_title -> best article URL
    Also builds: normalized_title -> list of (url, date) for disambiguation
    """
    print(f"Loading CSVs from {csv_dir}...")
    
    url_map = {}       # normalized_title -> url
    all_entries = []   # list of (normalized_title, url, date, original_title)
    
    csv_files = sorted(Path(csv_dir).glob("feed_results_*.csv"))
    print(f"Found {len(csv_files)} CSV files")
    
    for csv_file in csv_files:
        try:
            with open(csv_file, 'r', encoding='utf-8', errors='replace') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    title = row.get('title', '').strip()
                    link = row.get('link', '').strip()
                    date = row.get('publish_date', '').strip()
                    
                    if not title or not link:
                        continue
                    
                    # Skip Google wrapper URLs
                    if 'google.com/url' in link or 'google.com/alerts' in link:
                        continue
                    
                    # Skip feed URLs
                    if '/feed/' in link or link.endswith('/feed'):
                        continue
                    
                    norm = normalize(title)
                    if norm:
                        all_entries.append((norm, link, date, title))
                        # Keep most recent URL for each title
                        if norm not in url_map:
                            url_map[norm] = link
                        else:
                            # Prefer newer entries (files are sorted by date)
                            url_map[norm] = link
                            
        except Exception as e:
            print(f"  Error reading {csv_file.name}: {e}")
    
    print(f"Loaded {len(url_map)} unique article titles from CSVs")
    return url_map, all_entries


def get_null_archive_records(conn, table):
    """Get all archive records with NULL info_source."""
    cursor = conn.cursor(dictionary=True)
    cursor.execute(f"""
        SELECT id, business, city, state, rss_title
        FROM {table}
        WHERE info_source IS NULL OR info_source = ''
        ORDER BY id
    """)
    results = cursor.fetchall()
    cursor.close()
    return results


def get_null_wp_records(wp_conn):
    """Get all WordPress leads with NULL or empty links meta."""
    cursor = wp_conn.cursor(dictionary=True)
    cursor.execute("""
        SELECT pm_links.post_id, 
               pm_business.meta_value as business,
               pm_city.meta_value as city,
               pm_state.meta_value as state
        FROM wp_postmeta pm_links
        JOIN wp_postmeta pm_business ON pm_links.post_id = pm_business.post_id
        JOIN wp_postmeta pm_city ON pm_links.post_id = pm_city.post_id
        JOIN wp_postmeta pm_state ON pm_links.post_id = pm_state.post_id
        WHERE pm_links.meta_key = 'links'
        AND pm_business.meta_key = 'business_name'
        AND pm_city.meta_key = 'business_city'
        AND pm_state.meta_key = 'business_state'
        AND (pm_links.meta_value IS NULL OR pm_links.meta_value = '')
    """)
    results = cursor.fetchall()
    cursor.close()
    return results


# Generic chain names that appear in many cities — require city to match
GENERIC_CHAINS = {
    'chick-fil-a', 'mcdonalds', '7 brew', '7brew', 'starbucks', 'dunkin',
    'waffle house', 'five guys', 'shake shack', 'wingstop', 'panera bread',
    'panera', 'chipotle', 'subway', 'wendys', 'taco bell', 'burger king',
    'arby', 'raising canes', 'raising cane', 'el pollo loco', 'ihop',
    'applebees', 'chilis', 'olive garden', 'red lobster', 'outback',
    'buc-ees', 'bucees', 'dairy queen', 'sonic', 'popeyes', 'culvers',
    'jersey mikes', 'firehouse subs', 'jimmy johns', 'potbelly',
    'tropical smoothie', 'smoothie king', 'jamba juice', 'panda express',
    'dominos', 'pizza hut', 'papa johns', 'little caesars', 'marco',
    'hooters', 'buffalo wild wings', 'bdubs', 'cheddars', 'dennys',
    'ihop', 'cracker barrel', 'texas roadhouse', 'longhorn', 'benihana',
    'dave hot chicken', 'hot chicken', 'wingstop', 'slim chickens',
    'auntie annes', 'cinnabon', 'joes crab', 'red robin', 'applebee'
}


def is_generic_chain(norm_business):
    """Check if business name is a generic chain requiring city for matching."""
    for chain in GENERIC_CHAINS:
        if chain in norm_business or norm_business in chain:
            return True
    return False


def find_url_for_record(business, city, state, rss_title, url_map, lead_map):
    """
    Try to find a matching URL from the CSV data.
    Uses tiered matching based on available data quality.
    """
    norm_business = normalize(business)
    norm_city = normalize(city) if city else ''
    has_city = bool(norm_city and len(norm_city) >= 3)
    is_generic = is_generic_chain(norm_business)

    # Strategy 0: Direct business+city+state match from weekly lead CSVs (most reliable)
    norm_state = normalize(state) if state else ''
    key_full = (norm_business, norm_city, norm_state)
    key_no_state = (norm_business, norm_city, '')
    if key_full in lead_map:
        return lead_map[key_full], 'weekly_csv_exact'
    if norm_city and key_no_state in lead_map:
        return lead_map[key_no_state], 'weekly_csv_no_state'
    # Try business only if unique enough
    if norm_business and len(norm_business) >= 10 and not is_generic:
        for (b, c, s), url in lead_map.items():
            if b == norm_business:
                return url, 'weekly_csv_business_only'

    # Strategy 1: Exact RSS title match — always most reliable
    if rss_title:
        norm_title = normalize(rss_title)
        if norm_title and norm_title in url_map:
            return url_map[norm_title], 'exact_rss_title'

    # Strategy 2: Business + city both in article title
    if norm_business and len(norm_business) >= 4 and has_city:
        for norm_title, url in url_map.items():
            if norm_business in norm_title and norm_city in norm_title:
                return url, 'business_and_city_in_title'

    # Strategy 3: Business + city both in URL path
    if norm_business and len(norm_business) >= 4 and has_city:
        url_business = norm_business.replace(' ', '-')
        url_city = norm_city.replace(' ', '-')
        for norm_title, url in url_map.items():
            url_lower = url.lower()
            if url_business in url_lower and url_city in url_lower:
                return url, 'business_and_city_in_url'

    # Strategy 4: Unique business name only (no city needed, not a generic chain)
    # Only use if business name is specific enough (10+ chars) and not a generic chain
    if norm_business and len(norm_business) >= 10 and not is_generic:
        for norm_title, url in url_map.items():
            if norm_business in norm_title:
                return url, 'unique_business_in_title'

    # Strategy 5: Long RSS title partial match (4+ significant words)
    if rss_title:
        norm_title = normalize(rss_title)
        words = [w for w in norm_title.split() if len(w) >= 5]
        if len(words) >= 4:
            search_phrase = ' '.join(words[:4])
            for csv_title, url in url_map.items():
                if search_phrase in csv_title:
                    return url, 'partial_rss_title'

    return None, None


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description='Recover NULL links from feed CSVs')
    parser.add_argument('--dry-run', action='store_true', help='Preview without making changes')
    parser.add_argument('--archive-only', action='store_true', help='Only fix archive table, not WordPress')
    args = parser.parse_args()

    print("=" * 60)
    print("FLHIP LINK RECOVERY SCRIPT")
    print("=" * 60)
    
    if args.dry_run:
        print("DRY RUN - no changes will be made\n")

    # Load weekly lead CSVs (most reliable - have business+city+state+url)
    lead_map = load_weekly_lead_csvs(WEEKLY_LEADS_DIR)
    
    # Load feed CSVs (fallback - title-based matching)
    url_map, all_entries = load_all_csvs(CSV_DIR)

    # Connect to databases
    leads_conn = mysql.connector.connect(**LEADS_DB)
    wp_conn = mysql.connector.connect(**WP_DB)
    
    print(f"\nConnected to databases")

    # Get NULL records from archive
    print(f"\nFetching NULL records from archive tables...")
    null_restaurant = get_null_archive_records(leads_conn, ARCHIVE_TABLE)
    null_hotel = get_null_archive_records(leads_conn, HOTEL_ARCHIVE_TABLE)
    null_archive = null_restaurant + null_hotel
    print(f"Found {len(null_restaurant)} NULL restaurant archive records")
    print(f"Found {len(null_hotel)} NULL hotel archive records")

    # Get NULL WordPress records
    print(f"Fetching NULL WordPress postmeta records...")
    null_wp = get_null_wp_records(wp_conn)
    print(f"Found {len(null_wp)} NULL WordPress link records")

    # ── Fix Archive Records ──
    print(f"\n{'─' * 60}")
    print("FIXING ARCHIVE RECORDS")
    print(f"{'─' * 60}")
    
    archive_cursor = leads_conn.cursor()
    archive_matched = 0
    archive_unmatched = 0
    used_urls = set()  # Track URLs already assigned to avoid duplicates

    for record in null_archive:
        url, strategy = find_url_for_record(
            record['business'], 
            record['city'],
            record.get('state', ''),
            record.get('rss_title', ''),
            url_map,
            lead_map
        )
        
        if url:
            if url in used_urls:
                print(f"  SKIP (URL already used): {record['business']} [{record['city']}]")
                archive_unmatched += 1
                continue
            used_urls.add(url)
            archive_matched += 1
            print(f"  MATCH ({strategy}): {record['business']} [{record['city']}]")
            print(f"    URL: {url[:80]}")
            
            if not args.dry_run:
                # Determine which table
                table = ARCHIVE_TABLE if record in null_restaurant else HOTEL_ARCHIVE_TABLE
                try:
                    archive_cursor.execute(
                        f"UPDATE {table} SET info_source = %s WHERE id = %s",
                        (url, record['id'])
                    )
                except Exception as e:
                    print(f"    SKIP (duplicate URL): {url[:60]}")
        else:
            archive_unmatched += 1
            if args.dry_run:
                print(f"  NO MATCH: {record['business']} [{record['city']}]")

    if not args.dry_run:
        leads_conn.commit()
        print(f"\nArchive: {archive_matched} records updated")
    else:
        print(f"\nArchive: {archive_matched} would be updated, {archive_unmatched} unmatched")

    # ── Fix WordPress Records ──
    if not args.archive_only:
        print(f"\n{'─' * 60}")
        print("FIXING WORDPRESS POSTMETA RECORDS")
        print(f"{'─' * 60}")
        
        wp_cursor = wp_conn.cursor()
        wp_matched = 0
        wp_unmatched = 0

        for record in null_wp:
            url, strategy = find_url_for_record(
                record['business'],
                record['city'],
                record.get('state', ''),
                '',  # No rss_title in WP records
                url_map,
                lead_map
            )
            
            if url:
                wp_matched += 1
                print(f"  MATCH ({strategy}): {record['business']} [{record['city']}]")
                
                if not args.dry_run:
                    wp_cursor.execute("""
                        UPDATE wp_postmeta 
                        SET meta_value = %s
                        WHERE post_id = %s AND meta_key = 'links'
                    """, (url, record['post_id']))
            else:
                wp_unmatched += 1

        if not args.dry_run:
            wp_conn.commit()
            print(f"\nWordPress: {wp_matched} records updated, {wp_unmatched} unmatched")
        else:
            print(f"\nWordPress: {wp_matched} would be updated, {wp_unmatched} unmatched")

    # ── Summary ──
    print(f"\n{'=' * 60}")
    print("SUMMARY")
    print(f"{'=' * 60}")
    print(f"CSV titles loaded:        {len(url_map)}")
    print(f"Archive NULL records:     {len(null_archive)}")
    print(f"Archive matched:          {archive_matched}")
    print(f"Archive unmatched:        {len(null_archive) - archive_matched}")
    if not args.archive_only:
        print(f"WordPress NULL records:   {len(null_wp)}")
        print(f"WordPress matched:        {wp_matched}")
        print(f"WordPress unmatched:      {wp_unmatched}")
    
    leads_conn.close()
    wp_conn.close()
    print("\nDone.")


if __name__ == '__main__':
    main()
