#!/usr/bin/env python3
"""
FLHIP Hotel Contact Enricher (Batched Version)
Updated: 2026-01-10

Single script that:
1. First tries regex extraction from notes (FREE)
2. If still missing info, uses BATCHED Claude API calls that:
   - Processes up to 50 hotels per API call
   - Uses web search to find public contact info
   - Significantly reduces cost vs per-record calls

Cost: ~$2.50 per batch of ~31 hotels with web search
"""

import os
import sys
import time
import json
import re
import argparse
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
load_dotenv()

# Anthropic API configuration
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

# Try to import anthropic library
try:
    import anthropic
    ANTHROPIC_AVAILABLE = True
except ImportError:
    ANTHROPIC_AVAILABLE = False
    print("[WARN] anthropic library not installed. Run: pip install anthropic")

# Database configuration
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': os.getenv('DB_NAME')
}

# Table name
TABLE_NAME = "hotel_leads_usa"

# Tracking
TRACKING_DIR = Path(__file__).resolve().parent / "tracking"
PROGRESS_FILE = TRACKING_DIR / "hotel_contact_enricher_progress.txt"

# Batch size for API calls (50 hotels per call)
API_BATCH_SIZE = 50


def get_db_connection():
    """Create database connection"""
    try:
        conn = mysql.connector.connect(**DB_CONFIG)
        return conn
    except Error as e:
        print(f"[ERROR] Database connection error: {e}")
        sys.exit(1)


def get_records_needing_contacts(conn, limit=50, start_id=0):
    """
    Get records missing phone OR email
    Prioritizes records with contact_name (better for web search)
    """
    cursor = conn.cursor(dictionary=True)
    
    query = f"""
        SELECT id, business, address, city, state, zip,
               phone, contact_name, contact_email, 
               notes, info_source
        FROM {TABLE_NAME}
        WHERE 
            id > %s
            AND (
                (phone IS NULL OR phone = '')
                OR (contact_email IS NULL OR contact_email = '')
            )
        ORDER BY 
            CASE WHEN contact_name IS NOT NULL AND contact_name != '' THEN 0 ELSE 1 END,
            id ASC
        LIMIT %s
    """
    
    cursor.execute(query, (start_id, limit))
    records = cursor.fetchall()
    cursor.close()
    
    return records


def get_total_needing_contacts(conn):
    """Get total count of records needing contact info"""
    cursor = conn.cursor()
    
    query = f"""
        SELECT COUNT(*) 
        FROM {TABLE_NAME}
        WHERE 
            (phone IS NULL OR phone = '')
            OR (contact_email IS NULL OR contact_email = '')
    """
    
    cursor.execute(query)
    count = cursor.fetchone()[0]
    cursor.close()
    
    return count


def extract_contacts_with_regex(text):
    """
    FREE: Use regex patterns to extract phone numbers and emails
    """
    results = {
        'phone': None,
        'email': None
    }
    
    if not text:
        return results
    
    # Phone patterns (US formats)
    phone_patterns = [
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\d{3}[-.\s]\d{4}',
        r'1[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
    ]
    
    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            phone = match.group()
            digits = re.sub(r'[^\d]', '', phone)
            if len(digits) == 10:
                results['phone'] = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
                break
            elif len(digits) == 11 and digits.startswith('1'):
                digits = digits[1:]
                results['phone'] = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
                break
    
    # Email pattern
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    email_match = re.search(email_pattern, text)
    if email_match:
        email = email_match.group().lower()
        # Filter out generic personal emails
        exclude_domains = ['example.com', 'test.com', 'gmail.com', 'yahoo.com',
                          'hotmail.com', 'outlook.com', 'aol.com']
        if not any(domain in email for domain in exclude_domains):
            results['email'] = email
    
    return results


def enrich_contacts_batch_with_claude(client, records, use_web_search=True):
    """
    BATCHED Claude API call that processes multiple hotels at once.
    
    Args:
        client: Anthropic client
        records: List of record dicts (up to 50)
        use_web_search: Whether to enable web search tool
    
    Returns:
        Dict mapping record_id to {phone, email, contact_name, source}
    """
    
    if not records:
        return {}
    
    # Build the batch prompt
    hotel_list = []
    for i, record in enumerate(records, 1):
        business = record.get('business') or 'Unknown Hotel'
        contact_name = record.get('contact_name') or ''
        city = record.get('city') or ''
        state = record.get('state') or ''
        location = f"{city}, {state}" if city and state else (city or state or "Unknown")
        
        need_phone = not record.get('phone')
        need_email = not record.get('contact_email')
        
        hotel_list.append(f"""
HOTEL {i} (ID: {record['id']}):
  Business: {business}
  Location: {location}
  Contact Name: {contact_name or 'Unknown'}
  Need: {'phone ' if need_phone else ''}{'email' if need_email else ''}
""")
    
    hotels_text = "\n".join(hotel_list)
    
    prompt = f"""Find contact information for these hotels. Use web search to find public business contact info.

{hotels_text}

SEARCH STRATEGY:
- For each hotel with a contact name, search: "[contact name]" "[hotel name]" contact
- Also search: [hotel name] [city] official website contact
- Look for: official websites, booking sites, press releases, business directories
- Only use PUBLICLY AVAILABLE business information

RULES:
- Extract ONLY information explicitly found in search results
- Do NOT make up or guess any information
- Phone format: (XXX) XXX-XXXX (US numbers only)
- Prefer BUSINESS contact info over personal
- Skip generic emails (gmail, yahoo, hotmail) unless clearly business-related
- If no info found for a hotel, still include it with null values

Respond with ONLY valid JSON array (no markdown, no explanation):
[
  {{
    "id": <record_id>,
    "phone": "(XXX) XXX-XXXX or null",
    "email": "email@domain.com or null",
    "contact_name": "Name or null (only if found new one)",
    "source": "where you found it or null"
  }},
  ...
]"""

    try:
        if use_web_search:
            # Allow more web searches for batch - up to 1 per hotel
            max_searches = min(len(records), 50)
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4000,
                tools=[
                    {
                        "type": "web_search_20250305",
                        "name": "web_search",
                        "max_uses": max_searches
                    }
                ],
                messages=[{"role": "user", "content": prompt}]
            )
        else:
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=3000,
                messages=[{"role": "user", "content": prompt}]
            )
        
        # Extract response text from content blocks
        response_text = ""
        for block in message.content:
            if hasattr(block, 'text'):
                response_text += block.text
        
        response_text = response_text.strip()
        
        # Clean up markdown code blocks
        if "```json" in response_text:
            response_text = response_text.split("```json")[1].split("```")[0]
        elif "```" in response_text:
            parts = response_text.split("```")
            for part in parts:
                if "[" in part and "]" in part:
                    response_text = part
                    break
        
        response_text = response_text.strip()
        
        # Find JSON array in response
        json_match = re.search(r'\[[\s\S]*\]', response_text)
        if json_match:
            response_text = json_match.group()
        
        # Parse JSON array
        results_list = json.loads(response_text)
        
        # Convert to dict keyed by record ID
        results = {}
        for item in results_list:
            record_id = item.get('id')
            if record_id:
                # Validate and clean phone
                phone = item.get('phone')
                if phone and phone not in ['null', 'None', None, '']:
                    digits = re.sub(r'[^\d]', '', phone)
                    if len(digits) == 10:
                        phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
                    elif len(digits) == 11 and digits.startswith('1'):
                        digits = digits[1:]
                        phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
                    else:
                        phone = None
                else:
                    phone = None
                
                # Validate and clean email
                email = item.get('email')
                if email and email not in ['null', 'None', None, '']:
                    email = email.lower().strip()
                    if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
                        email = None
                    else:
                        # Filter generic personal emails
                        personal_domains = ['gmail.com', 'yahoo.com', 'hotmail.com',
                                           'outlook.com', 'aol.com', 'icloud.com']
                        if any(domain in email for domain in personal_domains):
                            email = None
                else:
                    email = None
                
                # Clean contact_name
                contact_name = item.get('contact_name')
                if contact_name in ['null', 'None', None, '']:
                    contact_name = None
                
                results[record_id] = {
                    'phone': phone,
                    'email': email,
                    'contact_name': contact_name,
                    'source': item.get('source')
                }
        
        return results
        
    except json.JSONDecodeError as e:
        print(f"  [ERROR] JSON parse error: {e}")
        print(f"  Response was: {response_text[:500]}...")
        return {}
    except anthropic.APIError as e:
        print(f"  [ERROR] Anthropic API error: {e}")
        return {}
    except Exception as e:
        print(f"  [ERROR] Unexpected error: {e}")
        return {}


def enrich_contacts_with_claude(client, record, need_phone, need_email, use_web_search=True):
    """
    SINGLE record Claude API call (legacy - for testing single records)
    """
    results = enrich_contacts_batch_with_claude(client, [record], use_web_search)
    return results.get(record['id'])


def update_record_contacts(conn, record_id, contact_data):
    """Update a record with contact information"""
    cursor = conn.cursor()
    
    updates = []
    values = []
    
    if contact_data.get('phone'):
        updates.append("phone = %s")
        values.append(contact_data['phone'])
    
    if contact_data.get('email'):
        updates.append("contact_email = %s")
        values.append(contact_data['email'])
    
    if contact_data.get('contact_name'):
        updates.append("contact_name = %s")
        values.append(contact_data['contact_name'])
    
    if not updates:
        return False
    
    query = f"""
        UPDATE {TABLE_NAME}
        SET {', '.join(updates)}
        WHERE id = %s
    """
    values.append(record_id)
    
    try:
        cursor.execute(query, values)
        conn.commit()
        return cursor.rowcount > 0
    except Error as e:
        print(f"[ERROR] Database update error: {e}")
        return False
    finally:
        cursor.close()


def save_progress(last_id):
    """Save last processed ID for resume capability"""
    TRACKING_DIR.mkdir(exist_ok=True)
    with open(PROGRESS_FILE, 'w') as f:
        f.write(str(last_id))


def load_progress():
    """Load last processed ID"""
    if PROGRESS_FILE.exists():
        with open(PROGRESS_FILE, 'r') as f:
            try:
                return int(f.read().strip())
            except ValueError:
                return 0
    return 0


def main():
    parser = argparse.ArgumentParser(description='FLHIP Hotel Contact Enricher (Batched)')
    parser.add_argument('--limit', type=int, default=50, help='Max records to process')
    parser.add_argument('--batch-size', type=int, default=25, help='Records per DB query batch')
    parser.add_argument('--api-batch', type=int, default=50, help='Records per API call (max 50)')
    parser.add_argument('--start-id', type=int, default=0, help='Start from specific ID')
    parser.add_argument('--resume', action='store_true', help='Resume from last progress')
    parser.add_argument('--delay', type=float, default=5, help='Delay between API batches (seconds)')
    parser.add_argument('--regex-only', action='store_true', help='Only use regex extraction (free)')
    parser.add_argument('--no-web-search', action='store_true', help='Disable web search (cheaper)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--auto', action='store_true', help='Run without prompts')
    parser.add_argument('--test', type=int, help='Test single record by ID')
    args = parser.parse_args()
    
    # Cap API batch size at 50
    args.api_batch = min(args.api_batch, 50)
    
    print("=" * 60)
    print("FLHIP HOTEL CONTACT ENRICHER (Batched)")
    print("=" * 60)
    
    # Initialize Anthropic client if needed
    client = None
    if not args.regex_only:
        if not ANTHROPIC_AVAILABLE:
            print("[ERROR] anthropic library required for AI enrichment")
            sys.exit(1)
        if not ANTHROPIC_API_KEY:
            print("[ERROR] ANTHROPIC_API_KEY not set")
            sys.exit(1)
        
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        print(f"[OK] Anthropic API Key: ...{ANTHROPIC_API_KEY[-8:]}")
        
        if args.no_web_search:
            print("[MODE] Article analysis only (no web search)")
        else:
            print("[MODE] Batched web search ({} hotels per API call)".format(args.api_batch))
    else:
        print("[MODE] Regex extraction only (FREE)")
    
    # Database connection
    conn = get_db_connection()
    print(f"[OK] Connected to database: {DB_CONFIG['database']}")
    print(f"[OK] Using table: {TABLE_NAME}")
    
    # Handle resume
    if args.resume:
        args.start_id = load_progress()
        if args.start_id:
            print(f"[RESUME] Continuing from ID {args.start_id}")
    
    # Test mode
    if args.test:
        cursor = conn.cursor(dictionary=True)
        cursor.execute(f"SELECT * FROM {TABLE_NAME} WHERE id = %s", (args.test,))
        record = cursor.fetchone()
        cursor.close()
        
        if not record:
            print(f"[ERROR] Record ID {args.test} not found")
            conn.close()
            return
        
        print(f"\n[TEST] Record ID {args.test}")
        print(f"   Business: {record['business']}")
        print(f"   Contact: {record['contact_name'] or 'None'}")
        print(f"   Location: {record['city']}, {record['state']}")
        print(f"   Phone: {record['phone'] or 'MISSING'}")
        print(f"   Email: {record['contact_email'] or 'MISSING'}")
        
        need_phone = not record['phone']
        need_email = not record['contact_email']
        
        # Try regex first
        print("\n[REGEX] Trying regex extraction...")
        regex_results = extract_contacts_with_regex(record.get('notes', ''))
        if regex_results['phone']:
            print(f"   Found phone: {regex_results['phone']}")
        if regex_results['email']:
            print(f"   Found email: {regex_results['email']}")
        
        if not args.regex_only and (need_phone or need_email):
            print("\n[AI] Calling Claude API (batched mode with 1 record)...")
            results = enrich_contacts_batch_with_claude(
                client, [record],
                use_web_search=not args.no_web_search
            )
            
            result = results.get(args.test)
            if result:
                print(f"\n[RESULT]")
                print(json.dumps(result, indent=2))
            else:
                print("\n[RESULT] No contact info found")
        
        conn.close()
        return
    
    # Get total count
    total_needed = get_total_needing_contacts(conn)
    print(f"[INFO] Records needing contact info: {total_needed}")
    
    if total_needed == 0:
        print("[OK] All records have contact data!")
        conn.close()
        return
    
    if args.dry_run:
        print("\n[DRY RUN] No changes will be made\n")
    
    # Cost estimate - UPDATED for batched processing
    if not args.regex_only and not args.dry_run:
        if args.no_web_search:
            cost_per_batch = 0.03  # ~$0.03 per batch without search
        else:
            cost_per_batch = 2.50  # ~$2.50 per batch of ~31 with web search
        
        num_batches = (args.limit + args.api_batch - 1) // args.api_batch
        estimated_cost = num_batches * cost_per_batch
        print(f"[COST] Estimated: ~${estimated_cost:.2f} for {args.limit} records ({num_batches} API calls)")
    
    # Process records
    processed = 0
    updated = 0
    regex_found = 0
    ai_found = 0
    skipped = 0
    api_calls = 0
    current_start_id = args.start_id
    
    if args.start_id > 0:
        print(f"[INFO] Starting after ID {args.start_id}")
    
    while processed < args.limit:
        batch_size = min(args.batch_size, args.limit - processed)
        records = get_records_needing_contacts(conn, batch_size, current_start_id)
        
        if not records:
            print("\n[OK] No more records to process")
            break
        
        print(f"\n--- DB Batch of {len(records)} records ---")
        
        # STEP 1: Try regex on all records first (FREE)
        records_for_ai = []
        for record in records:
            record_id = record['id']
            business = record['business'] or 'Unknown'
            has_notes = bool(record.get('notes') and len(record.get('notes', '')) > 50)
            
            need_phone = not record['phone']
            need_email = not record['contact_email']
            
            print(f"\n[{processed + 1}/{args.limit}] ID {record_id}: {business[:40]}")
            
            if args.dry_run:
                print("   [SKIP] Dry run")
                processed += 1
                current_start_id = record_id
                continue
            
            # Try regex first
            regex_results = {'phone': None, 'email': None}
            if has_notes:
                regex_results = extract_contacts_with_regex(record.get('notes', ''))
                
                if regex_results['phone'] and need_phone:
                    print(f"   [REGEX] Phone: {regex_results['phone']}")
                    update_record_contacts(conn, record_id, {'phone': regex_results['phone']})
                    regex_found += 1
                    updated += 1
                    need_phone = False
                
                if regex_results['email'] and need_email:
                    print(f"   [REGEX] Email: {regex_results['email']}")
                    update_record_contacts(conn, record_id, {'email': regex_results['email']})
                    regex_found += 1
                    updated += 1
                    need_email = False
            
            # If still need info, queue for AI batch
            if not args.regex_only and (need_phone or need_email):
                if record.get('contact_name') or has_notes:
                    record['_need_phone'] = need_phone
                    record['_need_email'] = need_email
                    records_for_ai.append(record)
                else:
                    print(f"   [SKIP] No contact name or notes")
                    skipped += 1
            
            processed += 1
            current_start_id = record_id
            save_progress(record_id)
        
        # STEP 2: Process AI batch
        if records_for_ai and not args.regex_only:
            # Process in API batch sizes
            for i in range(0, len(records_for_ai), args.api_batch):
                api_batch = records_for_ai[i:i + args.api_batch]
                
                print(f"\n   [AI BATCH] Processing {len(api_batch)} hotels in single API call...")
                api_calls += 1
                
                results = enrich_contacts_batch_with_claude(
                    client, api_batch,
                    use_web_search=not args.no_web_search
                )
                
                # Apply results
                for record in api_batch:
                    record_id = record['id']
                    result = results.get(record_id, {})
                    
                    contact_data = {}
                    if result.get('phone') and record['_need_phone']:
                        contact_data['phone'] = result['phone']
                        print(f"   [AI] ID {record_id}: Phone {result['phone']}")
                        ai_found += 1
                    
                    if result.get('email') and record['_need_email']:
                        contact_data['email'] = result['email']
                        print(f"   [AI] ID {record_id}: Email {result['email']}")
                        ai_found += 1
                    
                    if result.get('contact_name') and not record.get('contact_name'):
                        contact_data['contact_name'] = result['contact_name']
                    
                    if contact_data:
                        success = update_record_contacts(conn, record_id, contact_data)
                        if success:
                            updated += 1
                        if result.get('source'):
                            print(f"   [SOURCE] {result['source'][:60]}")
                    else:
                        print(f"   [AI] ID {record_id}: No info found")
                        skipped += 1
                
                # Delay between API batches
                if i + args.api_batch < len(records_for_ai):
                    time.sleep(args.delay)
        
        # Prompt to continue
        if not args.auto and processed < args.limit:
            remaining = args.limit - processed
            response = input(f"\nContinue with next batch? (y/n): ")
            if response.lower() != 'y':
                break
    
    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"   Processed: {processed}")
    print(f"   Updated: {updated}")
    print(f"   Regex finds: {regex_found}")
    print(f"   AI finds: {ai_found}")
    print(f"   No data found: {skipped}")
    print(f"   API calls made: {api_calls}")
    
    if not args.regex_only and not args.dry_run and api_calls > 0:
        cost_per_batch = 0.03 if args.no_web_search else 2.50
        print(f"   Est. cost: ~${api_calls * cost_per_batch:.2f}")
    
    conn.close()
    print("\n[DONE]")


if __name__ == "__main__":
    main()