"""
FLHIP Scraper v3 - Hotel Configuration
Using Claude API for intelligent hotel lead extraction

FEED STRATEGY:
- Primary: Google News RSS (Playwright resolves wrapper URLs to publisher URLs)
- Secondary: Working industry feeds (Lodging Magazine, Hotel Dive)

NOTE: Google News RSS returns wrapper URLs like:
  https://news.google.com/rss/articles/CBMi...
These are resolved to real publisher URLs using Playwright/Chromium in hotel_rss_fetcher.py
"""

import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / '.env')
from datetime import datetime

# =============================================================================
# API KEYS (set via environment variables)
# =============================================================================
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")

# =============================================================================
# DATABASE CONFIG
# =============================================================================
DB_CONFIG = {
    "host": os.getenv("DB_HOST", "localhost"),
    "user": os.getenv("DB_USER", "root"),
    "password": os.getenv("DB_PASSWORD", ""),
    "database": os.getenv("DB_NAME", "restaurant_openings"),
    "charset": "utf8mb4"
}

# =============================================================================
# EMAIL CONFIG
# =============================================================================
EMAIL_CONFIG = {
    "smtp_host": os.getenv("SMTP_HOST", "smtp.gmail.com"),
    "smtp_port": int(os.getenv("SMTP_PORT", "587")),
    "smtp_user": os.getenv("SMTP_USER", ""),
    "smtp_password": os.getenv("SMTP_PASSWORD", ""),
    "from_address": os.getenv("EMAIL_FROM", ""),
    "to_addresses": os.getenv("EMAIL_TO", "").split(","),
}

# =============================================================================
# CLAUDE API CONFIG
# =============================================================================
CLAUDE_MODEL = "claude-sonnet-4-20250514"
CLAUDE_MAX_TOKENS = 2000
CLAUDE_TEMPERATURE = 0

# =============================================================================
# GOOGLE NEWS RSS FEEDS
# These return wrapper URLs that need Playwright resolution
# Template: https://news.google.com/rss/search?q={QUERY}&hl=en-US&gl=US&ceid=US:en
# =============================================================================

def gnews_url(query: str) -> str:
    """Build Google News RSS URL from search query."""
    from urllib.parse import quote
    return f"https://news.google.com/rss/search?q={quote(query)}&hl=en-US&gl=US&ceid=US:en"

# Core hotel development queries (US focused)
GOOGLE_NEWS_FEEDS = [
    # === DEVELOPMENT & CONSTRUCTION ===
    {"name": "Hotel Opening Development US", 
     "url": gnews_url('(hotel OR resort) ("set to open" OR "will open" OR "plans to open" OR "breaks ground" OR groundbreaking) when:14d -restaurant -bar -airbnb'),
     "type": "google_news"},
    
    {"name": "Hotel Construction Permits", 
     "url": gnews_url('(hotel OR resort) (construction OR permit OR development OR approved) when:14d -restaurant -bar -review -recipes'),
     "type": "google_news"},
    
    {"name": "New Hotel Announced", 
     "url": gnews_url('"new hotel" (announced OR planned OR proposed OR approved) when:14d -restaurant -bar -review'),
     "type": "google_news"},
    
    {"name": "Hotel Grand Opening", 
     "url": gnews_url('hotel "grand opening" when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Hotel Opening 2026", 
     "url": gnews_url('hotel (opening OR opens) 2026 when:14d -restaurant -bar -review'),
     "type": "google_news"},
    
    # === MAJOR HOTEL BRANDS ===
    {"name": "Hilton New Hotel", 
     "url": gnews_url('Hilton (opening OR "new hotel" OR construction OR groundbreaking) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Marriott New Hotel", 
     "url": gnews_url('Marriott (opening OR "new hotel" OR construction OR groundbreaking) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Hyatt New Hotel", 
     "url": gnews_url('Hyatt (opening OR "new hotel" OR construction OR groundbreaking) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "IHG Holiday Inn New Hotel", 
     "url": gnews_url('(IHG OR "Holiday Inn" OR "InterContinental") (opening OR "new hotel" OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Best Western Choice Hotels", 
     "url": gnews_url('("Best Western" OR "Choice Hotels" OR "Comfort Inn" OR "Quality Inn") (opening OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Boutique Hotel Opening", 
     "url": gnews_url('"boutique hotel" (opening OR "coming soon" OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Extended Stay Hotel", 
     "url": gnews_url('("extended stay" OR "Residence Inn" OR "Home2 Suites" OR "TownePlace") hotel opening when:14d -restaurant -bar'),
     "type": "google_news"},
    
    # === MAJOR US MARKETS ===
    {"name": "New York Hotel Development", 
     "url": gnews_url('("New York" OR Manhattan OR Brooklyn OR Queens) hotel (opening OR development OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Los Angeles Hotel Development", 
     "url": gnews_url('("Los Angeles" OR Hollywood OR "Beverly Hills" OR Pasadena) hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Miami Florida Hotel", 
     "url": gnews_url('(Miami OR "Fort Lauderdale" OR "Palm Beach" OR Orlando) hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Las Vegas Hotel Casino", 
     "url": gnews_url('"Las Vegas" (hotel OR resort OR casino) (opening OR development OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Texas Hotel Development", 
     "url": gnews_url('(Houston OR Dallas OR Austin OR "San Antonio" OR "Fort Worth") hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Chicago Hotel Development", 
     "url": gnews_url('Chicago hotel (opening OR development OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Phoenix Arizona Hotel", 
     "url": gnews_url('(Phoenix OR Scottsdale OR Arizona) hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Denver Colorado Hotel", 
     "url": gnews_url('(Denver OR Colorado) hotel (opening OR development OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Seattle Pacific NW Hotel", 
     "url": gnews_url('(Seattle OR Portland OR "Pacific Northwest") hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Nashville Tennessee Hotel", 
     "url": gnews_url('(Nashville OR Tennessee OR Memphis) hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Atlanta Georgia Hotel", 
     "url": gnews_url('(Atlanta OR Georgia) hotel (opening OR development OR construction) when:14d -restaurant -bar'),
     "type": "google_news"},
    
    {"name": "Boston Northeast Hotel", 
     "url": gnews_url('(Boston OR Massachusetts OR "New England") hotel (opening OR development) when:14d -restaurant -bar'),
     "type": "google_news"},
]

# =============================================================================
# WORKING INDUSTRY FEEDS (direct RSS, no resolution needed)
# =============================================================================
INDUSTRY_FEEDS = [
    {"name": "Lodging Magazine", "url": "https://lodgingmagazine.com/feed/", "type": "hotel_industry"},
    {"name": "Hotel Dive", "url": "https://www.hoteldive.com/feeds/news/", "type": "hotel_industry"},
]

# =============================================================================
# ALL FEEDS - Google News first, then industry
# =============================================================================
RSS_FEEDS = GOOGLE_NEWS_FEEDS + INDUSTRY_FEEDS

# =============================================================================
# BLOCKED DOMAINS - These need Zyte to scrape (paywall/bot wall)
# =============================================================================
BLOCKED_DOMAINS = [
    # Social media (never scrapeable)
    "facebook.com",
    "instagram.com",
    "twitter.com",
    "x.com",
    "youtube.com",
    "tiktok.com",
    "linkedin.com",
    
    # Aggregators that block scraping
    "msn.com",
    "pressreader.com",
    "yahoo.com",
    
    # Paywalled news
    "nytimes.com",
    "wsj.com",
    "bloomberg.com",
    "washingtonpost.com",
    "latimes.com",
    "bostonglobe.com",
    "chicagotribune.com",
    "sfchronicle.com",
    "dallasnews.com",
    "houstonchronicle.com",
    "denverpost.com",
    "startribune.com",
    "phillymag.com",
    
    # Business journals (all paywalled)
    "bizjournals.com",
    
    # Hotel industry sites with bot protection
    "hotelmanagement.net",
    "tophotel.news",
    "hoteliermiddleeast.com",
    "hoteldesigns.net",
    
    # Real estate sites with heavy protection
    "newyorkyimby.com",
    "therealdeal.com",
    
    # Regional news with paywalls
    "8newsnow.com",
    "thenationalherald.com",
    "reviewjournal.com",
    "commercialappeal.com",
    "tennessean.com",
]

# =============================================================================
# TITLE FAIL PATTERNS - Skip these titles BEFORE URL resolution (saves time)
# =============================================================================
TITLE_FAIL_PATTERNS = [
    # Already opened (not future leads)
    "just opened",
    "now open",
    "opens its doors",
    "opened its doors",
    "officially opens",
    "officially opened",
    "soft opening",
    "ribbon cutting",
    "has opened",
    "is now open",
    "opened last",
    "opened yesterday",
    "opened this week",
    
    # International locations (not US market)
    "europe",
    "european",
    "asia",
    "asian",
    "middle east",
    "africa",
    "african",
    "australia",
    "australian",
    "canada",
    "canadian",
    "mexico",
    "mexican",
    "caribbean",
    "uk hotel",
    "london hotel",
    "paris hotel",
    "dubai",
    "singapore",
    "tokyo",
    "hong kong",
    "bali",
    "maldives",
    "cancun",
    "jamaica",
    "bahamas",
    "puerto rico",
    "san juan",
    "osaka",
    "nairobi",
    "kenya",
    "south africa",
    
    # Non-hotel content
    "restaurant",
    "cafe opens",
    "bar opens",
    "brewery",
    "winery",
    "distillery",
    "airbnb",
    "vrbo",
    "vacation rental",
    
    # Reviews/rankings (not openings)
    "best hotels",
    "top hotels",
    "hotel review",
    "hotel rating",
    "where to stay",
    "hotel deals",
    "hotel prices",
    
    # Job/personnel news
    "appointed",
    "director of sales",
    "general manager named",
    "new gm",
    "joins as",
    
    # Closures/conversions (not new openings)
    "hotel closes",
    "hotel closing",
    "shuttered",
    "demolished",
    "converted to apartments",
    "become apartments",
    "homeless shelter",
    "affordable housing",
    
    # Year-in-review/retrospective
    "year in review",
    "best of 2024",
    "best of 2025",
    "hotels that opened in",
    "businesses that opened in",
    "highlights from",
    
    # Generic/irrelevant
    "holiday hours",
    "christmas",
    "thanksgiving",
    "new year",
    "fire at hotel",
    "hotel fire",
    "shooting at",
    "crime at",
]

# =============================================================================
# US STATES (for validation)
# =============================================================================
US_STATES = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC'
]

# =============================================================================
# PROCESSING LIMITS
# =============================================================================
MAX_ARTICLES_PER_RUN = 500
REQUEST_DELAY_SECONDS = 0.5
SCRAPE_TIMEOUT_SECONDS = 15

# =============================================================================
# OUTPUT
# =============================================================================
LOG_FILE = f"scraper4_{datetime.now().strftime('%Y%m%d')}.log"

# =============================================================================
# VALIDATION
# =============================================================================
if __name__ == "__main__":
    print(f"Hotel Config Loaded:")
    print(f"  Total feeds: {len(RSS_FEEDS)}")
    print(f"    Google News: {len(GOOGLE_NEWS_FEEDS)}")
    print(f"    Industry: {len(INDUSTRY_FEEDS)}")
    print(f"  Blocked domains: {len(BLOCKED_DOMAINS)}")
    print(f"  Title fail patterns: {len(TITLE_FAIL_PATTERNS)}")
    print(f"\nGoogle News feeds:")
    for f in GOOGLE_NEWS_FEEDS[:5]:
        print(f"  - {f['name']}")
    print(f"  ... and {len(GOOGLE_NEWS_FEEDS) - 5} more")
