"""
FLHIP Hotel Scraper v3 - RSS Fetcher
Pulls hotel articles from RSS feeds and scrapes content

Updated: 2026-01-04
- Added Google News URL resolution via Playwright
- Added tiered scraping: requests first, Zyte fallback
- Added block detection for intelligent Zyte routing
- Added title normalization (strip HTML tags, decode entities)
"""

import os
import html
import feedparser
import requests
import base64
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote
import re
import time
from datetime import datetime
from hotel_config import (
    RSS_FEEDS, 
    BLOCKED_DOMAINS, 
    TITLE_FAIL_PATTERNS
)

# Import Google News resolver
try:
    from google_news_resolver import is_google_news_wrapper, resolve_google_news_url
    RESOLVER_AVAILABLE = True
    print("[OK] Google News resolver loaded")
except ImportError:
    RESOLVER_AVAILABLE = False
    def is_google_news_wrapper(url): return False
    def resolve_google_news_url(url): return None

# =============================================================================
# ZYTE CONFIG
# =============================================================================
ZYTE_API_KEY = os.getenv("ZYTE_API_KEY", "")
ZYTE_API_URL = "https://api.zyte.com/v1/extract"

# Track Zyte usage per domain to avoid bill shock
ZYTE_DOMAIN_CALLS = {}
ZYTE_MAX_PER_DOMAIN = 10  # Max Zyte calls per domain per run

# Scrape settings
SCRAPE_TIMEOUT_SECONDS = 15
REQUEST_DELAY_SECONDS = 0.5
MIN_CONTENT_LENGTH = 600

# Block detection signals
BLOCK_SIGNALS = [
    "captcha",
    "cloudflare",
    "attention required",
    "verify you are human",
    "verify you are a human",
    "enable javascript",
    "access denied",
    "incident id",
    "subscribe to continue",
    "sign in to continue",
    "create an account to continue",
    "please enable cookies",
    "unusual traffic",
    "too many requests",
    "rate limit",
]

# Domains that are never worth Zyte calls (always fail)
NEVER_ZYTE_DOMAINS = [
    "facebook.com",
    "instagram.com",
    "twitter.com",
    "x.com",
    "youtube.com",
    "tiktok.com",
    "linkedin.com",
]

# =============================================================================
# HTTP HEADERS
# =============================================================================
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
}


# =============================================================================
# TITLE NORMALIZATION
# =============================================================================
TAG_RE = re.compile(r"<[^>]+>")

def normalize_title(raw: str) -> str:
    """
    Normalize RSS title by:
    1. Decoding HTML entities (&amp; -> &, &#39; -> ', etc)
    2. Stripping HTML tags (<b>, </b>, etc)
    3. Collapsing whitespace
    """
    if not raw:
        return ""
    t = html.unescape(raw)
    t = TAG_RE.sub("", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


# =============================================================================
# URL UTILITIES
# =============================================================================
def is_blocked_domain(url: str) -> bool:
    """Check if URL is from a known blocked domain."""
    try:
        domain = urlparse(url).netloc.lower()
        for blocked in BLOCKED_DOMAINS:
            if blocked in domain:
                return True
        return False
    except:
        return False


def is_never_zyte_domain(url: str) -> bool:
    """Check if domain should never be sent to Zyte."""
    try:
        domain = urlparse(url).netloc.lower()
        for never in NEVER_ZYTE_DOMAINS:
            if never in domain:
                return True
        return False
    except:
        return False


def is_restaurant_url(url: str) -> bool:
    """Check if URL is from a restaurant-focused path (noise in hotel feeds)."""
    if not url:
        return False
    u = url.lower()
    patterns = ["/restaurants/", "/restaurant/", "/dining/", "/food/", "/eatery/", "/cafe/", "/bar/"]
    return any(p in u for p in patterns)


def get_domain(url: str) -> str:
    """Extract domain from URL."""
    try:
        return urlparse(url).netloc.lower()
    except:
        return ""


def title_fails_filter(title: str) -> bool:
    """Check if title matches early-fail patterns."""
    if not title:
        return True
    title_lower = title.lower()
    for pattern in TITLE_FAIL_PATTERNS:
        if pattern in title_lower:
            return True
    return False


def extract_real_url(google_url: str) -> str:
    """
    Extract the actual URL from various wrapper formats:
    1. Google Alerts: google.com/url?url=REAL_URL&... -> parse query param
    2. Google News RSS: news.google.com/rss/articles/CBMi... -> browser resolution
    3. Direct URLs: pass through unchanged
    """
    if not google_url:
        return google_url
    
    # Handle Google Alerts redirect URLs (simple regex extraction)
    if "google.com/url" in google_url:
        match = re.search(r'url=([^&]+)', google_url)
        if match:
            return unquote(match.group(1))
    
    # Handle Google News RSS article URLs (require browser/JS)
    if is_google_news_wrapper(google_url):
        # First try quick HTTP redirect (sometimes works)
        try:
            response = requests.get(
                google_url,
                headers=HEADERS,
                allow_redirects=True,
                timeout=10
            )
            if response.url and "news.google.com" not in response.url:
                return response.url
        except:
            pass
        
        # Fall back to Playwright browser resolution
        if RESOLVER_AVAILABLE:
            resolved = resolve_google_news_url(google_url)
            if resolved and "news.google.com" not in resolved:
                return resolved
        
        # Resolution failed - return original
        return google_url
    
    return google_url


# =============================================================================
# BLOCK DETECTION
# =============================================================================
def is_blocked_response(response: requests.Response) -> bool:
    """Detect if response indicates blocking (paywall, bot wall, etc)."""
    if response.status_code in (401, 403, 429, 451, 503):
        return True
    
    try:
        text = response.text.lower()[:5000]
        for signal in BLOCK_SIGNALS:
            if signal in text:
                return True
    except:
        pass
    
    return False


def extract_text_from_html(html_content: str) -> str | None:
    """Extract clean text from HTML."""
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
            tag.decompose()
        
        content = None
        for selector in ['article', 'main', '.article-body', '.post-content', '.entry-content', '.story-body', '.article-content']:
            element = soup.select_one(selector)
            if element:
                content = element.get_text(separator='\n', strip=True)
                break
        
        if not content:
            body = soup.find('body')
            if body:
                content = body.get_text(separator='\n', strip=True)
        
        if content:
            content = re.sub(r'\n{3,}', '\n\n', content)
            content = re.sub(r' {2,}', ' ', content)
            return content.strip()
        
        return None
    except:
        return None


# =============================================================================
# TIER 1: REQUESTS SCRAPE
# =============================================================================
def scrape_with_requests(url: str) -> tuple[str | None, bool]:
    """
    Scrape URL using requests library.
    Returns: (content, was_blocked)
    """
    try:
        response = requests.get(
            url, 
            headers=HEADERS, 
            timeout=SCRAPE_TIMEOUT_SECONDS,
            allow_redirects=True
        )
        
        if is_blocked_response(response):
            return None, True
        
        response.raise_for_status()
        content = extract_text_from_html(response.text)
        return content, False
        
    except requests.RequestException:
        return None, True
    except Exception:
        return None, False


# =============================================================================
# TIER 2: ZYTE SCRAPE
# =============================================================================
def scrape_with_zyte(url: str) -> str | None:
    """Scrape a URL using Zyte API with browser rendering."""
    global ZYTE_DOMAIN_CALLS
    
    if not ZYTE_API_KEY:
        return None
    
    domain = get_domain(url)
    if domain in ZYTE_DOMAIN_CALLS and ZYTE_DOMAIN_CALLS[domain] >= ZYTE_MAX_PER_DOMAIN:
        print(f"  [Zyte] Domain cap reached for {domain}")
        return None
    
    if is_never_zyte_domain(url):
        return None
    
    try:
        response = requests.post(
            ZYTE_API_URL,
            auth=(ZYTE_API_KEY, ""),
            json={
                "url": url,
                "browserHtml": True,
            },
            timeout=60
        )
        
        ZYTE_DOMAIN_CALLS[domain] = ZYTE_DOMAIN_CALLS.get(domain, 0) + 1
        
        if response.status_code != 200:
            print(f"  [Zyte] Error: {response.status_code}")
            return None
        
        data = response.json()
        html_content = data.get("browserHtml", "")
        
        if not html_content:
            body_b64 = data.get("httpResponseBody", "")
            if body_b64:
                html_content = base64.b64decode(body_b64).decode("utf-8", errors="ignore")
        
        if not html_content:
            return None
        
        return extract_text_from_html(html_content)
        
    except Exception as e:
        print(f"  [Zyte] Exception: {e}")
        return None


# =============================================================================
# TIERED SCRAPER (requests -> Zyte fallback)
# =============================================================================
def scrape_article_content(url: str) -> str | None:
    """
    Smart scraper with fallback:
    1. Try requests (free, fast)
    2. If blocked/thin content -> try Zyte (paid, reliable)
    """
    if is_google_news_wrapper(url):
        print("  Unresolved Google News URL")
        return None
    
    # Tier 1: Try requests
    content, was_blocked = scrape_with_requests(url)
    
    if content and len(content) >= MIN_CONTENT_LENGTH:
        return content
    
    # Tier 2: Try Zyte
    should_try_zyte = (
        was_blocked or 
        (content and len(content) < MIN_CONTENT_LENGTH) or
        is_blocked_domain(url)
    )
    
    if should_try_zyte and ZYTE_API_KEY:
        reason = "blocked" if was_blocked else "thin content" if content else "known blocked domain"
        print(f"  [Zyte] Retrying ({reason})...")
        
        zyte_content = scrape_with_zyte(url)
        if zyte_content and len(zyte_content) >= MIN_CONTENT_LENGTH:
            print(f"  [Zyte] Success: {len(zyte_content)} chars")
            return zyte_content
    
    return content


# =============================================================================
# FEED FETCHER
# =============================================================================
def fetch_all_feeds(max_per_feed: int = 15) -> list[dict]:
    """Fetch articles from all configured RSS feeds."""
    print("=" * 60)
    print("FLHIP Hotel Scraper v3 - RSS Fetch & Scrape")
    print("=" * 60)
    
    if ZYTE_API_KEY:
        print(f"[OK] Zyte API key configured (fallback enabled)")
    else:
        print("[WARN] ZYTE_API_KEY not set - no fallback for blocked sites")
    
    print(f"Processing {len(RSS_FEEDS)} feeds (max {max_per_feed} articles per feed)...")
    
    all_articles = []
    seen_urls = set()
    
    for feed_config in RSS_FEEDS:
        feed_name = feed_config.get("name", "Unknown")
        feed_url = feed_config.get("url", "")
        
        if not feed_url:
            continue
        
        print(f"\nFetching feed: {feed_name}")
        
        try:
            feed = feedparser.parse(feed_url)
            
            if not feed.entries:
                print(f"  No entries found")
                continue
            
            count = 0
            for entry in feed.entries:
                if count >= max_per_feed:
                    print(f"  Reached per-feed limit ({max_per_feed})")
                    break
                
                raw_title = entry.get("title", "")
                link = entry.get("link", "")
                
                if not link:
                    continue
                
                # Normalize title
                title = normalize_title(raw_title)
                
                # Early title filter
                if title_fails_filter(title):
                    print(f"  Title fail: {title[:50]}")
                    continue
                
                # Resolve URL
                is_gnews = is_google_news_wrapper(link)
                print(f"  Resolving URL {count + 1}...", end=" ")
                real_url = extract_real_url(link)
                
                try:
                    domain = urlparse(real_url).netloc[:40]
                    print(f"-> {domain}")
                except:
                    print(f"-> {real_url[:50]}")
                
                # Skip restaurant URLs
                if is_restaurant_url(real_url):
                    print(f"  Restaurant URL skipped")
                    continue
                
                # Dedupe
                if real_url in seen_urls:
                    continue
                seen_urls.add(real_url)
                
                # Get date
                pub_date = datetime.now().strftime("%Y-%m-%d")
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    try:
                        pub_date = datetime(*entry.published_parsed[:6]).strftime("%Y-%m-%d")
                    except:
                        pass
                
                all_articles.append({
                    "title": title,
                    "url": real_url,
                    "date": pub_date,
                    "feed_name": feed_name,
                    "content": None,
                })
                
                count += 1
                
                if is_gnews:
                    time.sleep(0.3)
            
            print(f"  Found {count} candidate articles")
            
        except Exception as e:
            print(f"  Feed error: {e}")
            continue
        
        time.sleep(0.5)
    
    print(f"\nTotal unique hotel articles: {len(all_articles)}")
    return all_articles


def scrape_articles(articles: list[dict]) -> list[dict]:
    """Scrape content for each article using tiered approach."""
    scraped = []
    
    for i, article in enumerate(articles):
        url = article.get("url", "")
        print(f"Scraping {i+1}/{len(articles)}: {url[:60]}...")
        
        content = scrape_article_content(url)
        
        if content and len(content) >= 200:
            article["content"] = content
            scraped.append(article)
            print(f"  Got {len(content)} chars")
        else:
            print(f"  No content")
        
        time.sleep(REQUEST_DELAY_SECONDS)
    
    # Print Zyte usage
    if ZYTE_DOMAIN_CALLS:
        total_zyte = sum(ZYTE_DOMAIN_CALLS.values())
        print(f"\n[Zyte] Total API calls this run: {total_zyte}")
        for domain, count in sorted(ZYTE_DOMAIN_CALLS.items(), key=lambda x: -x[1])[:5]:
            print(f"  {domain}: {count}")
    
    print(f"\nSuccessfully scraped: {len(scraped)}/{len(articles)}")
    return scraped


def fetch_and_scrape(max_per_feed: int = 15) -> list[dict]:
    """Main entry point: fetch feeds and scrape content."""
    articles = fetch_all_feeds(max_per_feed=max_per_feed)
    scraped = scrape_articles(articles)
    with_content = [a for a in scraped if a.get("content")]
    print(f"\nArticles with content: {len(with_content)}")
    return with_content


# Alias for compatibility with hotel_main.py
def fetch_and_scrape_all() -> list[dict]:
    """Alias for fetch_and_scrape with default settings."""
    return fetch_and_scrape(max_per_feed=15)


# =============================================================================
# TEST
# =============================================================================
if __name__ == "__main__":
    print("Testing hotel RSS fetcher...")
    articles = fetch_and_scrape(max_per_feed=5)
    print(f"\nGot {len(articles)} articles with content")
    for a in articles[:5]:
        print(f"  - {a['title'][:50]}")
        print(f"    {a['url'][:60]}")
