"""
FLHIP Scraper v3 - RSS Fetcher
Pulls articles from RSS feeds and scrapes content

Updated: 2026-01-04
- Added Google News URL resolution via Playwright
- Added tiered scraping: requests first, Zyte fallback
- Added block detection for intelligent Zyte routing
"""

import os
import feedparser
import requests
import base64
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote
import re
import time
from datetime import datetime
from config import (
    RSS_FEEDS, 
    BLOCKED_DOMAINS, 
    TITLE_FAIL_PATTERNS,
    SCRAPE_TIMEOUT_SECONDS,
    REQUEST_DELAY_SECONDS
)

# Import Google News resolver (optional - works without it for Google Alerts)
try:
    from google_news_resolver import is_google_news_wrapper, resolve_google_news_url
    RESOLVER_AVAILABLE = True
    print("[OK] Google News resolver loaded")
except ImportError:
    RESOLVER_AVAILABLE = False
    def is_google_news_wrapper(url): return False
    def resolve_google_news_url(url): return None

# =============================================================================
# ZYTE CONFIG
# =============================================================================
ZYTE_API_KEY = os.getenv("ZYTE_API_KEY", "")
ZYTE_API_URL = "https://api.zyte.com/v1/extract"

# Track Zyte usage per domain to avoid bill shock
ZYTE_DOMAIN_CALLS = {}
ZYTE_MAX_PER_DOMAIN = 10  # Max Zyte calls per domain per run

# Block detection signals
BLOCK_SIGNALS = [
    "captcha",
    "cloudflare",
    "attention required",
    "verify you are human",
    "verify you are a human",
    "enable javascript",
    "access denied",
    "incident id",
    "subscribe to continue",
    "sign in to continue",
    "create an account to continue",
    "please enable cookies",
    "unusual traffic",
    "too many requests",
    "rate limit",
]

# Domains that are never worth Zyte calls (always fail)
NEVER_ZYTE_DOMAINS = [
    "facebook.com",
    "instagram.com",
    "twitter.com",
    "x.com",
    "youtube.com",
    "tiktok.com",
    "linkedin.com",
]

# =============================================================================
# HTTP HEADERS
# =============================================================================
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
}


# =============================================================================
# URL UTILITIES
# =============================================================================
def is_blocked_domain(url: str) -> bool:
    """Check if URL is from a known blocked domain."""
    try:
        domain = urlparse(url).netloc.lower()
        for blocked in BLOCKED_DOMAINS:
            if blocked in domain:
                return True
        return False
    except:
        return False


def is_never_zyte_domain(url: str) -> bool:
    """Check if domain should never be sent to Zyte."""
    try:
        domain = urlparse(url).netloc.lower()
        for never in NEVER_ZYTE_DOMAINS:
            if never in domain:
                return True
        return False
    except:
        return False


def get_domain(url: str) -> str:
    """Extract domain from URL."""
    try:
        return urlparse(url).netloc.lower()
    except:
        return ""


def title_fails_early(title: str) -> bool:
    """Check if title matches early-fail patterns."""
    if not title:
        return True
    
    title_lower = title.lower()
    for pattern in TITLE_FAIL_PATTERNS:
        if re.search(pattern, title_lower):
            return True
    return False


def extract_real_url(google_url: str) -> str:
    """
    Extract the actual URL from various wrapper formats:
    1. Google Alerts: google.com/url?url=REAL_URL&... -> parse query param
    2. Google News RSS: news.google.com/rss/articles/CBMi... -> browser resolution
    3. Direct URLs: pass through unchanged
    """
    if not google_url:
        return google_url
    
    # Handle Google Alerts redirect URLs (simple regex extraction)
    if "google.com/url" in google_url:
        match = re.search(r'url=([^&]+)', google_url)
        if match:
            return unquote(match.group(1))
    
    # Handle Google News RSS article URLs (require browser/JS)
    if is_google_news_wrapper(google_url):
        # First try quick HTTP redirect (sometimes works)
        try:
            response = requests.get(
                google_url,
                headers=HEADERS,
                allow_redirects=True,
                timeout=10
            )
            if response.url and "news.google.com" not in response.url:
                return response.url
        except:
            pass
        
        # Fall back to Playwright browser resolution
        if RESOLVER_AVAILABLE:
            resolved = resolve_google_news_url(google_url)
            if resolved and "news.google.com" not in resolved:
                return resolved
        
        # Resolution failed - return original
        return google_url
    
    return google_url


# =============================================================================
# BLOCK DETECTION
# =============================================================================
def is_blocked_response(response: requests.Response) -> bool:
    """Detect if response indicates blocking (paywall, bot wall, etc)."""
    # Check status code
    if response.status_code in (401, 403, 429, 451, 503):
        return True
    
    # Check content for block signals
    try:
        text = response.text.lower()[:5000]  # Only check first 5K chars
        for signal in BLOCK_SIGNALS:
            if signal in text:
                return True
    except:
        pass
    
    return False


def extract_text_from_html(html: str) -> str | None:
    """Extract clean text from HTML."""
    try:
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove junk elements
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
            tag.decompose()
        
        # Try to find main content
        content = None
        for selector in ['article', 'main', '.article-body', '.post-content', '.entry-content', '.story-body', '[itemprop="articleBody"]']:
            element = soup.select_one(selector)
            if element:
                content = element.get_text(separator='\n', strip=True)
                break
        
        # Fallback to body
        if not content:
            body = soup.find('body')
            if body:
                content = body.get_text(separator='\n', strip=True)
        
        if content:
            content = re.sub(r'\n{3,}', '\n\n', content)
            content = re.sub(r' {2,}', ' ', content)
            return content.strip()
        
        return None
    except:
        return None


# =============================================================================
# TIER 1: REQUESTS SCRAPE
# =============================================================================
def scrape_with_requests(url: str) -> tuple[str | None, bool]:
    """
    Scrape URL using requests library.
    Returns: (content, was_blocked)
    """
    try:
        response = requests.get(
            url, 
            headers=HEADERS, 
            timeout=SCRAPE_TIMEOUT_SECONDS,
            allow_redirects=True
        )
        
        # Check for block signals
        if is_blocked_response(response):
            return None, True
        
        response.raise_for_status()
        
        content = extract_text_from_html(response.text)
        return content, False
        
    except requests.RequestException as e:
        # Connection errors, timeouts, etc
        return None, True
    except Exception as e:
        return None, False


# =============================================================================
# TIER 2: ZYTE SCRAPE
# =============================================================================
def scrape_with_zyte(url: str) -> str | None:
    """Scrape a URL using Zyte API with browser rendering."""
    global ZYTE_DOMAIN_CALLS
    
    if not ZYTE_API_KEY:
        return None
    
    # Check domain cap
    domain = get_domain(url)
    if domain in ZYTE_DOMAIN_CALLS and ZYTE_DOMAIN_CALLS[domain] >= ZYTE_MAX_PER_DOMAIN:
        print(f"  [Zyte] Domain cap reached for {domain}")
        return None
    
    # Skip domains that never work
    if is_never_zyte_domain(url):
        return None
    
    try:
        response = requests.post(
            ZYTE_API_URL,
            auth=(ZYTE_API_KEY, ""),
            json={
                "url": url,
                "browserHtml": True,  # Use browser rendering
            },
            timeout=60
        )
        
        # Track usage
        ZYTE_DOMAIN_CALLS[domain] = ZYTE_DOMAIN_CALLS.get(domain, 0) + 1
        
        if response.status_code != 200:
            print(f"  [Zyte] Error: {response.status_code}")
            return None
        
        data = response.json()
        
        # Get browser-rendered HTML
        html = data.get("browserHtml", "")
        if not html:
            # Fallback to httpResponseBody
            body_b64 = data.get("httpResponseBody", "")
            if body_b64:
                html = base64.b64decode(body_b64).decode("utf-8", errors="ignore")
        
        if not html:
            return None
        
        content = extract_text_from_html(html)
        return content
        
    except Exception as e:
        print(f"  [Zyte] Exception: {e}")
        return None


# =============================================================================
# TIERED SCRAPER (requests -> Zyte fallback)
# =============================================================================
def scrape_article_content(url: str) -> str | None:
    """
    Smart scraper with fallback:
    1. Try requests (free, fast)
    2. If blocked/thin content -> try Zyte (paid, reliable)
    """
    MIN_CONTENT_LENGTH = 600
    
    # Skip if still a Google News wrapper (resolution failed)
    if is_google_news_wrapper(url):
        print("  Unresolved Google News URL")
        return None
    
    # Tier 1: Try requests
    content, was_blocked = scrape_with_requests(url)
    
    if content and len(content) >= MIN_CONTENT_LENGTH:
        return content
    
    # Tier 2: Try Zyte if:
    # - Was blocked by paywall/bot wall
    # - Content was too thin
    # - Known blocked domain
    should_try_zyte = (
        was_blocked or 
        (content and len(content) < MIN_CONTENT_LENGTH) or
        is_blocked_domain(url)
    )
    
    if should_try_zyte and ZYTE_API_KEY:
        reason = "blocked" if was_blocked else "thin content" if content else "known blocked domain"
        print(f"  [Zyte] Retrying ({reason})...")
        
        zyte_content = scrape_with_zyte(url)
        if zyte_content and len(zyte_content) >= MIN_CONTENT_LENGTH:
            print(f"  [Zyte] Success: {len(zyte_content)} chars")
            return zyte_content
    
    # Return whatever we have (might be thin or None)
    return content


# =============================================================================
# FEED FETCHER
# =============================================================================
def fetch_feed(feed_config: dict) -> list[dict]:
    """Fetch and parse a single RSS feed."""
    feed_name = feed_config.get("name", "Unknown")
    feed_url = feed_config.get("url", "")
    
    if not feed_url:
        return []
    
    print(f"Fetching feed: {feed_name}")
    
    try:
        feed = feedparser.parse(feed_url)
        articles = []
        
        for entry in feed.entries:
            title = entry.get("title", "")
            link = entry.get("link", "")
            
            # Early title check (before URL resolution)
            if title_fails_early(title):
                print(f"  Title fail: {title[:50]}")
                continue
            
            # Extract real URL (handles Google Alerts AND Google News)
            real_url = extract_real_url(link)
            
            # Get published date
            try:
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    pub_date = datetime(*entry.published_parsed[:6]).strftime("%Y-%m-%d")
                else:
                    pub_date = datetime.now().strftime("%Y-%m-%d")
            except:
                pub_date = datetime.now().strftime("%Y-%m-%d")
            
            articles.append({
                "title": title,
                "url": real_url,
                "date": pub_date,
                "feed_name": feed_name,
                "feed_url": feed_url,
                "content": None
            })
        
        print(f"  Found {len(articles)} candidate articles")
        return articles
        
    except Exception as e:
        print(f"  Feed error: {e}")
        return []


def fetch_all_feeds() -> list[dict]:
    """Fetch articles from all configured RSS feeds."""
    all_articles = []
    seen_urls = set()
    
    for feed_config in RSS_FEEDS:
        articles = fetch_feed(feed_config)
        
        for article in articles:
            # Dedupe by URL
            if article["url"] not in seen_urls:
                seen_urls.add(article["url"])
                all_articles.append(article)
        
        time.sleep(1)  # Be nice to feed servers
    
    print(f"\nTotal unique articles: {len(all_articles)}")
    return all_articles


def scrape_articles(articles: list[dict]) -> list[dict]:
    """Scrape content for each article using tiered approach."""
    scraped = []
    stats = {"requests_success": 0, "zyte_success": 0, "failed": 0}
    
    for i, article in enumerate(articles):
        url = article["url"]
        print(f"Scraping {i+1}/{len(articles)}: {url[:60]}...")
        
        content = scrape_article_content(url)
        
        if content and len(content) >= 200:
            article["content"] = content
            scraped.append(article)
            print(f"  Got {len(content)} chars")
        else:
            print(f"  No content")
            stats["failed"] += 1
        
        time.sleep(REQUEST_DELAY_SECONDS)
    
    # Print Zyte usage stats
    if ZYTE_DOMAIN_CALLS:
        total_zyte = sum(ZYTE_DOMAIN_CALLS.values())
        print(f"\n[Zyte] Total API calls this run: {total_zyte}")
        for domain, count in sorted(ZYTE_DOMAIN_CALLS.items(), key=lambda x: -x[1])[:5]:
            print(f"  {domain}: {count}")
    
    print(f"\nSuccessfully scraped: {len(scraped)}/{len(articles)}")
    return scraped


def fetch_and_scrape_all() -> list[dict]:
    """Main function: fetch feeds, filter, scrape, return articles with content."""
    print("=" * 60)
    print("FLHIP Scraper v3 - RSS Fetch & Scrape")
    print("=" * 60)
    
    if ZYTE_API_KEY:
        print(f"[OK] Zyte API key configured (fallback enabled)")
    else:
        print("[WARN] ZYTE_API_KEY not set - no fallback for blocked sites")
    
    # Fetch from all feeds
    articles = fetch_all_feeds()
    
    # Scrape content
    articles_with_content = scrape_articles(articles)
    
    return articles_with_content


# =============================================================================
# TEST
# =============================================================================
if __name__ == "__main__":
    # Test with a single feed
    test_articles = fetch_all_feeds()
    print(f"\nFound {len(test_articles)} articles")
    
    # Show first few
    for article in test_articles[:5]:
        print(f"- {article['title'][:60]}")
        print(f"  {article['url'][:60]}")
        print(f"  Feed: {article['feed_name']}")
