#!/usr/bin/env python3
"""
Google News URL Resolver
Uses Playwright/Chromium to resolve Google News wrapper URLs to actual publisher URLs.

Google News RSS feeds return URLs like:
  https://news.google.com/rss/articles/CBMikwJBVV95cUxPSURxWnJ...

These are base64-encoded protobuf that can't be resolved via simple HTTP redirects.
This module uses a headless browser to follow the JS redirects to the real article.
"""

import re
from urllib.parse import urlparse

# Playwright import - will fail gracefully if not installed
try:
    from playwright.sync_api import sync_playwright
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False
    print("WARNING: Playwright not installed. Run: pip install playwright && playwright install chromium")


GOOGLE_NEWS_HOSTS = {"news.google.com"}


def is_google_news_wrapper(url: str) -> bool:
    """Check if URL is a Google News wrapper that needs browser resolution."""
    if not url:
        return False
    try:
        p = urlparse(url)
        return (p.netloc in GOOGLE_NEWS_HOSTS) and ("/rss/articles/" in p.path or "/articles/" in p.path)
    except:
        return False


def resolve_google_news_url(url: str, timeout_ms: int = 20000) -> str | None:
    """
    Opens the Google News wrapper in headless Chromium and returns final destination URL.
    
    Args:
        url: Google News wrapper URL
        timeout_ms: Max time to wait for page load (default 20 seconds)
        
    Returns:
        Publisher URL if resolution succeeds, None if it fails
    """
    if not PLAYWRIGHT_AVAILABLE:
        print("  Playwright not available, cannot resolve Google News URL")
        return None
        
    if not is_google_news_wrapper(url):
        return url
        
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True, 
                args=[
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-gpu",
                    "--single-process",
                ]
            )
            page = browser.new_page()
            try:
                page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
                # Give it time to perform JS redirects
                page.wait_for_timeout(1500)
                final_url = page.url
                browser.close()
                
                # If it didn't move away from Google News, resolution failed
                if final_url == url or "news.google.com" in final_url:
                    return None
                    
                return final_url
                
            except Exception as e:
                browser.close()
                print(f"  Browser navigation error: {e}")
                return None
                
    except Exception as e:
        print(f"  Playwright error: {e}")
        return None


def resolve_google_news_batch(urls: list[str], timeout_ms: int = 15000) -> dict[str, str | None]:
    """
    Resolve multiple Google News URLs efficiently using a single browser instance.
    
    Args:
        urls: List of Google News wrapper URLs
        timeout_ms: Max time per URL
        
    Returns:
        Dict mapping original URL -> resolved URL (or None if failed)
    """
    if not PLAYWRIGHT_AVAILABLE:
        print("  Playwright not available")
        return {url: None for url in urls}
    
    results = {}
    
    # Filter to only Google News URLs
    gnews_urls = [u for u in urls if is_google_news_wrapper(u)]
    non_gnews = [u for u in urls if not is_google_news_wrapper(u)]
    
    # Non-Google-News URLs pass through unchanged
    for url in non_gnews:
        results[url] = url
    
    if not gnews_urls:
        return results
    
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=[
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-gpu",
                ]
            )
            
            for i, url in enumerate(gnews_urls):
                try:
                    page = browser.new_page()
                    page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
                    page.wait_for_timeout(1200)
                    final_url = page.url
                    page.close()
                    
                    if final_url and "news.google.com" not in final_url:
                        results[url] = final_url
                    else:
                        results[url] = None
                        
                except Exception as e:
                    results[url] = None
                    
            browser.close()
            
    except Exception as e:
        print(f"  Batch resolution error: {e}")
        for url in gnews_urls:
            if url not in results:
                results[url] = None
    
    return results


# Quick test
if __name__ == "__main__":
    test_url = "https://news.google.com/rss/articles/CBMikwJBVV95cUxPSURxWnJ"
    print(f"Testing resolution of: {test_url[:60]}...")
    
    if PLAYWRIGHT_AVAILABLE:
        result = resolve_google_news_url(test_url)
        if result:
            print(f"Resolved to: {result}")
        else:
            print("Resolution failed")
    else:
        print("Playwright not installed")
