#!/usr/bin/env python3
"""
FLHIP Scraper v3 - Zyte Retry
Retry blocked/paywalled URLs using Zyte proxy

Usage:
    python zyte_retry.py              # Fetch blocked URLs from today's feeds and retry with Zyte
    python zyte_retry.py --dry-run    # Show what would be processed without saving
"""

import requests
import base64
import json
import time
import argparse
from datetime import datetime

from config import RSS_FEEDS, BLOCKED_DOMAINS, TITLE_FAIL_PATTERNS
from claude_processor import process_article
from db_handler import insert_leads_batch, check_duplicate, get_connection
import feedparser
import re
import os

# Zyte API config
ZYTE_API_KEY = os.getenv("ZYTE_API_KEY", "")
ZYTE_API_URL = "https://api.zyte.com/v1/extract"


def extract_real_url(google_url: str) -> str:
    """Extract the actual URL from a Google redirect URL."""
    if "google.com/url" in google_url:
        match = re.search(r'url=([^&]+)', google_url)
        if match:
            from urllib.parse import unquote
            return unquote(match.group(1))
    return google_url


def is_blocked_domain(url: str) -> bool:
    """Check if URL is from a blocked domain."""
    try:
        from urllib.parse import urlparse
        domain = urlparse(url).netloc.lower()
        for blocked in BLOCKED_DOMAINS:
            if blocked in domain:
                return True
        return False
    except:
        return False


def title_fails_early(title: str) -> bool:
    """Check if title matches early-fail patterns."""
    if not title:
        return True
    title_lower = title.lower()
    for pattern in TITLE_FAIL_PATTERNS:
        if re.search(pattern, title_lower):
            return True
    return False


def fetch_blocked_urls() -> list[dict]:
    """Fetch all blocked URLs from RSS feeds."""
    blocked_articles = []
    seen_urls = set()
    
    print("Scanning RSS feeds for blocked URLs...")
    
    for feed_config in RSS_FEEDS:
        feed_name = feed_config.get("name", "Unknown")
        feed_url = feed_config.get("url", "")
        
        if not feed_url:
            continue
            
        try:
            feed = feedparser.parse(feed_url)
            
            for entry in feed.entries:
                title = entry.get("title", "")
                link = entry.get("link", "")
                real_url = extract_real_url(link)
                
                # Only collect blocked domain URLs
                if not is_blocked_domain(real_url):
                    continue
                    
                # Skip bad titles
                if title_fails_early(title):
                    continue
                
                # Dedupe
                if real_url in seen_urls:
                    continue
                seen_urls.add(real_url)
                
                # Get date
                try:
                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
                        pub_date = datetime(*entry.published_parsed[:6]).strftime("%Y-%m-%d")
                    else:
                        pub_date = datetime.now().strftime("%Y-%m-%d")
                except:
                    pub_date = datetime.now().strftime("%Y-%m-%d")
                
                blocked_articles.append({
                    "title": title,
                    "url": real_url,
                    "date": pub_date,
                    "feed_name": feed_name
                })
                
        except Exception as e:
            print(f"  Feed error {feed_name}: {e}")
            continue
        
        time.sleep(0.5)
    
    print(f"Found {len(blocked_articles)} blocked URLs to retry")
    return blocked_articles


def scrape_with_zyte(url: str) -> str | None:
    """Scrape a URL using Zyte API."""
    if not ZYTE_API_KEY:
        print("  ZYTE_API_KEY not set!")
        return None
    
    try:
        response = requests.post(
            ZYTE_API_URL,
            auth=(ZYTE_API_KEY, ""),
            json={
                "url": url,
                "httpResponseBody": True,
                "httpResponseHeaders": True,
            },
            timeout=60
        )
        
        if response.status_code != 200:
            print(f"  Zyte error: {response.status_code}")
            return None
        
        data = response.json()
        
        # Decode the response body
        body_b64 = data.get("httpResponseBody", "")
        if not body_b64:
            return None
            
        html = base64.b64decode(body_b64).decode("utf-8", errors="ignore")
        
        # Extract text content
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove junk
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript']):
            tag.decompose()
        
        # Find article content
        content = None
        for selector in ['article', 'main', '.article-body', '.post-content', '.entry-content', '.story-body']:
            element = soup.select_one(selector)
            if element:
                content = element.get_text(separator='\n', strip=True)
                break
        
        if not content:
            body = soup.find('body')
            if body:
                content = body.get_text(separator='\n', strip=True)
        
        if content:
            content = re.sub(r'\n{3,}', '\n\n', content)
            content = re.sub(r' {2,}', ' ', content)
            return content.strip()
        
        return None
        
    except Exception as e:
        print(f"  Zyte exception: {e}")
        return None


def main():
    parser = argparse.ArgumentParser(description="Retry blocked URLs with Zyte")
    parser.add_argument("--dry-run", action="store_true", help="Don't save to DB")
    parser.add_argument("--limit", type=int, default=100, help="Max URLs to retry")
    args = parser.parse_args()
    
    if not ZYTE_API_KEY:
        print("ERROR: ZYTE_API_KEY not set in environment")
        return
    
    print("=" * 60)
    print("FLHIP Scraper v3 - Zyte Retry for Blocked URLs")
    print("=" * 60)
    
    # Get blocked URLs
    blocked = fetch_blocked_urls()
    
    if not blocked:
        print("No blocked URLs to retry")
        return
    
    # Check which ones we already have
    conn = get_connection()
    new_blocked = []
    for article in blocked:
        if not check_duplicate(conn, article["url"]):
            new_blocked.append(article)
    conn.close()
    
    print(f"After dedup: {len(new_blocked)} new URLs to try")
    
    if len(new_blocked) > args.limit:
        print(f"Limiting to {args.limit}")
        new_blocked = new_blocked[:args.limit]
    
    # Process with Zyte
    leads = []
    for i, article in enumerate(new_blocked):
        print(f"\n[{i+1}/{len(new_blocked)}] {article['url'][:60]}...")
        
        content = scrape_with_zyte(article["url"])
        
        if not content or len(content) < 200:
            print("  No content")
            continue
        
        print(f"  Got {len(content)} chars via Zyte")
        
        # Process through Claude
        result = process_article(
            title=article["title"],
            content=content,
            source_url=article["url"],
            article_date=article["date"]
        )
        
        if result:
            result["feed_name"] = article.get("feed_name")
            leads.append(result)
            print(f"  PASS: {result.get('business_name')} in {result.get('city')}, {result.get('state')}")
        else:
            print("  FAIL (Claude rejected)")
        
        time.sleep(1)  # Rate limit
    
    # Save results
    print("\n" + "=" * 60)
    print(f"Results: {len(leads)} leads from {len(new_blocked)} URLs")
    print("=" * 60)
    
    if leads:
        if args.dry_run:
            print("\nDRY RUN - Would insert:")
            for lead in leads:
                print(f"  - {lead.get('business_name')} in {lead.get('city')}, {lead.get('state')}")
        else:
            stats = insert_leads_batch(leads)
            print(f"\nInserted: {stats['inserted']}")
            print(f"Duplicates: {stats['duplicates']}")
            print(f"Errors: {stats['errors']}")


if __name__ == "__main__":
    main()
