"""
FLHIP Scraper v3 - Hotel Claude Processor
Single intelligent prompt that qualifies AND extracts hotel leads in one call
"""

import anthropic
import json
import re
from datetime import datetime, timedelta
from hotel_config import ANTHROPIC_API_KEY, CLAUDE_MODEL, CLAUDE_MAX_TOKENS, CLAUDE_TEMPERATURE, US_STATES


# Initialize Anthropic client
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# Current date for reference
CURRENT_DATE = datetime.now().strftime("%Y-%m-%d")


EXTRACTION_PROMPT = f"""You are a lead qualification system for FLHIP, a hospitality industry database. Your job is to identify FUTURE hotel openings in the United States.

TODAY'S DATE: {CURRENT_DATE}

TASK: Analyze the article and determine if it contains a legitimate future hotel opening lead.

=== AUTOMATIC FAIL (return null) ===
- Hotel has ALREADY OPENED (past tense: "opened", "launched", "debuted", "welcomed guests")
- Article is a "best of" list, retrospective, or year-in-review
- Article is about closures, fires, accidents, or negative events
- Article is a hotel review or "where to stay" guide
- Article is about a non-US location
- Article is primarily about restaurants (not hotels)
- Article is corporate/financial news without a specific location opening
- Article mentions an opening date that is IN THE PAST
- Content is blocked/paywalled (captcha, "verify you are human", etc.)

=== PASS CRITERIA (ALL must be true) ===
1. Article describes a hotel that WILL open in the future
2. Location is in the United States
3. At least one of these future signals:
   - Specific future opening date mentioned
   - "Coming soon", "set to open", "will open", "plans to open"
   - Construction/buildout underway
   - Hiring for new hotel
   - Permit/license approved for new hotel
   - Groundbreaking ceremony
   - Development approved

=== HOTEL TYPES TO INCLUDE ===
- Traditional hotels (Marriott, Hilton, Hyatt, IHG, etc.)
- Boutique hotels
- Resorts
- Extended stay hotels
- Casino hotels
- Conference/convention hotels
- Airport hotels

=== EXTRACTION RULES ===
- Business name: The actual hotel name, NOT the publisher, NOT generic descriptions
- If multiple hotels mentioned, extract the PRIMARY one (the main subject)
- Opening date: Use YYYY-MM-DD format. If "Spring 2026" use "2026-04-15". If "early 2026" use "2026-02-28". If "late 2026" use "2026-11-15". If unknown, use null.
- State: Use 2-letter code (TX, CA, NY, etc.)
- Phone: Format as found, or null
- Contact: Owner/manager/developer name if mentioned, or null

=== RESPONSE FORMAT ===
Return ONLY valid JSON. No explanation. No markdown.

If FAIL:
{{"status": "FAIL", "reason": "brief reason"}}

If PASS:
{{
  "status": "PASS",
  "confidence": "high" | "medium" | "low",
  "business_name": "Hotel Name",
  "address": "123 Main St" | null,
  "city": "City Name",
  "state": "TX",
  "zip": "12345" | null,
  "opening_date": "YYYY-MM-DD" | null,
  "phone": "555-123-4567" | null,
  "contact_name": "John Smith" | null,
  "contact_email": "email@example.com" | null,
  "notes": "Brief description of the hotel and opening details (rooms, amenities, brand)",
  "future_signals": ["signal1", "signal2"]
}}

=== ARTICLE TO ANALYZE ===
TITLE: {{title}}
SOURCE: {{source}}
ARTICLE DATE: {{article_date}}

CONTENT:
{{content}}
"""


def process_article(title: str, content: str, source_url: str, article_date: str) -> dict | None:
    """
    Process a single article through Claude.
    Returns extracted data dict if PASS, None if FAIL.
    """
    
    # Quick pre-checks before API call
    if not content or len(content.strip()) < 100:
        return None
    
    # Check for bot walls in content
    bot_wall_patterns = [
        "verifying you are human",
        "unusual traffic",
        "enable javascript",
        "captcha",
        "access denied",
        "subscribe to continue",
        "create an account"
    ]
    content_lower = content.lower()
    for pattern in bot_wall_patterns:
        if pattern in content_lower:
            return None
    
    # Build the prompt - use single braces for replacement
    prompt = EXTRACTION_PROMPT.replace("{title}", title or "")
    prompt = prompt.replace("{source}", source_url or "")
    prompt = prompt.replace("{article_date}", article_date or "")
    prompt = prompt.replace("{content}", content[:8000])
    
    try:
        response = client.messages.create(
            model=CLAUDE_MODEL,
            max_tokens=CLAUDE_MAX_TOKENS,
            temperature=CLAUDE_TEMPERATURE,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        
        # Extract response text
        response_text = response.content[0].text.strip()
        
        # Parse JSON response
        # Handle potential markdown code blocks
        if response_text.startswith("```"):
            response_text = re.sub(r"```json?\n?", "", response_text)
            response_text = response_text.replace("```", "").strip()
        
        result = json.loads(response_text)
        
        # Check status
        if result.get("status") != "PASS":
            return None
        
        # Validate required fields
        if not result.get("business_name"):
            return None
        if not result.get("city"):
            return None
        if not result.get("state"):
            return None
        
        # Validate state is US
        state = result.get("state", "").upper()
        if state not in US_STATES:
            return None
        
        # Validate opening date is in the future (if provided)
        opening_date = result.get("opening_date")
        if opening_date:
            try:
                open_dt = datetime.strptime(opening_date, "%Y-%m-%d")
                if open_dt < datetime.now():
                    return None  # Past date = FAIL
            except ValueError:
                result["opening_date"] = None  # Invalid format, clear it
        
        # Clean up the result
        result["state"] = state
        result["source_url"] = source_url
        result["article_title"] = title
        result["article_date"] = article_date
        result["processed_at"] = datetime.now().isoformat()
        
        return result
        
    except json.JSONDecodeError as e:
        print(f"JSON parse error: {e}")
        return None
    except anthropic.APIError as e:
        print(f"Claude API error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None


def batch_process_articles(articles: list[dict]) -> list[dict]:
    """
    Process a batch of articles.
    articles: list of {"title": str, "content": str, "url": str, "date": str, "feed_name": str}
    Returns: list of extracted leads (only PASSed articles)
    """
    leads = []
    
    for i, article in enumerate(articles):
        print(f"Processing {i+1}/{len(articles)}: {article.get('title', '')[:60]}...")
        
        result = process_article(
            title=article.get("title", ""),
            content=article.get("content", ""),
            source_url=article.get("url", ""),
            article_date=article.get("date", "")
        )
        
        if result:
            # Pass through feed_name for tracking
            result["feed_name"] = article.get("feed_name", "")
            
            leads.append(result)
            print(f"  PASS: {result.get('business_name')} in {result.get('city')}, {result.get('state')}")
        else:
            print(f"  FAIL")
    
    return leads


# =============================================================================
# TEST
# =============================================================================
if __name__ == "__main__":
    # Test with a sample article
    test_article = {
        "title": "New Marriott Hotel Coming to Downtown Austin in Spring 2026",
        "content": """
        AUSTIN, TX - Marriott International is bringing a new 250-room hotel to 
        downtown Austin. The Marriott Austin Downtown, located at 500 Congress Avenue, 
        is set to open in April 2026.
        
        The 15-story property will feature a rooftop bar, full-service restaurant,
        fitness center, and 10,000 square feet of meeting space. The hotel will
        be managed by Lark Hospitality.
        
        "Austin's growth makes it a perfect market for this property," said 
        John Smith, VP of Development. Construction broke ground last month.
        
        For more information, contact development@marriott.com or call 512-555-0123.
        """,
        "url": "https://example.com/austin-hotel-news/marriott",
        "date": "2025-12-28",
        "feed_name": "hotel opening soon"
    }
    
    print("Testing Hotel Claude processor...")
    print("=" * 60)
    
    result = process_article(
        title=test_article["title"],
        content=test_article["content"],
        source_url=test_article["url"],
        article_date=test_article["date"]
    )
    
    if result:
        print("\nExtracted Lead:")
        print(json.dumps(result, indent=2))
    else:
        print("\nNo lead extracted (FAIL)")
