#!/usr/bin/env python3
"""
FLHIP Scraper 5b - Texas DSHS Food Establishment Permits
Uses Zyte API - confirmed working on state.tx.us (200 OK in playground)

KEY FIX: Zyte is stateless per request. DSHS requires JSESSIONID session cookie
to be captured from the warmup GET and forwarded in all subsequent POSTs via
customHttpRequestHeaders Cookie header.

.env location: ~/flhip/.env (parent of scraper5)

Usage:
    python3 tx_dshs_scraper.py
    python3 tx_dshs_scraper.py --counties 101,57,220

Cron (Saturday 5am):
    0 5 * * 6 cd ~/flhip/scraper5 && python3 tx_dshs_scraper.py >> logs/tx_$(date +%%Y-%%m-%%d).log 2>&1
"""

import os, sys, time, argparse, re, urllib.parse
from base64 import b64decode, b64encode
import requests, mysql.connector
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
from gov_leads_archive import archive_gov_leads

load_dotenv(Path(__file__).resolve().parent.parent / '.env')

DB_CONFIG = {
    "host":     os.getenv("DB_HOST", "localhost"),
    "user":     os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "database": os.getenv("DB_NAME", "restaurant_openings"),
    "charset":  "utf8mb4",
}

ZYTE_API_URL = "https://api.zyte.com/v1/extract"
ZYTE_API_KEY = os.getenv("ZYTE_API_KEY", "")

STATE         = "TX"
REQUEST_DELAY = 2.0
MAX_PAGES     = 500

BASE        = "https://vo.ras.dshs.state.tx.us/datamart"
WARMUP_URL  = f"{BASE}/selSearchTypeTXRAS.do"
SEARCH_URL  = f"{BASE}/searchByCountyTXRAS.do"
LIST_URL    = f"{BASE}/listTXRAS.do"
BOARD_CODE  = "1523"
LICENSE_TYPE = "2301"

COUNTIES = {
    "1":"ANDERSON","2":"ANDREWS","3":"ANGELINA","4":"ARANSAS","5":"ARCHER",
    "6":"ARMSTRONG","7":"ATASCOSA","8":"AUSTIN","9":"BAILEY","10":"BANDERA",
    "11":"BASTROP","12":"BAYLOR","13":"BEE","14":"BELL","15":"BEXAR",
    "16":"BLANCO","17":"BORDEN","18":"BOSQUE","19":"BOWIE","20":"BRAZORIA",
    "21":"BRAZOS","22":"BREWSTER","23":"BRISCOE","24":"BROOKS","25":"BROWN",
    "26":"BURLESON","27":"BURNET","28":"CALDWELL","29":"CALHOUN","30":"CALLAHAN",
    "31":"CAMERON","32":"CAMP","33":"CARSON","34":"CASS","35":"CASTRO",
    "36":"CHAMBERS","37":"CHEROKEE","38":"CHILDRESS","39":"CLAY","40":"COCHRAN",
    "41":"COKE","42":"COLEMAN","43":"COLLIN","44":"COLLINGSWORTH","45":"COLORADO",
    "46":"COMAL","47":"COMANCHE","48":"CONCHO","49":"COOKE","50":"CORYELL",
    "51":"COTTLE","52":"CRANE","53":"CROCKETT","54":"CROSBY","55":"CULBERSON",
    "56":"DALLAM","57":"DALLAS","58":"DAWSON","59":"DEAF SMITH","60":"DELTA",
    "61":"DENTON","62":"DICKENS","63":"DIMMIT","64":"DONLEY","65":"DUVAL",
    "66":"EASTLAND","67":"ECTOR","68":"EDWARDS","69":"EL PASO","70":"ELLIS",
    "71":"ERATH","72":"FALLS","73":"FANNIN","74":"FAYETTE","75":"FISHER",
    "76":"FLOYD","77":"FOARD","78":"FORT BEND","79":"FRANKLIN","80":"FREESTONE",
    "81":"FRIO","82":"GAINES","83":"GALVESTON","84":"GARZA","85":"GILLESPIE",
    "86":"GLASSCOCK","87":"GOLIAD","88":"GONZALES","89":"GRAY","90":"GRAYSON",
    "91":"GREGG","92":"GRIMES","93":"GUADALUPE","94":"HALE","95":"HALL",
    "96":"HAMILTON","97":"HANSFORD","98":"HARDEMAN","99":"HARDIN","100":"HARDIN",
    "101":"HARRIS","102":"HARRISON","103":"HARTLEY","104":"HASKELL","105":"HAYS",
    "106":"HEMPHILL","107":"HENDERSON","108":"HIDALGO","109":"HILL","110":"HOCKLEY",
    "111":"HOOD","112":"HOPKINS","113":"HOUSTON","114":"HOWARD","115":"HUDSPETH",
    "116":"HUNT","117":"HUTCHINSON","118":"IRION","119":"JACK","120":"JACKSON",
    "121":"JASPER","122":"JEFF DAVIS","123":"JEFFERSON","124":"JIM HOGG",
    "125":"JIM WELLS","126":"JOHNSON","127":"JONES","128":"KARNES","129":"KAUFMAN",
    "130":"KENDALL","131":"KENEDY","132":"KENT","133":"KERR","134":"KIMBLE",
    "135":"KING","136":"KINNEY","137":"KLEBERG","138":"KNOX","139":"LAMAR",
    "140":"LAMB","141":"LAMPASAS","142":"LA SALLE","143":"LAVACA","144":"LEE",
    "145":"LEON","146":"LIBERTY","147":"LIMESTONE","148":"LIPSCOMB","149":"LIVE OAK",
    "150":"LLANO","151":"LOVING","152":"LUBBOCK","153":"LYNN","154":"MCCULLOCH",
    "155":"MCLENNAN","156":"MCMULLEN","157":"MADISON","158":"MARION","159":"MARTIN",
    "160":"MASON","161":"MATAGORDA","162":"MAVERICK","163":"MEDINA","164":"MENARD",
    "165":"MIDLAND","166":"MILAM","167":"MILLS","168":"MITCHELL","169":"MONTAGUE",
    "170":"MONTGOMERY","171":"MOORE","172":"MORRIS","173":"MOTLEY",
    "174":"NACOGDOCHES","175":"NAVARRO","176":"NEWTON","177":"NOLAN","178":"NUECES",
    "179":"OCHILTREE","180":"OLDHAM","181":"ORANGE","182":"PALO PINTO",
    "183":"PANOLA","184":"PARKER","185":"PARMER","186":"PECOS","187":"POLK",
    "188":"POTTER","189":"PRESIDIO","190":"RAINS","191":"RANDALL","192":"REAGAN",
    "193":"REAL","194":"RED RIVER","195":"REEVES","196":"REFUGIO","197":"ROBERTS",
    "198":"ROBERTSON","199":"ROCKWALL","200":"RUNNELS","201":"RUSK","202":"SABINE",
    "203":"SAN AUGUSTINE","204":"SAN JACINTO","205":"SAN PATRICIO","206":"SAN SABA",
    "207":"SCHLEICHER","208":"SCURRY","209":"SHACKELFORD","210":"SHELBY",
    "211":"SHERMAN","212":"SMITH","213":"SOMERVELL","214":"STARR","215":"STEPHENS",
    "216":"STERLING","217":"STONEWALL","218":"SUTTON","219":"SWISHER",
    "220":"TARRANT","221":"TAYLOR","222":"TERRELL","223":"TERRY",
    "224":"THROCKMORTON","225":"TITUS","226":"TOM GREEN","227":"TRAVIS",
    "228":"TRINITY","229":"TYLER","230":"UPSHUR","231":"UPTON","232":"UVALDE",
    "233":"VAL VERDE","234":"VAN ZANDT","235":"VICTORIA","236":"WALKER",
    "237":"WALLER","238":"WARD","239":"WASHINGTON","240":"WEBB","241":"WHARTON",
    "242":"WHEELER","243":"WICHITA","244":"WILBARGER","245":"WILLACY",
    "246":"WILLIAMSON","247":"WILSON","248":"WINKLER","249":"WISE","250":"WOOD",
    "251":"YOAKUM","252":"YOUNG","253":"ZAPATA","254":"ZAVALA",
}

PRIORITY = ["101","57","220","61","43","105","227","108","69","83",
            "70","93","15","31","126","90","178","14","20","246"]


def zyte_request(url, method="GET", form_data=None, cookie_str=None, referer=None, timeout=90):
    """
    Make a request via Zyte API.
    Returns (html, set_cookie_list) tuple.
    Captures Set-Cookie headers so session can be maintained across requests.
    """
    payload = {
        "url":                 url,
        "httpResponseBody":    True,
        "httpResponseHeaders": True,  # needed to capture Set-Cookie
        "followRedirect":      True,
    }

    if method == "POST" and form_data:
        encoded = urllib.parse.urlencode(form_data)
        payload["httpRequestMethod"] = "POST"
        payload["httpRequestBody"]   = b64encode(encoded.encode()).decode()

    # Build custom headers
    headers = [{"name": "Content-Type", "value": "application/x-www-form-urlencoded"}] \
              if method == "POST" else []
    if referer:
        headers.append({"name": "Referer", "value": referer})
    if cookie_str:
        headers.append({"name": "Cookie", "value": cookie_str})
    if headers:
        payload["customHttpRequestHeaders"] = headers

    resp = requests.post(ZYTE_API_URL, auth=(ZYTE_API_KEY, ""), json=payload, timeout=timeout)
    resp.raise_for_status()
    data = resp.json()

    html = b64decode(data.get("httpResponseBody", "")).decode("latin-1")
    resp_headers = data.get("httpResponseHeaders", [])
    set_cookies = [h["value"] for h in resp_headers if h["name"].lower() == "set-cookie"]
    return html, set_cookies


def build_cookie_str(cookie_list):
    """Extract name=value pairs from Set-Cookie headers."""
    return "; ".join(c.split(";")[0] for c in cookie_list if c)


def is_expired(html):
    return "login.do" in html[:500] or html[:300].strip().startswith("Access is denied")


def search_county(county_code):
    """
    Step 1: GET warmup to get JSESSIONID
    Step 2: POST search with that cookie
    Returns (anchor, soup, cookie_str) or (None, None, None)
    """
    # Warmup GET — capture session cookie
    try:
        html1, cookies1 = zyte_request(WARMUP_URL, timeout=60)
    except requests.RequestException as e:
        print(f"    Warmup error: {e}")
        return None, None, None

    cookie_str = build_cookie_str(cookies1)

    # POST county search — carry session cookie, use Referer
    try:
        html2, cookies2 = zyte_request(
            SEARCH_URL,
            method="POST",
            form_data={
                "countyAgencyKey": county_code,
                "licenseType":     LICENSE_TYPE,
                "boardCode":       BOARD_CODE,
            },
            cookie_str=cookie_str,
            referer=WARMUP_URL,
            timeout=90,
        )
    except requests.RequestException as e:
        print(f"    Search POST error: {e}")
        return None, None, None

    # Merge any new cookies from search response
    all_cookies = build_cookie_str(cookies1 + cookies2)

    if is_expired(html2):
        print("    Expired/denied")
        return None, None, None

    soup = BeautifulSoup(html2, "html.parser")
    anchor_el = soup.find("input", {"name": "anchor"})

    if not anchor_el:
        # Try regex fallback
        m = re.search(r'name="anchor"\s+value="([^"]+)"', html2)
        if m:
            anchor = m.group(1)
        else:
            print(f"    No anchor — response preview:\n    {html2[:600]}")
            return None, None, None
    else:
        anchor = anchor_el["value"]

    return anchor, soup, all_cookies


def parse_page(soup):
    records = []
    for table in soup.find_all("table"):
        headers = [th.get_text(strip=True) for th in table.find_all("th")]
        if "Name" in headers and "Status" in headers:
            for row in table.find_all("tr")[1:]:
                cells = [td.get_text(strip=True) for td in row.find_all("td")]
                if len(cells) >= 4:
                    records.append({
                        "name":        cells[0],
                        "license_num": cells[1],
                        "status":      cells[3],
                        "expiry":      cells[4] if len(cells) > 4 else "",
                    })
            break
    return records


def has_next_page(soup):
    return bool(soup.find("input", {"name": "nextPage", "value": "Next Page"}))


def scrape_county(code, name):
    print(f"  [{code}] {name}...", end="", flush=True)

    anchor, soup, cookie_str = search_county(code)
    if not anchor:
        print(" FAILED")
        return []

    records = parse_page(soup)
    page = 1

    while has_next_page(soup):
        time.sleep(REQUEST_DELAY)
        page += 1
        if page > MAX_PAGES:
            break
        try:
            html, _ = zyte_request(
                LIST_URL,
                method="POST",
                form_data={
                    "anchor":         anchor,
                    "nextPage":       "Next Page",
                    "param_rsdIndex": "null",
                },
                cookie_str=cookie_str,
                referer=SEARCH_URL,
                timeout=30,
            )
        except requests.RequestException as e:
            print(f" err p{page}: {e}", end="")
            break

        if is_expired(html):
            print(f" expired p{page}", end="")
            break

        soup = BeautifulSoup(html, "html.parser")
        chunk = parse_page(soup)
        if not chunk:
            break
        records.extend(chunk)

    print(f" {len(records)} records ({page}p)")
    return records


def parse_expiry(val):
    if not val:
        return None
    for fmt in ("%m/%d/%Y", "%Y-%m-%d"):
        try:
            return datetime.strptime(val.strip(), fmt).strftime("%Y-%m-%d %H:%M:%S")
        except ValueError:
            continue
    return None


def is_duplicate(cursor, name):
    cursor.execute(
        "SELECT id FROM restaurant_leads_gov_usa WHERE business=%s AND state=%s LIMIT 1",
        (name, STATE)
    )
    return cursor.fetchone() is not None


def ingest(records, county_name, conn, cursor):
    inserted = 0
    county = county_name.title()
    for r in records:
        name = r["name"].strip()
        if not name:
            continue
        if any(x in r["status"].lower() for x in ("expired", "null and void", "inactive")):
            continue
        if is_duplicate(cursor, name):
            continue
        notes = (f"Gov Lead - TX DSHS Food Establishment | "
                 f"License: {r['license_num']} | Status: {r['status']}")
        try:
            cursor.execute("""
                INSERT INTO restaurant_leads_gov_usa
                    (county, region, business, address, city, state, source_state, zip,
                     phone, contact_name, contact_email,
                     info_source, internet_info,
                     notes, gov_status, plan_review_type, facility_type,
                     license_number, application_number,
                     lat, lng, article_date, review_date, sent_to_wp)
                VALUES (%s,NULL,%s,NULL,NULL,%s,%s,NULL,
                        NULL,NULL,NULL,
                        NULL,NULL,
                        %s,%s,NULL,NULL,
                        %s,NULL,
                        NULL,NULL,%s,NULL,0)
            """, (county, name, STATE, STATE,
                  notes, r["status"],
                  r["license_num"],
                  parse_expiry(r.get("expiry"))))
            inserted += 1
        except mysql.connector.Error as e:
            print(f"    DB err ({name}): {e}")
    conn.commit()
    return inserted


def run(county_filter=None):
    print("=" * 60)
    print("FLHIP Scraper 5b - Texas DSHS Food Establishments")
    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    print("=" * 60)

    missing = [k for k, v in {"DB_USER": DB_CONFIG["user"],
               "DB_PASSWORD": DB_CONFIG["password"],
               "ZYTE_API_KEY": ZYTE_API_KEY}.items() if not v]
    if missing:
        print(f"ERROR: Missing from ~/flhip/.env: {', '.join(missing)}")
        sys.exit(1)

    print(f"Proxy: Zyte API")

    try:
        conn = mysql.connector.connect(**DB_CONFIG)
        cursor = conn.cursor()
        print(f"DB: connected as {DB_CONFIG['user']}@{DB_CONFIG['host']}\n")
    except mysql.connector.Error as e:
        print(f"DB ERROR: {e}")
        sys.exit(1)

    # Archive last week before loading new data
    # Only archive on full runs, not single-county tests
    if not county_filter:
        archive_gov_leads(conn, cursor)

    if county_filter:
        ordered = [(c, COUNTIES.get(c, f"County {c}")) for c in county_filter]
    else:
        seen, ordered = set(), []
        for c in PRIORITY:
            if c in COUNTIES:
                ordered.append((c, COUNTIES[c]))
                seen.add(c)
        for c, n in COUNTIES.items():
            if c not in seen:
                ordered.append((c, n))

    total_records = total_inserted = 0
    failed = []

    for i, (code, name) in enumerate(ordered):
        try:
            records = scrape_county(code, name)
            total_records += len(records)
            if records:
                total_inserted += ingest(records, name, conn, cursor)
        except Exception as e:
            print(f"  [{code}] FAILED: {e}")
            failed.append(code)
        time.sleep(REQUEST_DELAY)
        if (i + 1) % 25 == 0:
            print(f"\n  --- {i+1}/{len(ordered)} counties | {total_inserted} new leads ---\n")

    cursor.close()
    conn.close()

    print(f"\n{'=' * 60}")
    print("COMPLETE")
    print(f"  Counties:     {len(ordered) - len(failed)}/{len(ordered)}")
    print(f"  Records seen: {total_records}")
    print(f"  New leads:    {total_inserted}")
    if failed:
        print(f"  Failed:       {failed}")
    print(f"{'=' * 60}\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--counties", default=None)
    args = parser.parse_args()
    cf = [c.strip() for c in args.counties.split(",")] if args.counties else None
    run(county_filter=cf)