"""
DEWA PR Tracker — Scraper
  - Press Releases: Playwright (Chromium) — iterates all years via year filter
  - Podcasts:       Direct RSS feed parse from media.rss.com (no browser needed)

Usage:
  python scraper.py              # normal run
  python scraper.py --debug      # visible browser + saves debug HTML/screenshots
  python scraper.py --visible    # visible browser only
"""
import argparse
import re
import xml.etree.ElementTree as ET
from datetime import datetime
from email.utils import parsedate_to_datetime
from pathlib import Path
from urllib.request import urlopen, Request

from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout

import database as db
from config import (
    DEWA_NEWS_URL, DEWA_BASE_URL,
    SCRAPER_HEADLESS, SCRAPER_TIMEOUT_MS, SCRAPER_WAIT_AFTER_LOAD_MS,
)

RSS_FEED_URL = "https://media.rss.com/dewa-news/feed.xml"

# Years available on the DEWA news page
SCRAPE_YEARS = ["2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026"]


# ── Utilities ─────────────────────────────────────────────────────────────────

def normalise_url(href: str) -> str:
    if not href:
        return ""
    href = href.strip()
    if href.startswith("http"):
        return href
    if href.startswith("/"):
        return DEWA_BASE_URL + href
    return href


def parse_dewa_date(raw: str) -> str:
    """Parse DEWA date format '30 March 2026' → '2026-03-30'."""
    if not raw:
        return None
    raw = raw.strip()
    try:
        return datetime.strptime(raw, "%d %B %Y").strftime("%Y-%m-%d")
    except ValueError:
        pass
    try:
        return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
    except ValueError:
        pass
    m = re.search(r"(\d{4})", raw)
    return m.group(1) + "-01-01" if m else None


def seconds_to_mmss(secs) -> str:
    try:
        s = int(secs)
        return f"{s // 60}:{s % 60:02d}"
    except (TypeError, ValueError):
        return str(secs) if secs else None


# ── Press Release scraper (Playwright) ───────────────────────────────────────

def extract_items_from_page(page) -> list:
    """Extract all .m9-teaser--news_all items currently visible on the page."""
    items = page.query_selector_all(".m9-teaser--news_all")
    results = []
    for item in items:
        try:
            # Link — the whole card is an <a class="teaser__link">
            link_el = item.query_selector("a.teaser__link")
            if not link_el:
                link_el = item.query_selector("a")
            href  = link_el.get_attribute("href") if link_el else ""
            url   = normalise_url(href)

            # Title
            title_el = item.query_selector("h4.teaser__title")
            if not title_el:
                title_el = item.query_selector("h2, h3, h4, h5, .title")
            title = title_el.inner_text().strip() if title_el else ""

            if not title or not url:
                continue

            # Date — "30 March 2026"
            date_el = item.query_selector("h6.teaser__date")
            raw_date = date_el.inner_text().strip() if date_el else ""
            date = parse_dewa_date(raw_date)

            # Description — stored in data-vl attribute of the <p>
            desc_el = item.query_selector("p.teaser__copy")
            description = None
            if desc_el:
                description = (
                    desc_el.get_attribute("data-vl")
                    or desc_el.inner_text().strip()
                    or None
                )

            results.append({
                "title": title,
                "url": url,
                "date": date,
                "description": description,
                "category": None,
            })
        except Exception as e:
            print(f"  [warn] Skipping item: {e}")
    return results


def scrape_press_releases(page, debug: bool = False) -> list:
    print(f"[scraper] → News page: {DEWA_NEWS_URL}")
    page.goto(DEWA_NEWS_URL, timeout=SCRAPER_TIMEOUT_MS, wait_until="networkidle")
    page.wait_for_timeout(SCRAPER_WAIT_AFTER_LOAD_MS)

    if debug:
        page.screenshot(path="debug_news.png", full_page=True)
        Path("debug_news.html").write_text(page.content(), encoding="utf-8")
        print("[debug] Saved debug_news.png + debug_news.html")

    all_results = []
    seen_urls   = set()

    month_select = page.query_selector("select[name='month']")
    year_select  = page.query_selector("select[name='year']")

    if not year_select:
        print("[scraper] WARNING: year selector not found — extracting current page only")
        return extract_items_from_page(page)

    for year in SCRAPE_YEARS:
        print(f"  [scraper] Year {year}:", end=" ", flush=True)
        try:
            # IMPORTANT: reset month first (fires API call), then select year
            if month_select:
                month_select.select_option(value="")
                page.wait_for_timeout(2000)
            year_select.select_option(value=year)
            page.wait_for_timeout(2500)

            # Click "More" until it disappears — loads all pages for this year
            pages_loaded = 1
            while True:
                more_btn = page.query_selector("#loadmore")
                if more_btn and more_btn.is_visible():
                    try:
                        more_btn.scroll_into_view_if_needed()
                        more_btn.click()
                        page.wait_for_timeout(2500)
                        pages_loaded += 1
                    except Exception:
                        break
                else:
                    break

            items     = extract_items_from_page(page)
            new_items = [i for i in items if i["url"] not in seen_urls]
            for i in new_items:
                seen_urls.add(i["url"])
            all_results.extend(new_items)
            print(f"{len(new_items)} items ({pages_loaded} page(s) loaded)")

        except Exception as e:
            print(f"error: {e}")
            continue

    print(f"[scraper] Total press releases extracted: {len(all_results)}")
    return all_results


# ── Podcast scraper (RSS feed) ────────────────────────────────────────────────

def scrape_podcasts_from_rss() -> list:
    print(f"[scraper] → Fetching RSS feed: {RSS_FEED_URL}")
    try:
        req = Request(
            RSS_FEED_URL,
            headers={"User-Agent": "Mozilla/5.0 (compatible; DEWATracker/1.0)"}
        )
        with urlopen(req, timeout=30) as resp:
            raw_xml = resp.read()
    except Exception as e:
        print(f"[scraper] RSS fetch error: {e}")
        return []

    # Parse XML
    ns = {
        "itunes":  "http://www.itunes.com/dtds/podcast-1.0.dtd",
        "podcast": "https://podcastindex.org/namespace/1.0",
        "content": "http://purl.org/rss/1.0/modules/content/",
    }

    try:
        root = ET.fromstring(raw_xml)
    except ET.ParseError as e:
        print(f"[scraper] XML parse error: {e}")
        return []

    channel = root.find("channel")
    if not channel:
        return []

    results = []
    for item in channel.findall("item"):
        try:
            title = (item.findtext("title") or "").strip()
            link  = (item.findtext("link")  or "").strip()

            if not title or not link:
                continue

            # Date — RFC 822 format e.g. "Wed, 01 Apr 2026 08:58:31 GMT"
            pub_date_raw = item.findtext("pubDate") or ""
            date = None
            if pub_date_raw:
                try:
                    date = parsedate_to_datetime(pub_date_raw).strftime("%Y-%m-%d")
                except Exception:
                    pass

            # MP3 URL from <enclosure>
            enclosure = item.find("enclosure")
            audio_url = enclosure.get("url") if enclosure is not None else None

            # Duration (seconds → mm:ss)
            dur_raw  = item.findtext("itunes:duration", namespaces=ns) or ""
            duration = seconds_to_mmss(dur_raw) if dur_raw else None

            # Description
            desc_raw = item.findtext("description") or ""
            desc_raw = re.sub(r"<[^>]+>", "", desc_raw).strip()
            description = desc_raw or None

            # Language: Arabic if title contains Arabic Unicode block
            is_arabic = bool(re.search(r'[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF]', title))
            language  = "ar" if is_arabic else "en"

            results.append({
                "title":       title,
                "url":         link,
                "audio_url":   audio_url,
                "date":        date,
                "duration":    duration,
                "description": description,
                "language":    language,
            })
        except Exception as e:
            print(f"  [warn] Skipping podcast item: {e}")
            continue

    print(f"[scraper] Total podcast episodes from RSS: {len(results)}")
    return results


# ── Match podcasts → press releases by title ──────────────────────────────────

def normalise_title(t: str) -> str:
    """Lowercase, strip punctuation for fuzzy matching."""
    t = t.lower()
    t = re.sub(r"[^\w\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def match_podcasts_to_prs(podcasts: list, pr_id_map: dict) -> list:
    """
    pr_id_map: { url: (id, normalised_title) }
    Match by:
    1. URL slug — rss.com episode URL may contain matching text
    2. Title substring overlap (>60% word overlap)
    """
    pr_titles = [(pid, ntitle) for _, (pid, ntitle) in pr_id_map.items()]

    matched = []
    for pod in podcasts:
        pr_id = None
        pod_norm = normalise_title(pod["title"])
        pod_words = set(pod_norm.split())

        best_score = 0
        best_pid   = None
        for pid, pr_norm in pr_titles:
            pr_words = set(pr_norm.split())
            if not pr_words:
                continue
            overlap = len(pod_words & pr_words) / len(pr_words)
            if overlap > best_score:
                best_score = overlap
                best_pid   = pid

        if best_score >= 0.5:
            pr_id = best_pid

        matched.append({**pod, "pr_id": pr_id})

    linked = sum(1 for p in matched if p["pr_id"])
    print(f"[scraper] Podcasts linked to PRs: {linked}/{len(matched)}")
    return matched


# ── Main ──────────────────────────────────────────────────────────────────────

def run_scrape(debug: bool = False, headless: bool = None) -> tuple[int, int]:
    if headless is None:
        headless = SCRAPER_HEADLESS
    if debug:
        headless = False

    db.init_db()

    prs_saved  = 0
    pods_saved = 0
    status     = "success"
    notes      = ""

    try:
        # ── 1. Press Releases ─────────────────────────────────────────────────
        with sync_playwright() as pw:
            browser = pw.chromium.launch(
                headless=headless,
                args=["--disable-blink-features=AutomationControlled"],
            )
            ctx = browser.new_context(
                user_agent=(
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/122.0.0.0 Safari/537.36"
                ),
                viewport={"width": 1440, "height": 900},
            )
            page = ctx.new_page()
            pr_data = scrape_press_releases(page, debug=debug)
            browser.close()

        pr_id_map = {}
        for pr in pr_data:
            pid = db.upsert_press_release(
                title=pr["title"],
                url=pr["url"],
                date=pr["date"],
                category=pr.get("category"),
                description=pr.get("description"),
            )
            pr_id_map[pr["url"]] = (pid, normalise_title(pr["title"]))
            prs_saved += 1

        # ── 2. Podcasts from RSS feed ─────────────────────────────────────────
        pod_data = scrape_podcasts_from_rss()
        pod_data = match_podcasts_to_prs(pod_data, pr_id_map)
        for pod in pod_data:
            db.upsert_podcast(
                title=pod["title"],
                url=pod["url"],
                pr_id=pod.get("pr_id"),
                audio_url=pod.get("audio_url"),
                date=pod.get("date"),
                duration=pod.get("duration"),
            )
            pods_saved += 1

    except Exception as e:
        status = "error"
        notes  = str(e)
        print(f"[scraper] ERROR: {e}")

    db.log_scrape(prs_saved, pods_saved, status, notes)
    print(f"\n✅ Done — {prs_saved} PRs, {pods_saved} podcasts saved. Status: {status}")
    if notes:
        print(f"   Notes: {notes}")

    return prs_saved, pods_saved


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="DEWA PR & Podcast Scraper")
    parser.add_argument("--debug",   action="store_true", help="Visible browser + save debug files")
    parser.add_argument("--visible", action="store_true", help="Visible browser only")
    args = parser.parse_args()

    run_scrape(debug=args.debug, headless=not args.visible and SCRAPER_HEADLESS)