# src/scraping/scrape_bbc.py

from typing import List, Dict
from pathlib import Path
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from .utils import get_project_root


def _get_sample_path() -> Path:
    """
    BBC 本地 HTML 路径：
    uk_budget_project/data/raw/samples/bbc_budget.html
    """
    root = get_project_root()
    return root / "data" / "raw" / "samples" / "bbc_budget.html"


def _parse_bbc_page(html: str) -> List[Dict[str, str]]:
    """
    从 BBC 的 Budget 页面里提取“包含 budget 的标题 + 链接”。

    不依赖具体 CSS 结构：
      - 找所有 <a> 标签
      - 只要链接里是 bbc.co.uk
      - 文本里包含 'budget'（忽略大小写）
      - 文本长度 >= 10（过滤掉导航）
    """
    soup = BeautifulSoup(html, "html.parser")
    articles: List[Dict[str, str]] = []
    seen_urls = set()

    for a in soup.find_all("a", href=True):
        text = a.get_text(" ", strip=True)
        href = a["href"]

        if not text:
            continue

        url = urljoin("https://www.bbc.co.uk", href)

        # 只要 BBC 自己的链接
        if "bbc.co.uk" not in url:
            continue

        # 标题里要有 budget
        if "budget" not in text.lower():
            continue

        # 太短的一般是导航
        if len(text) < 10:
            continue

        if url in seen_urls:
            continue
        seen_urls.add(url)

        articles.append(
            {
                "source": "bbc",
                "source_type": "public_service",
                "headline": text,
                "url": url,
                "date": "",  # 先不解析日期
            }
        )

    return articles


def scrape_bbc() -> List[Dict[str, str]]:
    """
    主函数：从本地 BBC HTML 中解析 headlines。
    会被 run_scrapers.py 调用。
    """
    sample_path = _get_sample_path()

    if not sample_path.exists():
        print(f"[BBC] Local sample HTML NOT FOUND at: {sample_path}")
        return []

    print(f"[BBC] Using local sample HTML from: {sample_path}")

    html = sample_path.read_text(encoding="utf-8", errors="ignore")
    if not html.strip():
        print("[BBC] Sample HTML is empty. Returning [].")
        return []

    articles = _parse_bbc_page(html)
    print(f"[BBC] Parsed {len(articles)} headlines from local sample.")
    return articles


if __name__ == "__main__":
    data = scrape_bbc()
    print(f"Total BBC headlines: {len(data)}")
    for a in data[:10]:
        print(a["headline"], "->", a["url"])