# src/scraping/scrape_ft.py

from typing import List, Dict
from pathlib import Path

from bs4 import BeautifulSoup

from .utils import get_project_root


def _get_sample_path() -> Path:
    """
    Financial Times (FT) 的本地 HTML 路径：

        uk_budget_project/data/raw/samples/ft_budget.html
    """
    root = get_project_root()
    return root / "data" / "raw" / "samples" / "ft_budget.html"


def _parse_ft_page(html: str) -> List[Dict[str, str]]:
    """
    从 FT 本地 HTML 中抓取 Budget 相关 headlines。
    """
    soup = BeautifulSoup(html, "html.parser")
    articles: List[Dict[str, str]] = []

    for a in soup.find_all("a"):
        text = (a.get_text() or "").strip()
        href = a.get("href") or ""

        if not text:
            continue
        if "budget" not in text.lower():
            continue
        if not href:
            continue

        if href.startswith("/"):
            href = "https://www.ft.com" + href

        art = {
            "source": "ft",
            "source_type": "financial",
            "date": "",
            "headline": text,
            "url": href,
        }
        articles.append(art)

    return articles


def scrape_ft() -> List[Dict[str, str]]:

    sample_path = _get_sample_path()

    if not sample_path.exists():
        print(f"[FT] Local sample NOT FOUND at: {sample_path}")
        return []

    print(f"[FT] Using local sample HTML from: {sample_path}")
    html = sample_path.read_text(encoding="utf-8", errors="ignore")

    if not html.strip():
        print("[FT] Sample HTML is empty. Returning [].")
        return []

    articles = _parse_ft_page(html)
    print(f"[FT] Parsed {len(articles)} headlines from local sample.")
    return articles


if __name__ == "__main__":
    data = scrape_ft()
    print(f"Total FT headlines: {len(data)}")
    for a in data:
        print(a["headline"], "=>", a["url"])
