# src/scraping/scrape_sky.py

from typing import List, Dict
from pathlib import Path

from bs4 import BeautifulSoup

from .utils import get_project_root


def _get_sample_path() -> Path:

    root = get_project_root()
    return root / "data" / "raw" / "samples" / "sky_budget.html"


def _parse_sky_page(html: str) -> List[Dict[str, str]]:

    soup = BeautifulSoup(html, "html.parser")
    articles: List[Dict[str, str]] = []

    for a in soup.find_all("a"):
        text = (a.get_text() or "").strip()
        href = a.get("href") or ""

        if not text:
            continue
        if "budget" not in text.lower():
            continue
        if not href:
            continue

        if href.startswith("/"):
            href = "https://news.sky.com" + href

        art = {
            "source": "sky",
            "source_type": "right",
            "date": "",
            "headline": text,
            "url": href,
        }
        articles.append(art)

    return articles


def scrape_sky() -> List[Dict[str, str]]:

    sample_path = _get_sample_path()

    if not sample_path.exists():
        print(f"[Sky] Local sample NOT FOUND at: {sample_path}")
        return []

    print(f"[Sky] Using local sample HTML from: {sample_path}")
    html = sample_path.read_text(encoding="utf-8", errors="ignore")

    if not html.strip():
        print("[Sky] Sample HTML is empty. Returning [].")
        return []

    articles = _parse_sky_page(html)
    print(f"[Sky] Parsed {len(articles)} headlines from local sample.")
    return articles


if __name__ == "__main__":
    data = scrape_sky()
    print(f"Total Sky headlines: {len(data)}")
    for a in data:
        print(a["headline"], "=>", a["url"])
