

from typing import List, Dict
from pathlib import Path

from bs4 import BeautifulSoup

from .utils import get_project_root


def _get_sample_path() -> Path:

    root = get_project_root()
    return root / "data" / "raw" / "samples" / "telegraph_budget.html"


def _parse_telegraph_page(html: str) -> List[Dict[str, str]]:

    soup = BeautifulSoup(html, "html.parser")
    articles: List[Dict[str, str]] = []

    for a in soup.find_all("a"):
        text = (a.get_text() or "").strip()
        href = a.get("href") or ""

        if not text:
            continue
        if "budget" not in text.lower():
            continue
        if not href:
            continue

  
        if href.startswith("/"):
            href = "https://www.telegraph.co.uk" + href

        art = {
            "source": "telegraph",
            "source_type": "right",
            "date": "",
            "headline": text,
            "url": href,
        }
        articles.append(art)

    return articles


def scrape_telegraph() -> List[Dict[str, str]]:

    sample_path = _get_sample_path()

    if not sample_path.exists():
        print(f"[Telegraph] Local sample NOT FOUND at: {sample_path}")
        return []

    print(f"[Telegraph] Using local sample HTML from: {sample_path}")
    html = sample_path.read_text(encoding="utf-8", errors="ignore")

    if not html.strip():
        print("[Telegraph] Sample HTML is empty. Returning [].")
        return []

    articles = _parse_telegraph_page(html)
    print(f"[Telegraph] Parsed {len(articles)} headlines from local sample.")
    return articles


if __name__ == "__main__":
 
    data = scrape_telegraph()
    print(f"Total Telegraph headlines: {len(data)}")
    for a in data:
        print(a["headline"], "=>", a["url"])
