
from typing import List, Dict
from pathlib import Path
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from .utils import get_project_root


def _get_sample_path() -> Path:

    root = get_project_root()
    return root / "data" / "raw" / "samples" / "guardian_search.html"


def _parse_guardian_page(html: str) -> List[Dict[str, str]]:
   
    soup = BeautifulSoup(html, "html.parser")
    articles: List[Dict[str, str]] = []
    seen_urls = set()

    for a in soup.find_all("a", href=True):
        text = a.get_text(" ", strip=True)
        href = a["href"]

        if not text:
            continue

        url = urljoin("https://www.theguardian.com", href)


        if "theguardian.com" not in url:
            continue

        if "budget" not in text.lower():
            continue

        if len(text) < 10:
            continue

        if url in seen_urls:
            continue
        seen_urls.add(url)

        articles.append(
            {
                "source": "guardian",
                "source_type": "left",
                "headline": text,
                "url": url,
                "date": "",
            }
        )

    return articles


def scrape_guardian() -> List[Dict[str, str]]:

    sample_path = _get_sample_path()

    if not sample_path.exists():
        print(f"[Guardian] Local sample HTML NOT FOUND at: {sample_path}")
        return []

    print(f"[Guardian] Using local sample HTML from: {sample_path}")

    html = sample_path.read_text(encoding="utf-8", errors="ignore")
    if not html.strip():
        print("[Guardian] Sample HTML is empty. Returning [].")
        return []

    articles = _parse_guardian_page(html)
    print(f"[Guardian] Parsed {len(articles)} headlines from local sample.")
    return articles


if __name__ == "__main__":
    data = scrape_guardian()
    print(f"Total Guardian headlines: {len(data)}")
    for a in data[:10]:
        print(a["headline"], "->", a["url"])
