import json
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import requests
from bs4 import BeautifulSoup

from typing import Type
from pydantic import BaseModel, Field, PrivateAttr
from langchain.tools import BaseTool

# ---(A) Fetch + plain-text conversion ----------------------------------------

def fetch_html(url: str, timeout: int = 20) -> str:
    r = requests.get(url, timeout=timeout, headers={"User-Agent": "SchemaDemo/1.0"})
    r.raise_for_status()
    return r.text

def html_to_plain_text(html: str) -> str:
    """Very simple plain-text conversion; for production consider trafilatura/readability."""
    soup = BeautifulSoup(html, "html.parser")

    # drop scripts/styles/nav/aside/footer for cleaner text
    for tag in soup(["script", "style", "noscript", "template", "iframe"]):
        tag.decompose()

    # common chrome
    for tag in soup.find_all(["nav", "aside", "footer"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)
    # collapse excessive whitespace
    text = re.sub(r"\s+", " ", text)
    return text


# ---(B) Structured data detection (JSON-LD + microdata-lite) -----------------

def parse_json_ld(html: str) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(html, "html.parser")
    result = []
    for s in soup.find_all("script", attrs={"type": "application/ld+json"}):
        try:
            data = json.loads(s.string or "")
            if isinstance(data, list):
                result.extend(data)
            elif isinstance(data, dict):
                result.append(data)
        except Exception:
            # tolerate malformed blocks
            continue
    return result

def parse_microdata_min(html: str) -> List[Dict[str, Any]]:
    """
    Minimal microdata scraper (not full spec). Looks for itemscope elements
    and captures itemprop children text. Good enough for demos.
    """
    soup = BeautifulSoup(html, "html.parser")
    items: List[Dict[str, Any]] = []
    for scope in soup.find_all(attrs={"itemscope": True}):
        item: Dict[str, Any] = {}
        if scope.has_attr("itemtype"):
            item["@type"] = scope["itemtype"].split("/")[-1]
        if scope.has_attr("itemid"):
            item["@id"] = scope["itemid"]

        # gather properties
        props = scope.find_all(attrs={"itemprop": True})
        for p in props:
            key = p["itemprop"]
            val = None
            if p.has_attr("content"):
                val = p["content"]
            elif p.name in ("meta", "img"):
                # meta content or img alt/src
                val = p.get("content") or p.get("alt") or p.get("src")
            else:
                val = p.get_text(" ", strip=True)

            if key in item:
                # normalize multi-values
                if isinstance(item[key], list):
                    item[key].append(val)
                else:
                    item[key] = [item[key], val]
            else:
                item[key] = val
        if item:
            items.append(item)
    return items

def extract_structured_data(html: str) -> List[Dict[str, Any]]:
    """Combine JSON-LD and (minimal) microdata into one list."""
    data = []
    data.extend(parse_json_ld(html))
    data.extend(parse_microdata_min(html))
    return data


# ---(C) Heuristic text-only extractor ----------------------------------------

def meta_tag(soup: BeautifulSoup, names: List[str]) -> Optional[str]:
    for n in names:
        tag = soup.find("meta", attrs={"name": n}) or soup.find("meta", attrs={"property": n})
        if tag and tag.get("content"):
            return tag["content"].strip()
    return None

def guess_article_from_text(html: str, plain_text: str) -> Dict[str, Any]:
    """
    Naive extractor: use <title>, meta tags, simple regexes.
    Works poorly compared to schema — intentionally, to demonstrate the delta.
    """
    soup = BeautifulSoup(html, "html.parser")

    title = (soup.title.string.strip() if soup.title and soup.title.string else None)
    title = title.split(" | ")[0].split(" - ")[0] if title else None

    author = meta_tag(soup, ["author", "article:author", "og:article:author"])
    if not author:
        # fuzzy: look for "By <Name>"
        m = re.search(r"\bby\s+([A-Z][\w\.\-\' ]+)", plain_text, flags=re.IGNORECASE)
        author = m.group(1).strip() if m else None

    date = meta_tag(soup, [
        "article:published_time", "date", "publish-date", "pubdate", "og:article:published_time"
    ])
    if not date:
        m = re.search(r"(\b\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}\b|\b\d{4}-\d{2}-\d{2}\b)", plain_text)
        date = m.group(1) if m else None

    # crude summary: first 30-60 words that look like body
    words = plain_text.split()
    summary = " ".join(words[:60]) if words else None

    return {
        "type": "Article (guessed)",
        "headline": title,
        "author": author,
        "datePublished": date,
        "summary": summary,
    }


# ---(D) Schema-aware normalization -------------------------------------------

# fields we’ll try to surface for common schema.org types
ARTICLE_KEYS = ["headline", "name", "alternativeHeadline", "description", "author", "datePublished"]
EVENT_KEYS   = ["name", "startDate", "endDate", "location", "performer", "description"]
PRODUCT_KEYS = ["name", "brand", "sku", "gtin13", "offers", "description"]

def pick(d: Dict[str, Any], keys: List[str]) -> Dict[str, Any]:
    return {k: d.get(k) for k in keys if k in d}

def flatten_author(a: Any) -> Any:
    if isinstance(a, dict):
        return a.get("name") or a.get("@id") or a
    if isinstance(a, list):
        return [flatten_author(x) for x in a]
    return a

def normalize_structured(block: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """Normalize a JSON-LD/microdata block into a friendly dict for demo."""
    t = block.get("@type")
    if isinstance(t, list):
        t = t[0]
    if not t:
        # try guesses
        if "headline" in block or "articleBody" in block:
            t = "Article"
    out = {"type": t or "Thing"}

    if t in ("NewsArticle", "BlogPosting", "Article"):
        x = pick(block, ARTICLE_KEYS)
        if "author" in x:
            x["author"] = flatten_author(x["author"])
        out.update(x)
    elif t == "Event":
        x = pick(block, EVENT_KEYS)
        loc = x.get("location")
        if isinstance(loc, dict):
            x["location"] = loc.get("name") or loc.get("address") or loc
        out.update(x)
    elif t == "Product":
        x = pick(block, PRODUCT_KEYS)
        brand = x.get("brand")
        if isinstance(brand, dict):
            x["brand"] = brand.get("name") or brand
        offers = x.get("offers")
        if isinstance(offers, dict):
            x["offers"] = {k: offers.get(k) for k in ["price", "priceCurrency", "availability"]}
        out.update(x)
    else:
        # generic pass-through of some helpful fields
        for k in ["name", "description"]:
            if k in block:
                out[k] = block[k]

    return out


# ---(E) “Orchestrator” simulation with LangChain tools -----------------------

from langchain.tools import BaseTool
from pydantic import BaseModel, Field

@dataclass
class PageExtraction:
    url: str
    html: str
    text: str
    structured_raw: List[Dict[str, Any]]
    structured_norm: List[Dict[str, Any]]
    naive_guess: Dict[str, Any]

def process_url(url: str) -> PageExtraction:
    html = fetch_html(url)
    text = html_to_plain_text(html)
    structured_raw = extract_structured_data(html)
    structured_norm = [n for b in structured_raw if (n := normalize_structured(b))]
    naive = guess_article_from_text(html, text)
    return PageExtraction(
        url=url, html=html, text=text,
        structured_raw=structured_raw, structured_norm=structured_norm,
        naive_guess=naive
    )

class SchemaQueryInput(BaseModel):
    question: str = Field(..., description="Question about the page content")

class SchemaTool(BaseTool):
    # ✅ add annotations for pydantic v2
    name: str = "schema_lookup"
    description: str = "Use when the page exposes JSON-LD or microdata; returns normalized structured fields."
    args_schema: Type[BaseModel] = SchemaQueryInput

    # ✅ keep non-pydantic state here
    _extraction: PageExtraction = PrivateAttr()

    def __init__(self, extraction: PageExtraction, **data):
        super().__init__(**data)
        self._extraction = extraction

    def _run(self, question: str) -> str:
        return json.dumps(self._extraction.structured_norm, ensure_ascii=False, indent=2)

    async def _arun(self, question: str) -> str:
        return self._run(question)


class TextTool(BaseTool):
    name: str = "text_search"
    description: str = "Use when no schema is present; searches the plain text and meta to guess key facts."
    args_schema: Type[BaseModel] = SchemaQueryInput

    _extraction: PageExtraction = PrivateAttr()

    def __init__(self, extraction: PageExtraction, **data):
        super().__init__(**data)
        self._extraction = extraction

    def _run(self, question: str) -> str:
        return json.dumps({
            "naive_guess": self._extraction.naive_guess,
            "sample_text": self._extraction.text[:300] + ("..." if len(self._extraction.text) > 300 else "")
        }, ensure_ascii=False, indent=2)

    async def _arun(self, question: str) -> str:
        return self._run(question)


# ---(F) Demo runner -----------------------------------------------------------

def demo(url_with_schema: str, url_without_schema: str) -> None:
    print("=== Processing pages ===")
    a = process_url(url_with_schema)
    b = process_url(url_without_schema)

    print("\n--- Page A (with schema) ---")
    print(f"URL: {a.url}")
    print(f"Plain text sample: {a.text[:200]}{'...' if len(a.text) > 200 else ''}")
    print("Found structured blocks:", len(a.structured_raw))
    print("Normalized (first 1):")
    print(json.dumps(a.structured_norm[:1], ensure_ascii=False, indent=2))

    print("\n--- Page B (without schema) ---")
    print(f"URL: {b.url}")
    print(f"Plain text sample: {b.text[:200]}{'...' if len(b.text) > 200 else ''}")
    print("Found structured blocks:", len(b.structured_raw))
    print("Naive guess:")
    print(json.dumps(b.naive_guess, ensure_ascii=False, indent=2))

    # Simulate an “orchestrator” deciding which tool to call.
    schema_tool_a = SchemaTool(a)
    text_tool_a = TextTool(a)
    schema_tool_b = SchemaTool(b)
    text_tool_b = TextTool(b)

    user_question = "What are the key fields (title/author/date/name/time/location/price) this page exposes?"

    print("\n=== Orchestrator decision ===")
    print("For Page A (schema present): choose schema_lookup")
    print(schema_tool_a.run(user_question))

    print("\nFor Page B (no schema): fall back to text_search")
    print(text_tool_b.run(user_question))


# ---(G) Example usage ---------------------------------------------------------

if __name__ == "__main__":
    # Replace these with two real pages you control or trust for the demo:
    # - url_with_schema: e.g., a blog post or event page with JSON-LD
    # - url_without_schema: a similar page without structured data
    url_with_schema = "https://storage.googleapis.com/getheard-schema-org-test/seopage-with-schema.html"
    url_without_schema = "https://storage.googleapis.com/getheard-schema-org-test/seopage-no-schema.html"
    demo(url_with_schema, url_without_schema)
