#!/usr/bin/env python3
import argparse
import os
import re
import sys
from pathlib import Path
from typing import List, Sequence

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_transformers import MarkdownifyTransformer
from langchain_core.documents import Document


def slugify(text: str) -> str:
    text = re.sub(r"https?://", "", text)
    text = re.sub(r"[^\w\-\.]+", "-", text, flags=re.UNICODE)
    return re.sub(r"-{2,}", "-", text).strip("-").lower() or "page"


def load_from_url(url: str) -> List[Document]:
    # WebBaseLoader returns HTML in page_content by default
    loader = WebBaseLoader(url)
    return loader.load()


def load_from_file(path: str) -> List[Document]:
    p = Path(path)
    html = p.read_text(encoding="utf-8", errors="ignore")
    return [Document(page_content=html, metadata={"source": str(p.resolve())})]


def convert_docs_to_markdown(docs: List[Document],
                             strip_tags: Sequence[str] | None = None,
                             convert_only: Sequence[str] | None = None) -> List[Document]:
    """
    Uses MarkdownifyTransformer (markdownify under the hood).
    You can choose to strip certain HTML tags entirely, or only convert specific tags.
    """
    md = MarkdownifyTransformer(
        strip=list(strip_tags) if strip_tags else None,
        convert=list(convert_only) if convert_only else None,
    )
    return md.transform_documents(docs)


def write_output(docs: List[Document], out: str | None) -> None:
    if out:
        # Merge all docs into one file
        Path(out).write_text("\n\n".join(d.page_content for d in docs), encoding="utf-8")
        print(f"✓ Wrote Markdown -> {out}")
    return

def main() -> None:
    strip_tags = ["script","style"] # array of tags to ignore
    convert_only = None # array of specific tags to convert
    outFileName = "transformed.md" # output markdown file name
    sourceUrl = "https://storage.googleapis.com/getheard-schema-org-test/seopage-no-schema.html"

    docs: List[Document] = []

    try:
        docs.extend(load_from_url(sourceUrl))
    except Exception as e:
        print(f"Error while loading inputs: {e}", file=sys.stderr)
        sys.exit(2)

    try:
        md_docs = convert_docs_to_markdown(docs, strip_tags=strip_tags, convert_only=convert_only)
    except Exception as e:
        print(f"Error during HTML→Markdown conversion: {e}", file=sys.stderr)
        sys.exit(3)

    try:
        write_output(md_docs, outFileName)
    except Exception as e:
        print(f"Error while writing output: {e}", file=sys.stderr)
        sys.exit(4)


if __name__ == "__main__":
    main()
