# rag/pdf_utils.py
from pypdf import PdfReader
import re


def load_pdf_pages(pdf_path: str):
    """
    Load PDF pages with page numbers and extracted text.
    """
    reader = PdfReader(pdf_path)
    pages = []

    for i, page in enumerate(reader.pages):
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""

        pages.append({
            "page": i + 1,
            "text": text.strip()
        })

    return pages


def extract_section_heading(text: str) -> str | None:
    """
    Best-effort heuristic to extract a section heading.
    Looks for uppercase or numbered headings at the top of the page.
    """
    if not text:
        return None

    lines = [l.strip() for l in text.split("\n") if l.strip()]
    if not lines:
        return None

    first_line = lines[0]

    # Examples:
    # "RISK MANAGEMENT"
    # "1. REGULATORY COMPLIANCE"
    # "QUALITY ASSURANCE"
    if (
        first_line.isupper()
        or re.match(r"^\d+(\.\d+)*\s+[A-Z]", first_line)
    ):
        return first_line[:120]

    return None


def chunk_text(text: str, chunk_size: int, overlap: int):
    """
    Deterministic sliding-window chunking with whitespace normalization.
    """
    text = " ".join(text.split())
    if not text:
        return []

    chunks = []
    start = 0
    text_len = len(text)

    while start < text_len:
        end = min(text_len, start + chunk_size)

        # Try not to cut mid-sentence (best-effort)
        if end < text_len:
            last_period = text.rfind(".", start, end)
            if last_period != -1 and last_period > start + 100:
                end = last_period + 1

        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        if end == text_len:
            break

        start = max(0, end - overlap)

    return chunks
