#!/usr/bin/env python3
"""
tools/index-pdf.py
==================
Script untuk mengekstrak teks PDF dan menyimpannya sebagai JSON chunks.
Dijalankan sekali (offline) di server via CLI.

Cara pakai:
  python3 tools/index-pdf.py --pdf data/PKB_2024.pdf --name "PKB 2024" --kategori pkb
  python3 tools/index-pdf.py --pdf data/UU_Ketenagakerjaan.pdf --name "UU No.13/2003" --kategori uu
  python3 tools/index-pdf.py --list   (tampilkan semua dokumen terindeks)
  python3 tools/index-pdf.py --delete pkb_2024  (hapus indeks dokumen)

Persyaratan:
  pip install pdfplumber
"""

import sys
import os
import json
import argparse
import hashlib
from datetime import datetime
from pathlib import Path

try:
    import pdfplumber
except ImportError:
    print("❌ Install dulu: pip install pdfplumber")
    sys.exit(1)

# ── Konfigurasi ──────────────────────────────────────────────
DATA_DIR   = Path(__file__).parent.parent / "data"
INDEX_FILE = DATA_DIR / "pdf_index.json"

CHUNK_SIZE    = 800   # karakter per chunk
CHUNK_OVERLAP = 150   # overlap antar chunk (konteks tidak putus)

# ── Helper ────────────────────────────────────────────────────

def load_index() -> dict:
    if INDEX_FILE.exists():
        with open(INDEX_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}

def save_index(data: dict):
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    with open(INDEX_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def make_doc_id(name: str) -> str:
    """Buat ID unik dari nama dokumen."""
    return name.lower().replace(" ", "_").replace("/", "_")[:30]

def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
    """Pecah teks panjang menjadi chunks dengan overlap."""
    chunks  = []
    start   = 0
    text    = " ".join(text.split())  # normalisasi whitespace

    while start < len(text):
        end   = start + size
        chunk = text[start:end]

        # Coba potong di batas kalimat/kata agar tidak putus di tengah
        if end < len(text):
            last_period = max(
                chunk.rfind('. '),
                chunk.rfind('\n'),
                chunk.rfind('? '),
                chunk.rfind(': ')
            )
            if last_period > size // 2:
                chunk = chunk[:last_period + 1]
                end   = start + last_period + 1

        chunks.append(chunk.strip())
        start = end - overlap

    return [c for c in chunks if len(c) > 50]  # buang chunk terlalu pendek

def extract_pdf_text(pdf_path: str) -> list[dict]:
    """
    Ekstrak teks dari setiap halaman PDF.
    Return: list of {'page': int, 'text': str}
    """
    pages = []

    with pdfplumber.open(pdf_path) as pdf:
        total = len(pdf.pages)
        print(f"📄 Total halaman: {total}")

        for i, page in enumerate(pdf.pages, start=1):
            sys.stdout.write(f"\r  Memproses halaman {i}/{total}...")
            sys.stdout.flush()

            text = page.extract_text() or ""
            text = text.strip()

            if text:
                pages.append({"page": i, "text": text})

    print(f"\n✅ Berhasil ekstrak {len(pages)} halaman berisi teks")
    return pages

def build_chunks(pages: list[dict], doc_id: str, doc_name: str) -> list[dict]:
    """
    Gabungkan teks per halaman lalu buat chunks dengan metadata.
    """
    chunks = []
    chunk_id = 0

    for page_data in pages:
        page_num  = page_data["page"]
        page_text = page_data["text"]
        page_chunks = chunk_text(page_text)

        for c in page_chunks:
            chunks.append({
                "id"       : f"{doc_id}_chunk_{chunk_id:04d}",
                "doc_id"   : doc_id,
                "doc_name" : doc_name,
                "page"     : page_num,
                "text"     : c,
                "keywords" : extract_keywords(c),
            })
            chunk_id += 1

    return chunks

def extract_keywords(text: str) -> list[str]:
    """
    Ekstrak kata-kata penting sebagai keyword untuk pencarian cepat.
    Simple approach: ambil kata ≥ 4 huruf, lowercase, unik.
    """
    stopwords = {
        "yang", "dengan", "untuk", "dalam", "pada", "adalah", "atau",
        "jika", "maka", "serta", "telah", "oleh", "dari", "kepada",
        "tidak", "dapat", "akan", "setiap", "harus", "wajib", "bahwa",
        "tersebut", "dimaksud", "ketentuan", "sebagaimana", "pasal",
        "ayat", "huruf", "undang", "peraturan", "nomor", "tahun"
    }

    words = text.lower().split()
    keywords = []

    for word in words:
        # Bersihkan tanda baca
        clean = ''.join(c for c in word if c.isalpha())
        if len(clean) >= 4 and clean not in stopwords:
            keywords.append(clean)

    # Unik, maksimal 20 keyword
    return list(dict.fromkeys(keywords))[:20]

# ── Commands ──────────────────────────────────────────────────

def cmd_index(args):
    """Indeks satu file PDF."""
    pdf_path = Path(args.pdf)

    if not pdf_path.exists():
        print(f"❌ File tidak ditemukan: {pdf_path}")
        sys.exit(1)

    doc_id   = make_doc_id(args.name)
    doc_name = args.name
    kategori = args.kategori or "umum"

    print(f"\n📚 Mengindeks: {doc_name}")
    print(f"   ID       : {doc_id}")
    print(f"   Kategori : {kategori}")
    print(f"   File     : {pdf_path}\n")

    # Ekstrak teks
    pages = extract_pdf_text(str(pdf_path))

    if not pages:
        print("❌ Tidak ada teks yang bisa diekstrak (PDF mungkin hasil scan/gambar)")
        print("   Gunakan OCR dulu: tesseract atau Adobe Acrobat")
        sys.exit(1)

    # Build chunks
    chunks = build_chunks(pages, doc_id, doc_name)
    print(f"🔧 Total chunks: {len(chunks)}")

    # Simpan ke index
    index = load_index()
    index[doc_id] = {
        "id"            : doc_id,
        "name"          : doc_name,
        "kategori"      : kategori,
        "file"          : str(pdf_path.name),
        "total_pages"   : len(pages),
        "total_chunks"  : len(chunks),
        "indexed_at"    : datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "chunks"        : chunks,
    }

    save_index(index)

    print(f"\n✅ Indeks berhasil disimpan ke: {INDEX_FILE}")
    print(f"   Dokumen : {doc_name}")
    print(f"   Halaman : {len(pages)}")
    print(f"   Chunks  : {len(chunks)}")

def cmd_list(args):
    """Tampilkan semua dokumen yang sudah diindeks."""
    index = load_index()

    if not index:
        print("📭 Belum ada dokumen terindeks.")
        print("   Gunakan: python3 tools/index-pdf.py --pdf file.pdf --name 'Nama Dokumen'")
        return

    print(f"\n📚 Dokumen Terindeks ({len(index)} dokumen):\n")
    print(f"{'ID':<25} {'Nama':<30} {'Kat':<10} {'Hal':>4} {'Chunks':>6} {'Diindeks':<20}")
    print("-" * 100)

    for doc_id, doc in index.items():
        print(
            f"{doc_id:<25} "
            f"{doc['name']:<30} "
            f"{doc['kategori']:<10} "
            f"{doc['total_pages']:>4} "
            f"{doc['total_chunks']:>6} "
            f"{doc['indexed_at']:<20}"
        )

def cmd_delete(args):
    """Hapus indeks satu dokumen."""
    index = load_index()
    doc_id = args.delete

    if doc_id not in index:
        print(f"❌ Dokumen dengan ID '{doc_id}' tidak ditemukan.")
        cmd_list(args)
        return

    doc_name = index[doc_id]["name"]
    del index[doc_id]
    save_index(index)

    print(f"🗑️  Indeks '{doc_name}' (ID: {doc_id}) berhasil dihapus.")

# ── Main ──────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="🤖 PDF Indexer untuk WA Bot",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Contoh penggunaan:
  python3 tools/index-pdf.py --pdf data/PKB_2024.pdf --name "PKB 2024" --kategori pkb
  python3 tools/index-pdf.py --pdf data/UU13_2003.pdf --name "UU Ketenagakerjaan 13/2003" --kategori uu
  python3 tools/index-pdf.py --list
  python3 tools/index-pdf.py --delete pkb_2024
        """
    )

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--pdf",    metavar="FILE", help="Path ke file PDF yang akan diindeks")
    group.add_argument("--list",   action="store_true", help="Tampilkan daftar dokumen terindeks")
    group.add_argument("--delete", metavar="ID",   help="Hapus indeks dokumen berdasarkan ID")

    parser.add_argument("--name",     metavar="NAMA",     help="Nama dokumen (wajib jika --pdf)")
    parser.add_argument("--kategori", metavar="KATEGORI", help="Kategori: pkb, uu, pp, sk, dll")

    args = parser.parse_args()

    if args.pdf:
        if not args.name:
            parser.error("--name wajib diisi saat menggunakan --pdf")
        cmd_index(args)
    elif args.list:
        cmd_list(args)
    elif args.delete:
        cmd_delete(args)

if __name__ == "__main__":
    main()
