// app-extract.jsx — real text extraction from PDF (.pdf), Word (.docx), and plain text.
// Exposes window.extractAgendaText(file) -> Promise<{ text, method }>.
(function () {
  function decodeEntities(s) {
    return String(s)
      .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
      .replace(/&quot;/g, '"').replace(/&apos;/g, "'")
      .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
      .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
  }

  // ---------- DOCX (Office Open XML) ----------
  async function extractDocx(file) {
    if (!window.JSZip) throw new Error("docx-lib-missing");
    const buf = await file.arrayBuffer();
    const zip = await window.JSZip.loadAsync(buf);
    const docFile = zip.file("word/document.xml");
    if (!docFile) throw new Error("not-a-docx");
    let xml = await docFile.async("string");
    // preserve structural whitespace
    xml = xml
      .replace(/<w:tab\b[^>]*\/?>/g, "\t")
      .replace(/<w:br\b[^>]*\/?>/g, "\n")
      .replace(/<\/w:p>/g, "\n")           // paragraph -> newline
      .replace(/<w:numPr\b[\s\S]*?<\/w:numPr>/g, ""); // ignore list metadata
    // pull visible text runs
    const out = [];
    const re = /<w:t\b[^>]*>([\s\S]*?)<\/w:t>|\n|\t/g;
    let m;
    let line = "";
    while ((m = re.exec(xml)) !== null) {
      if (m[0] === "\n") { out.push(line); line = ""; }
      else if (m[0] === "\t") { line += "\t"; }
      else { line += decodeEntities(m[1]); }
    }
    if (line) out.push(line);
    return out.join("\n");
  }

  // ---------- PDF (via pdf.js) ----------
  async function extractPdf(file) {
    const lib = window.pdfjsLib;
    if (!lib) throw new Error("pdf-lib-missing");
    const data = new Uint8Array(await file.arrayBuffer());
    const pdf = await lib.getDocument({ data, disableWorker: false }).promise;
    const allLines = [];
    for (let p = 1; p <= pdf.numPages; p++) {
      const page = await pdf.getPage(p);
      const tc = await page.getTextContent();
      // group text items into lines by their y position
      const rows = [];
      tc.items.forEach((it) => {
        if (!it.str) return;
        const x = it.transform[4];
        const y = Math.round(it.transform[5]);
        let row = rows.find((r) => Math.abs(r.y - y) < 4);
        if (!row) { row = { y, parts: [] }; rows.push(row); }
        row.parts.push({ x, str: it.str });
      });
      rows.sort((a, b) => b.y - a.y); // top to bottom
      rows.forEach((r) => {
        r.parts.sort((a, b) => a.x - b.x);
        const text = r.parts.map((pp) => pp.str).join("").replace(/\s+/g, " ").trim();
        if (text) allLines.push(text);
      });
    }
    return allLines.join("\n");
  }

  // ---------- legacy .doc / plain text ----------
  function readAsText(file) {
    return new Promise((resolve, reject) => {
      const r = new FileReader();
      r.onload = () => resolve(String(r.result || ""));
      r.onerror = reject;
      r.readAsText(file);
    });
  }

  async function extractAgendaText(file) {
    const name = (file.name || "").toLowerCase();
    const type = file.type || "";

    if (name.endsWith(".pdf") || type === "application/pdf") {
      return { text: await extractPdf(file), method: "PDF" };
    }
    if (name.endsWith(".docx") || type.indexOf("officedocument.wordprocessing") >= 0) {
      return { text: await extractDocx(file), method: "Word" };
    }
    // .txt / .md / .rtf / legacy .doc — best-effort text read
    const text = await readAsText(file);
    // strip RTF control words if present
    if (/^\s*{\\rtf/.test(text)) {
      const stripped = text.replace(/\\par[d]?/g, "\n").replace(/\\[a-z]+-?\d* ?/g, "").replace(/[{}]/g, "").trim();
      return { text: stripped, method: "RTF" };
    }
    return { text, method: "Text" };
  }

  window.extractAgendaText = extractAgendaText;
})();