// app-extract.jsx — real text extraction from PDF (.pdf), Word (.docx), and plain text. // Exposes window.extractAgendaText(file) -> Promise<{ text, method }>. (function () { function decodeEntities(s) { return String(s) .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">") .replace(/"/g, '"').replace(/'/g, "'") .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10))) .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16))); } // ---------- DOCX (Office Open XML) ---------- async function extractDocx(file) { if (!window.JSZip) throw new Error("docx-lib-missing"); const buf = await file.arrayBuffer(); const zip = await window.JSZip.loadAsync(buf); const docFile = zip.file("word/document.xml"); if (!docFile) throw new Error("not-a-docx"); let xml = await docFile.async("string"); // preserve structural whitespace xml = xml .replace(/]*\/?>/g, "\t") .replace(/]*\/?>/g, "\n") .replace(/<\/w:p>/g, "\n") // paragraph -> newline .replace(//g, ""); // ignore list metadata // pull visible text runs const out = []; const re = /]*>([\s\S]*?)<\/w:t>|\n|\t/g; let m; let line = ""; while ((m = re.exec(xml)) !== null) { if (m[0] === "\n") { out.push(line); line = ""; } else if (m[0] === "\t") { line += "\t"; } else { line += decodeEntities(m[1]); } } if (line) out.push(line); return out.join("\n"); } // ---------- PDF (via pdf.js) ---------- async function extractPdf(file) { const lib = window.pdfjsLib; if (!lib) throw new Error("pdf-lib-missing"); const data = new Uint8Array(await file.arrayBuffer()); const pdf = await lib.getDocument({ data, disableWorker: false }).promise; const allLines = []; for (let p = 1; p <= pdf.numPages; p++) { const page = await pdf.getPage(p); const tc = await page.getTextContent(); // group text items into lines by their y position const rows = []; tc.items.forEach((it) => { if (!it.str) return; const x = it.transform[4]; const y = Math.round(it.transform[5]); let row = rows.find((r) => Math.abs(r.y - y) < 4); if (!row) { row = { y, parts: [] }; rows.push(row); } row.parts.push({ x, str: it.str }); }); rows.sort((a, b) => b.y - a.y); // top to bottom rows.forEach((r) => { r.parts.sort((a, b) => a.x - b.x); const text = r.parts.map((pp) => pp.str).join("").replace(/\s+/g, " ").trim(); if (text) allLines.push(text); }); } return allLines.join("\n"); } // ---------- legacy .doc / plain text ---------- function readAsText(file) { return new Promise((resolve, reject) => { const r = new FileReader(); r.onload = () => resolve(String(r.result || "")); r.onerror = reject; r.readAsText(file); }); } async function extractAgendaText(file) { const name = (file.name || "").toLowerCase(); const type = file.type || ""; if (name.endsWith(".pdf") || type === "application/pdf") { return { text: await extractPdf(file), method: "PDF" }; } if (name.endsWith(".docx") || type.indexOf("officedocument.wordprocessing") >= 0) { return { text: await extractDocx(file), method: "Word" }; } // .txt / .md / .rtf / legacy .doc — best-effort text read const text = await readAsText(file); // strip RTF control words if present if (/^\s*{\\rtf/.test(text)) { const stripped = text.replace(/\\par[d]?/g, "\n").replace(/\\[a-z]+-?\d* ?/g, "").replace(/[{}]/g, "").trim(); return { text: stripped, method: "RTF" }; } return { text, method: "Text" }; } window.extractAgendaText = extractAgendaText; })();