846 lines
30 KiB
TypeScript
846 lines
30 KiB
TypeScript
// Utilities to extract DocuSeal-style signature placeholders from PDFs
|
|
// and estimate reasonable positions when exact text coordinates are unavailable.
|
|
|
|
import { PDFDocument } from 'pdf-lib';
|
|
import pako from 'pako';
|
|
import { execFile } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import * as fs from 'fs';
|
|
import * as os from 'os';
|
|
import * as path from 'path';
|
|
import { PDFiumLibrary } from '@hyzyla/pdfium';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
export type PlaceholderMatch = {
|
|
fullMatch: string;
|
|
label: string;
|
|
role: string;
|
|
type: string;
|
|
// DocuSeal placeholders declare dimensions in pixels (px)
|
|
// Example: {{Signature;role=Employeur;type=signature;height=60;width=150}}
|
|
width: number; // px
|
|
height: number; // px
|
|
startIndex: number;
|
|
endIndex: number;
|
|
};
|
|
|
|
export type EstimatedPosition = {
|
|
role: string;
|
|
label: string;
|
|
page: number; // 1-indexed
|
|
x: number; // POURCENTAGES (%) from left
|
|
y: number; // POURCENTAGES (%) from top
|
|
width: number; // POURCENTAGES (%)
|
|
height: number; // POURCENTAGES (%)
|
|
};
|
|
|
|
export type ExtractedPosition = {
|
|
role: string;
|
|
label: string;
|
|
page: number; // 1-indexed
|
|
x: number; // POURCENTAGES (%) from left
|
|
y: number; // POURCENTAGES (%) from top
|
|
width: number; // POURCENTAGES (%)
|
|
height: number; // POURCENTAGES (%)
|
|
text: string; // Le texte du placeholder trouvé
|
|
};
|
|
|
|
const PLACEHOLDER_REGEX = /\{\{([^;]+);role=([^;]+);type=([^;]+);height=(\d+);width=(\d+)\}\}/g;
|
|
|
|
/**
|
|
* Count PDF pages by scanning for '/Type /Page' markers in raw bytes.
|
|
* This is heuristic but robust enough for most PDFs without full parsing.
|
|
*/
|
|
export function countPdfPagesFromBytes(bytes: Uint8Array | Buffer): number {
|
|
try {
|
|
const text = bufferToLatin1String(bytes);
|
|
// Count '/Type /Page' but not '/Type /Pages'
|
|
const matches = text.match(/\/Type\s*\/Page(?!s)\b/g);
|
|
if (matches && matches.length > 0) return matches.length;
|
|
} catch {
|
|
// ignore
|
|
}
|
|
// Default to 1 page if unknown
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Extract placeholders using external Node.js script with pdf-parse
|
|
* Contourne les problèmes de compatibilité Webpack/Next.js avec pdfjs-dist
|
|
*/
|
|
export async function extractPlaceholdersWithPdfParse(bytes: Uint8Array | Buffer): Promise<{ placeholders: PlaceholderMatch[]; text: string; numPages?: number } > {
|
|
let tempFile: string | null = null;
|
|
|
|
try {
|
|
console.log('[extractPlaceholdersWithPdfParse] Extraction du texte avec script externe...');
|
|
|
|
// Écrire le PDF dans un fichier temporaire
|
|
tempFile = path.join(os.tmpdir(), `pdf-extract-${Date.now()}.pdf`);
|
|
fs.writeFileSync(tempFile, bytes);
|
|
console.log('[extractPlaceholdersWithPdfParse] Fichier temporaire:', tempFile);
|
|
|
|
// Chemin vers le script d'extraction
|
|
const scriptPath = path.join(process.cwd(), 'scripts', 'extract-pdf-text.js');
|
|
console.log('[extractPlaceholdersWithPdfParse] Script path:', scriptPath);
|
|
|
|
// Exécuter le script avec le fichier PDF
|
|
const { stdout, stderr } = await Promise.race([
|
|
execFileAsync('node', [scriptPath, tempFile], {
|
|
maxBuffer: 10 * 1024 * 1024 // 10MB max
|
|
}),
|
|
new Promise<never>((_, reject) =>
|
|
setTimeout(() => reject(new Error('Script timeout après 30s')), 30000)
|
|
)
|
|
]);
|
|
|
|
if (stderr) {
|
|
console.error('[extractPlaceholdersWithPdfParse] Stderr:', stderr);
|
|
}
|
|
|
|
console.log('[extractPlaceholdersWithPdfParse] Stdout length:', stdout.length);
|
|
|
|
// Parser la réponse JSON
|
|
const data = JSON.parse(stdout);
|
|
|
|
if (data.error) {
|
|
throw new Error(data.error);
|
|
}
|
|
|
|
const text = data.text;
|
|
console.log('[extractPlaceholdersWithPdfParse] Texte extrait:', {
|
|
textLength: text.length,
|
|
numPages: data.numPages,
|
|
preview: text.substring(0, 200)
|
|
});
|
|
|
|
console.log('[extractPlaceholdersWithPdfParse] Texte extrait:', {
|
|
textLength: text.length,
|
|
textPreview: text.substring(0, 500),
|
|
containsPlaceholder: text.includes('{{'),
|
|
containsSignature: text.includes('Signature'),
|
|
});
|
|
|
|
if (!text.includes('{{')) {
|
|
console.warn('[extractPlaceholdersWithPdfParse] Aucun placeholder {{ trouvé dans le texte');
|
|
return { placeholders: [], text, numPages: data.numPages };
|
|
}
|
|
|
|
const placeholders: PlaceholderMatch[] = [];
|
|
|
|
// Chercher les placeholders
|
|
PLACEHOLDER_REGEX.lastIndex = 0;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = PLACEHOLDER_REGEX.exec(text)) !== null) {
|
|
console.log('[extractPlaceholdersWithPdfParse] ✅ Placeholder trouvé:', match[0]);
|
|
placeholders.push({
|
|
fullMatch: match[0],
|
|
label: match[1].trim(),
|
|
role: match[2].trim(),
|
|
type: match[3].trim(),
|
|
height: parseInt(match[4], 10),
|
|
width: parseInt(match[5], 10),
|
|
startIndex: match.index,
|
|
endIndex: match.index + match[0].length,
|
|
});
|
|
}
|
|
|
|
console.log('[extractPlaceholdersWithPdfParse] Total trouvé:', placeholders.length);
|
|
return { placeholders, text, numPages: data.numPages };
|
|
|
|
} catch (error) {
|
|
console.error('[extractPlaceholdersWithPdfParse] Erreur:', error);
|
|
return { placeholders: [], text: '', numPages: undefined };
|
|
} finally {
|
|
// Nettoyer le fichier temporaire
|
|
if (tempFile && fs.existsSync(tempFile)) {
|
|
try {
|
|
fs.unlinkSync(tempFile);
|
|
console.log('[extractPlaceholdersWithPdfParse] Fichier temporaire supprimé');
|
|
} catch (e) {
|
|
console.error('[extractPlaceholdersWithPdfParse] Erreur lors de la suppression du fichier temporaire:', e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract placeholders from PDF bytes by decompressing FlateDecode streams
|
|
*/
|
|
export function extractPlaceholdersFromPdfBuffer(bytes: Uint8Array | Buffer): PlaceholderMatch[] {
|
|
const text = bufferToLatin1String(bytes);
|
|
|
|
// Décompresser les streams FlateDecode
|
|
let decompressedText = text;
|
|
|
|
// Trouver tous les streams compressés
|
|
const streamRegex = /\/Filter\s*\/FlateDecode[^>]*>>[\s\n]*stream[\s\n]+([\s\S]*?)[\s\n]+endstream/g;
|
|
let streamMatch: RegExpExecArray | null;
|
|
|
|
console.log('[extractPlaceholdersFromPdfBuffer] Recherche de streams FlateDecode...');
|
|
let streamCount = 0;
|
|
|
|
while ((streamMatch = streamRegex.exec(text)) !== null) {
|
|
streamCount++;
|
|
try {
|
|
// Extraire les données du stream
|
|
const compressedData = streamMatch[1];
|
|
|
|
// Convertir en Uint8Array
|
|
const bytes = new Uint8Array(compressedData.length);
|
|
for (let i = 0; i < compressedData.length; i++) {
|
|
bytes[i] = compressedData.charCodeAt(i);
|
|
}
|
|
|
|
// Décompresser avec pako (zlib)
|
|
const decompressed = pako.inflate(bytes, { to: 'string' });
|
|
|
|
// Ajouter le contenu décompressé
|
|
decompressedText += '\n' + decompressed;
|
|
|
|
console.log(`[extractPlaceholdersFromPdfBuffer] Stream ${streamCount} décompressé:`, {
|
|
compressedSize: compressedData.length,
|
|
decompressedSize: decompressed.length,
|
|
preview: decompressed.substring(0, 200),
|
|
containsPlaceholder: decompressed.includes('{{'),
|
|
});
|
|
} catch (err) {
|
|
console.warn(`[extractPlaceholdersFromPdfBuffer] Erreur décompression stream ${streamCount}:`, err);
|
|
}
|
|
}
|
|
|
|
console.log('[extractPlaceholdersFromPdfBuffer] Texte final:', {
|
|
textLength: decompressedText.length,
|
|
streamCount,
|
|
containsPlaceholder: decompressedText.includes('{{'),
|
|
containsSignature: decompressedText.includes('Signature'),
|
|
placeholderIndex: decompressedText.indexOf('{{'),
|
|
contextAroundPlaceholder: decompressedText.indexOf('{{') >= 0
|
|
? decompressedText.substring(
|
|
Math.max(0, decompressedText.indexOf('{{') - 100),
|
|
Math.min(decompressedText.length, decompressedText.indexOf('{{') + 300)
|
|
)
|
|
: 'N/A',
|
|
});
|
|
|
|
// Le texte peut être encodé en hexadécimal dans les instructions PDF
|
|
// Cherchons les séquences <...> et décodons-les
|
|
console.log('[extractPlaceholdersFromPdfBuffer] Décodage des séquences hexadécimales...');
|
|
const hexRegex = /<([0-9A-Fa-f\s]+)>/g;
|
|
let hexMatch: RegExpExecArray | null;
|
|
let hexDecodeCount = 0;
|
|
|
|
while ((hexMatch = hexRegex.exec(decompressedText)) !== null) {
|
|
const hexString = hexMatch[1].replace(/\s/g, '');
|
|
try {
|
|
let decoded = '';
|
|
// Essayer décodage UTF-16 (4 hex digits par caractère)
|
|
for (let i = 0; i < hexString.length; i += 4) {
|
|
const hex = hexString.substr(i, 4);
|
|
const charCode = parseInt(hex, 16);
|
|
if (charCode > 0 && charCode < 65536) {
|
|
decoded += String.fromCharCode(charCode);
|
|
}
|
|
}
|
|
if (decoded) {
|
|
decompressedText += '\n' + decoded;
|
|
hexDecodeCount++;
|
|
if (decoded.includes('{{') || decoded.includes('Signature')) {
|
|
console.log('[extractPlaceholdersFromPdfBuffer] ✅ Texte hex décodé:', decoded.substring(0, 200));
|
|
}
|
|
}
|
|
} catch (err) {
|
|
// Ignore decoding errors
|
|
}
|
|
}
|
|
|
|
console.log('[extractPlaceholdersFromPdfBuffer] Après décodage hex:', {
|
|
hexDecodeCount,
|
|
newTextLength: decompressedText.length,
|
|
containsPlaceholder: decompressedText.includes('{{'),
|
|
containsSignature: decompressedText.includes('Signature'),
|
|
});
|
|
|
|
const placeholders: PlaceholderMatch[] = [];
|
|
|
|
// Chercher les placeholders dans le texte décodé
|
|
PLACEHOLDER_REGEX.lastIndex = 0;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = PLACEHOLDER_REGEX.exec(decompressedText)) !== null) {
|
|
console.log('[extractPlaceholdersFromPdfBuffer] ✅ Match trouvé:', match[0]);
|
|
placeholders.push({
|
|
fullMatch: match[0],
|
|
label: match[1].trim(),
|
|
role: match[2].trim(),
|
|
type: match[3].trim(),
|
|
height: parseInt(match[4], 10),
|
|
width: parseInt(match[5], 10),
|
|
startIndex: match.index,
|
|
endIndex: match.index + match[0].length,
|
|
});
|
|
}
|
|
|
|
console.log('[extractPlaceholdersFromPdfBuffer] Total trouvé:', placeholders.length);
|
|
return placeholders;
|
|
}
|
|
|
|
/**
|
|
* Estimate reasonable positions (in PERCENTAGES) for placeholders when exact coordinates are unknown.
|
|
* Places fields near the bottom margin, left/right by role.
|
|
* MISE À JOUR : Retourne des POURCENTAGES au lieu de millimètres
|
|
*/
|
|
export function estimatePositionsFromPlaceholders(
|
|
placeholders: PlaceholderMatch[],
|
|
pageCount: number
|
|
): EstimatedPosition[] {
|
|
const MARGIN_X_PERCENT = 9.5; // ~20mm sur 210mm = 9.5%
|
|
const MARGIN_BOTTOM_PERCENT = 10; // ~30mm sur 297mm = 10%
|
|
|
|
// Prefer placing on the last page by default
|
|
const defaultPage = Math.max(1, pageCount);
|
|
const SPACING_PERCENT = 5; // Espacement vertical entre signatures
|
|
|
|
return placeholders.map((ph, index) => {
|
|
// DocuSeal: width/height sont en pixels. Convertir en points (1px = 0.75pt) puis en %.
|
|
// Hypothèse A4 en points (595 x 842)
|
|
const A4_WIDTH_PT = 595;
|
|
const A4_HEIGHT_PT = 842;
|
|
const widthPt = (ph.width || 150) * 0.75;
|
|
const heightPt = (ph.height || 60) * 0.75;
|
|
const widthPercent = (widthPt / A4_WIDTH_PT) * 100;
|
|
const heightPercent = (heightPt / A4_HEIGHT_PT) * 100;
|
|
|
|
// Calculer Y en fonction de l'ordre (du haut vers le bas avec espacement)
|
|
const baseY = 100 - MARGIN_BOTTOM_PERCENT - heightPercent;
|
|
const yPercent = baseY - (index * (heightPercent + SPACING_PERCENT));
|
|
|
|
// Role-based horizontal placement: employer left, employee right
|
|
const roleLc = ph.role.toLowerCase();
|
|
// IMPORTANT: Vérifier "employeur" AVANT "employe" car "employeur" contient "employe"
|
|
const isEmployer = roleLc.includes('employeur');
|
|
const isEmployee = !isEmployer && (roleLc.includes('salari') || roleLc.includes('employé') || roleLc.includes('employe'));
|
|
|
|
const xPercent = isEmployee
|
|
? 100 - MARGIN_X_PERCENT - widthPercent // À droite pour salarié
|
|
: MARGIN_X_PERCENT; // À gauche pour employeur
|
|
|
|
return {
|
|
role: ph.role,
|
|
label: ph.label,
|
|
page: defaultPage,
|
|
x: xPercent,
|
|
y: yPercent,
|
|
width: widthPercent,
|
|
height: heightPercent,
|
|
};
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Estimation améliorée basée sur le TEXTE complet extrait avec pdf-parse.
|
|
* - Détermine la page à partir de l'index de ligne du placeholder
|
|
* - Calcule Y en fonction de la ligne au sein de la page
|
|
* - Convertit width/height px → pt → % (A4 par défaut)
|
|
*/
|
|
export function estimatePositionsFromPlaceholdersUsingText(
|
|
placeholders: PlaceholderMatch[],
|
|
text: string,
|
|
pageCount: number
|
|
): EstimatedPosition[] {
|
|
const lines = text.split(/\r?\n/);
|
|
const totalLines = Math.max(1, lines.length);
|
|
const A4_WIDTH_PT = 595;
|
|
const A4_HEIGHT_PT = 842;
|
|
|
|
// Détection explicite des débuts de page si le texte contient "Page X / N" en début de ligne
|
|
const pageMarkers: { page: number; offset: number; lineIndex: number }[] = [];
|
|
const pageHeaderRegex = /^Page\s+(\d+)\s*\/\s*(\d+)\s*$/m;
|
|
// Construire les offsets de ligne et détecter les pages au passage
|
|
const lineOffsets: number[] = new Array(totalLines);
|
|
let offset = 0;
|
|
for (let i = 0; i < totalLines; i++) {
|
|
lineOffsets[i] = offset;
|
|
const ln = lines[i];
|
|
// Si une ligne ressemble à "Page X / N", enregistrer comme marqueur de page
|
|
const m = ln.match(/^Page\s+(\d+)\s*\/\s*(\d+)\s*$/);
|
|
if (m) {
|
|
const p = parseInt(m[1], 10);
|
|
pageMarkers.push({ page: p, offset, lineIndex: i });
|
|
}
|
|
// +1 pour le saut de ligne (approx)
|
|
offset += ln.length + 1;
|
|
}
|
|
|
|
const findLineIndex = (pos: number): number => {
|
|
// Recherche linéaire suffisante pour de petits documents; peut être optimisée via binaire
|
|
let idx = 0;
|
|
for (let i = 0; i < totalLines; i++) {
|
|
if (lineOffsets[i] <= pos) idx = i; else break;
|
|
}
|
|
return idx;
|
|
};
|
|
|
|
const clamp = (v: number, min: number, max: number) => Math.max(min, Math.min(max, v));
|
|
const TOP_MARGIN_PERCENT = 5;
|
|
const BOTTOM_MARGIN_PERCENT = 5;
|
|
const LEFT_MARGIN_PERCENT = 5;
|
|
const RIGHT_MARGIN_PERCENT = 5;
|
|
|
|
return placeholders.map((ph) => {
|
|
const widthPt = (ph.width || 150) * 0.75;
|
|
const heightPt = (ph.height || 60) * 0.75;
|
|
const widthPercent = (widthPt / A4_WIDTH_PT) * 100;
|
|
const heightPercent = (heightPt / A4_HEIGHT_PT) * 100;
|
|
|
|
// Localiser la première occurrence du placeholder dans le texte global
|
|
let idx = text.indexOf(ph.fullMatch);
|
|
if (idx === -1) idx = ph.startIndex;
|
|
|
|
const lineIdx = findLineIndex(idx);
|
|
// Déterminer la page soit via marqueurs "Page X / N", soit via découpage uniforme
|
|
let page: number;
|
|
if (pageMarkers.length > 0) {
|
|
// Trouver le dernier marqueur dont l'offset <= idx
|
|
let chosen = pageMarkers[0];
|
|
for (const pm of pageMarkers) {
|
|
if (pm.offset <= idx) chosen = pm; else break;
|
|
}
|
|
page = chosen.page;
|
|
} else {
|
|
const linesPerPage = Math.max(1, Math.floor(totalLines / Math.max(1, pageCount)));
|
|
page = Math.floor(lineIdx / linesPerPage) + 1;
|
|
}
|
|
page = clamp(page, 1, Math.max(1, pageCount));
|
|
|
|
// Calcul du relInPage :
|
|
let relInPage: number;
|
|
if (pageMarkers.length > 0) {
|
|
// Limites de page via marqueurs
|
|
const thisMarkerIdx = pageMarkers.findIndex((pm) => pm.page === page);
|
|
const startLine = thisMarkerIdx >= 0 ? pageMarkers[thisMarkerIdx].lineIndex : 0;
|
|
const endLine = thisMarkerIdx + 1 < pageMarkers.length ? pageMarkers[thisMarkerIdx + 1].lineIndex : totalLines - 1;
|
|
const span = Math.max(1, endLine - startLine);
|
|
const within = clamp(lineIdx - startLine, 0, span);
|
|
relInPage = within / span;
|
|
} else {
|
|
const linesPerPage = Math.max(1, Math.floor(totalLines / Math.max(1, pageCount)));
|
|
const firstLineOfPage = (page - 1) * linesPerPage;
|
|
const withinPageLine = Math.max(0, lineIdx - firstLineOfPage);
|
|
relInPage = withinPageLine / linesPerPage; // 0 en haut, 1 en bas
|
|
}
|
|
|
|
const usableHeight = 100 - TOP_MARGIN_PERCENT - BOTTOM_MARGIN_PERCENT - heightPercent;
|
|
const yPercent = clamp(TOP_MARGIN_PERCENT + relInPage * usableHeight, 0, 100 - heightPercent);
|
|
|
|
// X basé sur la position du texte dans la ligne
|
|
const posInLine = Math.max(0, idx - lineOffsets[lineIdx]);
|
|
const lineLen = Math.max(1, lines[lineIdx]?.length || 1);
|
|
const usableWidth = Math.max(0, 100 - LEFT_MARGIN_PERCENT - RIGHT_MARGIN_PERCENT - widthPercent);
|
|
const xPercent = clamp(LEFT_MARGIN_PERCENT + (posInLine / lineLen) * usableWidth, 0, 100 - widthPercent);
|
|
|
|
return {
|
|
role: ph.role,
|
|
label: ph.label,
|
|
page,
|
|
x: xPercent,
|
|
y: yPercent,
|
|
width: widthPercent,
|
|
height: heightPercent,
|
|
};
|
|
});
|
|
}
|
|
|
|
function bufferToLatin1String(bytes: Uint8Array | Buffer): string {
|
|
if (typeof Buffer !== 'undefined' && (bytes as Buffer).toString) {
|
|
return (bytes as Buffer).toString('latin1');
|
|
}
|
|
// Fallback for environments without Node Buffer
|
|
let result = '';
|
|
const chunk = 8192;
|
|
for (let i = 0; i < bytes.length; i += chunk) {
|
|
const slice = bytes.slice(i, i + chunk);
|
|
result += Array.from(slice as any, (b: number) => String.fromCharCode(b)).join('');
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Extraction précise des positions de placeholders avec pdf-lib
|
|
* Cette fonction extrait les positions exactes des placeholders dans le PDF
|
|
* en analysant le contenu textuel de chaque page.
|
|
*
|
|
* RETOURNE LES POSITIONS EN POURCENTAGES pour être indépendant de la résolution
|
|
*/
|
|
export async function extractPrecisePositionsFromPdf(
|
|
pdfBytes: Uint8Array | Buffer
|
|
): Promise<ExtractedPosition[]> {
|
|
try {
|
|
const pdfDoc = await PDFDocument.load(pdfBytes);
|
|
const pages = pdfDoc.getPages();
|
|
const positions: ExtractedPosition[] = [];
|
|
|
|
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
|
const page = pages[pageIndex];
|
|
const { width: pageWidthPt, height: pageHeightPt } = page.getSize();
|
|
|
|
// Extraire le texte de la page
|
|
const pageContent = await extractPageTextContent(pdfDoc, pageIndex);
|
|
|
|
console.log(`[PLACEHOLDER] Page ${pageIndex + 1} - Contenu extrait:`, {
|
|
hasContent: !!pageContent,
|
|
contentLength: pageContent?.length || 0,
|
|
contentPreview: pageContent?.substring(0, 200),
|
|
});
|
|
|
|
if (!pageContent) {
|
|
console.warn(`[PLACEHOLDER] Page ${pageIndex + 1} - Aucun contenu extrait`);
|
|
continue;
|
|
}
|
|
|
|
// Chercher les placeholders dans le contenu de la page
|
|
PLACEHOLDER_REGEX.lastIndex = 0;
|
|
let match: RegExpExecArray | null;
|
|
|
|
console.log(`[PLACEHOLDER] Page ${pageIndex + 1} - Recherche de placeholders avec regex:`, PLACEHOLDER_REGEX);
|
|
|
|
let matchCount = 0;
|
|
while ((match = PLACEHOLDER_REGEX.exec(pageContent)) !== null) {
|
|
matchCount++;
|
|
const label = match[1].trim();
|
|
const role = match[2].trim();
|
|
const type = match[3].trim();
|
|
// DocuSeal dimensions en pixels
|
|
const heightPx = parseInt(match[4], 10);
|
|
const widthPx = parseInt(match[5], 10);
|
|
const fullText = match[0];
|
|
|
|
// Chercher la position du texte dans la page
|
|
const textPosition = findTextPositionInPage(pageContent, fullText);
|
|
|
|
if (textPosition) {
|
|
// Convertir la position PDF (points, origine bas-gauche) en % (origine haut-gauche)
|
|
const xPercent = (textPosition.x / pageWidthPt) * 100;
|
|
const yPercent = ((pageHeightPt - textPosition.y) / pageHeightPt) * 100;
|
|
|
|
// Convertir dimensions px -> pt puis en %
|
|
const widthPt = widthPx * 0.75;
|
|
const heightPt = heightPx * 0.75;
|
|
const widthPercent = (widthPt / pageWidthPt) * 100;
|
|
const heightPercent = (heightPt / pageHeightPt) * 100;
|
|
|
|
positions.push({
|
|
role,
|
|
label,
|
|
page: pageIndex + 1, // 1-indexed
|
|
x: xPercent,
|
|
y: yPercent,
|
|
width: widthPercent,
|
|
height: heightPercent,
|
|
text: fullText,
|
|
});
|
|
|
|
console.log(`[PLACEHOLDER] Trouvé sur page ${pageIndex + 1}: ${label} (${role})`);
|
|
console.log(` Position: x=${xPercent.toFixed(1)}%, y=${yPercent.toFixed(1)}%, w=${widthPercent.toFixed(1)}%, h=${heightPercent.toFixed(1)}%`);
|
|
} else {
|
|
console.warn(`[PLACEHOLDER] Position non trouvée pour: ${label} sur page ${pageIndex + 1}`);
|
|
}
|
|
}
|
|
|
|
if (matchCount === 0) {
|
|
console.log(`[PLACEHOLDER] Page ${pageIndex + 1} - Aucun placeholder trouvé dans le contenu`);
|
|
} else {
|
|
console.log(`[PLACEHOLDER] Page ${pageIndex + 1} - ${matchCount} placeholder(s) trouvé(s)`);
|
|
}
|
|
}
|
|
|
|
console.log(`[PLACEHOLDER] Total trouvé: ${positions.length} positions précises`);
|
|
return positions;
|
|
|
|
} catch (error) {
|
|
console.error('[PLACEHOLDER] Erreur lors de l\'extraction précise:', error);
|
|
// Fallback sur l'ancienne méthode
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extrait le contenu textuel brut d'une page PDF en décompressant les streams
|
|
*/
|
|
async function extractPageTextContent(pdfDoc: PDFDocument, pageIndex: number): Promise<string | null> {
|
|
try {
|
|
const page = pdfDoc.getPages()[pageIndex];
|
|
const { Contents } = page.node.normalizedEntries();
|
|
|
|
if (!Contents) {
|
|
console.log(`[extractPageTextContent] Page ${pageIndex}: Pas de Contents`);
|
|
return null;
|
|
}
|
|
|
|
let contentStream: any;
|
|
|
|
// Contents peut être un array ou un objet unique
|
|
if (Array.isArray(Contents)) {
|
|
// Combiner tous les streams
|
|
let combined = '';
|
|
for (const ref of Contents) {
|
|
const stream = pdfDoc.context.lookup(ref);
|
|
if (stream && (stream as any).contents) {
|
|
combined += new TextDecoder('utf-8', { fatal: false }).decode((stream as any).contents);
|
|
}
|
|
}
|
|
return combined || null;
|
|
} else {
|
|
// Stream unique
|
|
contentStream = pdfDoc.context.lookup(Contents);
|
|
if (contentStream && (contentStream as any).contents) {
|
|
const text = new TextDecoder('utf-8', { fatal: false }).decode((contentStream as any).contents);
|
|
console.log(`[extractPageTextContent] Page ${pageIndex}: ${text.length} caractères extraits`);
|
|
return text;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
console.error(`[PLACEHOLDER] Erreur extraction contenu page ${pageIndex}:`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Trouve la position d'un texte dans le contenu d'une page PDF
|
|
* en analysant les opérateurs de positionnement de texte (Tm, Td, etc.)
|
|
*/
|
|
function findTextPositionInPage(pageContent: string, searchText: string): { x: number; y: number } | null {
|
|
try {
|
|
// D'abord chercher le texte exact
|
|
let textIndex = pageContent.indexOf(searchText);
|
|
|
|
// Si non trouvé, essayer de chercher une version échappée ou encodée
|
|
if (textIndex === -1) {
|
|
// Parfois les accolades sont encodées ou le texte est découpé
|
|
const simplifiedSearch = searchText.replace(/[{}]/g, '');
|
|
textIndex = pageContent.indexOf(simplifiedSearch);
|
|
}
|
|
|
|
if (textIndex === -1) {
|
|
console.warn('[PLACEHOLDER] Texte non trouvé dans le contenu de la page:', searchText.substring(0, 50));
|
|
return null;
|
|
}
|
|
|
|
// Remonter dans le contenu pour trouver le dernier opérateur de positionnement
|
|
const beforeText = pageContent.substring(Math.max(0, textIndex - 2000), textIndex);
|
|
|
|
// Chercher le dernier opérateur Tm (matrice de transformation de texte)
|
|
// Format: a b c d e f Tm où e et f sont les coordonnées x et y
|
|
const tmMatches = beforeText.matchAll(/([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+Tm/g);
|
|
const tmArray = Array.from(tmMatches);
|
|
|
|
if (tmArray.length > 0) {
|
|
const lastTm = tmArray[tmArray.length - 1];
|
|
const x = parseFloat(lastTm[5]); // Position X (paramètre e)
|
|
const y = parseFloat(lastTm[6]); // Position Y (paramètre f)
|
|
|
|
console.log('[PLACEHOLDER] Position trouvée via Tm:', { x, y, text: searchText.substring(0, 30) });
|
|
return { x, y };
|
|
}
|
|
|
|
// Fallback: chercher l'opérateur Td (déplacement de texte)
|
|
const tdMatches = beforeText.matchAll(/([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+Td/g);
|
|
const tdArray = Array.from(tdMatches);
|
|
|
|
if (tdArray.length > 0) {
|
|
const lastTd = tdArray[tdArray.length - 1];
|
|
const x = parseFloat(lastTd[1]);
|
|
const y = parseFloat(lastTd[2]);
|
|
|
|
console.log('[PLACEHOLDER] Position trouvée via Td:', { x, y, text: searchText.substring(0, 30) });
|
|
return { x, y };
|
|
}
|
|
|
|
// Dernier fallback: chercher BT (début de bloc texte) avec position
|
|
const btMatch = beforeText.match(/BT[\s\S]*?([+-]?\d+\.?\d*)\s+([+-]?\d+\.?\d*)\s+Td/);
|
|
if (btMatch) {
|
|
const x = parseFloat(btMatch[1]);
|
|
const y = parseFloat(btMatch[2]);
|
|
|
|
console.log('[PLACEHOLDER] Position trouvée via BT+Td:', { x, y, text: searchText.substring(0, 30) });
|
|
return { x, y };
|
|
}
|
|
|
|
console.warn('[PLACEHOLDER] Aucun opérateur de position trouvé pour:', searchText.substring(0, 50));
|
|
return null;
|
|
|
|
} catch (error) {
|
|
console.error('[PLACEHOLDER] Erreur recherche position texte:', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extraction de placeholders avec Pdfium (comme DocuSeal)
|
|
* Cette méthode utilise Pdfium pour extraire le texte des PDFs Chromium
|
|
* RETOURNE LES POSITIONS EN POURCENTAGES
|
|
*/
|
|
export async function extractPlaceholdersWithPdfium(
|
|
pdfBuffer: Buffer | Uint8Array
|
|
): Promise<ExtractedPosition[]> {
|
|
let library: any = null;
|
|
let document: any = null;
|
|
|
|
try {
|
|
console.log('[PDFIUM] Initialisation de Pdfium...');
|
|
|
|
// Initialiser la bibliothèque Pdfium
|
|
library = await PDFiumLibrary.init();
|
|
|
|
// Charger le document depuis le buffer
|
|
document = await library.loadDocument(pdfBuffer);
|
|
const pageCount = document.getPageCount();
|
|
|
|
console.log(`[PDFIUM] Document chargé: ${pageCount} page(s)`);
|
|
|
|
const positions: ExtractedPosition[] = [];
|
|
|
|
// Parcourir toutes les pages
|
|
for (const page of document.pages()) {
|
|
console.log(`[PDFIUM] Extraction texte page ${page.number}...`);
|
|
|
|
// Extraire le texte de la page avec Pdfium
|
|
const pageText = page.getText();
|
|
|
|
console.log(`[PDFIUM] Page ${page.number} - Texte extrait:`, {
|
|
length: pageText.length,
|
|
preview: pageText.substring(0, 200),
|
|
containsPlaceholder: pageText.includes('{{'),
|
|
});
|
|
|
|
// Obtenir les dimensions de la page en points (1/72 inch)
|
|
const { width: pageWidthPt, height: pageHeightPt } = page.getSize();
|
|
|
|
// Chercher les placeholders dans le texte extrait
|
|
PLACEHOLDER_REGEX.lastIndex = 0;
|
|
let match: RegExpExecArray | null;
|
|
let matchCount = 0;
|
|
const pageMatches: Array<{
|
|
label: string;
|
|
role: string;
|
|
type: string;
|
|
heightPx: number;
|
|
widthPx: number;
|
|
fullText: string;
|
|
textIndex: number;
|
|
}> = [];
|
|
|
|
// Collecter tous les matches de la page
|
|
while ((match = PLACEHOLDER_REGEX.exec(pageText)) !== null) {
|
|
matchCount++;
|
|
const label = match[1].trim();
|
|
const role = match[2].trim();
|
|
const type = match[3].trim();
|
|
// DocuSeal dimensions en pixels
|
|
const heightPx = parseInt(match[4], 10);
|
|
const widthPx = parseInt(match[5], 10);
|
|
const fullText = match[0];
|
|
const textIndex = match.index;
|
|
|
|
console.log(`[PDFIUM] Placeholder trouvé: ${label} (${role}) à index ${textIndex}`);
|
|
|
|
pageMatches.push({
|
|
label,
|
|
role,
|
|
type,
|
|
heightPx,
|
|
widthPx,
|
|
fullText,
|
|
textIndex,
|
|
});
|
|
}
|
|
|
|
// Trier par ordre d'apparition dans le texte
|
|
pageMatches.sort((a, b) => a.textIndex - b.textIndex);
|
|
|
|
// Convertir en positions avec estimation basée sur le TEXTE (meilleure que l'ordre seul)
|
|
// Approche: approximer la ligne du placeholder à partir de textIndex puis convertir en pourcentage vertical
|
|
const rawLines = pageText.split(/\r?\n/);
|
|
const lineStarts: number[] = [];
|
|
let acc = 0;
|
|
rawLines.forEach((ln: string) => {
|
|
lineStarts.push(acc);
|
|
acc += ln.length + 1; // +1 pour le saut de ligne
|
|
});
|
|
|
|
const clamp = (v: number, min: number, max: number) => Math.max(min, Math.min(max, v));
|
|
const LEFT_MARGIN_PERCENT = 5;
|
|
const RIGHT_MARGIN_PERCENT = 5;
|
|
const TOP_MARGIN_PERCENT = 5;
|
|
const BOTTOM_MARGIN_PERCENT = 5;
|
|
|
|
pageMatches.forEach((pm, index) => {
|
|
// Convertir px -> points (1px = 0.75pt) puis en %
|
|
const widthPt = pm.widthPx * 0.75;
|
|
const heightPt = pm.heightPx * 0.75;
|
|
const widthPercent = (widthPt / pageWidthPt) * 100;
|
|
const heightPercent = (heightPt / pageHeightPt) * 100;
|
|
|
|
// Trouver la ligne du texte la plus proche du textIndex
|
|
let lineIdx = 0;
|
|
for (let i = 0; i < lineStarts.length; i++) {
|
|
if (lineStarts[i] <= pm.textIndex) lineIdx = i; else break;
|
|
}
|
|
const totalLines = Math.max(1, rawLines.length);
|
|
const relLine = lineIdx / totalLines; // 0 en haut, 1 en bas
|
|
|
|
// Y: mapper la ligne au pourcentage vertical (avec marges et en tenant compte de la hauteur de l'overlay)
|
|
const usableHeight = 100 - TOP_MARGIN_PERCENT - BOTTOM_MARGIN_PERCENT - heightPercent;
|
|
const yPercent = clamp(TOP_MARGIN_PERCENT + relLine * usableHeight, 0, 100 - heightPercent);
|
|
|
|
// X: basé sur la position dans la ligne (début du placeholder)
|
|
const posInLine = Math.max(0, pm.textIndex - (lineStarts[lineIdx] ?? 0));
|
|
const lineLen = Math.max(1, rawLines[lineIdx]?.length || 1);
|
|
const usableWidth = Math.max(0, 100 - LEFT_MARGIN_PERCENT - RIGHT_MARGIN_PERCENT - widthPercent);
|
|
const xPercent = clamp(LEFT_MARGIN_PERCENT + (posInLine / lineLen) * usableWidth, 0, 100 - widthPercent);
|
|
|
|
positions.push({
|
|
role: pm.role,
|
|
label: pm.label,
|
|
page: page.number,
|
|
x: xPercent,
|
|
y: yPercent,
|
|
width: widthPercent,
|
|
height: heightPercent,
|
|
text: pm.fullText,
|
|
});
|
|
|
|
console.log(`[PDFIUM] Position calculée (texte) #${index + 1}: x=${xPercent.toFixed(1)}%, y=${yPercent.toFixed(1)}% (line ${lineIdx + 1}/${totalLines})`);
|
|
});
|
|
|
|
console.log(`[PDFIUM] Page ${page.number} - ${matchCount} placeholder(s) trouvé(s)`);
|
|
}
|
|
|
|
console.log(`[PDFIUM] Total trouvé: ${positions.length} placeholder(s)`);
|
|
return positions;
|
|
|
|
} catch (error) {
|
|
console.error('[PDFIUM] Erreur lors de l\'extraction:', error);
|
|
return [];
|
|
} finally {
|
|
// Libérer la mémoire
|
|
if (document) {
|
|
try {
|
|
document.destroy();
|
|
console.log('[PDFIUM] Document détruit');
|
|
} catch (e) {
|
|
console.error('[PDFIUM] Erreur destruction document:', e);
|
|
}
|
|
}
|
|
if (library) {
|
|
try {
|
|
library.destroy();
|
|
console.log('[PDFIUM] Bibliothèque détruite');
|
|
} catch (e) {
|
|
console.error('[PDFIUM] Erreur destruction bibliothèque:', e);
|
|
}
|
|
}
|
|
}
|
|
}
|