adding Marker to parse effectively pdf

This commit is contained in:
Adrien
2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -0,0 +1,26 @@
package com.aiteacher.document;
import java.util.List;
/**
* Internal DTO produced by MarkerPageParser for one PDF page.
* Decouples the Marker HTTP API from downstream services.
*/
public record PageResult(
int pageNumber, // 1-based, derived from Marker page block index
String orderedText, // full page text in correct reading order (blocks joined by \n\n)
String headingTitle, // first SectionHeader block on page, or null
List<FigureData> figures, // extracted figure images (may be empty)
String markdown // markdown representation with marker://{blockId} image placeholders
) {
/**
* A figure extracted from the page.
* Image bytes are PNG data decoded from the Marker JSON {@code images} map.
*/
public record FigureData(
byte[] imageBytes, // PNG image data (base64-decoded from Marker response)
String nearestCaption, // text of the adjacent Caption block, or null
String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability
) {}
}