adding Marker to parse effectively pdf

2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -0,0 +1,26 @@
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by MarkerPageParser for one PDF page.
+ * Decouples the Marker HTTP API from downstream services.
+ */
+public record PageResult(
+        int pageNumber,           // 1-based, derived from Marker page block index
+        String orderedText,       // full page text in correct reading order (blocks joined by \n\n)
+        String headingTitle,      // first SectionHeader block on page, or null
+        List<FigureData> figures, // extracted figure images (may be empty)
+        String markdown           // markdown representation with marker://{blockId} image placeholders
+) {
+
+    /**
+     * A figure extracted from the page.
+     * Image bytes are PNG data decoded from the Marker JSON {@code images} map.
+     */
+    public record FigureData(
+            byte[] imageBytes,       // PNG image data (base64-decoded from Marker response)
+            String nearestCaption,   // text of the adjacent Caption block, or null
+            String blockId           // Marker block ID (e.g. "/page/0/Figure/2") for traceability
+    ) {}
+}