adding Marker to parse effectively pdf
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Internal DTO produced by MarkerPageParser for one PDF page.
|
||||
* Decouples the Marker HTTP API from downstream services.
|
||||
*/
|
||||
public record PageResult(
|
||||
int pageNumber, // 1-based, derived from Marker page block index
|
||||
String orderedText, // full page text in correct reading order (blocks joined by \n\n)
|
||||
String headingTitle, // first SectionHeader block on page, or null
|
||||
List<FigureData> figures, // extracted figure images (may be empty)
|
||||
String markdown // markdown representation with marker://{blockId} image placeholders
|
||||
) {
|
||||
|
||||
/**
|
||||
* A figure extracted from the page.
|
||||
* Image bytes are PNG data decoded from the Marker JSON {@code images} map.
|
||||
*/
|
||||
public record FigureData(
|
||||
byte[] imageBytes, // PNG image data (base64-decoded from Marker response)
|
||||
String nearestCaption, // text of the adjacent Caption block, or null
|
||||
String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability
|
||||
) {}
|
||||
}
|
||||
Reference in New Issue
Block a user