enhance page parsing using json output and html

This commit is contained in:
Adrien
2026-04-05 21:55:30 +02:00
parent ea1276dc2e
commit 5c641f4bcc
9 changed files with 292 additions and 258 deletions
@@ -64,8 +64,8 @@ public class BookController {
)); ));
} }
@GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE) @GetMapping(value = "/{id}/pages/{pageNumber}/html", produces = MediaType.TEXT_HTML_VALUE)
public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id, public ResponseEntity<String> getPageHtml(@PathVariable UUID id,
@PathVariable int pageNumber) { @PathVariable int pageNumber) {
bookService.getById(id); // 404 if not found bookService.getById(id); // 404 if not found
try { try {
@@ -3,8 +3,6 @@ package com.aiteacher.book;
import com.aiteacher.document.*; import com.aiteacher.document.*;
import com.aiteacher.figure.FigureStorageService; import com.aiteacher.figure.FigureStorageService;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
@@ -38,15 +36,15 @@ public class BookEmbeddingService {
private final FigureStorageService figureStorageService; private final FigureStorageService figureStorageService;
private final MarkdownStorageService markdownStorageService; private final MarkdownStorageService markdownStorageService;
private static final Pattern MARKER_PLACEHOLDER =
Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
@Value("${app.embedding.batch-size:50}") @Value("${app.embedding.batch-size:50}")
private int embeddingBatchSize; private int embeddingBatchSize;
@Value("${app.embedding.batch-delay-ms:1000}") @Value("${app.embedding.batch-delay-ms:1000}")
private long embeddingBatchDelayMs; private long embeddingBatchDelayMs;
@Value("${app.embedding.skip-embedding:false}")
private boolean skipEmbedding;
public BookEmbeddingService( public BookEmbeddingService(
VectorStore vectorStore, VectorStore vectorStore,
BookRepository bookRepository, BookRepository bookRepository,
@@ -94,8 +92,10 @@ public class BookEmbeddingService {
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1); ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter); chapterRepository.save(chapter);
// Step 1: Parse every page with Marker — correct reading order + pre-cropped figures // Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
List<PageResult> pageResults = markerPageParser.parse(pdfPath); ParsedBook parsed = markerPageParser.parse(pdfPath);
List<PageResult> pageResults = parsed.pages();
// Step 2: Build SectionEntity per page and persist // Step 2: Build SectionEntity per page and persist
List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults); List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
@@ -105,22 +105,24 @@ public class BookEmbeddingService {
for (SectionEntity section : sections) { for (SectionEntity section : sections) {
allChunks.addAll(textChunkingService.chunk(section, bookTitle)); allChunks.addAll(textChunkingService.chunk(section, bookTitle));
} }
if (skipEmbedding) {
log.info("skip-embedding=true — skipping text embedding for book {}", bookId);
} else {
embedInBatches(allChunks, bookId); embedInBatches(allChunks, bookId);
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId); log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
}
// Step 4: Decode pre-cropped figures from Marker output // Step 4: Decode pre-cropped figures from Marker output
FigureExtractionService.ExtractionResult extraction = FigureExtractionService.ExtractionResult extraction =
figureExtractionService.extract(bookId, chapterId, pageResults); figureExtractionService.extract(bookId, chapterId, pageResults);
List<FigureEntity> figures = extraction.figures(); List<FigureEntity> figures = extraction.figures();
// Step 4b: Upload per-page markdown with resolved figure URLs to S3 // Step 4b: Save per-page HTML to S3, replacing Marker image src with API URLs
for (PageResult page : pageResults) { parsed.htmlByPage().forEach((pageNumber, html) -> {
if (!page.markdown().isBlank()) { String resolved = resolveImageSrcs(html, bookId, extraction.blockIdToFigureId());
String resolved = resolvePlaceholders(page.markdown(), bookId, markdownStorageService.save(bookId, pageNumber, resolved);
extraction.blockIdToFigureId()); });
markdownStorageService.save(bookId, page.pageNumber(), resolved); log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
}
}
// Step 5: Vision analysis (description + visible text) → embed figure chunks // Step 5: Vision analysis (description + visible text) → embed figure chunks
for (FigureEntity figure : figures) { for (FigureEntity figure : figures) {
@@ -139,11 +141,12 @@ public class BookEmbeddingService {
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText()); + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
String embeddingId = UUID.randomUUID().toString(); String embeddingId = UUID.randomUUID().toString();
if (!skipEmbedding) {
Document figureDoc = new Document(embeddingId, embeddingContent, Document figureDoc = new Document(embeddingId, embeddingContent,
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText())); buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
vectorStore.add(List.of(figureDoc)); vectorStore.add(List.of(figureDoc));
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId)); figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
}
figureRepository.save(figure); figureRepository.save(figure);
} }
log.info("Embedded {} figure chunks for book {}", figures.size(), bookId); log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
@@ -252,25 +255,20 @@ public class BookEmbeddingService {
return m; return m;
} }
/** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */ /**
private String resolvePlaceholders(String markdown, UUID bookId, * Replaces Marker's {@code src='{blockId}'} image attributes with resolved API URLs.
Map<String, String> blockIdToFigureId) { * Block IDs look like {@code /page/0/Figure/2}.
Matcher m = MARKER_PLACEHOLDER.matcher(markdown); */
StringBuilder sb = new StringBuilder(); private String resolveImageSrcs(String html, UUID bookId, Map<String, String> blockIdToFigureId) {
while (m.find()) { for (Map.Entry<String, String> entry : blockIdToFigureId.entrySet()) {
String altText = m.group(1); String blockId = entry.getKey();
String blockId = m.group(2); String figureId = entry.getValue();
String figureId = blockIdToFigureId.get(blockId); String apiUrl = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
if (figureId != null) { // Marker emits both single and double-quoted src attributes
String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png"; html = html.replace("src='" + blockId + "'", "src='" + apiUrl + "'");
m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\") html = html.replace("src=\"" + blockId + "\"", "src=\"" + apiUrl + "\"");
.replace("$", "\\$") + "](" + url + ")");
} else {
m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
} }
} return html;
m.appendTail(sb);
return sb.toString().strip();
} }
private String truncate(String msg, int max) { private String truncate(String msg, int max) {
@@ -18,17 +18,22 @@ import java.nio.file.Path;
import java.util.*; import java.util.*;
/** /**
* Parses a PDF using the local Marker server ({@code POST /marker/upload}). * Parses a PDF with a single call to the Marker server using {@code output_format=json}.
* *
* <p>A single HTTP call returns: * <p>The JSON response contains an {@code output} field that is itself a JSON string with a
* tree structure: the root has a {@code children} array where each item is a {@code Page} block.
* Each block carries an {@code html} field with {@code <content-ref src='blockId'>} placeholders
* that reference its {@code children} by ID.
*
* <p>{@link #jsonToHtml} mirrors the Marker Python {@code json_to_html} utility: it walks the
* tree recursively and resolves every {@code content-ref} with the rendered HTML of the
* referenced child block.
*
* <p>Returns a {@link ParsedBook} with:
* <ul> * <ul>
* <li>Reading-order text blocks — correct for multi-column and scanned pages</li> * <li>{@code pages} — one {@link PageResult} per non-empty page (drives embeddings)</li>
* <li>Section headings extracted from {@code SectionHeader} blocks</li> * <li>{@code htmlByPage} — full resolved HTML per page (saved to S3 for the reader)</li>
* <li>Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's
* {@code images} map</li>
* </ul> * </ul>
*
* <p>The response is mapped to one {@link PageResult} per page block.
*/ */
@Service @Service
public class MarkerPageParser { public class MarkerPageParser {
@@ -36,24 +41,21 @@ public class MarkerPageParser {
private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class); private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class);
private static final Set<String> TEXT_BLOCK_TYPES = Set.of( private static final Set<String> TEXT_BLOCK_TYPES = Set.of(
"Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation", "Text", "TextInlineMath", "ListItem", "Table", "TableOfContents", "Code", "Equation",
"Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting" "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting"
); );
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup"); private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
private final RestClient restClient; private static final ObjectMapper MAPPER = new ObjectMapper();
private final ObjectMapper objectMapper;
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) { private final RestClient restClient;
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
this.restClient = restClient; this.restClient = restClient;
this.objectMapper = objectMapper;
} }
/** public ParsedBook parse(Path pdfPath) {
* Parses the entire PDF and returns one {@link PageResult} per non-empty page. log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
*/
public List<PageResult> parse(Path pdfPath) {
log.info("Submitting {} to Marker for parsing", pdfPath.getFileName());
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>(); MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
body.add("file", new FileSystemResource(pdfPath)); body.add("file", new FileSystemResource(pdfPath));
@@ -67,207 +69,219 @@ public class MarkerPageParser {
.body(JsonNode.class); .body(JsonNode.class);
try { try {
Path debugFile = Path.of("/tmp/marker-response-md.json"); Files.writeString(Path.of("/tmp/marker-response-json.json"), response.toPrettyString());
Files.writeString(debugFile, response.toPrettyString());
log.info("Marker response saved to {}", debugFile);
} catch (IOException e) { } catch (IOException e) {
log.warn("Could not save Marker response to file", e); log.warn("Could not save Marker response to /tmp/marker-response-json.json", e);
} }
List<PageResult> results = parseResponse(response); List<JsonNode> pageNodes = extractPages(response);
log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName()); if (pageNodes.isEmpty()) {
return results; log.warn("Marker returned no pages for {}", pdfPath.getFileName());
return new ParsedBook(List.of(), Map.of());
}
log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());
List<PageResult> pages = new ArrayList<>();
Map<Integer, String> htmlByPage = new LinkedHashMap<>();
for (int i = 0; i < pageNodes.size(); i++) {
JsonNode pageNode = pageNodes.get(i);
int pageNumber = i + 1; // 1-based
PageResult result = buildPageResult(pageNode, pageNumber);
String html = jsonToHtml(pageNode);
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
pages.add(result);
htmlByPage.put(pageNumber, html);
}
} }
// --- Private helpers --- log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
return new ParsedBook(pages, htmlByPage);
}
private List<PageResult> parseResponse(JsonNode response) { // ── Page extraction ───────────────────────────────────────────────────────
/**
* Parses the {@code output} JSON string and returns the list of page nodes
* (the top-level {@code children} of the document root).
*/
private List<JsonNode> extractPages(JsonNode response) {
if (response == null) return List.of(); if (response == null) return List.of();
// The "output" field is a JSON-encoded string — parse it first.
// Fall back to treating the whole response as the root if "output" is absent.
JsonNode root;
JsonNode outputNode = response.path("output"); JsonNode outputNode = response.path("output");
if (!outputNode.isMissingNode() && outputNode.isTextual()) { if (outputNode.isMissingNode()) {
try { log.warn("Marker response has no 'output' field");
root = objectMapper.readTree(outputNode.asText());
} catch (tools.jackson.core.JacksonException e) {
log.warn("Could not parse Marker 'output' field as JSON", e);
return List.of(); return List.of();
} }
} else if (!outputNode.isMissingNode()) { try {
root = outputNode; JsonNode root = MAPPER.readTree(outputNode.stringValue());
} else {
root = response;
}
JsonNode children = root.path("children"); JsonNode children = root.path("children");
if (children.isMissingNode() || !children.isArray()) { if (children.isMissingNode() || !children.isArray()) {
log.warn("Marker response has no 'children' array — empty result"); log.warn("Marker output root has no 'children' array");
return List.of(); return List.of();
} }
List<JsonNode> result = new ArrayList<>();
List<PageResult> results = new ArrayList<>(); children.forEach(result::add);
int pageIndex = 0; return result;
for (JsonNode pageBlock : children) { } catch (Exception e) {
String blockType = pageBlock.path("block_type").asText(); log.warn("Could not parse Marker 'output' string as JSON: {}", e.getMessage());
if (!"Page".equals(blockType)) continue; return List.of();
int pageNumber = pageIndex + 1;
pageIndex++;
PageResult result = parsePage(pageBlock, pageNumber);
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
results.add(result);
} }
} }
return results;
// ── HTML rendering ────────────────────────────────────────────────────────
/**
* Java equivalent of the Marker Python {@code json_to_html} utility.
*
* <p>Algorithm:
* <ol>
* <li>If the block has no children, return its {@code html} as-is (leaf node).</li>
* <li>Otherwise recursively render each child, then replace every
* {@code <content-ref src='childId'>} placeholder in the block's own {@code html}
* with the rendered child HTML.</li>
* </ol>
*/
String jsonToHtml(JsonNode block) {
String html = str(block.path("html"));
// If the block carries image data, inject <img> data-URI tags.
// Marker stores base64 image bytes in block.images keyed by block ID.
// Picture/Figure leaf blocks have empty html, so this is the only way to
// get the image into the rendered output.
JsonNode images = block.path("images");
if (!images.isMissingNode() && !images.isNull() && !images.isEmpty()) {
StringBuilder imgTags = new StringBuilder();
images.properties().forEach(entry -> {
String base64 = str(entry.getValue());
if (!base64.isEmpty()) {
String mime = detectImageMime(base64);
imgTags.append("<img src=\"data:").append(mime)
.append(";base64,").append(base64).append("\">");
}
});
if (!imgTags.isEmpty()) {
html = html + imgTags;
}
} }
private PageResult parsePage(JsonNode pageBlock, int pageNumber) { JsonNode children = block.path("children");
JsonNode children = pageBlock.path("children"); if (children.isMissingNode() || children.isNull() || !children.isArray() || children.isEmpty()) {
if (children.isMissingNode() || !children.isArray()) { return html; // leaf node
return new PageResult(pageNumber, "", null, List.of(), "");
} }
StringBuilder textBuilder = new StringBuilder(); // Build id → rendered-html map for all direct children
StringBuilder markdownBuilder = new StringBuilder(); Map<String, String> childHtml = new LinkedHashMap<>();
String headingTitle = null; for (JsonNode child : children) {
String id = str(child.path("id"));
childHtml.put(id, jsonToHtml(child));
}
// Replace every <content-ref src='id'></content-ref> with the child's HTML
for (Map.Entry<String, String> entry : childHtml.entrySet()) {
String ref = "<content-ref src='" + entry.getKey() + "'></content-ref>";
html = html.replace(ref, entry.getValue());
}
return html;
}
// ── PageResult (text + figures for embeddings) ────────────────────────────
private PageResult buildPageResult(JsonNode pageBlock, int pageNumber) {
StringBuilder text = new StringBuilder();
String[] headingTitle = {null};
List<PageResult.FigureData> figures = new ArrayList<>(); List<PageResult.FigureData> figures = new ArrayList<>();
Set<Integer> consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure
List<JsonNode> childList = new ArrayList<>(); walkBlock(pageBlock, text, headingTitle, figures);
children.forEach(childList::add); return new PageResult(pageNumber, text.toString().strip(), headingTitle[0], figures);
}
for (int i = 0; i < childList.size(); i++) { /** Recursively walks the block tree, collecting text and figures in reading order. */
if (consumed.contains(i)) continue; private void walkBlock(JsonNode block, StringBuilder text, String[] headingTitle,
List<PageResult.FigureData> figures) {
JsonNode child = childList.get(i); String type = str(block.path("block_type"));
String type = child.path("block_type").asText();
if ("SectionHeader".equals(type)) { if ("SectionHeader".equals(type)) {
String heading = stripHtml(child.path("html").asText()).strip(); String heading = stripHtml(str(block.path("html"))).strip();
if (!heading.isEmpty() && headingTitle == null) { if (!heading.isEmpty() && headingTitle[0] == null) headingTitle[0] = heading;
headingTitle = heading; appendText(text, heading);
}
appendText(textBuilder, heading);
appendMarkdown(markdownBuilder, "## " + heading);
} else if (TEXT_BLOCK_TYPES.contains(type)) { } else if (TEXT_BLOCK_TYPES.contains(type)) {
String text = stripHtml(child.path("html").asText()); appendText(text, stripHtml(str(block.path("html"))));
appendText(textBuilder, text);
appendMarkdown(markdownBuilder, text.strip());
} else if (FIGURE_BLOCK_TYPES.contains(type)) { } else if (FIGURE_BLOCK_TYPES.contains(type)) {
extractFigures(child, i, childList, figures, markdownBuilder, consumed); String caption = findCaption(block);
} extractFigures(block, caption, figures);
} }
return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle, // Recurse into children (content-ref ordering is implicit via tree order)
figures, markdownBuilder.toString().strip()); JsonNode children = block.path("children");
} if (!children.isMissingNode() && !children.isNull() && children.isArray()) {
for (JsonNode child : children) {
/** walkBlock(child, text, headingTitle, figures);
* Handles a figure/picture block at {@code index} in {@code siblings}.
* For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure,
* and the caption is a sibling Caption child inside the group.
* For leaf blocks the caption is the next sibling in the page child list.
* Image refs are appended to {@code markdown} as {@code ![caption](marker://{blockId})}.
* Consumed caption sibling indices are added to {@code consumed}.
*/
private void extractFigures(JsonNode block, int index, List<JsonNode> siblings,
List<PageResult.FigureData> out, StringBuilder markdown,
Set<Integer> consumed) {
String type = block.path("block_type").asText();
boolean isGroup = type.endsWith("Group");
if (isGroup) {
JsonNode groupChildren = block.path("children");
if (groupChildren.isMissingNode() || !groupChildren.isArray()) return;
String groupCaption = null;
for (JsonNode sub : groupChildren) {
if ("Caption".equals(sub.path("block_type").asText())) {
String c = stripHtml(sub.path("html").asText()).strip();
if (!c.isEmpty()) groupCaption = c;
}
}
for (JsonNode sub : groupChildren) {
String subType = sub.path("block_type").asText();
if ("Figure".equals(subType) || "Picture".equals(subType)) {
String blockId = sub.path("id").asText();
byte[] imageBytes = extractImageBytes(sub, blockId);
if (imageBytes != null) {
out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId));
String altText = groupCaption != null ? groupCaption : blockId;
appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
}
}
}
} else {
String blockId = block.path("id").asText();
byte[] imageBytes = extractImageBytes(block, blockId);
if (imageBytes != null) {
String caption = null;
if (index + 1 < siblings.size()) {
JsonNode next = siblings.get(index + 1);
if ("Caption".equals(next.path("block_type").asText())) {
String c = stripHtml(next.path("html").asText()).strip();
if (!c.isEmpty()) caption = c;
consumed.add(index + 1);
}
}
out.add(new PageResult.FigureData(imageBytes, caption, blockId));
String altText = caption != null ? caption : blockId;
appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
} }
} }
} }
/** /** Finds the first Caption child inside a figure block, if any. */
* Extracts and base64-decodes the image bytes for this block. private String findCaption(JsonNode figureBlock) {
* Marker stores images in the block's {@code images} map keyed by block ID. JsonNode children = figureBlock.path("children");
*/ if (children.isMissingNode() || !children.isArray()) return null;
private byte[] extractImageBytes(JsonNode block, String blockId) { for (JsonNode child : children) {
JsonNode images = block.path("images"); if ("Caption".equals(str(child.path("block_type")))) {
if (images.isMissingNode() || images.isEmpty()) return null; String caption = stripHtml(str(child.path("html"))).strip();
return caption.isEmpty() ? null : caption;
// Try the block's own ID first, then fall back to the first entry }
JsonNode imgNode = images.path(blockId);
if (imgNode.isMissingNode()) {
imgNode = images.properties().stream()
.findFirst()
.map(e -> e.getValue())
.orElse(imgNode);
} }
String base64 = imgNode.asText();
if (base64.isEmpty()) return null;
try {
return Base64.getDecoder().decode(base64);
} catch (IllegalArgumentException ex) {
log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
return null; return null;
} }
private void extractFigures(JsonNode block, String caption, List<PageResult.FigureData> out) {
JsonNode images = block.path("images");
if (images.isMissingNode() || images.isEmpty()) return;
images.properties().forEach(entry -> {
String blockId = entry.getKey();
String base64 = str(entry.getValue());
if (base64.isEmpty()) return;
try {
byte[] bytes = Base64.getDecoder().decode(base64);
out.add(new PageResult.FigureData(bytes, caption, blockId));
} catch (IllegalArgumentException ex) {
log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
}
});
} }
// ── Utilities ─────────────────────────────────────────────────────────────
private void appendText(StringBuilder sb, String text) { private void appendText(StringBuilder sb, String text) {
if (text == null) return;
String stripped = text.strip(); String stripped = text.strip();
if (stripped.isEmpty()) return; if (stripped.isEmpty()) return;
if (sb.length() > 0) sb.append("\n\n"); if (sb.length() > 0) sb.append("\n\n");
sb.append(stripped); sb.append(stripped);
} }
private void appendMarkdown(StringBuilder sb, String text) {
if (text == null || text.isBlank()) return;
if (sb.length() > 0) sb.append("\n\n");
sb.append(text.strip());
}
/** Strips HTML tags and normalises whitespace. */
private String stripHtml(String html) { private String stripHtml(String html) {
if (html == null || html.isEmpty()) return ""; if (html == null || html.isEmpty()) return "";
return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip(); return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip();
} }
/** Detects MIME type from the first characters of a base64-encoded image. */
private static String detectImageMime(String base64) {
if (base64.startsWith("/9j/")) return "image/jpeg";
if (base64.startsWith("iVBOR")) return "image/png";
if (base64.startsWith("R0lGO")) return "image/gif";
if (base64.startsWith("UklGR")) return "image/webp";
return "image/png"; // safe fallback
}
/** Null-safe string extraction from a JsonNode (Jackson 3: stringValue() returns null for non-strings). */
private static String str(JsonNode node) {
String v = node.stringValue();
return v != null ? v : "";
}
} }
@@ -10,8 +10,7 @@ public record PageResult(
int pageNumber, // 1-based, derived from Marker page block index int pageNumber, // 1-based, derived from Marker page block index
String orderedText, // full page text in correct reading order (blocks joined by \n\n) String orderedText, // full page text in correct reading order (blocks joined by \n\n)
String headingTitle, // first SectionHeader block on page, or null String headingTitle, // first SectionHeader block on page, or null
List<FigureData> figures, // extracted figure images (may be empty) List<FigureData> figures // extracted figure images (may be empty)
String markdown // markdown representation with marker://{blockId} image placeholders
) { ) {
/** /**
@@ -0,0 +1,16 @@
package com.aiteacher.document;
import java.util.List;
import java.util.Map;
/**
* Result of a full Marker parse: structured page data (from JSON) plus
* native per-page markdown (from the separate Markdown API call).
*
* @param pages one entry per non-empty page, derived from the chunks response
* @param htmlByPage concatenated block HTML keyed by 1-based page number
*/
public record ParsedBook(
List<PageResult> pages,
Map<Integer, String> htmlByPage
) {}
@@ -53,7 +53,7 @@ public class S3MarkdownStorageService implements MarkdownStorageService {
byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8); byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8);
s3.putObject( s3.putObject(
PutObjectRequest.builder().bucket(bucket).key(key) PutObjectRequest.builder().bucket(bucket).key(key)
.contentType("text/markdown; charset=utf-8") .contentType("text/html; charset=utf-8")
.contentLength((long) bytes.length).build(), .contentLength((long) bytes.length).build(),
RequestBody.fromBytes(bytes)); RequestBody.fromBytes(bytes));
return key; return key;
@@ -69,7 +69,7 @@ public class S3MarkdownStorageService implements MarkdownStorageService {
@Override @Override
public void deleteAll(UUID bookId) { public void deleteAll(UUID bookId) {
String prefix = "markdown/" + bookId + "/"; String prefix = "html/" + bookId + "/";
try { try {
List<ObjectIdentifier> toDelete = new ArrayList<>(); List<ObjectIdentifier> toDelete = new ArrayList<>();
s3.listObjectsV2Paginator(ListObjectsV2Request.builder() s3.listObjectsV2Paginator(ListObjectsV2Request.builder()
@@ -92,6 +92,6 @@ public class S3MarkdownStorageService implements MarkdownStorageService {
} }
private static String key(UUID bookId, int pageNumber) { private static String key(UUID bookId, int pageNumber) {
return "markdown/" + bookId + "/page-" + pageNumber + ".md"; return "html/" + bookId + "/page-" + pageNumber + ".html";
} }
} }
+2 -1
View File
@@ -64,5 +64,6 @@ app:
embedding: embedding:
batch-size: 20 batch-size: 20
batch-delay-ms: 2000 batch-delay-ms: 2000
skip-embedding: true
marker: marker:
base-url: ${MARKER_BASE_URL:http://localhost:8000} base-url: ${MARKER_BASE_URL:http://192.168.1.105:8000}
+5 -2
View File
@@ -64,11 +64,11 @@ body {
Ubuntu, Cantarell, 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; Ubuntu, Cantarell, 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;
background: #f0f4f8; background: #f0f4f8;
color: #2d3748; color: #2d3748;
min-height: 100vh; height: 100vh;
} }
#app { #app {
min-height: 100vh; height: 100vh;
display: flex; display: flex;
flex-direction: column; flex-direction: column;
} }
@@ -133,6 +133,9 @@ body {
.main-content { .main-content {
flex: 1; flex: 1;
min-height: 0;
display: flex;
flex-direction: column;
padding: 2rem; padding: 2rem;
max-width: 1200px; max-width: 1200px;
margin: 0 auto; margin: 0 auto;
+14 -11
View File
@@ -44,7 +44,6 @@
<script setup lang="ts"> <script setup lang="ts">
import { ref, watch, onMounted } from 'vue' import { ref, watch, onMounted } from 'vue'
import { useRoute } from 'vue-router' import { useRoute } from 'vue-router'
import { marked } from 'marked'
import { api } from '@/services/api' import { api } from '@/services/api'
import { useBookStore } from '@/stores/bookStore' import { useBookStore } from '@/stores/bookStore'
import type { Book } from '@/stores/bookStore' import type { Book } from '@/stores/bookStore'
@@ -104,15 +103,11 @@ async function loadPage(page: number) {
activeBlobUrls = [] activeBlobUrls = []
try { try {
const res = await api.get<string>(`/books/${bookId}/pages/${page}/markdown`, { const res = await api.get<string>(`/books/${bookId}/pages/${page}/html`, {
headers: { Accept: 'text/plain' }, headers: { Accept: 'text/html' },
responseType: 'text' responseType: 'text'
}) })
const markdownText = res.data let html = await resolveImages(res.data)
// Render markdown to HTML, then resolve image src via authenticated fetch
let html = await marked.parse(markdownText) as string
html = await resolveImages(html)
renderedHtml.value = html renderedHtml.value = html
} catch (e: any) { } catch (e: any) {
error.value = e.message ?? 'Failed to load page.' error.value = e.message ?? 'Failed to load page.'
@@ -123,8 +118,8 @@ async function loadPage(page: number) {
/** /**
* Finds <img src="/api/v1/figures/..."> in the HTML, fetches each image * Finds <img src="/api/v1/figures/..."> in the HTML, fetches each image
* with the authenticated axios instance (which carries Basic auth headers), * through the authenticated axios instance, and replaces the src with a
* and replaces the src with a temporary blob URL so the browser can display it. * temporary blob URL so the browser can render it without re-authenticating.
*/ */
async function resolveImages(html: string): Promise<string> { async function resolveImages(html: string): Promise<string> {
const srcPattern = /src="(\/api\/v1\/figures\/[^"]+)"/g const srcPattern = /src="(\/api\/v1\/figures\/[^"]+)"/g
@@ -137,7 +132,7 @@ async function resolveImages(html: string): Promise<string> {
await Promise.all( await Promise.all(
unique.map(async (src) => { unique.map(async (src) => {
try { try {
const res = await api.get(src, { responseType: 'blob' }) const res = await api.get(src.replace(/^\/api\/v1/, ''), { responseType: 'blob' })
const blobUrl = URL.createObjectURL(res.data) const blobUrl = URL.createObjectURL(res.data)
activeBlobUrls.push(blobUrl) activeBlobUrls.push(blobUrl)
blobMap[src] = blobUrl blobMap[src] = blobUrl
@@ -160,6 +155,8 @@ async function resolveImages(html: string): Promise<string> {
gap: 1rem; gap: 1rem;
max-width: 860px; max-width: 860px;
margin: 0 auto; margin: 0 auto;
flex: 1;
min-height: 0;
} }
.reader-header { .reader-header {
@@ -238,6 +235,9 @@ async function resolveImages(html: string): Promise<string> {
.reader-body { .reader-body {
flex: 1; flex: 1;
min-height: 0;
display: flex;
flex-direction: column;
} }
.reader-loading { .reader-loading {
@@ -255,6 +255,9 @@ async function resolveImages(html: string): Promise<string> {
} }
.reader-content { .reader-content {
flex: 1;
min-height: 0;
overflow-y: auto;
padding: 2rem; padding: 2rem;
} }