enhance page parsing using json output and html

2026-04-05 21:55:30 +02:00
parent ea1276dc2e
commit 5c641f4bcc
9 changed files with 292 additions and 258 deletions
@@ -64,9 +64,9 @@ public class BookController {
        ));
    }

-    @GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
-    public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
-                                                   @PathVariable int pageNumber) {
+    @GetMapping(value = "/{id}/pages/{pageNumber}/html", produces = MediaType.TEXT_HTML_VALUE)
+    public ResponseEntity<String> getPageHtml(@PathVariable UUID id,
+                                               @PathVariable int pageNumber) {
        bookService.getById(id); // 404 if not found
        try {
            return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
@@ -3,8 +3,6 @@ package com.aiteacher.book;
 import com.aiteacher.document.*;
 import com.aiteacher.figure.FigureStorageService;

-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.document.Document;
@@ -38,15 +36,15 @@ public class BookEmbeddingService {
    private final FigureStorageService figureStorageService;
    private final MarkdownStorageService markdownStorageService;

-    private static final Pattern MARKER_PLACEHOLDER =
-            Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
-
    @Value("${app.embedding.batch-size:50}")
    private int embeddingBatchSize;

    @Value("${app.embedding.batch-delay-ms:1000}")
    private long embeddingBatchDelayMs;

+    @Value("${app.embedding.skip-embedding:false}")
+    private boolean skipEmbedding;
+
    public BookEmbeddingService(
            VectorStore vectorStore,
            BookRepository bookRepository,
@@ -94,8 +92,10 @@ public class BookEmbeddingService {
            ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
            chapterRepository.save(chapter);

-            // Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
-            List<PageResult> pageResults = markerPageParser.parse(pdfPath);
+            // Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
+            ParsedBook parsed = markerPageParser.parse(pdfPath);
+
+            List<PageResult> pageResults = parsed.pages();

            // Step 2: Build SectionEntity per page and persist
            List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
@@ -105,22 +105,24 @@ public class BookEmbeddingService {
            for (SectionEntity section : sections) {
                allChunks.addAll(textChunkingService.chunk(section, bookTitle));
            }
-            embedInBatches(allChunks, bookId);
-            log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
+            if (skipEmbedding) {
+                log.info("skip-embedding=true — skipping text embedding for book {}", bookId);
+            } else {
+                embedInBatches(allChunks, bookId);
+                log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
+            }

            // Step 4: Decode pre-cropped figures from Marker output
            FigureExtractionService.ExtractionResult extraction =
                    figureExtractionService.extract(bookId, chapterId, pageResults);
            List<FigureEntity> figures = extraction.figures();

-            // Step 4b: Upload per-page markdown with resolved figure URLs to S3
-            for (PageResult page : pageResults) {
-                if (!page.markdown().isBlank()) {
-                    String resolved = resolvePlaceholders(page.markdown(), bookId,
-                            extraction.blockIdToFigureId());
-                    markdownStorageService.save(bookId, page.pageNumber(), resolved);
-                }
-            }
+            // Step 4b: Save per-page HTML to S3, replacing Marker image src with API URLs
+            parsed.htmlByPage().forEach((pageNumber, html) -> {
+                String resolved = resolveImageSrcs(html, bookId, extraction.blockIdToFigureId());
+                markdownStorageService.save(bookId, pageNumber, resolved);
+            });
+            log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);

            // Step 5: Vision analysis (description + visible text) → embed figure chunks
            for (FigureEntity figure : figures) {
@@ -139,11 +141,12 @@ public class BookEmbeddingService {
                        + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());

                String embeddingId = UUID.randomUUID().toString();
-                Document figureDoc = new Document(embeddingId, embeddingContent,
-                        buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
-                vectorStore.add(List.of(figureDoc));
-
-                figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
+                if (!skipEmbedding) {
+                    Document figureDoc = new Document(embeddingId, embeddingContent,
+                            buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
+                    vectorStore.add(List.of(figureDoc));
+                    figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
+                }
                figureRepository.save(figure);
            }
            log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
@@ -252,25 +255,20 @@ public class BookEmbeddingService {
        return m;
    }

-    /** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
-    private String resolvePlaceholders(String markdown, UUID bookId,
-                                       Map<String, String> blockIdToFigureId) {
-        Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
-        StringBuilder sb = new StringBuilder();
-        while (m.find()) {
-            String altText = m.group(1);
-            String blockId = m.group(2);
-            String figureId = blockIdToFigureId.get(blockId);
-            if (figureId != null) {
-                String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
-                m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\")
-                        .replace("$", "\\$") + "](" + url + ")");
-            } else {
-                m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
-            }
+    /**
+     * Replaces Marker's {@code src='{blockId}'} image attributes with resolved API URLs.
+     * Block IDs look like {@code /page/0/Figure/2}.
+     */
+    private String resolveImageSrcs(String html, UUID bookId, Map<String, String> blockIdToFigureId) {
+        for (Map.Entry<String, String> entry : blockIdToFigureId.entrySet()) {
+            String blockId = entry.getKey();
+            String figureId = entry.getValue();
+            String apiUrl = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
+            // Marker emits both single and double-quoted src attributes
+            html = html.replace("src='" + blockId + "'", "src='" + apiUrl + "'");
+            html = html.replace("src=\"" + blockId + "\"", "src=\"" + apiUrl + "\"");
        }
-        m.appendTail(sb);
-        return sb.toString().strip();
+        return html;
    }

    private String truncate(String msg, int max) {