adding Marker to parse effectively pdf

2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -2,7 +2,9 @@ package com.aiteacher.book;

 import com.aiteacher.document.FigureEntity;
 import com.aiteacher.document.FigureRepository;
+import com.aiteacher.document.MarkdownStorageService;
 import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.*;
 import org.springframework.web.multipart.MultipartFile;
@@ -18,10 +20,13 @@ public class BookController {

    private final BookService bookService;
    private final FigureRepository figureRepository;
+    private final MarkdownStorageService markdownStorageService;

-    public BookController(BookService bookService, FigureRepository figureRepository) {
+    public BookController(BookService bookService, FigureRepository figureRepository,
+                          MarkdownStorageService markdownStorageService) {
        this.bookService = bookService;
        this.figureRepository = figureRepository;
+        this.markdownStorageService = markdownStorageService;
    }

    @PostMapping(consumes = "multipart/form-data")
@@ -59,6 +64,17 @@ public class BookController {
        ));
    }

+    @GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
+    public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
+                                                   @PathVariable int pageNumber) {
+        bookService.getById(id); // 404 if not found
+        try {
+            return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
+        } catch (Exception e) {
+            return ResponseEntity.notFound().build();
+        }
+    }
+
    @GetMapping("/{id}/figures")
    public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
        bookService.getById(id); // 404 if not found
@@ -2,6 +2,9 @@ package com.aiteacher.book;

 import com.aiteacher.document.*;
 import com.aiteacher.figure.FigureStorageService;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.document.Document;
@@ -23,13 +26,7 @@ public class BookEmbeddingService {

    private final VectorStore vectorStore;
    private final BookRepository bookRepository;
-
-    @Value("${app.embedding.batch-size:50}")
-    private int embeddingBatchSize;
-
-    @Value("${app.embedding.batch-delay-ms:1000}")
-    private long embeddingBatchDelayMs;
-    private final PdfStructureParser pdfStructureParser;
+    private final MarkerPageParser markerPageParser;
    private final FigureExtractionService figureExtractionService;
    private final VisionDescriptionService visionDescriptionService;
    private final TextChunkingService textChunkingService;
@@ -39,11 +36,21 @@ public class BookEmbeddingService {
    private final FigureRepository figureRepository;
    private final ChunkFigureRefRepository chunkFigureRefRepository;
    private final FigureStorageService figureStorageService;
+    private final MarkdownStorageService markdownStorageService;
+
+    private static final Pattern MARKER_PLACEHOLDER =
+            Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
+
+    @Value("${app.embedding.batch-size:50}")
+    private int embeddingBatchSize;
+
+    @Value("${app.embedding.batch-delay-ms:1000}")
+    private long embeddingBatchDelayMs;

    public BookEmbeddingService(
            VectorStore vectorStore,
            BookRepository bookRepository,
-            PdfStructureParser pdfStructureParser,
+            MarkerPageParser markerPageParser,
            FigureExtractionService figureExtractionService,
            VisionDescriptionService visionDescriptionService,
            TextChunkingService textChunkingService,
@@ -52,10 +59,11 @@ public class BookEmbeddingService {
            ChapterRepository chapterRepository,
            FigureRepository figureRepository,
            ChunkFigureRefRepository chunkFigureRefRepository,
-            FigureStorageService figureStorageService) {
+            FigureStorageService figureStorageService,
+            MarkdownStorageService markdownStorageService) {
        this.vectorStore = vectorStore;
        this.bookRepository = bookRepository;
-        this.pdfStructureParser = pdfStructureParser;
+        this.markerPageParser = markerPageParser;
        this.figureExtractionService = figureExtractionService;
        this.visionDescriptionService = visionDescriptionService;
        this.textChunkingService = textChunkingService;
@@ -65,11 +73,12 @@ public class BookEmbeddingService {
        this.figureRepository = figureRepository;
        this.chunkFigureRefRepository = chunkFigureRefRepository;
        this.figureStorageService = figureStorageService;
+        this.markdownStorageService = markdownStorageService;
    }

    @Async
    public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
-        log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
+        log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle);

        Book book = bookRepository.findById(bookId).orElse(null);
        if (book == null) {
@@ -81,59 +90,73 @@ public class BookEmbeddingService {
            book.setStatus(BookStatus.PROCESSING);
            bookRepository.save(book);

-            // Step 1: Parse PDF into page-level sections persisted in Postgres
-            List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
            String chapterId = bookId + "-ch1";
+            ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
+            chapterRepository.save(chapter);

-            // Step 2: Build and embed text chunks for all sections in batches
+            // Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
+            List<PageResult> pageResults = markerPageParser.parse(pdfPath);
+
+            // Step 2: Build SectionEntity per page and persist
+            List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
+
+            // Step 3: Chunk and embed text
            List<Document> allChunks = new ArrayList<>();
            for (SectionEntity section : sections) {
-                List<Document> chunks = textChunkingService.chunk(section, bookTitle);
-                allChunks.addAll(chunks);
+                allChunks.addAll(textChunkingService.chunk(section, bookTitle));
            }
            embedInBatches(allChunks, bookId);
            log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);

-            // Step 3: Extract images from the PDF, save to file store, persist FigureEntity
-            List<FigureEntity> figures = figureExtractionService.extract(
-                bookId, chapterId, sections, pdfPath);
+            // Step 4: Decode pre-cropped figures from Marker output
+            FigureExtractionService.ExtractionResult extraction =
+                    figureExtractionService.extract(bookId, chapterId, pageResults);
+            List<FigureEntity> figures = extraction.figures();

-            // Step 4: For each figure, generate vision description and embed caption
+            // Step 4b: Upload per-page markdown with resolved figure URLs to S3
+            for (PageResult page : pageResults) {
+                if (!page.markdown().isBlank()) {
+                    String resolved = resolvePlaceholders(page.markdown(), bookId,
+                            extraction.blockIdToFigureId());
+                    markdownStorageService.save(bookId, page.pageNumber(), resolved);
+                }
+            }
+
+            // Step 5: Vision analysis (description + visible text) → embed figure chunks
            for (FigureEntity figure : figures) {
                byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
-                String description = visionDescriptionService.describe(
-                    imageBytes, figure.getCaption());
+                VisionDescriptionService.ImageAnalysis analysis =
+                        visionDescriptionService.analyze(imageBytes, figure.getCaption());

-                // Use description as caption fallback if no caption was detected
                if (figure.getCaption() == null || figure.getCaption().isBlank()) {
-                    figure.setCaption(description);
+                    figure.setCaption(analysis.description());
                    figureRepository.save(figure);
                }

-                // Content for embedding = vision description + caption for maximum signal
-                String embeddingContent = description
-                    + (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
+                // Embedding content: description + caption + visible image text
+                String embeddingContent = analysis.description()
+                        + (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
+                        + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());

                String embeddingId = UUID.randomUUID().toString();
-                Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
-                Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
+                Document figureDoc = new Document(embeddingId, embeddingContent,
+                        buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
                vectorStore.add(List.of(figureDoc));

                figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
                figureRepository.save(figure);
            }
-            log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
+            log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);

-            // Step 5: Link text chunks to figures via text references
+            // Step 6: Link text chunks to figures via in-text references
            for (SectionEntity section : sections) {
                List<Document> sectionChunks = allChunks.stream()
-                    .filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
-                    .toList();
+                        .filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
+                        .toList();
                List<FigureEntity> sectionFigures = figures.stream()
-                    .filter(f -> section.getId().equals(f.getSectionId()))
-                    .toList();
-                chunkFigureRefService.linkChunksToFigures(
-                    sectionChunks, sectionFigures, section.getPageStart());
+                        .filter(f -> section.getId().equals(f.getSectionId()))
+                        .toList();
+                chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart());
            }

            book.setStatus(BookStatus.READY);
@@ -142,7 +165,7 @@ public class BookEmbeddingService {
            bookRepository.save(book);

            log.info("Finished embedding book {} — {} pages, {} figures",
-                bookId, sections.size(), figures.size());
+                    bookId, sections.size(), figures.size());

        } catch (Exception ex) {
            log.error("Failed to embed book {}", bookId, ex);
@@ -156,53 +179,63 @@ public class BookEmbeddingService {
    public void deleteBookChunks(UUID bookId) {
        log.info("Deleting all data for book {}", bookId);
        try {
-            // Delete chunk-figure refs (by figureId for this book)
            List<String> figureIds = figureRepository.findAllByBookId(bookId)
-                .stream().map(FigureEntity::getId).toList();
+                    .stream().map(FigureEntity::getId).toList();
            if (!figureIds.isEmpty()) {
                chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
            }
-
-            // Delete figures from Postgres
            figureRepository.deleteAllByBookId(bookId);
-
-            // Delete figure files from disk
            figureStorageService.deleteAll(bookId);
-
-            // Delete sections and chapters from Postgres
+            markdownStorageService.deleteAll(bookId);
            sectionRepository.deleteAllByBookId(bookId);
            chapterRepository.deleteAllByBookId(bookId);

-            // Delete vector store entries (text chunks + figure embeddings)
            FilterExpressionBuilder b = new FilterExpressionBuilder();
            vectorStore.delete(b.eq("book_id", bookId.toString()).build());
-
        } catch (Exception ex) {
            log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
        }
    }

+    // --- Private helpers ---
+
+    private List<SectionEntity> buildAndSaveSections(UUID bookId, String bookTitle,
+                                                      String chapterId,
+                                                      List<PageResult> pageResults) {
+        List<SectionEntity> sections = new ArrayList<>();
+        for (PageResult page : pageResults) {
+            if (page.orderedText().isBlank()) continue;
+
+            String sectionId = bookId + "-p" + page.pageNumber();
+            String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
+
+            SectionEntity section = new SectionEntity(
+                    sectionId, chapterId, bookId,
+                    String.valueOf(page.pageNumber()),
+                    title,
+                    page.pageNumber(), page.pageNumber(),
+                    page.orderedText());
+            sections.add(sectionRepository.save(section));
+        }
+        return sections;
+    }
+
    private void embedInBatches(List<Document> docs, UUID bookId) {
        int total = docs.size();
        for (int i = 0; i < total; i += embeddingBatchSize) {
            List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
            vectorStore.add(batch);
-            int batchNum = i / embeddingBatchSize + 1;
-            int totalBatches = (total - 1) / embeddingBatchSize + 1;
-            log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
+            log.debug("Embedded batch {}/{} for book {}",
+                    i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId);
            if (i + embeddingBatchSize < total) {
-                try {
-                    Thread.sleep(embeddingBatchDelayMs);
-                } catch (InterruptedException e) {
-                    Thread.currentThread().interrupt();
-                    log.warn("Embedding batch sleep interrupted for book {}", bookId);
-                }
+                try { Thread.sleep(embeddingBatchDelayMs); }
+                catch (InterruptedException e) { Thread.currentThread().interrupt(); }
            }
        }
    }

    private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
-                                                     String embeddingId) {
+                                                     String embeddingId, String imageText) {
        Map<String, Object> m = new HashMap<>();
        m.put("type", "FIGURE");
        m.put("book_id", figure.getBookId().toString());
@@ -215,9 +248,31 @@ public class BookEmbeddingService {
        m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
        m.put("page", figure.getPage());
        m.put("embedding_id", embeddingId);
+        m.put("image_text", imageText);  // verbatim text visible inside the image
        return m;
    }

+    /** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
+    private String resolvePlaceholders(String markdown, UUID bookId,
+                                       Map<String, String> blockIdToFigureId) {
+        Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
+        StringBuilder sb = new StringBuilder();
+        while (m.find()) {
+            String altText = m.group(1);
+            String blockId = m.group(2);
+            String figureId = blockIdToFigureId.get(blockId);
+            if (figureId != null) {
+                String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
+                m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\")
+                        .replace("$", "\\$") + "](" + url + ")");
+            } else {
+                m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
+            }
+        }
+        m.appendTail(sb);
+        return sb.toString().strip();
+    }
+
    private String truncate(String msg, int max) {
        if (msg == null) return null;
        return msg.length() <= max ? msg : msg.substring(0, max);