enhance rag retrieval + summary

2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -17,6 +17,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;

+
 /**
 * Parses a PDF with a single call to the Marker server using {@code output_format=json}.
 *
@@ -46,19 +47,65 @@ public class MarkerPageParser {
    );
    private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");

+    private static final int CHUNK_SIZE = 100;
+
    private static final ObjectMapper MAPPER = new ObjectMapper();

    private final RestClient restClient;
+    private final PdfSplitterService pdfSplitterService;

-    public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
+    public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient,
+                            PdfSplitterService pdfSplitterService) {
        this.restClient = restClient;
+        this.pdfSplitterService = pdfSplitterService;
    }

-    public ParsedBook parse(Path pdfPath) {
-        log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
+    /**
+     * Parses a PDF by splitting it into {@value #CHUNK_SIZE}-page chunks, submitting each
+     * chunk to Marker individually, and merging the results into a single {@link ParsedBook}.
+     * Page numbers in the merged result are absolute (1-based across the whole document).
+     */
+    public ParsedBook parse(Path pdfPath) throws IOException {
+        List<PdfSplitterService.PdfChunk> chunks = pdfSplitterService.split(pdfPath, CHUNK_SIZE);
+        log.info("Processing {} chunk(s) for {}", chunks.size(), pdfPath.getFileName());

+        List<PageResult> allPages = new ArrayList<>();
+        Map<Integer, String> allHtml = new LinkedHashMap<>();
+
+        try {
+            for (int c = 0; c < chunks.size(); c++) {
+                PdfSplitterService.PdfChunk chunk = chunks.get(c);
+                log.info("Submitting chunk {}/{} to Marker (page offset {})", c + 1, chunks.size(), chunk.pageOffset());
+
+                ParsedBook chunkResult = submitChunk(chunk.tempFile());
+
+                // Rebase page numbers from chunk-relative to document-absolute
+                for (PageResult page : chunkResult.pages()) {
+                    int absolutePage = chunk.pageOffset() + page.pageNumber();
+                    allPages.add(new PageResult(absolutePage, page.orderedText(), page.headingTitle(), page.figures()));
+                }
+                chunkResult.htmlByPage().forEach((chunkPage, html) ->
+                        allHtml.put(chunk.pageOffset() + chunkPage, html));
+            }
+        } finally {
+            // Delete temporary chunk files (skip if the chunk is the original PDF)
+            for (PdfSplitterService.PdfChunk chunk : chunks) {
+                if (!chunk.tempFile().equals(pdfPath)) {
+                    try { Files.deleteIfExists(chunk.tempFile()); }
+                    catch (IOException e) { log.warn("Could not delete temp chunk {}", chunk.tempFile()); }
+                }
+            }
+        }
+
+        log.info("Marker produced {} non-empty pages from {} chunk(s) of {}",
+                allPages.size(), chunks.size(), pdfPath.getFileName());
+        return new ParsedBook(allPages, allHtml);
+    }
+
+    /** Submits a single PDF file to Marker and returns the parsed result with chunk-relative page numbers. */
+    private ParsedBook submitChunk(Path chunkPath) {
        MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
-        body.add("file", new FileSystemResource(pdfPath));
+        body.add("file", new FileSystemResource(chunkPath));
        body.add("output_format", "json");

        JsonNode response = restClient.post()
@@ -76,28 +123,29 @@ public class MarkerPageParser {

        List<JsonNode> pageNodes = extractPages(response);
        if (pageNodes.isEmpty()) {
-            log.warn("Marker returned no pages for {}", pdfPath.getFileName());
+            log.warn("Marker returned no pages for chunk {}", chunkPath.getFileName());
            return new ParsedBook(List.of(), Map.of());
        }
-        log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());

        List<PageResult> pages = new ArrayList<>();
        Map<Integer, String> htmlByPage = new LinkedHashMap<>();

        for (int i = 0; i < pageNodes.size(); i++) {
            JsonNode pageNode = pageNodes.get(i);
-            int pageNumber = i + 1; // 1-based
+            int pageNumber = i + 1; // 1-based, chunk-relative

            PageResult result = buildPageResult(pageNode, pageNumber);
            String html = jsonToHtml(pageNode);

+            // Always save HTML so the reader can navigate to every page
+            htmlByPage.put(pageNumber, html);
+
+            // Only queue for embedding if the page has extractable content
            if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
                pages.add(result);
-                htmlByPage.put(pageNumber, html);
            }
        }

-        log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
        return new ParsedBook(pages, htmlByPage);
    }

@@ -0,0 +1,72 @@
+package com.aiteacher.document;
+
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.multipdf.Splitter;
+import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Splits a PDF file into fixed-size chunks using PDFBox.
+ * Each chunk is saved as a temporary file so it can be submitted independently to Marker.
+ */
+@Service
+public class PdfSplitterService {
+
+    private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
+
+    /**
+     * A chunk of a split PDF.
+     *
+     * @param tempFile   path to the temporary PDF file (caller must delete when done)
+     * @param pageOffset 0-based index of the first page in this chunk within the original document
+     */
+    public record PdfChunk(Path tempFile, int pageOffset) {}
+
+    /**
+     * Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
+     * Returns a single-element list when the document fits in one chunk.
+     *
+     * @param pdfPath          source PDF
+     * @param maxPagesPerChunk maximum pages per chunk
+     * @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
+     */
+    public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
+        try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
+            int totalPages = doc.getNumberOfPages();
+            log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
+
+            if (totalPages <= maxPagesPerChunk) {
+                // No split needed — return the original file as a single virtual chunk
+                return List.of(new PdfChunk(pdfPath, 0));
+            }
+
+            Splitter splitter = new Splitter();
+            splitter.setSplitAtPage(maxPagesPerChunk);
+            List<PDDocument> parts = splitter.split(doc);
+
+            List<PdfChunk> chunks = new ArrayList<>(parts.size());
+            int offset = 0;
+            for (PDDocument part : parts) {
+                try {
+                    Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
+                    part.save(tmp.toFile());
+                    chunks.add(new PdfChunk(tmp, offset));
+                    log.debug("Created chunk at {} (page offset {})", tmp, offset);
+                    offset += part.getNumberOfPages();
+                } finally {
+                    part.close();
+                }
+            }
+            return chunks;
+        }
+    }
+}
@@ -1,6 +1,8 @@
 package com.aiteacher.document;

 import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;

 import java.util.List;
 import java.util.UUID;
@@ -8,4 +10,10 @@ import java.util.UUID;
 public interface SectionRepository extends JpaRepository<SectionEntity, String> {
    List<SectionEntity> findAllByBookId(UUID bookId);
    void deleteAllByBookId(UUID bookId);
+
+    @Query("SELECT s FROM SectionEntity s WHERE s.bookId = :bookId AND s.pageStart <= :windowEnd AND s.pageEnd >= :windowStart ORDER BY s.pageStart")
+    List<SectionEntity> findByBookIdAndPageOverlap(
+            @Param("bookId") UUID bookId,
+            @Param("windowStart") int windowStart,
+            @Param("windowEnd") int windowEnd);
 }
@@ -3,6 +3,7 @@ package com.aiteacher.document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.chat.client.ChatClient;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.core.io.ByteArrayResource;
 import org.springframework.stereotype.Service;
 import org.springframework.util.MimeTypeUtils;
@@ -32,10 +33,16 @@ public class VisionDescriptionService {
            IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
            """;

+    /** Minimum ms between vision API calls. Configurable via app.vision.min-interval-ms. */
+    private final long minIntervalMs;
    private final ChatClient chatClient;
+    private volatile long lastCallAt = 0;

-    public VisionDescriptionService(ChatClient chatClient) {
+    public VisionDescriptionService(
+            ChatClient chatClient,
+            @Value("${app.vision.min-interval-ms:2000}") long minIntervalMs) {
        this.chatClient = chatClient;
+        this.minIntervalMs = minIntervalMs;
    }

    /**
@@ -55,6 +62,7 @@ public class VisionDescriptionService {
     * @param captionFallback caption detected from surrounding text, may be null
     */
    public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
+        throttle();
        try {
            String raw = chatClient.prompt()
                    .user(u -> u
@@ -71,6 +79,15 @@ public class VisionDescriptionService {
        }
    }

+    private synchronized void throttle() {
+        long now = System.currentTimeMillis();
+        long wait = minIntervalMs - (now - lastCallAt);
+        if (wait > 0) {
+            try { Thread.sleep(wait); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
+        }
+        lastCallAt = System.currentTimeMillis();
+    }
+
    private ImageAnalysis parse(String raw, String captionFallback) {
        String description = captionFallback != null ? captionFallback : "Figure";
        String imageText = "";