enhance rag retrieval + summary

2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -0,0 +1,72 @@
+package com.aiteacher.document;
+
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.multipdf.Splitter;
+import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Splits a PDF file into fixed-size chunks using PDFBox.
+ * Each chunk is saved as a temporary file so it can be submitted independently to Marker.
+ */
+@Service
+public class PdfSplitterService {
+
+    private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
+
+    /**
+     * A chunk of a split PDF.
+     *
+     * @param tempFile   path to the temporary PDF file (caller must delete when done)
+     * @param pageOffset 0-based index of the first page in this chunk within the original document
+     */
+    public record PdfChunk(Path tempFile, int pageOffset) {}
+
+    /**
+     * Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
+     * Returns a single-element list when the document fits in one chunk.
+     *
+     * @param pdfPath          source PDF
+     * @param maxPagesPerChunk maximum pages per chunk
+     * @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
+     */
+    public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
+        try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
+            int totalPages = doc.getNumberOfPages();
+            log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
+
+            if (totalPages <= maxPagesPerChunk) {
+                // No split needed — return the original file as a single virtual chunk
+                return List.of(new PdfChunk(pdfPath, 0));
+            }
+
+            Splitter splitter = new Splitter();
+            splitter.setSplitAtPage(maxPagesPerChunk);
+            List<PDDocument> parts = splitter.split(doc);
+
+            List<PdfChunk> chunks = new ArrayList<>(parts.size());
+            int offset = 0;
+            for (PDDocument part : parts) {
+                try {
+                    Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
+                    part.save(tmp.toFile());
+                    chunks.add(new PdfChunk(tmp, offset));
+                    log.debug("Created chunk at {} (page offset {})", tmp, offset);
+                    offset += part.getNumberOfPages();
+                } finally {
+                    part.close();
+                }
+            }
+            return chunks;
+        }
+    }
+}