enhance rag retrieval + summary

This commit is contained in:
Adrien
2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -0,0 +1,72 @@
package com.aiteacher.document;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* Splits a PDF file into fixed-size chunks using PDFBox.
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
*/
@Service
public class PdfSplitterService {
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
/**
* A chunk of a split PDF.
*
* @param tempFile path to the temporary PDF file (caller must delete when done)
* @param pageOffset 0-based index of the first page in this chunk within the original document
*/
public record PdfChunk(Path tempFile, int pageOffset) {}
/**
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
* Returns a single-element list when the document fits in one chunk.
*
* @param pdfPath source PDF
* @param maxPagesPerChunk maximum pages per chunk
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
*/
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
int totalPages = doc.getNumberOfPages();
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
if (totalPages <= maxPagesPerChunk) {
// No split needed — return the original file as a single virtual chunk
return List.of(new PdfChunk(pdfPath, 0));
}
Splitter splitter = new Splitter();
splitter.setSplitAtPage(maxPagesPerChunk);
List<PDDocument> parts = splitter.split(doc);
List<PdfChunk> chunks = new ArrayList<>(parts.size());
int offset = 0;
for (PDDocument part : parts) {
try {
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
part.save(tmp.toFile());
chunks.add(new PdfChunk(tmp, offset));
log.debug("Created chunk at {} (page offset {})", tmp, offset);
offset += part.getNumberOfPages();
} finally {
part.close();
}
}
return chunks;
}
}
}