enhance rag retrieval + summary

This commit is contained in:
Adrien
2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -17,6 +17,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
/**
* Parses a PDF with a single call to the Marker server using {@code output_format=json}.
*
@@ -46,19 +47,65 @@ public class MarkerPageParser {
);
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
private static final int CHUNK_SIZE = 100;
private static final ObjectMapper MAPPER = new ObjectMapper();
private final RestClient restClient;
private final PdfSplitterService pdfSplitterService;
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient,
PdfSplitterService pdfSplitterService) {
this.restClient = restClient;
this.pdfSplitterService = pdfSplitterService;
}
public ParsedBook parse(Path pdfPath) {
log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
/**
* Parses a PDF by splitting it into {@value #CHUNK_SIZE}-page chunks, submitting each
* chunk to Marker individually, and merging the results into a single {@link ParsedBook}.
* Page numbers in the merged result are absolute (1-based across the whole document).
*/
public ParsedBook parse(Path pdfPath) throws IOException {
List<PdfSplitterService.PdfChunk> chunks = pdfSplitterService.split(pdfPath, CHUNK_SIZE);
log.info("Processing {} chunk(s) for {}", chunks.size(), pdfPath.getFileName());
List<PageResult> allPages = new ArrayList<>();
Map<Integer, String> allHtml = new LinkedHashMap<>();
try {
for (int c = 0; c < chunks.size(); c++) {
PdfSplitterService.PdfChunk chunk = chunks.get(c);
log.info("Submitting chunk {}/{} to Marker (page offset {})", c + 1, chunks.size(), chunk.pageOffset());
ParsedBook chunkResult = submitChunk(chunk.tempFile());
// Rebase page numbers from chunk-relative to document-absolute
for (PageResult page : chunkResult.pages()) {
int absolutePage = chunk.pageOffset() + page.pageNumber();
allPages.add(new PageResult(absolutePage, page.orderedText(), page.headingTitle(), page.figures()));
}
chunkResult.htmlByPage().forEach((chunkPage, html) ->
allHtml.put(chunk.pageOffset() + chunkPage, html));
}
} finally {
// Delete temporary chunk files (skip if the chunk is the original PDF)
for (PdfSplitterService.PdfChunk chunk : chunks) {
if (!chunk.tempFile().equals(pdfPath)) {
try { Files.deleteIfExists(chunk.tempFile()); }
catch (IOException e) { log.warn("Could not delete temp chunk {}", chunk.tempFile()); }
}
}
}
log.info("Marker produced {} non-empty pages from {} chunk(s) of {}",
allPages.size(), chunks.size(), pdfPath.getFileName());
return new ParsedBook(allPages, allHtml);
}
/** Submits a single PDF file to Marker and returns the parsed result with chunk-relative page numbers. */
private ParsedBook submitChunk(Path chunkPath) {
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
body.add("file", new FileSystemResource(pdfPath));
body.add("file", new FileSystemResource(chunkPath));
body.add("output_format", "json");
JsonNode response = restClient.post()
@@ -76,28 +123,29 @@ public class MarkerPageParser {
List<JsonNode> pageNodes = extractPages(response);
if (pageNodes.isEmpty()) {
log.warn("Marker returned no pages for {}", pdfPath.getFileName());
log.warn("Marker returned no pages for chunk {}", chunkPath.getFileName());
return new ParsedBook(List.of(), Map.of());
}
log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());
List<PageResult> pages = new ArrayList<>();
Map<Integer, String> htmlByPage = new LinkedHashMap<>();
for (int i = 0; i < pageNodes.size(); i++) {
JsonNode pageNode = pageNodes.get(i);
int pageNumber = i + 1; // 1-based
int pageNumber = i + 1; // 1-based, chunk-relative
PageResult result = buildPageResult(pageNode, pageNumber);
String html = jsonToHtml(pageNode);
// Always save HTML so the reader can navigate to every page
htmlByPage.put(pageNumber, html);
// Only queue for embedding if the page has extractable content
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
pages.add(result);
htmlByPage.put(pageNumber, html);
}
}
log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
return new ParsedBook(pages, htmlByPage);
}
@@ -0,0 +1,72 @@
package com.aiteacher.document;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* Splits a PDF file into fixed-size chunks using PDFBox.
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
*/
@Service
public class PdfSplitterService {
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
/**
* A chunk of a split PDF.
*
* @param tempFile path to the temporary PDF file (caller must delete when done)
* @param pageOffset 0-based index of the first page in this chunk within the original document
*/
public record PdfChunk(Path tempFile, int pageOffset) {}
/**
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
* Returns a single-element list when the document fits in one chunk.
*
* @param pdfPath source PDF
* @param maxPagesPerChunk maximum pages per chunk
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
*/
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
int totalPages = doc.getNumberOfPages();
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
if (totalPages <= maxPagesPerChunk) {
// No split needed — return the original file as a single virtual chunk
return List.of(new PdfChunk(pdfPath, 0));
}
Splitter splitter = new Splitter();
splitter.setSplitAtPage(maxPagesPerChunk);
List<PDDocument> parts = splitter.split(doc);
List<PdfChunk> chunks = new ArrayList<>(parts.size());
int offset = 0;
for (PDDocument part : parts) {
try {
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
part.save(tmp.toFile());
chunks.add(new PdfChunk(tmp, offset));
log.debug("Created chunk at {} (page offset {})", tmp, offset);
offset += part.getNumberOfPages();
} finally {
part.close();
}
}
return chunks;
}
}
}
@@ -1,6 +1,8 @@
package com.aiteacher.document;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import java.util.List;
import java.util.UUID;
@@ -8,4 +10,10 @@ import java.util.UUID;
public interface SectionRepository extends JpaRepository<SectionEntity, String> {
List<SectionEntity> findAllByBookId(UUID bookId);
void deleteAllByBookId(UUID bookId);
@Query("SELECT s FROM SectionEntity s WHERE s.bookId = :bookId AND s.pageStart <= :windowEnd AND s.pageEnd >= :windowStart ORDER BY s.pageStart")
List<SectionEntity> findByBookIdAndPageOverlap(
@Param("bookId") UUID bookId,
@Param("windowStart") int windowStart,
@Param("windowEnd") int windowEnd);
}
@@ -3,6 +3,7 @@ package com.aiteacher.document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.stereotype.Service;
import org.springframework.util.MimeTypeUtils;
@@ -32,10 +33,16 @@ public class VisionDescriptionService {
IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
""";
/** Minimum ms between vision API calls. Configurable via app.vision.min-interval-ms. */
private final long minIntervalMs;
private final ChatClient chatClient;
private volatile long lastCallAt = 0;
public VisionDescriptionService(ChatClient chatClient) {
public VisionDescriptionService(
ChatClient chatClient,
@Value("${app.vision.min-interval-ms:2000}") long minIntervalMs) {
this.chatClient = chatClient;
this.minIntervalMs = minIntervalMs;
}
/**
@@ -55,6 +62,7 @@ public class VisionDescriptionService {
* @param captionFallback caption detected from surrounding text, may be null
*/
public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
throttle();
try {
String raw = chatClient.prompt()
.user(u -> u
@@ -71,6 +79,15 @@ public class VisionDescriptionService {
}
}
private synchronized void throttle() {
long now = System.currentTimeMillis();
long wait = minIntervalMs - (now - lastCallAt);
if (wait > 0) {
try { Thread.sleep(wait); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
}
lastCallAt = System.currentTimeMillis();
}
private ImageAnalysis parse(String raw, String captionFallback) {
String description = captionFallback != null ? captionFallback : "Figure";
String imageText = "";