enhance rag retrieval + summary
This commit is contained in:
@@ -17,6 +17,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Parses a PDF with a single call to the Marker server using {@code output_format=json}.
|
||||
*
|
||||
@@ -46,19 +47,65 @@ public class MarkerPageParser {
|
||||
);
|
||||
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
|
||||
|
||||
private static final int CHUNK_SIZE = 100;
|
||||
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
private final RestClient restClient;
|
||||
private final PdfSplitterService pdfSplitterService;
|
||||
|
||||
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
|
||||
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient,
|
||||
PdfSplitterService pdfSplitterService) {
|
||||
this.restClient = restClient;
|
||||
this.pdfSplitterService = pdfSplitterService;
|
||||
}
|
||||
|
||||
public ParsedBook parse(Path pdfPath) {
|
||||
log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
|
||||
/**
|
||||
* Parses a PDF by splitting it into {@value #CHUNK_SIZE}-page chunks, submitting each
|
||||
* chunk to Marker individually, and merging the results into a single {@link ParsedBook}.
|
||||
* Page numbers in the merged result are absolute (1-based across the whole document).
|
||||
*/
|
||||
public ParsedBook parse(Path pdfPath) throws IOException {
|
||||
List<PdfSplitterService.PdfChunk> chunks = pdfSplitterService.split(pdfPath, CHUNK_SIZE);
|
||||
log.info("Processing {} chunk(s) for {}", chunks.size(), pdfPath.getFileName());
|
||||
|
||||
List<PageResult> allPages = new ArrayList<>();
|
||||
Map<Integer, String> allHtml = new LinkedHashMap<>();
|
||||
|
||||
try {
|
||||
for (int c = 0; c < chunks.size(); c++) {
|
||||
PdfSplitterService.PdfChunk chunk = chunks.get(c);
|
||||
log.info("Submitting chunk {}/{} to Marker (page offset {})", c + 1, chunks.size(), chunk.pageOffset());
|
||||
|
||||
ParsedBook chunkResult = submitChunk(chunk.tempFile());
|
||||
|
||||
// Rebase page numbers from chunk-relative to document-absolute
|
||||
for (PageResult page : chunkResult.pages()) {
|
||||
int absolutePage = chunk.pageOffset() + page.pageNumber();
|
||||
allPages.add(new PageResult(absolutePage, page.orderedText(), page.headingTitle(), page.figures()));
|
||||
}
|
||||
chunkResult.htmlByPage().forEach((chunkPage, html) ->
|
||||
allHtml.put(chunk.pageOffset() + chunkPage, html));
|
||||
}
|
||||
} finally {
|
||||
// Delete temporary chunk files (skip if the chunk is the original PDF)
|
||||
for (PdfSplitterService.PdfChunk chunk : chunks) {
|
||||
if (!chunk.tempFile().equals(pdfPath)) {
|
||||
try { Files.deleteIfExists(chunk.tempFile()); }
|
||||
catch (IOException e) { log.warn("Could not delete temp chunk {}", chunk.tempFile()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Marker produced {} non-empty pages from {} chunk(s) of {}",
|
||||
allPages.size(), chunks.size(), pdfPath.getFileName());
|
||||
return new ParsedBook(allPages, allHtml);
|
||||
}
|
||||
|
||||
/** Submits a single PDF file to Marker and returns the parsed result with chunk-relative page numbers. */
|
||||
private ParsedBook submitChunk(Path chunkPath) {
|
||||
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
|
||||
body.add("file", new FileSystemResource(pdfPath));
|
||||
body.add("file", new FileSystemResource(chunkPath));
|
||||
body.add("output_format", "json");
|
||||
|
||||
JsonNode response = restClient.post()
|
||||
@@ -76,28 +123,29 @@ public class MarkerPageParser {
|
||||
|
||||
List<JsonNode> pageNodes = extractPages(response);
|
||||
if (pageNodes.isEmpty()) {
|
||||
log.warn("Marker returned no pages for {}", pdfPath.getFileName());
|
||||
log.warn("Marker returned no pages for chunk {}", chunkPath.getFileName());
|
||||
return new ParsedBook(List.of(), Map.of());
|
||||
}
|
||||
log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());
|
||||
|
||||
List<PageResult> pages = new ArrayList<>();
|
||||
Map<Integer, String> htmlByPage = new LinkedHashMap<>();
|
||||
|
||||
for (int i = 0; i < pageNodes.size(); i++) {
|
||||
JsonNode pageNode = pageNodes.get(i);
|
||||
int pageNumber = i + 1; // 1-based
|
||||
int pageNumber = i + 1; // 1-based, chunk-relative
|
||||
|
||||
PageResult result = buildPageResult(pageNode, pageNumber);
|
||||
String html = jsonToHtml(pageNode);
|
||||
|
||||
// Always save HTML so the reader can navigate to every page
|
||||
htmlByPage.put(pageNumber, html);
|
||||
|
||||
// Only queue for embedding if the page has extractable content
|
||||
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
|
||||
pages.add(result);
|
||||
htmlByPage.put(pageNumber, html);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
|
||||
return new ParsedBook(pages, htmlByPage);
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
||||
import org.apache.pdfbox.multipdf.Splitter;
|
||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Splits a PDF file into fixed-size chunks using PDFBox.
|
||||
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
|
||||
*/
|
||||
@Service
|
||||
public class PdfSplitterService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
|
||||
|
||||
/**
|
||||
* A chunk of a split PDF.
|
||||
*
|
||||
* @param tempFile path to the temporary PDF file (caller must delete when done)
|
||||
* @param pageOffset 0-based index of the first page in this chunk within the original document
|
||||
*/
|
||||
public record PdfChunk(Path tempFile, int pageOffset) {}
|
||||
|
||||
/**
|
||||
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
|
||||
* Returns a single-element list when the document fits in one chunk.
|
||||
*
|
||||
* @param pdfPath source PDF
|
||||
* @param maxPagesPerChunk maximum pages per chunk
|
||||
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
|
||||
*/
|
||||
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
|
||||
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
|
||||
int totalPages = doc.getNumberOfPages();
|
||||
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
|
||||
|
||||
if (totalPages <= maxPagesPerChunk) {
|
||||
// No split needed — return the original file as a single virtual chunk
|
||||
return List.of(new PdfChunk(pdfPath, 0));
|
||||
}
|
||||
|
||||
Splitter splitter = new Splitter();
|
||||
splitter.setSplitAtPage(maxPagesPerChunk);
|
||||
List<PDDocument> parts = splitter.split(doc);
|
||||
|
||||
List<PdfChunk> chunks = new ArrayList<>(parts.size());
|
||||
int offset = 0;
|
||||
for (PDDocument part : parts) {
|
||||
try {
|
||||
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
|
||||
part.save(tmp.toFile());
|
||||
chunks.add(new PdfChunk(tmp, offset));
|
||||
log.debug("Created chunk at {} (page offset {})", tmp, offset);
|
||||
offset += part.getNumberOfPages();
|
||||
} finally {
|
||||
part.close();
|
||||
}
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
@@ -8,4 +10,10 @@ import java.util.UUID;
|
||||
public interface SectionRepository extends JpaRepository<SectionEntity, String> {
|
||||
List<SectionEntity> findAllByBookId(UUID bookId);
|
||||
void deleteAllByBookId(UUID bookId);
|
||||
|
||||
@Query("SELECT s FROM SectionEntity s WHERE s.bookId = :bookId AND s.pageStart <= :windowEnd AND s.pageEnd >= :windowStart ORDER BY s.pageStart")
|
||||
List<SectionEntity> findByBookIdAndPageOverlap(
|
||||
@Param("bookId") UUID bookId,
|
||||
@Param("windowStart") int windowStart,
|
||||
@Param("windowEnd") int windowEnd);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package com.aiteacher.document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.ByteArrayResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.MimeTypeUtils;
|
||||
@@ -32,10 +33,16 @@ public class VisionDescriptionService {
|
||||
IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
|
||||
""";
|
||||
|
||||
/** Minimum ms between vision API calls. Configurable via app.vision.min-interval-ms. */
|
||||
private final long minIntervalMs;
|
||||
private final ChatClient chatClient;
|
||||
private volatile long lastCallAt = 0;
|
||||
|
||||
public VisionDescriptionService(ChatClient chatClient) {
|
||||
public VisionDescriptionService(
|
||||
ChatClient chatClient,
|
||||
@Value("${app.vision.min-interval-ms:2000}") long minIntervalMs) {
|
||||
this.chatClient = chatClient;
|
||||
this.minIntervalMs = minIntervalMs;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -55,6 +62,7 @@ public class VisionDescriptionService {
|
||||
* @param captionFallback caption detected from surrounding text, may be null
|
||||
*/
|
||||
public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
|
||||
throttle();
|
||||
try {
|
||||
String raw = chatClient.prompt()
|
||||
.user(u -> u
|
||||
@@ -71,6 +79,15 @@ public class VisionDescriptionService {
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void throttle() {
|
||||
long now = System.currentTimeMillis();
|
||||
long wait = minIntervalMs - (now - lastCallAt);
|
||||
if (wait > 0) {
|
||||
try { Thread.sleep(wait); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
|
||||
}
|
||||
lastCallAt = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
private ImageAnalysis parse(String raw, String captionFallback) {
|
||||
String description = captionFallback != null ? captionFallback : "Figure";
|
||||
String imageText = "";
|
||||
|
||||
Reference in New Issue
Block a user