enhance rag retrieval + summary

This commit is contained in:
Adrien
2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -92,7 +92,7 @@ public class BookEmbeddingService {
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter);
// Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
// Step 1: Parse with Marker — split into 100-page chunks, then merge results
ParsedBook parsed = markerPageParser.parse(pdfPath);
List<PageResult> pageResults = parsed.pages();
@@ -125,25 +125,32 @@ public class BookEmbeddingService {
log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
// Step 5: Vision analysis (description + visible text) → embed figure chunks
for (FigureEntity figure : figures) {
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
VisionDescriptionService.ImageAnalysis analysis =
visionDescriptionService.analyze(imageBytes, figure.getCaption());
Map<String, SectionEntity> sectionById = new HashMap<>();
for (SectionEntity s : sections) sectionById.put(s.getId(), s);
for (FigureEntity figure : figures) {
// Prefer caption extracted from the linked section's full text
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
figure.setCaption(analysis.description());
figureRepository.save(figure);
String sectionCaption = extractCaptionFromSection(sectionById.get(figure.getSectionId()));
if (sectionCaption != null) {
figure.setCaption(sectionCaption);
figureRepository.save(figure);
} else {
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
VisionDescriptionService.ImageAnalysis analysis =
visionDescriptionService.analyze(imageBytes, figure.getCaption());
figure.setCaption(analysis.description());
figureRepository.save(figure);
}
}
// Embedding content: description + caption + visible image text
String embeddingContent = analysis.description()
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
// Embedding content: description
String embeddingContent = (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
String embeddingId = UUID.randomUUID().toString();
if (!skipEmbedding) {
Document figureDoc = new Document(embeddingId, embeddingContent,
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
buildFigureMetadata(figure, bookTitle, embeddingId, ""));
vectorStore.add(List.of(figureDoc));
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
}
@@ -163,7 +170,7 @@ public class BookEmbeddingService {
}
book.setStatus(BookStatus.READY);
book.setPageCount(sections.size());
book.setPageCount(parsed.htmlByPage().size());
book.setProcessedAt(Instant.now());
bookRepository.save(book);
@@ -210,7 +217,7 @@ public class BookEmbeddingService {
if (page.orderedText().isBlank()) continue;
String sectionId = bookId + "-p" + page.pageNumber();
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
String title = truncate(page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber(), 500);
SectionEntity section = new SectionEntity(
sectionId, chapterId, bookId,
@@ -271,6 +278,17 @@ public class BookEmbeddingService {
return html;
}
private String extractCaptionFromSection(SectionEntity section) {
if (section == null) return null;
for (String line : section.getFullText().split("\n")) {
String trimmed = line.strip();
if (trimmed.startsWith("Fig.") || trimmed.startsWith("Figure") || trimmed.startsWith("Algorithm")) {
return trimmed;
}
}
return null;
}
private String truncate(String msg, int max) {
if (msg == null) return null;
return msg.length() <= max ? msg : msg.substring(0, max);