enhance rag retrieval + summary
This commit is contained in:
@@ -92,7 +92,7 @@ public class BookEmbeddingService {
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
|
||||
// Step 1: Parse with Marker — split into 100-page chunks, then merge results
|
||||
ParsedBook parsed = markerPageParser.parse(pdfPath);
|
||||
|
||||
List<PageResult> pageResults = parsed.pages();
|
||||
@@ -125,25 +125,32 @@ public class BookEmbeddingService {
|
||||
log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
|
||||
|
||||
// Step 5: Vision analysis (description + visible text) → embed figure chunks
|
||||
for (FigureEntity figure : figures) {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
Map<String, SectionEntity> sectionById = new HashMap<>();
|
||||
for (SectionEntity s : sections) sectionById.put(s.getId(), s);
|
||||
|
||||
for (FigureEntity figure : figures) {
|
||||
// Prefer caption extracted from the linked section's full text
|
||||
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
String sectionCaption = extractCaptionFromSection(sectionById.get(figure.getSectionId()));
|
||||
if (sectionCaption != null) {
|
||||
figure.setCaption(sectionCaption);
|
||||
figureRepository.save(figure);
|
||||
} else {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
}
|
||||
|
||||
// Embedding content: description + caption + visible image text
|
||||
String embeddingContent = analysis.description()
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
|
||||
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
|
||||
// Embedding content: description
|
||||
String embeddingContent = (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
if (!skipEmbedding) {
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, ""));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
}
|
||||
@@ -163,7 +170,7 @@ public class BookEmbeddingService {
|
||||
}
|
||||
|
||||
book.setStatus(BookStatus.READY);
|
||||
book.setPageCount(sections.size());
|
||||
book.setPageCount(parsed.htmlByPage().size());
|
||||
book.setProcessedAt(Instant.now());
|
||||
bookRepository.save(book);
|
||||
|
||||
@@ -210,7 +217,7 @@ public class BookEmbeddingService {
|
||||
if (page.orderedText().isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + page.pageNumber();
|
||||
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
|
||||
String title = truncate(page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber(), 500);
|
||||
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
@@ -271,6 +278,17 @@ public class BookEmbeddingService {
|
||||
return html;
|
||||
}
|
||||
|
||||
private String extractCaptionFromSection(SectionEntity section) {
|
||||
if (section == null) return null;
|
||||
for (String line : section.getFullText().split("\n")) {
|
||||
String trimmed = line.strip();
|
||||
if (trimmed.startsWith("Fig.") || trimmed.startsWith("Figure") || trimmed.startsWith("Algorithm")) {
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String truncate(String msg, int max) {
|
||||
if (msg == null) return null;
|
||||
return msg.length() <= max ? msg : msg.substring(0, max);
|
||||
|
||||
Reference in New Issue
Block a user