adding Marker to parse effectively pdf
This commit is contained in:
@@ -2,7 +2,9 @@ package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.FigureRepository;
|
||||
import com.aiteacher.document.MarkdownStorageService;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
@@ -18,10 +20,13 @@ public class BookController {
|
||||
|
||||
private final BookService bookService;
|
||||
private final FigureRepository figureRepository;
|
||||
private final MarkdownStorageService markdownStorageService;
|
||||
|
||||
public BookController(BookService bookService, FigureRepository figureRepository) {
|
||||
public BookController(BookService bookService, FigureRepository figureRepository,
|
||||
MarkdownStorageService markdownStorageService) {
|
||||
this.bookService = bookService;
|
||||
this.figureRepository = figureRepository;
|
||||
this.markdownStorageService = markdownStorageService;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data")
|
||||
@@ -59,6 +64,17 @@ public class BookController {
|
||||
));
|
||||
}
|
||||
|
||||
@GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
|
||||
public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
|
||||
@PathVariable int pageNumber) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
try {
|
||||
return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
|
||||
} catch (Exception e) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
|
||||
@GetMapping("/{id}/figures")
|
||||
public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
|
||||
@@ -2,6 +2,9 @@ package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.*;
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
@@ -23,13 +26,7 @@ public class BookEmbeddingService {
|
||||
|
||||
private final VectorStore vectorStore;
|
||||
private final BookRepository bookRepository;
|
||||
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
private final PdfStructureParser pdfStructureParser;
|
||||
private final MarkerPageParser markerPageParser;
|
||||
private final FigureExtractionService figureExtractionService;
|
||||
private final VisionDescriptionService visionDescriptionService;
|
||||
private final TextChunkingService textChunkingService;
|
||||
@@ -39,11 +36,21 @@ public class BookEmbeddingService {
|
||||
private final FigureRepository figureRepository;
|
||||
private final ChunkFigureRefRepository chunkFigureRefRepository;
|
||||
private final FigureStorageService figureStorageService;
|
||||
private final MarkdownStorageService markdownStorageService;
|
||||
|
||||
private static final Pattern MARKER_PLACEHOLDER =
|
||||
Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
|
||||
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
|
||||
public BookEmbeddingService(
|
||||
VectorStore vectorStore,
|
||||
BookRepository bookRepository,
|
||||
PdfStructureParser pdfStructureParser,
|
||||
MarkerPageParser markerPageParser,
|
||||
FigureExtractionService figureExtractionService,
|
||||
VisionDescriptionService visionDescriptionService,
|
||||
TextChunkingService textChunkingService,
|
||||
@@ -52,10 +59,11 @@ public class BookEmbeddingService {
|
||||
ChapterRepository chapterRepository,
|
||||
FigureRepository figureRepository,
|
||||
ChunkFigureRefRepository chunkFigureRefRepository,
|
||||
FigureStorageService figureStorageService) {
|
||||
FigureStorageService figureStorageService,
|
||||
MarkdownStorageService markdownStorageService) {
|
||||
this.vectorStore = vectorStore;
|
||||
this.bookRepository = bookRepository;
|
||||
this.pdfStructureParser = pdfStructureParser;
|
||||
this.markerPageParser = markerPageParser;
|
||||
this.figureExtractionService = figureExtractionService;
|
||||
this.visionDescriptionService = visionDescriptionService;
|
||||
this.textChunkingService = textChunkingService;
|
||||
@@ -65,11 +73,12 @@ public class BookEmbeddingService {
|
||||
this.figureRepository = figureRepository;
|
||||
this.chunkFigureRefRepository = chunkFigureRefRepository;
|
||||
this.figureStorageService = figureStorageService;
|
||||
this.markdownStorageService = markdownStorageService;
|
||||
}
|
||||
|
||||
@Async
|
||||
public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
|
||||
log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle);
|
||||
|
||||
Book book = bookRepository.findById(bookId).orElse(null);
|
||||
if (book == null) {
|
||||
@@ -81,59 +90,73 @@ public class BookEmbeddingService {
|
||||
book.setStatus(BookStatus.PROCESSING);
|
||||
bookRepository.save(book);
|
||||
|
||||
// Step 1: Parse PDF into page-level sections persisted in Postgres
|
||||
List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// Step 2: Build and embed text chunks for all sections in batches
|
||||
// Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
|
||||
List<PageResult> pageResults = markerPageParser.parse(pdfPath);
|
||||
|
||||
// Step 2: Build SectionEntity per page and persist
|
||||
List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
|
||||
|
||||
// Step 3: Chunk and embed text
|
||||
List<Document> allChunks = new ArrayList<>();
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> chunks = textChunkingService.chunk(section, bookTitle);
|
||||
allChunks.addAll(chunks);
|
||||
allChunks.addAll(textChunkingService.chunk(section, bookTitle));
|
||||
}
|
||||
embedInBatches(allChunks, bookId);
|
||||
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
|
||||
|
||||
// Step 3: Extract images from the PDF, save to file store, persist FigureEntity
|
||||
List<FigureEntity> figures = figureExtractionService.extract(
|
||||
bookId, chapterId, sections, pdfPath);
|
||||
// Step 4: Decode pre-cropped figures from Marker output
|
||||
FigureExtractionService.ExtractionResult extraction =
|
||||
figureExtractionService.extract(bookId, chapterId, pageResults);
|
||||
List<FigureEntity> figures = extraction.figures();
|
||||
|
||||
// Step 4: For each figure, generate vision description and embed caption
|
||||
// Step 4b: Upload per-page markdown with resolved figure URLs to S3
|
||||
for (PageResult page : pageResults) {
|
||||
if (!page.markdown().isBlank()) {
|
||||
String resolved = resolvePlaceholders(page.markdown(), bookId,
|
||||
extraction.blockIdToFigureId());
|
||||
markdownStorageService.save(bookId, page.pageNumber(), resolved);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Vision analysis (description + visible text) → embed figure chunks
|
||||
for (FigureEntity figure : figures) {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
String description = visionDescriptionService.describe(
|
||||
imageBytes, figure.getCaption());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
|
||||
// Use description as caption fallback if no caption was detected
|
||||
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
|
||||
figure.setCaption(description);
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
|
||||
// Content for embedding = vision description + caption for maximum signal
|
||||
String embeddingContent = description
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
|
||||
// Embedding content: description + caption + visible image text
|
||||
String embeddingContent = analysis.description()
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
|
||||
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
|
||||
log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
|
||||
|
||||
// Step 5: Link text chunks to figures via text references
|
||||
// Step 6: Link text chunks to figures via in-text references
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> sectionChunks = allChunks.stream()
|
||||
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
|
||||
.toList();
|
||||
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
|
||||
.toList();
|
||||
List<FigureEntity> sectionFigures = figures.stream()
|
||||
.filter(f -> section.getId().equals(f.getSectionId()))
|
||||
.toList();
|
||||
chunkFigureRefService.linkChunksToFigures(
|
||||
sectionChunks, sectionFigures, section.getPageStart());
|
||||
.filter(f -> section.getId().equals(f.getSectionId()))
|
||||
.toList();
|
||||
chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart());
|
||||
}
|
||||
|
||||
book.setStatus(BookStatus.READY);
|
||||
@@ -142,7 +165,7 @@ public class BookEmbeddingService {
|
||||
bookRepository.save(book);
|
||||
|
||||
log.info("Finished embedding book {} — {} pages, {} figures",
|
||||
bookId, sections.size(), figures.size());
|
||||
bookId, sections.size(), figures.size());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.error("Failed to embed book {}", bookId, ex);
|
||||
@@ -156,53 +179,63 @@ public class BookEmbeddingService {
|
||||
public void deleteBookChunks(UUID bookId) {
|
||||
log.info("Deleting all data for book {}", bookId);
|
||||
try {
|
||||
// Delete chunk-figure refs (by figureId for this book)
|
||||
List<String> figureIds = figureRepository.findAllByBookId(bookId)
|
||||
.stream().map(FigureEntity::getId).toList();
|
||||
.stream().map(FigureEntity::getId).toList();
|
||||
if (!figureIds.isEmpty()) {
|
||||
chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
|
||||
}
|
||||
|
||||
// Delete figures from Postgres
|
||||
figureRepository.deleteAllByBookId(bookId);
|
||||
|
||||
// Delete figure files from disk
|
||||
figureStorageService.deleteAll(bookId);
|
||||
|
||||
// Delete sections and chapters from Postgres
|
||||
markdownStorageService.deleteAll(bookId);
|
||||
sectionRepository.deleteAllByBookId(bookId);
|
||||
chapterRepository.deleteAllByBookId(bookId);
|
||||
|
||||
// Delete vector store entries (text chunks + figure embeddings)
|
||||
FilterExpressionBuilder b = new FilterExpressionBuilder();
|
||||
vectorStore.delete(b.eq("book_id", bookId.toString()).build());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// --- Private helpers ---
|
||||
|
||||
private List<SectionEntity> buildAndSaveSections(UUID bookId, String bookTitle,
|
||||
String chapterId,
|
||||
List<PageResult> pageResults) {
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
for (PageResult page : pageResults) {
|
||||
if (page.orderedText().isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + page.pageNumber();
|
||||
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
|
||||
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(page.pageNumber()),
|
||||
title,
|
||||
page.pageNumber(), page.pageNumber(),
|
||||
page.orderedText());
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
return sections;
|
||||
}
|
||||
|
||||
private void embedInBatches(List<Document> docs, UUID bookId) {
|
||||
int total = docs.size();
|
||||
for (int i = 0; i < total; i += embeddingBatchSize) {
|
||||
List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
|
||||
vectorStore.add(batch);
|
||||
int batchNum = i / embeddingBatchSize + 1;
|
||||
int totalBatches = (total - 1) / embeddingBatchSize + 1;
|
||||
log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
|
||||
log.debug("Embedded batch {}/{} for book {}",
|
||||
i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId);
|
||||
if (i + embeddingBatchSize < total) {
|
||||
try {
|
||||
Thread.sleep(embeddingBatchDelayMs);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.warn("Embedding batch sleep interrupted for book {}", bookId);
|
||||
}
|
||||
try { Thread.sleep(embeddingBatchDelayMs); }
|
||||
catch (InterruptedException e) { Thread.currentThread().interrupt(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
|
||||
String embeddingId) {
|
||||
String embeddingId, String imageText) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
m.put("type", "FIGURE");
|
||||
m.put("book_id", figure.getBookId().toString());
|
||||
@@ -215,9 +248,31 @@ public class BookEmbeddingService {
|
||||
m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
|
||||
m.put("page", figure.getPage());
|
||||
m.put("embedding_id", embeddingId);
|
||||
m.put("image_text", imageText); // verbatim text visible inside the image
|
||||
return m;
|
||||
}
|
||||
|
||||
/** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
|
||||
private String resolvePlaceholders(String markdown, UUID bookId,
|
||||
Map<String, String> blockIdToFigureId) {
|
||||
Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (m.find()) {
|
||||
String altText = m.group(1);
|
||||
String blockId = m.group(2);
|
||||
String figureId = blockIdToFigureId.get(blockId);
|
||||
if (figureId != null) {
|
||||
String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
|
||||
m.appendReplacement(sb, "");
|
||||
} else {
|
||||
m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
|
||||
}
|
||||
}
|
||||
m.appendTail(sb);
|
||||
return sb.toString().strip();
|
||||
}
|
||||
|
||||
private String truncate(String msg, int max) {
|
||||
if (msg == null) return null;
|
||||
return msg.length() <= max ? msg : msg.substring(0, max);
|
||||
|
||||
Reference in New Issue
Block a user