adding Marker to parse effectively pdf

This commit is contained in:
Adrien
2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -2,7 +2,9 @@ package com.aiteacher.book;
import com.aiteacher.document.FigureEntity;
import com.aiteacher.document.FigureRepository;
import com.aiteacher.document.MarkdownStorageService;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
@@ -18,10 +20,13 @@ public class BookController {
private final BookService bookService;
private final FigureRepository figureRepository;
private final MarkdownStorageService markdownStorageService;
public BookController(BookService bookService, FigureRepository figureRepository) {
public BookController(BookService bookService, FigureRepository figureRepository,
MarkdownStorageService markdownStorageService) {
this.bookService = bookService;
this.figureRepository = figureRepository;
this.markdownStorageService = markdownStorageService;
}
@PostMapping(consumes = "multipart/form-data")
@@ -59,6 +64,17 @@ public class BookController {
));
}
@GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
@PathVariable int pageNumber) {
bookService.getById(id); // 404 if not found
try {
return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
} catch (Exception e) {
return ResponseEntity.notFound().build();
}
}
@GetMapping("/{id}/figures")
public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
bookService.getById(id); // 404 if not found
@@ -2,6 +2,9 @@ package com.aiteacher.book;
import com.aiteacher.document.*;
import com.aiteacher.figure.FigureStorageService;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
@@ -23,13 +26,7 @@ public class BookEmbeddingService {
private final VectorStore vectorStore;
private final BookRepository bookRepository;
@Value("${app.embedding.batch-size:50}")
private int embeddingBatchSize;
@Value("${app.embedding.batch-delay-ms:1000}")
private long embeddingBatchDelayMs;
private final PdfStructureParser pdfStructureParser;
private final MarkerPageParser markerPageParser;
private final FigureExtractionService figureExtractionService;
private final VisionDescriptionService visionDescriptionService;
private final TextChunkingService textChunkingService;
@@ -39,11 +36,21 @@ public class BookEmbeddingService {
private final FigureRepository figureRepository;
private final ChunkFigureRefRepository chunkFigureRefRepository;
private final FigureStorageService figureStorageService;
private final MarkdownStorageService markdownStorageService;
private static final Pattern MARKER_PLACEHOLDER =
Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
@Value("${app.embedding.batch-size:50}")
private int embeddingBatchSize;
@Value("${app.embedding.batch-delay-ms:1000}")
private long embeddingBatchDelayMs;
public BookEmbeddingService(
VectorStore vectorStore,
BookRepository bookRepository,
PdfStructureParser pdfStructureParser,
MarkerPageParser markerPageParser,
FigureExtractionService figureExtractionService,
VisionDescriptionService visionDescriptionService,
TextChunkingService textChunkingService,
@@ -52,10 +59,11 @@ public class BookEmbeddingService {
ChapterRepository chapterRepository,
FigureRepository figureRepository,
ChunkFigureRefRepository chunkFigureRefRepository,
FigureStorageService figureStorageService) {
FigureStorageService figureStorageService,
MarkdownStorageService markdownStorageService) {
this.vectorStore = vectorStore;
this.bookRepository = bookRepository;
this.pdfStructureParser = pdfStructureParser;
this.markerPageParser = markerPageParser;
this.figureExtractionService = figureExtractionService;
this.visionDescriptionService = visionDescriptionService;
this.textChunkingService = textChunkingService;
@@ -65,11 +73,12 @@ public class BookEmbeddingService {
this.figureRepository = figureRepository;
this.chunkFigureRefRepository = chunkFigureRefRepository;
this.figureStorageService = figureStorageService;
this.markdownStorageService = markdownStorageService;
}
@Async
public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle);
Book book = bookRepository.findById(bookId).orElse(null);
if (book == null) {
@@ -81,59 +90,73 @@ public class BookEmbeddingService {
book.setStatus(BookStatus.PROCESSING);
bookRepository.save(book);
// Step 1: Parse PDF into page-level sections persisted in Postgres
List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
String chapterId = bookId + "-ch1";
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter);
// Step 2: Build and embed text chunks for all sections in batches
// Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
List<PageResult> pageResults = markerPageParser.parse(pdfPath);
// Step 2: Build SectionEntity per page and persist
List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
// Step 3: Chunk and embed text
List<Document> allChunks = new ArrayList<>();
for (SectionEntity section : sections) {
List<Document> chunks = textChunkingService.chunk(section, bookTitle);
allChunks.addAll(chunks);
allChunks.addAll(textChunkingService.chunk(section, bookTitle));
}
embedInBatches(allChunks, bookId);
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
// Step 3: Extract images from the PDF, save to file store, persist FigureEntity
List<FigureEntity> figures = figureExtractionService.extract(
bookId, chapterId, sections, pdfPath);
// Step 4: Decode pre-cropped figures from Marker output
FigureExtractionService.ExtractionResult extraction =
figureExtractionService.extract(bookId, chapterId, pageResults);
List<FigureEntity> figures = extraction.figures();
// Step 4: For each figure, generate vision description and embed caption
// Step 4b: Upload per-page markdown with resolved figure URLs to S3
for (PageResult page : pageResults) {
if (!page.markdown().isBlank()) {
String resolved = resolvePlaceholders(page.markdown(), bookId,
extraction.blockIdToFigureId());
markdownStorageService.save(bookId, page.pageNumber(), resolved);
}
}
// Step 5: Vision analysis (description + visible text) → embed figure chunks
for (FigureEntity figure : figures) {
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
String description = visionDescriptionService.describe(
imageBytes, figure.getCaption());
VisionDescriptionService.ImageAnalysis analysis =
visionDescriptionService.analyze(imageBytes, figure.getCaption());
// Use description as caption fallback if no caption was detected
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
figure.setCaption(description);
figure.setCaption(analysis.description());
figureRepository.save(figure);
}
// Content for embedding = vision description + caption for maximum signal
String embeddingContent = description
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
// Embedding content: description + caption + visible image text
String embeddingContent = analysis.description()
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
String embeddingId = UUID.randomUUID().toString();
Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
Document figureDoc = new Document(embeddingId, embeddingContent,
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
vectorStore.add(List.of(figureDoc));
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
figureRepository.save(figure);
}
log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
// Step 5: Link text chunks to figures via text references
// Step 6: Link text chunks to figures via in-text references
for (SectionEntity section : sections) {
List<Document> sectionChunks = allChunks.stream()
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
.toList();
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
.toList();
List<FigureEntity> sectionFigures = figures.stream()
.filter(f -> section.getId().equals(f.getSectionId()))
.toList();
chunkFigureRefService.linkChunksToFigures(
sectionChunks, sectionFigures, section.getPageStart());
.filter(f -> section.getId().equals(f.getSectionId()))
.toList();
chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart());
}
book.setStatus(BookStatus.READY);
@@ -142,7 +165,7 @@ public class BookEmbeddingService {
bookRepository.save(book);
log.info("Finished embedding book {} — {} pages, {} figures",
bookId, sections.size(), figures.size());
bookId, sections.size(), figures.size());
} catch (Exception ex) {
log.error("Failed to embed book {}", bookId, ex);
@@ -156,53 +179,63 @@ public class BookEmbeddingService {
public void deleteBookChunks(UUID bookId) {
log.info("Deleting all data for book {}", bookId);
try {
// Delete chunk-figure refs (by figureId for this book)
List<String> figureIds = figureRepository.findAllByBookId(bookId)
.stream().map(FigureEntity::getId).toList();
.stream().map(FigureEntity::getId).toList();
if (!figureIds.isEmpty()) {
chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
}
// Delete figures from Postgres
figureRepository.deleteAllByBookId(bookId);
// Delete figure files from disk
figureStorageService.deleteAll(bookId);
// Delete sections and chapters from Postgres
markdownStorageService.deleteAll(bookId);
sectionRepository.deleteAllByBookId(bookId);
chapterRepository.deleteAllByBookId(bookId);
// Delete vector store entries (text chunks + figure embeddings)
FilterExpressionBuilder b = new FilterExpressionBuilder();
vectorStore.delete(b.eq("book_id", bookId.toString()).build());
} catch (Exception ex) {
log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
}
}
// --- Private helpers ---
private List<SectionEntity> buildAndSaveSections(UUID bookId, String bookTitle,
String chapterId,
List<PageResult> pageResults) {
List<SectionEntity> sections = new ArrayList<>();
for (PageResult page : pageResults) {
if (page.orderedText().isBlank()) continue;
String sectionId = bookId + "-p" + page.pageNumber();
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
SectionEntity section = new SectionEntity(
sectionId, chapterId, bookId,
String.valueOf(page.pageNumber()),
title,
page.pageNumber(), page.pageNumber(),
page.orderedText());
sections.add(sectionRepository.save(section));
}
return sections;
}
private void embedInBatches(List<Document> docs, UUID bookId) {
int total = docs.size();
for (int i = 0; i < total; i += embeddingBatchSize) {
List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
vectorStore.add(batch);
int batchNum = i / embeddingBatchSize + 1;
int totalBatches = (total - 1) / embeddingBatchSize + 1;
log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
log.debug("Embedded batch {}/{} for book {}",
i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId);
if (i + embeddingBatchSize < total) {
try {
Thread.sleep(embeddingBatchDelayMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
log.warn("Embedding batch sleep interrupted for book {}", bookId);
}
try { Thread.sleep(embeddingBatchDelayMs); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); }
}
}
}
private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
String embeddingId) {
String embeddingId, String imageText) {
Map<String, Object> m = new HashMap<>();
m.put("type", "FIGURE");
m.put("book_id", figure.getBookId().toString());
@@ -215,9 +248,31 @@ public class BookEmbeddingService {
m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
m.put("page", figure.getPage());
m.put("embedding_id", embeddingId);
m.put("image_text", imageText); // verbatim text visible inside the image
return m;
}
/** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
private String resolvePlaceholders(String markdown, UUID bookId,
Map<String, String> blockIdToFigureId) {
Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
StringBuilder sb = new StringBuilder();
while (m.find()) {
String altText = m.group(1);
String blockId = m.group(2);
String figureId = blockIdToFigureId.get(blockId);
if (figureId != null) {
String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\")
.replace("$", "\\$") + "](" + url + ")");
} else {
m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
}
}
m.appendTail(sb);
return sb.toString().strip();
}
private String truncate(String msg, int max) {
if (msg == null) return null;
return msg.length() <= max ? msg : msg.substring(0, max);