enhance page parsing using json output and html
This commit is contained in:
@@ -64,9 +64,9 @@ public class BookController {
|
||||
));
|
||||
}
|
||||
|
||||
@GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
|
||||
public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
|
||||
@PathVariable int pageNumber) {
|
||||
@GetMapping(value = "/{id}/pages/{pageNumber}/html", produces = MediaType.TEXT_HTML_VALUE)
|
||||
public ResponseEntity<String> getPageHtml(@PathVariable UUID id,
|
||||
@PathVariable int pageNumber) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
try {
|
||||
return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
|
||||
|
||||
@@ -3,8 +3,6 @@ package com.aiteacher.book;
|
||||
import com.aiteacher.document.*;
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
@@ -38,15 +36,15 @@ public class BookEmbeddingService {
|
||||
private final FigureStorageService figureStorageService;
|
||||
private final MarkdownStorageService markdownStorageService;
|
||||
|
||||
private static final Pattern MARKER_PLACEHOLDER =
|
||||
Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
|
||||
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
|
||||
@Value("${app.embedding.skip-embedding:false}")
|
||||
private boolean skipEmbedding;
|
||||
|
||||
public BookEmbeddingService(
|
||||
VectorStore vectorStore,
|
||||
BookRepository bookRepository,
|
||||
@@ -94,8 +92,10 @@ public class BookEmbeddingService {
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
|
||||
List<PageResult> pageResults = markerPageParser.parse(pdfPath);
|
||||
// Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
|
||||
ParsedBook parsed = markerPageParser.parse(pdfPath);
|
||||
|
||||
List<PageResult> pageResults = parsed.pages();
|
||||
|
||||
// Step 2: Build SectionEntity per page and persist
|
||||
List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
|
||||
@@ -105,22 +105,24 @@ public class BookEmbeddingService {
|
||||
for (SectionEntity section : sections) {
|
||||
allChunks.addAll(textChunkingService.chunk(section, bookTitle));
|
||||
}
|
||||
embedInBatches(allChunks, bookId);
|
||||
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
|
||||
if (skipEmbedding) {
|
||||
log.info("skip-embedding=true — skipping text embedding for book {}", bookId);
|
||||
} else {
|
||||
embedInBatches(allChunks, bookId);
|
||||
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
|
||||
}
|
||||
|
||||
// Step 4: Decode pre-cropped figures from Marker output
|
||||
FigureExtractionService.ExtractionResult extraction =
|
||||
figureExtractionService.extract(bookId, chapterId, pageResults);
|
||||
List<FigureEntity> figures = extraction.figures();
|
||||
|
||||
// Step 4b: Upload per-page markdown with resolved figure URLs to S3
|
||||
for (PageResult page : pageResults) {
|
||||
if (!page.markdown().isBlank()) {
|
||||
String resolved = resolvePlaceholders(page.markdown(), bookId,
|
||||
extraction.blockIdToFigureId());
|
||||
markdownStorageService.save(bookId, page.pageNumber(), resolved);
|
||||
}
|
||||
}
|
||||
// Step 4b: Save per-page HTML to S3, replacing Marker image src with API URLs
|
||||
parsed.htmlByPage().forEach((pageNumber, html) -> {
|
||||
String resolved = resolveImageSrcs(html, bookId, extraction.blockIdToFigureId());
|
||||
markdownStorageService.save(bookId, pageNumber, resolved);
|
||||
});
|
||||
log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
|
||||
|
||||
// Step 5: Vision analysis (description + visible text) → embed figure chunks
|
||||
for (FigureEntity figure : figures) {
|
||||
@@ -139,11 +141,12 @@ public class BookEmbeddingService {
|
||||
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
if (!skipEmbedding) {
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
}
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
|
||||
@@ -252,25 +255,20 @@ public class BookEmbeddingService {
|
||||
return m;
|
||||
}
|
||||
|
||||
/** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
|
||||
private String resolvePlaceholders(String markdown, UUID bookId,
|
||||
Map<String, String> blockIdToFigureId) {
|
||||
Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (m.find()) {
|
||||
String altText = m.group(1);
|
||||
String blockId = m.group(2);
|
||||
String figureId = blockIdToFigureId.get(blockId);
|
||||
if (figureId != null) {
|
||||
String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
|
||||
m.appendReplacement(sb, "");
|
||||
} else {
|
||||
m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
|
||||
}
|
||||
/**
|
||||
* Replaces Marker's {@code src='{blockId}'} image attributes with resolved API URLs.
|
||||
* Block IDs look like {@code /page/0/Figure/2}.
|
||||
*/
|
||||
private String resolveImageSrcs(String html, UUID bookId, Map<String, String> blockIdToFigureId) {
|
||||
for (Map.Entry<String, String> entry : blockIdToFigureId.entrySet()) {
|
||||
String blockId = entry.getKey();
|
||||
String figureId = entry.getValue();
|
||||
String apiUrl = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
|
||||
// Marker emits both single and double-quoted src attributes
|
||||
html = html.replace("src='" + blockId + "'", "src='" + apiUrl + "'");
|
||||
html = html.replace("src=\"" + blockId + "\"", "src=\"" + apiUrl + "\"");
|
||||
}
|
||||
m.appendTail(sb);
|
||||
return sb.toString().strip();
|
||||
return html;
|
||||
}
|
||||
|
||||
private String truncate(String msg, int max) {
|
||||
|
||||
Reference in New Issue
Block a user