adding Marker to parse effectively pdf
This commit is contained in:
@@ -2,7 +2,9 @@ package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.FigureRepository;
|
||||
import com.aiteacher.document.MarkdownStorageService;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
@@ -18,10 +20,13 @@ public class BookController {
|
||||
|
||||
private final BookService bookService;
|
||||
private final FigureRepository figureRepository;
|
||||
private final MarkdownStorageService markdownStorageService;
|
||||
|
||||
public BookController(BookService bookService, FigureRepository figureRepository) {
|
||||
public BookController(BookService bookService, FigureRepository figureRepository,
|
||||
MarkdownStorageService markdownStorageService) {
|
||||
this.bookService = bookService;
|
||||
this.figureRepository = figureRepository;
|
||||
this.markdownStorageService = markdownStorageService;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data")
|
||||
@@ -59,6 +64,17 @@ public class BookController {
|
||||
));
|
||||
}
|
||||
|
||||
@GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
|
||||
public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
|
||||
@PathVariable int pageNumber) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
try {
|
||||
return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
|
||||
} catch (Exception e) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
|
||||
@GetMapping("/{id}/figures")
|
||||
public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
|
||||
@@ -2,6 +2,9 @@ package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.*;
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
@@ -23,13 +26,7 @@ public class BookEmbeddingService {
|
||||
|
||||
private final VectorStore vectorStore;
|
||||
private final BookRepository bookRepository;
|
||||
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
private final PdfStructureParser pdfStructureParser;
|
||||
private final MarkerPageParser markerPageParser;
|
||||
private final FigureExtractionService figureExtractionService;
|
||||
private final VisionDescriptionService visionDescriptionService;
|
||||
private final TextChunkingService textChunkingService;
|
||||
@@ -39,11 +36,21 @@ public class BookEmbeddingService {
|
||||
private final FigureRepository figureRepository;
|
||||
private final ChunkFigureRefRepository chunkFigureRefRepository;
|
||||
private final FigureStorageService figureStorageService;
|
||||
private final MarkdownStorageService markdownStorageService;
|
||||
|
||||
private static final Pattern MARKER_PLACEHOLDER =
|
||||
Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
|
||||
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
|
||||
public BookEmbeddingService(
|
||||
VectorStore vectorStore,
|
||||
BookRepository bookRepository,
|
||||
PdfStructureParser pdfStructureParser,
|
||||
MarkerPageParser markerPageParser,
|
||||
FigureExtractionService figureExtractionService,
|
||||
VisionDescriptionService visionDescriptionService,
|
||||
TextChunkingService textChunkingService,
|
||||
@@ -52,10 +59,11 @@ public class BookEmbeddingService {
|
||||
ChapterRepository chapterRepository,
|
||||
FigureRepository figureRepository,
|
||||
ChunkFigureRefRepository chunkFigureRefRepository,
|
||||
FigureStorageService figureStorageService) {
|
||||
FigureStorageService figureStorageService,
|
||||
MarkdownStorageService markdownStorageService) {
|
||||
this.vectorStore = vectorStore;
|
||||
this.bookRepository = bookRepository;
|
||||
this.pdfStructureParser = pdfStructureParser;
|
||||
this.markerPageParser = markerPageParser;
|
||||
this.figureExtractionService = figureExtractionService;
|
||||
this.visionDescriptionService = visionDescriptionService;
|
||||
this.textChunkingService = textChunkingService;
|
||||
@@ -65,11 +73,12 @@ public class BookEmbeddingService {
|
||||
this.figureRepository = figureRepository;
|
||||
this.chunkFigureRefRepository = chunkFigureRefRepository;
|
||||
this.figureStorageService = figureStorageService;
|
||||
this.markdownStorageService = markdownStorageService;
|
||||
}
|
||||
|
||||
@Async
|
||||
public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
|
||||
log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle);
|
||||
|
||||
Book book = bookRepository.findById(bookId).orElse(null);
|
||||
if (book == null) {
|
||||
@@ -81,59 +90,73 @@ public class BookEmbeddingService {
|
||||
book.setStatus(BookStatus.PROCESSING);
|
||||
bookRepository.save(book);
|
||||
|
||||
// Step 1: Parse PDF into page-level sections persisted in Postgres
|
||||
List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// Step 2: Build and embed text chunks for all sections in batches
|
||||
// Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
|
||||
List<PageResult> pageResults = markerPageParser.parse(pdfPath);
|
||||
|
||||
// Step 2: Build SectionEntity per page and persist
|
||||
List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
|
||||
|
||||
// Step 3: Chunk and embed text
|
||||
List<Document> allChunks = new ArrayList<>();
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> chunks = textChunkingService.chunk(section, bookTitle);
|
||||
allChunks.addAll(chunks);
|
||||
allChunks.addAll(textChunkingService.chunk(section, bookTitle));
|
||||
}
|
||||
embedInBatches(allChunks, bookId);
|
||||
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
|
||||
|
||||
// Step 3: Extract images from the PDF, save to file store, persist FigureEntity
|
||||
List<FigureEntity> figures = figureExtractionService.extract(
|
||||
bookId, chapterId, sections, pdfPath);
|
||||
// Step 4: Decode pre-cropped figures from Marker output
|
||||
FigureExtractionService.ExtractionResult extraction =
|
||||
figureExtractionService.extract(bookId, chapterId, pageResults);
|
||||
List<FigureEntity> figures = extraction.figures();
|
||||
|
||||
// Step 4: For each figure, generate vision description and embed caption
|
||||
// Step 4b: Upload per-page markdown with resolved figure URLs to S3
|
||||
for (PageResult page : pageResults) {
|
||||
if (!page.markdown().isBlank()) {
|
||||
String resolved = resolvePlaceholders(page.markdown(), bookId,
|
||||
extraction.blockIdToFigureId());
|
||||
markdownStorageService.save(bookId, page.pageNumber(), resolved);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Vision analysis (description + visible text) → embed figure chunks
|
||||
for (FigureEntity figure : figures) {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
String description = visionDescriptionService.describe(
|
||||
imageBytes, figure.getCaption());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
|
||||
// Use description as caption fallback if no caption was detected
|
||||
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
|
||||
figure.setCaption(description);
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
|
||||
// Content for embedding = vision description + caption for maximum signal
|
||||
String embeddingContent = description
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
|
||||
// Embedding content: description + caption + visible image text
|
||||
String embeddingContent = analysis.description()
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
|
||||
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
|
||||
log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);
|
||||
|
||||
// Step 5: Link text chunks to figures via text references
|
||||
// Step 6: Link text chunks to figures via in-text references
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> sectionChunks = allChunks.stream()
|
||||
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
|
||||
.toList();
|
||||
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
|
||||
.toList();
|
||||
List<FigureEntity> sectionFigures = figures.stream()
|
||||
.filter(f -> section.getId().equals(f.getSectionId()))
|
||||
.toList();
|
||||
chunkFigureRefService.linkChunksToFigures(
|
||||
sectionChunks, sectionFigures, section.getPageStart());
|
||||
.filter(f -> section.getId().equals(f.getSectionId()))
|
||||
.toList();
|
||||
chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart());
|
||||
}
|
||||
|
||||
book.setStatus(BookStatus.READY);
|
||||
@@ -142,7 +165,7 @@ public class BookEmbeddingService {
|
||||
bookRepository.save(book);
|
||||
|
||||
log.info("Finished embedding book {} — {} pages, {} figures",
|
||||
bookId, sections.size(), figures.size());
|
||||
bookId, sections.size(), figures.size());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.error("Failed to embed book {}", bookId, ex);
|
||||
@@ -156,53 +179,63 @@ public class BookEmbeddingService {
|
||||
public void deleteBookChunks(UUID bookId) {
|
||||
log.info("Deleting all data for book {}", bookId);
|
||||
try {
|
||||
// Delete chunk-figure refs (by figureId for this book)
|
||||
List<String> figureIds = figureRepository.findAllByBookId(bookId)
|
||||
.stream().map(FigureEntity::getId).toList();
|
||||
.stream().map(FigureEntity::getId).toList();
|
||||
if (!figureIds.isEmpty()) {
|
||||
chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
|
||||
}
|
||||
|
||||
// Delete figures from Postgres
|
||||
figureRepository.deleteAllByBookId(bookId);
|
||||
|
||||
// Delete figure files from disk
|
||||
figureStorageService.deleteAll(bookId);
|
||||
|
||||
// Delete sections and chapters from Postgres
|
||||
markdownStorageService.deleteAll(bookId);
|
||||
sectionRepository.deleteAllByBookId(bookId);
|
||||
chapterRepository.deleteAllByBookId(bookId);
|
||||
|
||||
// Delete vector store entries (text chunks + figure embeddings)
|
||||
FilterExpressionBuilder b = new FilterExpressionBuilder();
|
||||
vectorStore.delete(b.eq("book_id", bookId.toString()).build());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// --- Private helpers ---
|
||||
|
||||
private List<SectionEntity> buildAndSaveSections(UUID bookId, String bookTitle,
|
||||
String chapterId,
|
||||
List<PageResult> pageResults) {
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
for (PageResult page : pageResults) {
|
||||
if (page.orderedText().isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + page.pageNumber();
|
||||
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
|
||||
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(page.pageNumber()),
|
||||
title,
|
||||
page.pageNumber(), page.pageNumber(),
|
||||
page.orderedText());
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
return sections;
|
||||
}
|
||||
|
||||
private void embedInBatches(List<Document> docs, UUID bookId) {
|
||||
int total = docs.size();
|
||||
for (int i = 0; i < total; i += embeddingBatchSize) {
|
||||
List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
|
||||
vectorStore.add(batch);
|
||||
int batchNum = i / embeddingBatchSize + 1;
|
||||
int totalBatches = (total - 1) / embeddingBatchSize + 1;
|
||||
log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
|
||||
log.debug("Embedded batch {}/{} for book {}",
|
||||
i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId);
|
||||
if (i + embeddingBatchSize < total) {
|
||||
try {
|
||||
Thread.sleep(embeddingBatchDelayMs);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.warn("Embedding batch sleep interrupted for book {}", bookId);
|
||||
}
|
||||
try { Thread.sleep(embeddingBatchDelayMs); }
|
||||
catch (InterruptedException e) { Thread.currentThread().interrupt(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
|
||||
String embeddingId) {
|
||||
String embeddingId, String imageText) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
m.put("type", "FIGURE");
|
||||
m.put("book_id", figure.getBookId().toString());
|
||||
@@ -215,9 +248,31 @@ public class BookEmbeddingService {
|
||||
m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
|
||||
m.put("page", figure.getPage());
|
||||
m.put("embedding_id", embeddingId);
|
||||
m.put("image_text", imageText); // verbatim text visible inside the image
|
||||
return m;
|
||||
}
|
||||
|
||||
/** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
|
||||
private String resolvePlaceholders(String markdown, UUID bookId,
|
||||
Map<String, String> blockIdToFigureId) {
|
||||
Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (m.find()) {
|
||||
String altText = m.group(1);
|
||||
String blockId = m.group(2);
|
||||
String figureId = blockIdToFigureId.get(blockId);
|
||||
if (figureId != null) {
|
||||
String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
|
||||
m.appendReplacement(sb, "");
|
||||
} else {
|
||||
m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
|
||||
}
|
||||
}
|
||||
m.appendTail(sb);
|
||||
return sb.toString().strip();
|
||||
}
|
||||
|
||||
private String truncate(String msg, int max) {
|
||||
if (msg == null) return null;
|
||||
return msg.length() <= max ? msg : msg.substring(0, max);
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
package com.aiteacher.config;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.http.client.JdkClientHttpRequestFactory;
|
||||
import org.springframework.web.client.RestClient;
|
||||
|
||||
import java.net.http.HttpClient;
|
||||
|
||||
@Configuration
|
||||
public class MarkerConfig {
|
||||
|
||||
@Value("${app.marker.base-url:http://localhost:8000}")
|
||||
private String markerBaseUrl;
|
||||
|
||||
@Bean
|
||||
RestClient markerRestClient() {
|
||||
// Use the JDK HTTP client with no timeout — Marker conversions can take several minutes.
|
||||
HttpClient httpClient = HttpClient.newBuilder()
|
||||
.build();
|
||||
JdkClientHttpRequestFactory factory = new JdkClientHttpRequestFactory(httpClient);
|
||||
// No read timeout set: JDK HTTP client defaults to no deadline.
|
||||
|
||||
return RestClient.builder()
|
||||
.baseUrl(markerBaseUrl)
|
||||
.requestFactory(factory)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
@@ -1,43 +1,43 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Extracts images from each PDF page using PDFBox.
|
||||
* Images below the configured minimum size are skipped.
|
||||
* Caption is detected by the "Fig." pattern in page text.
|
||||
* Extracts figure images from {@link PageResult.FigureData} entries produced by
|
||||
* {@link MarkerPageParser}.
|
||||
*
|
||||
* <p>Marker returns pre-cropped PNG bytes for each detected figure, so no PDFBox
|
||||
* page rendering or bounding-box cropping is needed. This service:
|
||||
* <ol>
|
||||
* <li>Decodes the PNG bytes to check dimensions (skip images below min size)</li>
|
||||
* <li>Classifies the figure type from caption and surrounding text keywords</li>
|
||||
* <li>Persists the image via {@link FigureStorageService}</li>
|
||||
* <li>Persists a {@link FigureEntity} to the database</li>
|
||||
* </ol>
|
||||
*/
|
||||
@Service
|
||||
public class FigureExtractionService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FigureExtractionService.class);
|
||||
|
||||
// Caption: line starting with "Fig." or "Figure" followed by a number
|
||||
private static final Pattern CAPTION_PATTERN =
|
||||
Pattern.compile("(?m)^(Fig\\.?\\s*\\d+[\\-.]?\\d*[^\\n]*)", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
// Figure label: "Fig. 12-4" or "Fig. 12.4"
|
||||
private static final Pattern LABEL_PATTERN =
|
||||
Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");
|
||||
Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");
|
||||
|
||||
private final FigureStorageService storageService;
|
||||
private final FigureRepository figureRepository;
|
||||
@@ -52,65 +52,77 @@ public class FigureExtractionService {
|
||||
this.minImageSizePx = minImageSizePx;
|
||||
}
|
||||
|
||||
/** Holds the extraction output: persisted figures and a Marker blockId → DB figureId map. */
|
||||
public record ExtractionResult(List<FigureEntity> figures, Map<String, String> blockIdToFigureId) {}
|
||||
|
||||
/**
|
||||
* Extracts all qualifying images from the PDF for the given book.
|
||||
* Returns persisted FigureEntity list (without vision descriptions — set later).
|
||||
* Extracts and persists figures for all pages described by {@code pageResults}.
|
||||
*
|
||||
* @param bookId owning book
|
||||
* @param chapterId chapter bucket for these sections
|
||||
* @param pageResults Marker parse output — each entry's {@code figures} list
|
||||
* carries pre-cropped PNG bytes for that page
|
||||
* @return {@link ExtractionResult} with persisted figures and blockId→figureId map
|
||||
* (used to resolve markdown image placeholders)
|
||||
*/
|
||||
public List<FigureEntity> extract(UUID bookId, String chapterId,
|
||||
List<SectionEntity> sections, Path pdfPath) {
|
||||
public ExtractionResult extract(UUID bookId, String chapterId,
|
||||
List<PageResult> pageResults) {
|
||||
List<FigureEntity> figures = new ArrayList<>();
|
||||
Map<String, String> blockIdToFigureId = new HashMap<>();
|
||||
int figureCounter = 0;
|
||||
|
||||
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
||||
for (SectionEntity section : sections) {
|
||||
int pageIndex = section.getPageStart() - 1; // 0-based
|
||||
if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue;
|
||||
|
||||
PDPage page = doc.getPage(pageIndex);
|
||||
String pageText = section.getFullText();
|
||||
for (PageResult page : pageResults) {
|
||||
if (page.figures().isEmpty()) continue;
|
||||
|
||||
for (PageResult.FigureData figureData : page.figures()) {
|
||||
try {
|
||||
for (COSName name : page.getResources().getXObjectNames()) {
|
||||
PDXObject xObject = page.getResources().getXObject(name);
|
||||
if (!(xObject instanceof PDImageXObject image)) continue;
|
||||
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
if (bufferedImage.getWidth() < minImageSizePx
|
||||
|| bufferedImage.getHeight() < minImageSizePx) {
|
||||
continue; // skip decorative images
|
||||
}
|
||||
|
||||
figureCounter++;
|
||||
String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter;
|
||||
String caption = detectCaption(pageText);
|
||||
String label = detectLabel(caption, figureCounter);
|
||||
FigureType type = classifyType(caption, pageText);
|
||||
|
||||
String imagePath = storageService.save(bookId, figureId, bufferedImage);
|
||||
|
||||
FigureEntity figure = new FigureEntity(
|
||||
figureId, bookId, section.getId(), chapterId,
|
||||
label, caption, type, section.getPageStart(), imagePath
|
||||
);
|
||||
figures.add(figureRepository.save(figure));
|
||||
BufferedImage image = decodeImage(figureData.imageBytes());
|
||||
if (image == null) {
|
||||
log.debug("Could not decode image on page {} of book {} (block {})",
|
||||
page.pageNumber(), bookId, figureData.blockId());
|
||||
continue;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.warn("Failed to extract images from page {} of book {}: {}",
|
||||
section.getPageStart(), bookId, ex.getMessage());
|
||||
if (image.getWidth() < minImageSizePx || image.getHeight() < minImageSizePx) {
|
||||
log.debug("Skipping small figure on page {} ({}×{})",
|
||||
page.pageNumber(), image.getWidth(), image.getHeight());
|
||||
continue;
|
||||
}
|
||||
|
||||
figureCounter++;
|
||||
String figureId = bookId + "-fig-" + page.pageNumber() + "-" + figureCounter;
|
||||
String caption = figureData.nearestCaption();
|
||||
String label = detectLabel(caption, figureCounter);
|
||||
FigureType type = classifyType(caption, page.orderedText());
|
||||
|
||||
String sectionId = bookId + "-p" + page.pageNumber();
|
||||
String imagePath = storageService.save(bookId, figureId, image);
|
||||
|
||||
FigureEntity figure = new FigureEntity(
|
||||
figureId, bookId, sectionId, chapterId,
|
||||
label, caption, type, page.pageNumber(), imagePath);
|
||||
figures.add(figureRepository.save(figure));
|
||||
blockIdToFigureId.put(figureData.blockId(), figureId);
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.warn("Failed to extract figure on page {} of book {}: {}",
|
||||
page.pageNumber(), bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.error("Could not open PDF for image extraction, book {}", bookId, ex);
|
||||
}
|
||||
|
||||
log.info("Extracted {} figures for book {}", figures.size(), bookId);
|
||||
return figures;
|
||||
return new ExtractionResult(figures, blockIdToFigureId);
|
||||
}
|
||||
|
||||
private String detectCaption(String pageText) {
|
||||
if (pageText == null) return null;
|
||||
Matcher m = CAPTION_PATTERN.matcher(pageText);
|
||||
return m.find() ? m.group(1).trim() : null;
|
||||
// --- Private helpers ---
|
||||
|
||||
private BufferedImage decodeImage(byte[] imageBytes) {
|
||||
if (imageBytes == null || imageBytes.length == 0) return null;
|
||||
try {
|
||||
return ImageIO.read(new ByteArrayInputStream(imageBytes));
|
||||
} catch (IOException ex) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String detectLabel(String caption, int counter) {
|
||||
@@ -122,14 +134,18 @@ public class FigureExtractionService {
|
||||
}
|
||||
|
||||
private FigureType classifyType(String caption, String pageText) {
|
||||
String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase();
|
||||
String combined = ((caption != null ? caption : "") + " " +
|
||||
(pageText != null ? pageText : "")).toLowerCase();
|
||||
if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic")
|
||||
|| combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
|
||||
if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE;
|
||||
if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE;
|
||||
|| combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
|
||||
if (combined.contains("intraoperative") || combined.contains("intra-op"))
|
||||
return FigureType.INTRAOPERATIVE_IMAGE;
|
||||
if (caption != null && caption.toLowerCase().startsWith("table"))
|
||||
return FigureType.TABLE;
|
||||
if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph"))
|
||||
return FigureType.CHART;
|
||||
if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH;
|
||||
return FigureType.CHART;
|
||||
if (combined.contains("photograph") || combined.contains("photo"))
|
||||
return FigureType.SURGICAL_PHOTOGRAPH;
|
||||
return FigureType.ANATOMICAL_DIAGRAM;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public interface MarkdownStorageService {
|
||||
/** Uploads the markdown content and returns the S3 key. */
|
||||
String save(UUID bookId, int pageNumber, String markdown);
|
||||
|
||||
/** Downloads and returns the markdown content for the given book and page. */
|
||||
String getText(UUID bookId, int pageNumber);
|
||||
|
||||
/** Deletes all markdown files for the given book. */
|
||||
void deleteAll(UUID bookId);
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import tools.jackson.databind.JsonNode;
|
||||
import tools.jackson.databind.ObjectMapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.LinkedMultiValueMap;
|
||||
import org.springframework.util.MultiValueMap;
|
||||
import org.springframework.web.client.RestClient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Parses a PDF using the local Marker server ({@code POST /marker/upload}).
|
||||
*
|
||||
* <p>A single HTTP call returns:
|
||||
* <ul>
|
||||
* <li>Reading-order text blocks — correct for multi-column and scanned pages</li>
|
||||
* <li>Section headings extracted from {@code SectionHeader} blocks</li>
|
||||
* <li>Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's
|
||||
* {@code images} map</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>The response is mapped to one {@link PageResult} per page block.
|
||||
*/
|
||||
@Service
|
||||
public class MarkerPageParser {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class);
|
||||
|
||||
private static final Set<String> TEXT_BLOCK_TYPES = Set.of(
|
||||
"Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation",
|
||||
"Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting"
|
||||
);
|
||||
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
|
||||
|
||||
private final RestClient restClient;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) {
|
||||
this.restClient = restClient;
|
||||
this.objectMapper = objectMapper;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the entire PDF and returns one {@link PageResult} per non-empty page.
|
||||
*/
|
||||
public List<PageResult> parse(Path pdfPath) {
|
||||
log.info("Submitting {} to Marker for parsing", pdfPath.getFileName());
|
||||
|
||||
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
|
||||
body.add("file", new FileSystemResource(pdfPath));
|
||||
body.add("output_format", "json");
|
||||
|
||||
JsonNode response = restClient.post()
|
||||
.uri("/marker/upload")
|
||||
.contentType(MediaType.MULTIPART_FORM_DATA)
|
||||
.body(body)
|
||||
.retrieve()
|
||||
.body(JsonNode.class);
|
||||
|
||||
try {
|
||||
Path debugFile = Path.of("/tmp/marker-response-md.json");
|
||||
Files.writeString(debugFile, response.toPrettyString());
|
||||
log.info("Marker response saved to {}", debugFile);
|
||||
} catch (IOException e) {
|
||||
log.warn("Could not save Marker response to file", e);
|
||||
}
|
||||
|
||||
List<PageResult> results = parseResponse(response);
|
||||
log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName());
|
||||
return results;
|
||||
}
|
||||
|
||||
// --- Private helpers ---
|
||||
|
||||
private List<PageResult> parseResponse(JsonNode response) {
|
||||
if (response == null) return List.of();
|
||||
|
||||
// The "output" field is a JSON-encoded string — parse it first.
|
||||
// Fall back to treating the whole response as the root if "output" is absent.
|
||||
JsonNode root;
|
||||
JsonNode outputNode = response.path("output");
|
||||
if (!outputNode.isMissingNode() && outputNode.isTextual()) {
|
||||
try {
|
||||
root = objectMapper.readTree(outputNode.asText());
|
||||
} catch (tools.jackson.core.JacksonException e) {
|
||||
log.warn("Could not parse Marker 'output' field as JSON", e);
|
||||
return List.of();
|
||||
}
|
||||
} else if (!outputNode.isMissingNode()) {
|
||||
root = outputNode;
|
||||
} else {
|
||||
root = response;
|
||||
}
|
||||
|
||||
JsonNode children = root.path("children");
|
||||
if (children.isMissingNode() || !children.isArray()) {
|
||||
log.warn("Marker response has no 'children' array — empty result");
|
||||
return List.of();
|
||||
}
|
||||
|
||||
List<PageResult> results = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
for (JsonNode pageBlock : children) {
|
||||
String blockType = pageBlock.path("block_type").asText();
|
||||
if (!"Page".equals(blockType)) continue;
|
||||
|
||||
int pageNumber = pageIndex + 1;
|
||||
pageIndex++;
|
||||
|
||||
PageResult result = parsePage(pageBlock, pageNumber);
|
||||
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
|
||||
results.add(result);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
private PageResult parsePage(JsonNode pageBlock, int pageNumber) {
|
||||
JsonNode children = pageBlock.path("children");
|
||||
if (children.isMissingNode() || !children.isArray()) {
|
||||
return new PageResult(pageNumber, "", null, List.of(), "");
|
||||
}
|
||||
|
||||
StringBuilder textBuilder = new StringBuilder();
|
||||
StringBuilder markdownBuilder = new StringBuilder();
|
||||
String headingTitle = null;
|
||||
List<PageResult.FigureData> figures = new ArrayList<>();
|
||||
Set<Integer> consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure
|
||||
|
||||
List<JsonNode> childList = new ArrayList<>();
|
||||
children.forEach(childList::add);
|
||||
|
||||
for (int i = 0; i < childList.size(); i++) {
|
||||
if (consumed.contains(i)) continue;
|
||||
|
||||
JsonNode child = childList.get(i);
|
||||
String type = child.path("block_type").asText();
|
||||
|
||||
if ("SectionHeader".equals(type)) {
|
||||
String heading = stripHtml(child.path("html").asText()).strip();
|
||||
if (!heading.isEmpty() && headingTitle == null) {
|
||||
headingTitle = heading;
|
||||
}
|
||||
appendText(textBuilder, heading);
|
||||
appendMarkdown(markdownBuilder, "## " + heading);
|
||||
|
||||
} else if (TEXT_BLOCK_TYPES.contains(type)) {
|
||||
String text = stripHtml(child.path("html").asText());
|
||||
appendText(textBuilder, text);
|
||||
appendMarkdown(markdownBuilder, text.strip());
|
||||
|
||||
} else if (FIGURE_BLOCK_TYPES.contains(type)) {
|
||||
extractFigures(child, i, childList, figures, markdownBuilder, consumed);
|
||||
}
|
||||
}
|
||||
|
||||
return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle,
|
||||
figures, markdownBuilder.toString().strip());
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles a figure/picture block at {@code index} in {@code siblings}.
|
||||
* For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure,
|
||||
* and the caption is a sibling Caption child inside the group.
|
||||
* For leaf blocks the caption is the next sibling in the page child list.
|
||||
* Image refs are appended to {@code markdown} as {@code }.
|
||||
* Consumed caption sibling indices are added to {@code consumed}.
|
||||
*/
|
||||
private void extractFigures(JsonNode block, int index, List<JsonNode> siblings,
|
||||
List<PageResult.FigureData> out, StringBuilder markdown,
|
||||
Set<Integer> consumed) {
|
||||
String type = block.path("block_type").asText();
|
||||
boolean isGroup = type.endsWith("Group");
|
||||
|
||||
if (isGroup) {
|
||||
JsonNode groupChildren = block.path("children");
|
||||
if (groupChildren.isMissingNode() || !groupChildren.isArray()) return;
|
||||
|
||||
String groupCaption = null;
|
||||
for (JsonNode sub : groupChildren) {
|
||||
if ("Caption".equals(sub.path("block_type").asText())) {
|
||||
String c = stripHtml(sub.path("html").asText()).strip();
|
||||
if (!c.isEmpty()) groupCaption = c;
|
||||
}
|
||||
}
|
||||
for (JsonNode sub : groupChildren) {
|
||||
String subType = sub.path("block_type").asText();
|
||||
if ("Figure".equals(subType) || "Picture".equals(subType)) {
|
||||
String blockId = sub.path("id").asText();
|
||||
byte[] imageBytes = extractImageBytes(sub, blockId);
|
||||
if (imageBytes != null) {
|
||||
out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId));
|
||||
String altText = groupCaption != null ? groupCaption : blockId;
|
||||
appendMarkdown(markdown, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
String blockId = block.path("id").asText();
|
||||
byte[] imageBytes = extractImageBytes(block, blockId);
|
||||
if (imageBytes != null) {
|
||||
String caption = null;
|
||||
if (index + 1 < siblings.size()) {
|
||||
JsonNode next = siblings.get(index + 1);
|
||||
if ("Caption".equals(next.path("block_type").asText())) {
|
||||
String c = stripHtml(next.path("html").asText()).strip();
|
||||
if (!c.isEmpty()) caption = c;
|
||||
consumed.add(index + 1);
|
||||
}
|
||||
}
|
||||
out.add(new PageResult.FigureData(imageBytes, caption, blockId));
|
||||
String altText = caption != null ? caption : blockId;
|
||||
appendMarkdown(markdown, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts and base64-decodes the image bytes for this block.
|
||||
* Marker stores images in the block's {@code images} map keyed by block ID.
|
||||
*/
|
||||
private byte[] extractImageBytes(JsonNode block, String blockId) {
|
||||
JsonNode images = block.path("images");
|
||||
if (images.isMissingNode() || images.isEmpty()) return null;
|
||||
|
||||
// Try the block's own ID first, then fall back to the first entry
|
||||
JsonNode imgNode = images.path(blockId);
|
||||
if (imgNode.isMissingNode()) {
|
||||
imgNode = images.properties().stream()
|
||||
.findFirst()
|
||||
.map(e -> e.getValue())
|
||||
.orElse(imgNode);
|
||||
}
|
||||
|
||||
String base64 = imgNode.asText();
|
||||
if (base64.isEmpty()) return null;
|
||||
|
||||
try {
|
||||
return Base64.getDecoder().decode(base64);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void appendText(StringBuilder sb, String text) {
|
||||
String stripped = text.strip();
|
||||
if (stripped.isEmpty()) return;
|
||||
if (sb.length() > 0) sb.append("\n\n");
|
||||
sb.append(stripped);
|
||||
}
|
||||
|
||||
private void appendMarkdown(StringBuilder sb, String text) {
|
||||
if (text == null || text.isBlank()) return;
|
||||
if (sb.length() > 0) sb.append("\n\n");
|
||||
sb.append(text.strip());
|
||||
}
|
||||
|
||||
/** Strips HTML tags and normalises whitespace. */
|
||||
private String stripHtml(String html) {
|
||||
if (html == null || html.isEmpty()) return "";
|
||||
return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Internal DTO produced by MarkerPageParser for one PDF page.
|
||||
* Decouples the Marker HTTP API from downstream services.
|
||||
*/
|
||||
public record PageResult(
|
||||
int pageNumber, // 1-based, derived from Marker page block index
|
||||
String orderedText, // full page text in correct reading order (blocks joined by \n\n)
|
||||
String headingTitle, // first SectionHeader block on page, or null
|
||||
List<FigureData> figures, // extracted figure images (may be empty)
|
||||
String markdown // markdown representation with marker://{blockId} image placeholders
|
||||
) {
|
||||
|
||||
/**
|
||||
* A figure extracted from the page.
|
||||
* Image bytes are PNG data decoded from the Marker JSON {@code images} map.
|
||||
*/
|
||||
public record FigureData(
|
||||
byte[] imageBytes, // PNG image data (base64-decoded from Marker response)
|
||||
String nearestCaption, // text of the adjacent Caption block, or null
|
||||
String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability
|
||||
) {}
|
||||
}
|
||||
@@ -1,13 +1,17 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -15,13 +19,18 @@ import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Parses a PDF into page-level SectionEntity records stored in Postgres.
|
||||
* Each page becomes one section, grouped under a single chapter per book.
|
||||
* Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
|
||||
* left column is extracted first then right, preserving correct reading order.
|
||||
* Text is also normalized (collapsed whitespace) before storage.
|
||||
*/
|
||||
@Service
|
||||
public class PdfStructureParser {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
|
||||
|
||||
// Right column is considered empty (single-column page) if it has < 20% of left column's content
|
||||
private static final double TWO_COLUMN_THRESHOLD = 0.2;
|
||||
|
||||
private final ChapterRepository chapterRepository;
|
||||
private final SectionRepository sectionRepository;
|
||||
|
||||
@@ -35,37 +44,71 @@ public class PdfStructureParser {
|
||||
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Parsing PDF structure for book {}", bookId);
|
||||
|
||||
// One chapter per book
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// One section per page
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(
|
||||
new FileSystemResource(pdfPath.toFile()),
|
||||
PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
|
||||
);
|
||||
|
||||
List<org.springframework.ai.document.Document> pages = reader.get();
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < pages.size(); i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = pages.get(i).getText();
|
||||
if (text == null || text.isBlank()) continue;
|
||||
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
||||
List<PDPage> pages = new ArrayList<>();
|
||||
doc.getPages().forEach(pages::add);
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
for (int i = 0; i < 25; i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = normalizeWhitespace(extractPageText(pages.get(i)));
|
||||
if (text.isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
|
||||
}
|
||||
|
||||
log.info("Parsed {} sections for book {}", sections.size(), bookId);
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts text from a single page using column-aware region extraction.
|
||||
* Splits the page at the horizontal midpoint. If the right region has fewer
|
||||
* than 20% of the characters of the left region, treats the page as single-column.
|
||||
*/
|
||||
private String extractPageText(PDPage page) throws IOException {
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
int width = (int) mediaBox.getWidth();
|
||||
int height = (int) mediaBox.getHeight();
|
||||
int mid = width / 2;
|
||||
|
||||
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.addRegion("left", new Rectangle(0, 0, mid, height));
|
||||
stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
|
||||
stripper.extractRegions(page);
|
||||
|
||||
String left = stripper.getTextForRegion("left").strip();
|
||||
String right = stripper.getTextForRegion("right").strip();
|
||||
|
||||
if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
|
||||
// Single-column page — left holds all (or nearly all) content
|
||||
return left.isEmpty() ? right : left;
|
||||
}
|
||||
return left + "\n\n" + right;
|
||||
}
|
||||
|
||||
/** Collapses multi-space/tab runs and excessive blank lines. */
|
||||
private String normalizeWhitespace(String text) {
|
||||
return text
|
||||
.replaceAll("[ \t]{2,}", " ")
|
||||
.replaceAll("\n{3,}", "\n\n")
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
|
||||
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
|
||||
import software.amazon.awssdk.core.sync.RequestBody;
|
||||
import software.amazon.awssdk.regions.Region;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.S3Configuration;
|
||||
import software.amazon.awssdk.services.s3.model.*;
|
||||
|
||||
import java.net.URI;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
@Service
|
||||
public class S3MarkdownStorageService implements MarkdownStorageService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(S3MarkdownStorageService.class);
|
||||
|
||||
private final S3Client s3;
|
||||
private final String bucket;
|
||||
|
||||
public S3MarkdownStorageService(
|
||||
@Value("${app.figure-storage.endpoint}") String endpoint,
|
||||
@Value("${app.figure-storage.region}") String region,
|
||||
@Value("${app.figure-storage.bucket}") String bucket,
|
||||
@Value("${app.figure-storage.access-key-id}") String accessKeyId,
|
||||
@Value("${app.figure-storage.secret-access-key}") String secretKey) {
|
||||
this.bucket = bucket;
|
||||
URI endpointUri = URI.create(endpoint);
|
||||
StaticCredentialsProvider credentials = StaticCredentialsProvider.create(
|
||||
AwsBasicCredentials.create(accessKeyId, secretKey));
|
||||
Region awsRegion = Region.of(region);
|
||||
S3Configuration s3Config = S3Configuration.builder().pathStyleAccessEnabled(true).build();
|
||||
|
||||
this.s3 = S3Client.builder()
|
||||
.endpointOverride(endpointUri)
|
||||
.region(awsRegion)
|
||||
.credentialsProvider(credentials)
|
||||
.serviceConfiguration(s3Config)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String save(UUID bookId, int pageNumber, String markdown) {
|
||||
String key = key(bookId, pageNumber);
|
||||
byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8);
|
||||
s3.putObject(
|
||||
PutObjectRequest.builder().bucket(bucket).key(key)
|
||||
.contentType("text/markdown; charset=utf-8")
|
||||
.contentLength((long) bytes.length).build(),
|
||||
RequestBody.fromBytes(bytes));
|
||||
return key;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText(UUID bookId, int pageNumber) {
|
||||
byte[] bytes = s3.getObjectAsBytes(
|
||||
GetObjectRequest.builder().bucket(bucket).key(key(bookId, pageNumber)).build()
|
||||
).asByteArray();
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteAll(UUID bookId) {
|
||||
String prefix = "markdown/" + bookId + "/";
|
||||
try {
|
||||
List<ObjectIdentifier> toDelete = new ArrayList<>();
|
||||
s3.listObjectsV2Paginator(ListObjectsV2Request.builder()
|
||||
.bucket(bucket).prefix(prefix).build()).stream()
|
||||
.flatMap(page -> page.contents().stream())
|
||||
.map(S3Object::key)
|
||||
.map(k -> ObjectIdentifier.builder().key(k).build())
|
||||
.forEach(toDelete::add);
|
||||
|
||||
if (toDelete.isEmpty()) return;
|
||||
|
||||
s3.deleteObjects(DeleteObjectsRequest.builder()
|
||||
.bucket(bucket)
|
||||
.delete(Delete.builder().objects(toDelete).build())
|
||||
.build());
|
||||
log.info("Deleted {} markdown files from S3 for book {}", toDelete.size(), bookId);
|
||||
} catch (S3Exception ex) {
|
||||
log.warn("Could not fully delete markdown for book {} from S3: {}", bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static String key(UUID bookId, int pageNumber) {
|
||||
return "markdown/" + bookId + "/page-" + pageNumber + ".md";
|
||||
}
|
||||
}
|
||||
@@ -38,14 +38,52 @@ public class TextChunkingService {
|
||||
List<String> windows = new ArrayList<>();
|
||||
int start = 0;
|
||||
while (start < text.length()) {
|
||||
int end = Math.min(start + TARGET_CHARS, text.length());
|
||||
windows.add(text.substring(start, end));
|
||||
if (end == text.length()) break;
|
||||
start = end - OVERLAP_CHARS;
|
||||
int hardEnd = Math.min(start + TARGET_CHARS, text.length());
|
||||
if (hardEnd == text.length()) {
|
||||
String last = text.substring(start).strip();
|
||||
if (!last.isEmpty()) windows.add(last);
|
||||
break;
|
||||
}
|
||||
int splitAt = findSplitPoint(text, start, hardEnd);
|
||||
String chunk = text.substring(start, splitAt).strip();
|
||||
if (!chunk.isEmpty()) windows.add(chunk);
|
||||
// Overlap: back up from split point, align to a word start
|
||||
int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
|
||||
while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
|
||||
start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
|
||||
}
|
||||
return windows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the best split point at or before hardEnd, preferring (in order):
|
||||
* paragraph boundary, sentence boundary, word boundary, hard cut.
|
||||
*/
|
||||
private int findSplitPoint(String text, int start, int hardEnd) {
|
||||
int lookback = Math.min(400, (hardEnd - start) / 2);
|
||||
|
||||
// 1. Paragraph boundary
|
||||
int paraIdx = text.lastIndexOf("\n\n", hardEnd);
|
||||
if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
|
||||
|
||||
// 2. Sentence boundary (. ! ?) followed by space or newline
|
||||
for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
|
||||
char c = text.charAt(i);
|
||||
if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
|
||||
char next = text.charAt(i + 1);
|
||||
if (next == ' ' || next == '\n') return i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Word boundary
|
||||
for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
|
||||
if (text.charAt(i) == ' ') return i + 1;
|
||||
}
|
||||
|
||||
// 4. Hard cut
|
||||
return hardEnd;
|
||||
}
|
||||
|
||||
private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
|
||||
int index, int total, String chunkId) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
|
||||
@@ -8,18 +8,29 @@ import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.MimeTypeUtils;
|
||||
|
||||
/**
|
||||
* Generates a clinical text description for an extracted figure image
|
||||
* using the OpenAI vision model via Spring AI ChatClient.
|
||||
* Analyses an extracted figure image using the OpenAI vision model.
|
||||
*
|
||||
* <p>Returns an {@link ImageAnalysis} record containing:
|
||||
* <ul>
|
||||
* <li>{@code description} — 2-3 sentence clinical description of the image</li>
|
||||
* <li>{@code imageText} — all visible text, labels, and annotations copied verbatim
|
||||
* from the image (empty string when none present)</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Both fields are stored: {@code description} drives the embedding; {@code imageText}
|
||||
* is added to chunk metadata so queries can match exact labels (e.g., "Circle of Willis").
|
||||
*/
|
||||
@Service
|
||||
public class VisionDescriptionService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class);
|
||||
|
||||
private static final String PROMPT =
|
||||
"You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " +
|
||||
"this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " +
|
||||
"significance. If text or labels are visible, include them verbatim.";
|
||||
private static final String PROMPT = """
|
||||
You are a neurosurgery educator analysing a medical image.
|
||||
Respond in EXACTLY this format — no other text, no markdown:
|
||||
DESCRIPTION: <2-3 sentence clinical description focusing on anatomical structures, surgical landmarks, and clinical significance>
|
||||
IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
|
||||
""";
|
||||
|
||||
private final ChatClient chatClient;
|
||||
|
||||
@@ -28,19 +39,53 @@ public class VisionDescriptionService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a description string. Falls back to the provided caption if vision fails.
|
||||
* Holds the structured output of a vision model call on one figure image.
|
||||
*
|
||||
* @param description clinical description of the image content
|
||||
* @param imageText verbatim text visible inside the image; empty string if none
|
||||
*/
|
||||
public String describe(byte[] imageBytes, String captionFallback) {
|
||||
public record ImageAnalysis(String description, String imageText) {}
|
||||
|
||||
/**
|
||||
* Analyses the image bytes and returns an {@link ImageAnalysis}.
|
||||
* Falls back gracefully: if the vision call fails, the caption is used as description
|
||||
* and imageText is left empty.
|
||||
*
|
||||
* @param imageBytes PNG bytes of the extracted figure
|
||||
* @param captionFallback caption detected from surrounding text, may be null
|
||||
*/
|
||||
public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
|
||||
try {
|
||||
return chatClient.prompt()
|
||||
.user(u -> u
|
||||
.text(PROMPT)
|
||||
.media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
|
||||
.call()
|
||||
.content();
|
||||
String raw = chatClient.prompt()
|
||||
.user(u -> u
|
||||
.text(PROMPT)
|
||||
.media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
|
||||
.call()
|
||||
.content();
|
||||
return parse(raw, captionFallback);
|
||||
} catch (Exception ex) {
|
||||
log.warn("Vision description failed: {} — using caption as fallback", ex.getMessage());
|
||||
return captionFallback != null ? captionFallback : "Figure";
|
||||
log.warn("Vision analysis failed: {} — using caption as fallback", ex.getMessage());
|
||||
return new ImageAnalysis(
|
||||
captionFallback != null ? captionFallback : "Figure",
|
||||
"");
|
||||
}
|
||||
}
|
||||
|
||||
private ImageAnalysis parse(String raw, String captionFallback) {
|
||||
String description = captionFallback != null ? captionFallback : "Figure";
|
||||
String imageText = "";
|
||||
|
||||
if (raw != null) {
|
||||
for (String line : raw.split("\n")) {
|
||||
if (line.startsWith("DESCRIPTION:")) {
|
||||
String val = line.substring("DESCRIPTION:".length()).strip();
|
||||
if (!val.isEmpty()) description = val;
|
||||
} else if (line.startsWith("IMAGE_TEXT:")) {
|
||||
String val = line.substring("IMAGE_TEXT:".length()).strip();
|
||||
if (!val.isEmpty() && !"NONE".equalsIgnoreCase(val)) imageText = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
return new ImageAnalysis(description, imageText);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user