first implementation - image/drawing integration
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.FigureRepository;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
@@ -15,9 +17,11 @@ import java.util.UUID;
|
||||
public class BookController {
|
||||
|
||||
private final BookService bookService;
|
||||
private final FigureRepository figureRepository;
|
||||
|
||||
public BookController(BookService bookService) {
|
||||
public BookController(BookService bookService, FigureRepository figureRepository) {
|
||||
this.bookService = bookService;
|
||||
this.figureRepository = figureRepository;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data")
|
||||
@@ -46,6 +50,36 @@ public class BookController {
|
||||
return ResponseEntity.noContent().build();
|
||||
}
|
||||
|
||||
@PostMapping("/{id}/reembed")
|
||||
public ResponseEntity<Map<String, Object>> reembed(@PathVariable UUID id) {
|
||||
Book book = bookService.reembed(id);
|
||||
return ResponseEntity.accepted().body(Map.of(
|
||||
"bookId", book.getId(),
|
||||
"status", BookStatus.PROCESSING.name()
|
||||
));
|
||||
}
|
||||
|
||||
@GetMapping("/{id}/figures")
|
||||
public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
|
||||
bookService.getById(id); // 404 if not found
|
||||
List<FigureResponse> responses = figureRepository.findAllByBookId(id)
|
||||
.stream()
|
||||
.map(f -> toFigureResponse(id, f))
|
||||
.toList();
|
||||
return ResponseEntity.ok(responses);
|
||||
}
|
||||
|
||||
private FigureResponse toFigureResponse(UUID bookId, FigureEntity f) {
|
||||
String filename = f.getImagePath().substring(f.getImagePath().lastIndexOf('/') + 1);
|
||||
String imageUrl = "/api/v1/figures/" + bookId + "/" + filename;
|
||||
return new FigureResponse(
|
||||
f.getId(), f.getLabel(), f.getCaption(),
|
||||
f.getFigureType().name(), f.getPage(), imageUrl,
|
||||
f.getSectionId(),
|
||||
null // section title not eagerly loaded here
|
||||
);
|
||||
}
|
||||
|
||||
private Map<String, Object> toSummaryResponse(Book book) {
|
||||
return Map.of(
|
||||
"id", book.getId(),
|
||||
|
||||
@@ -1,41 +1,75 @@
|
||||
package com.aiteacher.book;
|
||||
|
||||
import com.aiteacher.document.*;
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.regex.Pattern;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
@Service
|
||||
public class BookEmbeddingService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(BookEmbeddingService.class);
|
||||
|
||||
// Pattern to detect diagram/figure captions
|
||||
private static final Pattern CAPTION_PATTERN =
|
||||
Pattern.compile("^(Figure|Fig\\.|Table|Diagram)\\s+[\\d.]+", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
private final VectorStore vectorStore;
|
||||
private final BookRepository bookRepository;
|
||||
|
||||
public BookEmbeddingService(VectorStore vectorStore, BookRepository bookRepository) {
|
||||
@Value("${app.embedding.batch-size:50}")
|
||||
private int embeddingBatchSize;
|
||||
|
||||
@Value("${app.embedding.batch-delay-ms:1000}")
|
||||
private long embeddingBatchDelayMs;
|
||||
private final PdfStructureParser pdfStructureParser;
|
||||
private final FigureExtractionService figureExtractionService;
|
||||
private final VisionDescriptionService visionDescriptionService;
|
||||
private final TextChunkingService textChunkingService;
|
||||
private final ChunkFigureRefService chunkFigureRefService;
|
||||
private final SectionRepository sectionRepository;
|
||||
private final ChapterRepository chapterRepository;
|
||||
private final FigureRepository figureRepository;
|
||||
private final ChunkFigureRefRepository chunkFigureRefRepository;
|
||||
private final FigureStorageService figureStorageService;
|
||||
|
||||
public BookEmbeddingService(
|
||||
VectorStore vectorStore,
|
||||
BookRepository bookRepository,
|
||||
PdfStructureParser pdfStructureParser,
|
||||
FigureExtractionService figureExtractionService,
|
||||
VisionDescriptionService visionDescriptionService,
|
||||
TextChunkingService textChunkingService,
|
||||
ChunkFigureRefService chunkFigureRefService,
|
||||
SectionRepository sectionRepository,
|
||||
ChapterRepository chapterRepository,
|
||||
FigureRepository figureRepository,
|
||||
ChunkFigureRefRepository chunkFigureRefRepository,
|
||||
FigureStorageService figureStorageService) {
|
||||
this.vectorStore = vectorStore;
|
||||
this.bookRepository = bookRepository;
|
||||
this.pdfStructureParser = pdfStructureParser;
|
||||
this.figureExtractionService = figureExtractionService;
|
||||
this.visionDescriptionService = visionDescriptionService;
|
||||
this.textChunkingService = textChunkingService;
|
||||
this.chunkFigureRefService = chunkFigureRefService;
|
||||
this.sectionRepository = sectionRepository;
|
||||
this.chapterRepository = chapterRepository;
|
||||
this.figureRepository = figureRepository;
|
||||
this.chunkFigureRefRepository = chunkFigureRefRepository;
|
||||
this.figureStorageService = figureStorageService;
|
||||
}
|
||||
|
||||
@Async
|
||||
public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Starting embedding for book {} ({})", bookId, bookTitle);
|
||||
log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
|
||||
|
||||
Book book = bookRepository.findById(bookId).orElse(null);
|
||||
if (book == null) {
|
||||
@@ -47,29 +81,68 @@ public class BookEmbeddingService {
|
||||
book.setStatus(BookStatus.PROCESSING);
|
||||
bookRepository.save(book);
|
||||
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(
|
||||
new FileSystemResource(pdfPath.toFile()),
|
||||
PdfDocumentReaderConfig.builder()
|
||||
.withPagesPerDocument(1)
|
||||
.build()
|
||||
);
|
||||
// Step 1: Parse PDF into page-level sections persisted in Postgres
|
||||
List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
|
||||
String chapterId = bookId + "-ch1";
|
||||
|
||||
List<Document> pages = reader.get();
|
||||
int pageCount = pages.size();
|
||||
// Step 2: Build and embed text chunks for all sections in batches
|
||||
List<Document> allChunks = new ArrayList<>();
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> chunks = textChunkingService.chunk(section, bookTitle);
|
||||
allChunks.addAll(chunks);
|
||||
}
|
||||
embedInBatches(allChunks, bookId);
|
||||
log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);
|
||||
|
||||
// Enrich metadata and tag diagram captions
|
||||
List<Document> enriched = pages.stream()
|
||||
.map(doc -> enrichDocument(doc, bookId.toString(), bookTitle))
|
||||
.toList();
|
||||
// Step 3: Extract images from the PDF, save to file store, persist FigureEntity
|
||||
List<FigureEntity> figures = figureExtractionService.extract(
|
||||
bookId, chapterId, sections, pdfPath);
|
||||
|
||||
vectorStore.add(enriched);
|
||||
// Step 4: For each figure, generate vision description and embed caption
|
||||
for (FigureEntity figure : figures) {
|
||||
Path imagePath = figureStorageService.resolve(figure.getImagePath());
|
||||
String description = visionDescriptionService.describe(
|
||||
imagePath, figure.getCaption());
|
||||
|
||||
// Use description as caption fallback if no caption was detected
|
||||
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
|
||||
figure.setCaption(description);
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
|
||||
// Content for embedding = vision description + caption for maximum signal
|
||||
String embeddingContent = description
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
|
||||
|
||||
// Step 5: Link text chunks to figures via text references
|
||||
for (SectionEntity section : sections) {
|
||||
List<Document> sectionChunks = allChunks.stream()
|
||||
.filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
|
||||
.toList();
|
||||
List<FigureEntity> sectionFigures = figures.stream()
|
||||
.filter(f -> section.getId().equals(f.getSectionId()))
|
||||
.toList();
|
||||
chunkFigureRefService.linkChunksToFigures(
|
||||
sectionChunks, sectionFigures, section.getPageStart());
|
||||
}
|
||||
|
||||
book.setStatus(BookStatus.READY);
|
||||
book.setPageCount(pageCount);
|
||||
book.setProcessedAt(java.time.Instant.now());
|
||||
book.setPageCount(sections.size());
|
||||
book.setProcessedAt(Instant.now());
|
||||
bookRepository.save(book);
|
||||
|
||||
log.info("Finished embedding book {} — {} pages", bookId, pageCount);
|
||||
log.info("Finished embedding book {} — {} pages, {} figures",
|
||||
bookId, sections.size(), figures.size());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.error("Failed to embed book {}", bookId, ex);
|
||||
@@ -79,40 +152,74 @@ public class BookEmbeddingService {
|
||||
}
|
||||
}
|
||||
|
||||
private Document enrichDocument(Document doc, String bookId, String bookTitle) {
|
||||
String content = doc.getText();
|
||||
String chunkType = detectChunkType(content);
|
||||
@Transactional
|
||||
public void deleteBookChunks(UUID bookId) {
|
||||
log.info("Deleting all data for book {}", bookId);
|
||||
try {
|
||||
// Delete chunk-figure refs (by figureId for this book)
|
||||
List<String> figureIds = figureRepository.findAllByBookId(bookId)
|
||||
.stream().map(FigureEntity::getId).toList();
|
||||
if (!figureIds.isEmpty()) {
|
||||
chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
|
||||
}
|
||||
|
||||
doc.getMetadata().put("book_id", bookId);
|
||||
doc.getMetadata().put("book_title", bookTitle);
|
||||
doc.getMetadata().put("chunk_type", chunkType);
|
||||
// Delete figures from Postgres
|
||||
figureRepository.deleteAllByBookId(bookId);
|
||||
|
||||
return doc;
|
||||
// Delete figure files from disk
|
||||
figureStorageService.deleteAll(bookId);
|
||||
|
||||
// Delete sections and chapters from Postgres
|
||||
sectionRepository.deleteAllByBookId(bookId);
|
||||
chapterRepository.deleteAllByBookId(bookId);
|
||||
|
||||
// Delete vector store entries (text chunks + figure embeddings)
|
||||
FilterExpressionBuilder b = new FilterExpressionBuilder();
|
||||
vectorStore.delete(b.eq("book_id", bookId.toString()).build());
|
||||
|
||||
} catch (Exception ex) {
|
||||
log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private String detectChunkType(String content) {
|
||||
if (content != null) {
|
||||
for (String line : content.split("\\r?\\n")) {
|
||||
if (CAPTION_PATTERN.matcher(line.trim()).find()) {
|
||||
return "diagram";
|
||||
private void embedInBatches(List<Document> docs, UUID bookId) {
|
||||
int total = docs.size();
|
||||
for (int i = 0; i < total; i += embeddingBatchSize) {
|
||||
List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
|
||||
vectorStore.add(batch);
|
||||
int batchNum = i / embeddingBatchSize + 1;
|
||||
int totalBatches = (total - 1) / embeddingBatchSize + 1;
|
||||
log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
|
||||
if (i + embeddingBatchSize < total) {
|
||||
try {
|
||||
Thread.sleep(embeddingBatchDelayMs);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.warn("Embedding batch sleep interrupted for book {}", bookId);
|
||||
}
|
||||
}
|
||||
}
|
||||
return "text";
|
||||
}
|
||||
|
||||
public void deleteBookChunks(UUID bookId) {
|
||||
log.info("Deleting vector chunks for book {}", bookId);
|
||||
try {
|
||||
FilterExpressionBuilder b = new FilterExpressionBuilder();
|
||||
vectorStore.delete(b.eq("book_id", bookId.toString()).build());
|
||||
} catch (Exception ex) {
|
||||
log.warn("Could not delete vector chunks for book {}: {}", bookId, ex.getMessage());
|
||||
}
|
||||
private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
|
||||
String embeddingId) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
m.put("type", "FIGURE");
|
||||
m.put("book_id", figure.getBookId().toString());
|
||||
m.put("book_title", bookTitle);
|
||||
m.put("chapter_id", figure.getChapterId() != null ? figure.getChapterId() : "");
|
||||
m.put("section_id", figure.getSectionId() != null ? figure.getSectionId() : "");
|
||||
m.put("figure_id", figure.getId());
|
||||
m.put("figure_type", figure.getFigureType().name());
|
||||
m.put("image_path", figure.getImagePath());
|
||||
m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
|
||||
m.put("page", figure.getPage());
|
||||
m.put("embedding_id", embeddingId);
|
||||
return m;
|
||||
}
|
||||
|
||||
private String truncate(String message, int maxLength) {
|
||||
if (message == null) return null;
|
||||
return message.length() <= maxLength ? message : message.substring(0, maxLength);
|
||||
private String truncate(String msg, int max) {
|
||||
if (msg == null) return null;
|
||||
return msg.length() <= max ? msg : msg.substring(0, max);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
package com.aiteacher.book;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.UUID;
|
||||
@@ -15,10 +17,15 @@ public class BookService {
|
||||
|
||||
private final BookRepository bookRepository;
|
||||
private final BookEmbeddingService bookEmbeddingService;
|
||||
private final Path bookStoragePath;
|
||||
|
||||
public BookService(BookRepository bookRepository, BookEmbeddingService bookEmbeddingService) {
|
||||
public BookService(
|
||||
BookRepository bookRepository,
|
||||
BookEmbeddingService bookEmbeddingService,
|
||||
@Value("${app.figure-storage.base-path:./uploads}") String basePath) {
|
||||
this.bookRepository = bookRepository;
|
||||
this.bookEmbeddingService = bookEmbeddingService;
|
||||
this.bookStoragePath = Paths.get(basePath).toAbsolutePath().normalize().resolve("books");
|
||||
}
|
||||
|
||||
public Book upload(MultipartFile file) throws IOException {
|
||||
@@ -28,20 +35,35 @@ public class BookService {
|
||||
}
|
||||
|
||||
String title = deriveTitle(originalFilename);
|
||||
|
||||
Book book = new Book(title, originalFilename, file.getSize());
|
||||
book = bookRepository.save(book);
|
||||
|
||||
// Write to a temp file so the async task can read it
|
||||
Path tempFile = Files.createTempFile("aiteacher-", "-" + book.getId() + ".pdf");
|
||||
file.transferTo(tempFile.toFile());
|
||||
// Persist PDF in a stable location for potential re-embedding
|
||||
Files.createDirectories(bookStoragePath);
|
||||
Path pdfPath = bookStoragePath.resolve(book.getId() + ".pdf");
|
||||
file.transferTo(pdfPath.toFile());
|
||||
|
||||
UUID bookId = book.getId();
|
||||
Path pdfPath = tempFile;
|
||||
String bookTitle = title;
|
||||
bookEmbeddingService.embedBook(bookId, title, pdfPath);
|
||||
return book;
|
||||
}
|
||||
|
||||
bookEmbeddingService.embedBook(bookId, bookTitle, pdfPath);
|
||||
public Book reembed(UUID id) {
|
||||
Book book = bookRepository.findById(id)
|
||||
.orElseThrow(() -> new NoSuchElementException("Book not found."));
|
||||
|
||||
if (book.getStatus() == BookStatus.PROCESSING) {
|
||||
throw new IllegalStateException("Book is already being processed.");
|
||||
}
|
||||
|
||||
Path pdfPath = bookStoragePath.resolve(id + ".pdf");
|
||||
if (!Files.exists(pdfPath)) {
|
||||
throw new IllegalStateException(
|
||||
"Original PDF not found. Please re-upload the book before re-embedding.");
|
||||
}
|
||||
|
||||
bookEmbeddingService.deleteBookChunks(id);
|
||||
bookEmbeddingService.embedBook(id, book.getTitle(), pdfPath);
|
||||
return book;
|
||||
}
|
||||
|
||||
@@ -63,14 +85,21 @@ public class BookService {
|
||||
}
|
||||
|
||||
bookEmbeddingService.deleteBookChunks(id);
|
||||
|
||||
// Delete the stored PDF
|
||||
Path pdfPath = bookStoragePath.resolve(id + ".pdf");
|
||||
try {
|
||||
Files.deleteIfExists(pdfPath);
|
||||
} catch (IOException ex) {
|
||||
// Non-fatal — log only
|
||||
}
|
||||
|
||||
bookRepository.deleteById(id);
|
||||
}
|
||||
|
||||
private String deriveTitle(String filename) {
|
||||
// Strip .pdf extension and replace separators with spaces
|
||||
String name = filename.replaceAll("(?i)\\.pdf$", "");
|
||||
name = name.replaceAll("[-_]", " ");
|
||||
// Capitalise first letter
|
||||
if (!name.isEmpty()) {
|
||||
name = Character.toUpperCase(name.charAt(0)) + name.substring(1);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
package com.aiteacher.book;
|
||||
|
||||
public record FigureResponse(
|
||||
String figureId,
|
||||
String label,
|
||||
String caption,
|
||||
String figureType,
|
||||
int page,
|
||||
String imageUrl,
|
||||
String sectionId,
|
||||
String sectionTitle
|
||||
) {}
|
||||
@@ -3,22 +3,16 @@ package com.aiteacher.chat;
|
||||
import com.aiteacher.book.BookRepository;
|
||||
import com.aiteacher.book.BookStatus;
|
||||
import com.aiteacher.book.NoKnowledgeSourceException;
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.SectionEntity;
|
||||
import com.aiteacher.retrieval.NeurosurgeryRetriever;
|
||||
import com.aiteacher.retrieval.RetrievalResult;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.ai.chat.client.advisor.vectorstore.QuestionAnswerAdvisor;
|
||||
import org.springframework.ai.chat.model.ChatResponse;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.vectorstore.SearchRequest;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.UUID;
|
||||
import java.util.*;
|
||||
|
||||
@Service
|
||||
public class ChatService {
|
||||
@@ -35,26 +29,28 @@ public class ChatService {
|
||||
- Build answers from what is present: procedures, conditions, techniques, and descriptions all contribute; combine them into a rich, structured response
|
||||
- Use clear structure: headings, bullet points, or numbered steps where appropriate to maximize clarity
|
||||
- Only say you cannot answer if the context is entirely unrelated to the question
|
||||
- Cite sources for each major point (book title and page number from the context metadata)
|
||||
- Cite sources for each major point (book title and page number from the context)
|
||||
- When referencing diagrams or figures, cite them as [Fig. X, p.N]
|
||||
- Maintain continuity with the conversation history
|
||||
- Never fabricate clinical information not present in the context
|
||||
""";
|
||||
|
||||
private final ChatClient chatClient;
|
||||
private final VectorStore vectorStore;
|
||||
private final BookRepository bookRepository;
|
||||
private final ChatSessionRepository sessionRepository;
|
||||
private final MessageRepository messageRepository;
|
||||
private final NeurosurgeryRetriever retriever;
|
||||
|
||||
public ChatService(ChatClient chatClient, VectorStore vectorStore,
|
||||
public ChatService(ChatClient chatClient,
|
||||
BookRepository bookRepository,
|
||||
ChatSessionRepository sessionRepository,
|
||||
MessageRepository messageRepository) {
|
||||
MessageRepository messageRepository,
|
||||
NeurosurgeryRetriever retriever) {
|
||||
this.chatClient = chatClient;
|
||||
this.vectorStore = vectorStore;
|
||||
this.bookRepository = bookRepository;
|
||||
this.sessionRepository = sessionRepository;
|
||||
this.messageRepository = messageRepository;
|
||||
this.retriever = retriever;
|
||||
}
|
||||
|
||||
public ChatSession createSession(String topicId) {
|
||||
@@ -73,7 +69,11 @@ public class ChatService {
|
||||
ChatSession session = sessionRepository.findById(sessionId)
|
||||
.orElseThrow(() -> new NoSuchElementException("Session not found."));
|
||||
|
||||
if (!bookRepository.existsByStatus(BookStatus.READY)) {
|
||||
List<com.aiteacher.book.Book> readyBooks = bookRepository.findAll().stream()
|
||||
.filter(b -> b.getStatus() == BookStatus.READY)
|
||||
.toList();
|
||||
|
||||
if (readyBooks.isEmpty()) {
|
||||
throw new NoKnowledgeSourceException("No books are available as knowledge sources.");
|
||||
}
|
||||
|
||||
@@ -81,27 +81,31 @@ public class ChatService {
|
||||
Message userMessage = new Message(sessionId, MessageRole.USER, userContent);
|
||||
messageRepository.save(userMessage);
|
||||
|
||||
// Build conversation history for context
|
||||
// Build full question with conversation history
|
||||
List<Message> history = messageRepository.findBySessionIdOrderByCreatedAtAsc(sessionId);
|
||||
|
||||
// Build the prompt with full conversation history as context
|
||||
String fullQuestion = buildQuestionWithHistory(history, userContent, session.getTopicId());
|
||||
|
||||
var qaAdvisor = QuestionAnswerAdvisor.builder(vectorStore)
|
||||
.searchRequest(SearchRequest.builder().similarityThreshold(0.5d).topK(6).build())
|
||||
.build();
|
||||
|
||||
ChatResponse response = chatClient.prompt()
|
||||
.advisors(qaAdvisor)
|
||||
// Retrieve context from all ready books (aggregate across books)
|
||||
List<SectionEntity> allSections = new ArrayList<>();
|
||||
List<FigureEntity> allFigures = new ArrayList<>();
|
||||
for (com.aiteacher.book.Book book : readyBooks) {
|
||||
RetrievalResult result = retriever.retrieve(fullQuestion, book.getId());
|
||||
allSections.addAll(result.parentSections());
|
||||
allFigures.addAll(result.figures());
|
||||
}
|
||||
|
||||
// Build LLM prompt with section full texts and figure references
|
||||
String contextPrompt = buildContextPrompt(fullQuestion, allSections, allFigures);
|
||||
|
||||
String assistantContent = chatClient.prompt()
|
||||
.system(SYSTEM_PROMPT)
|
||||
.user(fullQuestion)
|
||||
.user(contextPrompt)
|
||||
.call()
|
||||
.chatResponse();
|
||||
.content();
|
||||
|
||||
String assistantContent = response.getResult().getOutput().getText();
|
||||
List<Map<String, Object>> sources = extractSources(response);
|
||||
// Build sources list with TEXT and FIGURE entries
|
||||
List<Map<String, Object>> sources = buildSources(allSections, allFigures);
|
||||
|
||||
// Persist assistant message
|
||||
Message assistantMessage = new Message(sessionId, MessageRole.ASSISTANT, assistantContent);
|
||||
assistantMessage.setSources(sources);
|
||||
return messageRepository.save(assistantMessage);
|
||||
@@ -118,24 +122,95 @@ public class ChatService {
|
||||
sessionRepository.deleteById(sessionId);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private String buildContextPrompt(String question,
|
||||
List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (!sections.isEmpty()) {
|
||||
sb.append("CONTEXT:\n\n");
|
||||
for (SectionEntity section : sections) {
|
||||
sb.append("[").append(section.getTitle())
|
||||
.append(", p.").append(section.getPageStart()).append("]\n");
|
||||
sb.append(section.getFullText()).append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (!figures.isEmpty()) {
|
||||
sb.append("AVAILABLE FIGURES:\n");
|
||||
for (FigureEntity figure : figures) {
|
||||
sb.append("- ").append(figure.getLabel() != null ? figure.getLabel() : "Figure")
|
||||
.append(" (p.").append(figure.getPage()).append("): ")
|
||||
.append(figure.getCaption() != null ? figure.getCaption() : "")
|
||||
.append("\n");
|
||||
}
|
||||
sb.append("\nWhen referencing diagrams, cite them as [Fig. X, p.N].\n\n");
|
||||
}
|
||||
|
||||
sb.append("QUESTION:\n").append(question);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private List<Map<String, Object>> buildSources(List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
List<Map<String, Object>> sources = new ArrayList<>();
|
||||
|
||||
for (SectionEntity section : sections) {
|
||||
Map<String, Object> source = new LinkedHashMap<>();
|
||||
source.put("type", "TEXT");
|
||||
source.put("bookTitle", deriveTitleFromSection(section));
|
||||
source.put("page", section.getPageStart());
|
||||
source.put("chunkText", truncate(section.getFullText(), 500));
|
||||
sources.add(source);
|
||||
}
|
||||
|
||||
for (FigureEntity figure : figures) {
|
||||
Map<String, Object> source = new LinkedHashMap<>();
|
||||
source.put("type", "FIGURE");
|
||||
source.put("bookTitle", bookRepository.findById(figure.getBookId())
|
||||
.map(com.aiteacher.book.Book::getTitle).orElse("Book"));
|
||||
source.put("page", figure.getPage());
|
||||
source.put("figureId", figure.getId());
|
||||
source.put("label", figure.getLabel() != null ? figure.getLabel() : "");
|
||||
source.put("caption", figure.getCaption() != null ? figure.getCaption() : "");
|
||||
source.put("figureType", figure.getFigureType().name());
|
||||
// imageUrl assembled from relative path: figures/{bookId}/{filename}
|
||||
String filename = figure.getImagePath().substring(
|
||||
figure.getImagePath().lastIndexOf('/') + 1);
|
||||
source.put("imageUrl", "/api/v1/figures/" + figure.getBookId() + "/" + filename);
|
||||
sources.add(source);
|
||||
}
|
||||
|
||||
return sources;
|
||||
}
|
||||
|
||||
private String deriveTitleFromSection(SectionEntity section) {
|
||||
if (section == null) return "Book";
|
||||
return bookRepository.findById(section.getBookId())
|
||||
.map(com.aiteacher.book.Book::getTitle)
|
||||
.orElse("Book");
|
||||
}
|
||||
|
||||
private String buildQuestionWithHistory(List<Message> history, String currentQuestion,
|
||||
String topicId) {
|
||||
boolean hasTopic = topicId != null && !topicId.equals("free-form");
|
||||
|
||||
if (history.size() <= 1) {
|
||||
return hasTopic
|
||||
? String.format("[Context: This is a question about the neurosurgery topic '%s']\n%s",
|
||||
? String.format("[Context: question about neurosurgery topic '%s']\n%s",
|
||||
topicId, currentQuestion)
|
||||
: currentQuestion;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (hasTopic) {
|
||||
sb.append(String.format("[Context: This conversation is about the neurosurgery topic '%s']\n\n",
|
||||
topicId));
|
||||
sb.append(String.format("[Context: conversation about '%s']\n\n", topicId));
|
||||
}
|
||||
sb.append("Previous conversation:\n");
|
||||
// Include all messages except the last (which is the current user message just saved)
|
||||
for (int i = 0; i < history.size() - 1; i++) {
|
||||
Message msg = history.get(i);
|
||||
sb.append(msg.getRole().name()).append(": ").append(msg.getContent()).append("\n");
|
||||
@@ -144,30 +219,8 @@ public class ChatService {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private List<Map<String, Object>> extractSources(ChatResponse response) {
|
||||
List<Map<String, Object>> sources = new ArrayList<>();
|
||||
|
||||
if (response.getMetadata() != null) {
|
||||
Object retrieved = response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS);
|
||||
if (retrieved instanceof List<?> docs) {
|
||||
for (Object docObj : docs) {
|
||||
if (docObj instanceof Document doc) {
|
||||
Map<String, Object> metadata = doc.getMetadata();
|
||||
String bookTitle = (String) metadata.get("book_title");
|
||||
Object pageObj = metadata.get("page_number");
|
||||
Integer page = pageObj instanceof Number n ? n.intValue() : null;
|
||||
if (bookTitle != null) {
|
||||
Map<String, Object> source = new HashMap<>();
|
||||
source.put("bookTitle", bookTitle);
|
||||
source.put("page", page);
|
||||
source.put("chunkText", doc.getText());
|
||||
sources.add(source);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sources;
|
||||
private String truncate(String text, int maxChars) {
|
||||
if (text == null) return "";
|
||||
return text.length() <= maxChars ? text : text.substring(0, maxChars) + "…";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.aiteacher.config;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.web.servlet.config.annotation.ResourceHandlerRegistry;
|
||||
import org.springframework.web.servlet.config.annotation.WebMvcConfigurer;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
@Configuration
|
||||
public class FigureStorageConfig implements WebMvcConfigurer {
|
||||
|
||||
private final String basePath;
|
||||
|
||||
public FigureStorageConfig(@Value("${app.figure-storage.base-path:./uploads}") String basePath) {
|
||||
this.basePath = Paths.get(basePath).toAbsolutePath().normalize().toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addResourceHandlers(ResourceHandlerRegistry registry) {
|
||||
// Serve GET /api/v1/figures/** from the local file store
|
||||
registry.addResourceHandler("/api/v1/figures/**")
|
||||
.addResourceLocations("file:" + basePath + "/figures/");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import jakarta.persistence.*;
|
||||
import java.time.Instant;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "chapter")
|
||||
public class ChapterEntity {
|
||||
|
||||
@Id
|
||||
@Column(name = "id", length = 200)
|
||||
private String id;
|
||||
|
||||
@Column(name = "book_id", nullable = false)
|
||||
private UUID bookId;
|
||||
|
||||
@Column(name = "number", nullable = false)
|
||||
private int number;
|
||||
|
||||
@Column(name = "title", length = 500)
|
||||
private String title;
|
||||
|
||||
@Column(name = "page_start")
|
||||
private Integer pageStart;
|
||||
|
||||
@Column(name = "created_at", nullable = false)
|
||||
private Instant createdAt;
|
||||
|
||||
public ChapterEntity() {}
|
||||
|
||||
public ChapterEntity(String id, UUID bookId, int number, String title, Integer pageStart) {
|
||||
this.id = id;
|
||||
this.bookId = bookId;
|
||||
this.number = number;
|
||||
this.title = title;
|
||||
this.pageStart = pageStart;
|
||||
this.createdAt = Instant.now();
|
||||
}
|
||||
|
||||
public String getId() { return id; }
|
||||
public UUID getBookId() { return bookId; }
|
||||
public int getNumber() { return number; }
|
||||
public String getTitle() { return title; }
|
||||
public Integer getPageStart() { return pageStart; }
|
||||
public Instant getCreatedAt() { return createdAt; }
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public interface ChapterRepository extends JpaRepository<ChapterEntity, String> {
|
||||
void deleteAllByBookId(UUID bookId);
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import jakarta.persistence.*;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "chunk_figure_ref")
|
||||
@IdClass(ChunkFigureRefEntity.PK.class)
|
||||
public class ChunkFigureRefEntity {
|
||||
|
||||
@Id
|
||||
@Column(name = "chunk_id", nullable = false)
|
||||
private UUID chunkId;
|
||||
|
||||
@Id
|
||||
@Column(name = "figure_id", nullable = false, length = 200)
|
||||
private String figureId;
|
||||
|
||||
@Column(name = "mention_page")
|
||||
private Integer mentionPage;
|
||||
|
||||
public ChunkFigureRefEntity() {}
|
||||
|
||||
public ChunkFigureRefEntity(UUID chunkId, String figureId, Integer mentionPage) {
|
||||
this.chunkId = chunkId;
|
||||
this.figureId = figureId;
|
||||
this.mentionPage = mentionPage;
|
||||
}
|
||||
|
||||
public UUID getChunkId() { return chunkId; }
|
||||
public String getFigureId() { return figureId; }
|
||||
public Integer getMentionPage() { return mentionPage; }
|
||||
|
||||
public static class PK implements Serializable {
|
||||
private UUID chunkId;
|
||||
private String figureId;
|
||||
|
||||
public PK() {}
|
||||
public PK(UUID chunkId, String figureId) {
|
||||
this.chunkId = chunkId;
|
||||
this.figureId = figureId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof PK pk)) return false;
|
||||
return Objects.equals(chunkId, pk.chunkId) && Objects.equals(figureId, pk.figureId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(chunkId, figureId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface ChunkFigureRefRepository extends JpaRepository<ChunkFigureRefEntity, ChunkFigureRefEntity.PK> {
|
||||
|
||||
@Query("SELECT r FROM ChunkFigureRefEntity r WHERE r.chunkId IN :chunkIds")
|
||||
List<ChunkFigureRefEntity> findByChunkIdIn(@Param("chunkIds") List<UUID> chunkIds);
|
||||
|
||||
@Query("DELETE FROM ChunkFigureRefEntity r WHERE r.figureId IN :figureIds")
|
||||
@org.springframework.data.jpa.repository.Modifying
|
||||
void deleteByFigureIdIn(@Param("figureIds") List<String> figureIds);
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Scans chunk text for "Fig. X" and "Figure X" references and persists
|
||||
* ChunkFigureRefEntity rows linking that chunk to its referenced figures.
|
||||
*/
|
||||
@Service
|
||||
public class ChunkFigureRefService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ChunkFigureRefService.class);
|
||||
|
||||
// Matches: "Fig. 12-4", "Fig. 12.4", "Fig 12", "Figure 12-4", etc.
|
||||
private static final Pattern REF_PATTERN =
|
||||
Pattern.compile("(?i)\\b(Fig\\.?|Figure)\\s+(\\d+[\\-.\\d]*)");
|
||||
|
||||
private final ChunkFigureRefRepository refRepository;
|
||||
|
||||
public ChunkFigureRefService(ChunkFigureRefRepository refRepository) {
|
||||
this.refRepository = refRepository;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each text chunk, finds figure references and persists ChunkFigureRefEntity rows.
|
||||
*/
|
||||
public void linkChunksToFigures(List<Document> chunks, List<FigureEntity> bookFigures,
|
||||
int pageNum) {
|
||||
if (bookFigures.isEmpty()) return;
|
||||
|
||||
for (Document chunk : chunks) {
|
||||
String chunkIdStr = chunk.getId();
|
||||
UUID chunkId;
|
||||
try {
|
||||
chunkId = UUID.fromString(chunkIdStr);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.warn("Chunk has non-UUID id: {}", chunkIdStr);
|
||||
continue;
|
||||
}
|
||||
|
||||
Matcher m = REF_PATTERN.matcher(chunk.getText());
|
||||
while (m.find()) {
|
||||
String refNum = m.group(2).trim();
|
||||
// Find matching figure by label suffix
|
||||
for (FigureEntity figure : bookFigures) {
|
||||
if (figure.getLabel() != null && figure.getLabel().endsWith(refNum)) {
|
||||
refRepository.save(new ChunkFigureRefEntity(chunkId, figure.getId(), pageNum));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import jakarta.persistence.*;
|
||||
import java.time.Instant;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "figure")
|
||||
public class FigureEntity {
|
||||
|
||||
@Id
|
||||
@Column(name = "id", length = 200)
|
||||
private String id;
|
||||
|
||||
@Column(name = "book_id", nullable = false)
|
||||
private UUID bookId;
|
||||
|
||||
@Column(name = "section_id", length = 200)
|
||||
private String sectionId;
|
||||
|
||||
@Column(name = "chapter_id", length = 200)
|
||||
private String chapterId;
|
||||
|
||||
@Column(name = "label", length = 100)
|
||||
private String label;
|
||||
|
||||
@Column(name = "caption", columnDefinition = "TEXT")
|
||||
private String caption;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "figure_type", nullable = false, length = 50)
|
||||
private FigureType figureType;
|
||||
|
||||
@Column(name = "page", nullable = false)
|
||||
private int page;
|
||||
|
||||
@Column(name = "image_path", nullable = false, length = 1000)
|
||||
private String imagePath;
|
||||
|
||||
@Column(name = "caption_embedding_id")
|
||||
private UUID captionEmbeddingId;
|
||||
|
||||
@Column(name = "created_at", nullable = false)
|
||||
private Instant createdAt;
|
||||
|
||||
public FigureEntity() {}
|
||||
|
||||
public FigureEntity(String id, UUID bookId, String sectionId, String chapterId,
|
||||
String label, String caption, FigureType figureType,
|
||||
int page, String imagePath) {
|
||||
this.id = id;
|
||||
this.bookId = bookId;
|
||||
this.sectionId = sectionId;
|
||||
this.chapterId = chapterId;
|
||||
this.label = label;
|
||||
this.caption = caption;
|
||||
this.figureType = figureType;
|
||||
this.page = page;
|
||||
this.imagePath = imagePath;
|
||||
this.createdAt = Instant.now();
|
||||
}
|
||||
|
||||
public String getId() { return id; }
|
||||
public UUID getBookId() { return bookId; }
|
||||
public String getSectionId() { return sectionId; }
|
||||
public String getChapterId() { return chapterId; }
|
||||
public String getLabel() { return label; }
|
||||
public String getCaption() { return caption; }
|
||||
public FigureType getFigureType() { return figureType; }
|
||||
public int getPage() { return page; }
|
||||
public String getImagePath() { return imagePath; }
|
||||
public UUID getCaptionEmbeddingId() { return captionEmbeddingId; }
|
||||
public Instant getCreatedAt() { return createdAt; }
|
||||
|
||||
public void setCaptionEmbeddingId(UUID captionEmbeddingId) {
|
||||
this.captionEmbeddingId = captionEmbeddingId;
|
||||
}
|
||||
|
||||
public void setCaption(String caption) {
|
||||
this.caption = caption;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import com.aiteacher.figure.FigureStorageService;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Extracts images from each PDF page using PDFBox.
|
||||
* Images below the configured minimum size are skipped.
|
||||
* Caption is detected by the "Fig." pattern in page text.
|
||||
*/
|
||||
@Service
|
||||
public class FigureExtractionService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FigureExtractionService.class);
|
||||
|
||||
// Caption: line starting with "Fig." or "Figure" followed by a number
|
||||
private static final Pattern CAPTION_PATTERN =
|
||||
Pattern.compile("(?m)^(Fig\\.?\\s*\\d+[\\-.]?\\d*[^\\n]*)", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
// Figure label: "Fig. 12-4" or "Fig. 12.4"
|
||||
private static final Pattern LABEL_PATTERN =
|
||||
Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");
|
||||
|
||||
private final FigureStorageService storageService;
|
||||
private final FigureRepository figureRepository;
|
||||
private final int minImageSizePx;
|
||||
|
||||
public FigureExtractionService(
|
||||
FigureStorageService storageService,
|
||||
FigureRepository figureRepository,
|
||||
@Value("${app.figure-storage.min-image-size-px:100}") int minImageSizePx) {
|
||||
this.storageService = storageService;
|
||||
this.figureRepository = figureRepository;
|
||||
this.minImageSizePx = minImageSizePx;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all qualifying images from the PDF for the given book.
|
||||
* Returns persisted FigureEntity list (without vision descriptions — set later).
|
||||
*/
|
||||
public List<FigureEntity> extract(UUID bookId, String chapterId,
|
||||
List<SectionEntity> sections, Path pdfPath) {
|
||||
List<FigureEntity> figures = new ArrayList<>();
|
||||
int figureCounter = 0;
|
||||
|
||||
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
||||
for (SectionEntity section : sections) {
|
||||
int pageIndex = section.getPageStart() - 1; // 0-based
|
||||
if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue;
|
||||
|
||||
PDPage page = doc.getPage(pageIndex);
|
||||
String pageText = section.getFullText();
|
||||
|
||||
try {
|
||||
for (COSName name : page.getResources().getXObjectNames()) {
|
||||
PDXObject xObject = page.getResources().getXObject(name);
|
||||
if (!(xObject instanceof PDImageXObject image)) continue;
|
||||
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
if (bufferedImage.getWidth() < minImageSizePx
|
||||
|| bufferedImage.getHeight() < minImageSizePx) {
|
||||
continue; // skip decorative images
|
||||
}
|
||||
|
||||
figureCounter++;
|
||||
String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter;
|
||||
String caption = detectCaption(pageText);
|
||||
String label = detectLabel(caption, figureCounter);
|
||||
FigureType type = classifyType(caption, pageText);
|
||||
|
||||
String imagePath = storageService.save(bookId, figureId, bufferedImage);
|
||||
|
||||
FigureEntity figure = new FigureEntity(
|
||||
figureId, bookId, section.getId(), chapterId,
|
||||
label, caption, type, section.getPageStart(), imagePath
|
||||
);
|
||||
figures.add(figureRepository.save(figure));
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.warn("Failed to extract images from page {} of book {}: {}",
|
||||
section.getPageStart(), bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.error("Could not open PDF for image extraction, book {}", bookId, ex);
|
||||
}
|
||||
|
||||
log.info("Extracted {} figures for book {}", figures.size(), bookId);
|
||||
return figures;
|
||||
}
|
||||
|
||||
private String detectCaption(String pageText) {
|
||||
if (pageText == null) return null;
|
||||
Matcher m = CAPTION_PATTERN.matcher(pageText);
|
||||
return m.find() ? m.group(1).trim() : null;
|
||||
}
|
||||
|
||||
private String detectLabel(String caption, int counter) {
|
||||
if (caption != null) {
|
||||
Matcher m = LABEL_PATTERN.matcher(caption);
|
||||
if (m.find()) return "Fig. " + m.group(1).trim();
|
||||
}
|
||||
return "Fig. " + counter;
|
||||
}
|
||||
|
||||
private FigureType classifyType(String caption, String pageText) {
|
||||
String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase();
|
||||
if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic")
|
||||
|| combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
|
||||
if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE;
|
||||
if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE;
|
||||
if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph"))
|
||||
return FigureType.CHART;
|
||||
if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH;
|
||||
return FigureType.ANATOMICAL_DIAGRAM;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface FigureRepository extends JpaRepository<FigureEntity, String> {
|
||||
List<FigureEntity> findAllByBookId(UUID bookId);
|
||||
void deleteAllByBookId(UUID bookId);
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
public enum FigureType {
|
||||
ANATOMICAL_DIAGRAM,
|
||||
SURGICAL_PHOTOGRAPH,
|
||||
MRI_CT_SCAN,
|
||||
TABLE,
|
||||
CHART,
|
||||
INTRAOPERATIVE_IMAGE
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Parses a PDF into page-level SectionEntity records stored in Postgres.
|
||||
* Each page becomes one section, grouped under a single chapter per book.
|
||||
*/
|
||||
@Service
|
||||
public class PdfStructureParser {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
|
||||
|
||||
private final ChapterRepository chapterRepository;
|
||||
private final SectionRepository sectionRepository;
|
||||
|
||||
public PdfStructureParser(ChapterRepository chapterRepository,
|
||||
SectionRepository sectionRepository) {
|
||||
this.chapterRepository = chapterRepository;
|
||||
this.sectionRepository = sectionRepository;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Parsing PDF structure for book {}", bookId);
|
||||
|
||||
// One chapter per book
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// One section per page
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(
|
||||
new FileSystemResource(pdfPath.toFile()),
|
||||
PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
|
||||
);
|
||||
|
||||
List<org.springframework.ai.document.Document> pages = reader.get();
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < pages.size(); i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = pages.get(i).getText();
|
||||
if (text == null || text.isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
|
||||
log.info("Parsed {} sections for book {}", sections.size(), bookId);
|
||||
return sections;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import jakarta.persistence.*;
|
||||
import java.time.Instant;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "section")
|
||||
public class SectionEntity {
|
||||
|
||||
@Id
|
||||
@Column(name = "id", length = 200)
|
||||
private String id;
|
||||
|
||||
@Column(name = "chapter_id", nullable = false, length = 200)
|
||||
private String chapterId;
|
||||
|
||||
@Column(name = "book_id", nullable = false)
|
||||
private UUID bookId;
|
||||
|
||||
@Column(name = "number", length = 50)
|
||||
private String number;
|
||||
|
||||
@Column(name = "title", length = 500)
|
||||
private String title;
|
||||
|
||||
@Column(name = "page_start", nullable = false)
|
||||
private int pageStart;
|
||||
|
||||
@Column(name = "page_end", nullable = false)
|
||||
private int pageEnd;
|
||||
|
||||
@Column(name = "full_text", nullable = false, columnDefinition = "TEXT")
|
||||
private String fullText;
|
||||
|
||||
@Column(name = "created_at", nullable = false)
|
||||
private Instant createdAt;
|
||||
|
||||
public SectionEntity() {}
|
||||
|
||||
public SectionEntity(String id, String chapterId, UUID bookId, String number,
|
||||
String title, int pageStart, int pageEnd, String fullText) {
|
||||
this.id = id;
|
||||
this.chapterId = chapterId;
|
||||
this.bookId = bookId;
|
||||
this.number = number;
|
||||
this.title = title;
|
||||
this.pageStart = pageStart;
|
||||
this.pageEnd = pageEnd;
|
||||
this.fullText = fullText;
|
||||
this.createdAt = Instant.now();
|
||||
}
|
||||
|
||||
public String getId() { return id; }
|
||||
public String getChapterId() { return chapterId; }
|
||||
public UUID getBookId() { return bookId; }
|
||||
public String getNumber() { return number; }
|
||||
public String getTitle() { return title; }
|
||||
public int getPageStart() { return pageStart; }
|
||||
public int getPageEnd() { return pageEnd; }
|
||||
public String getFullText() { return fullText; }
|
||||
public Instant getCreatedAt() { return createdAt; }
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface SectionRepository extends JpaRepository<SectionEntity, String> {
|
||||
List<SectionEntity> findAllByBookId(UUID bookId);
|
||||
void deleteAllByBookId(UUID bookId);
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Splits a SectionEntity's full text into overlapping chunks for vector embedding.
|
||||
* Target size: ~1800 characters (~450 tokens); overlap: 200 characters.
|
||||
*/
|
||||
@Service
|
||||
public class TextChunkingService {
|
||||
|
||||
private static final int TARGET_CHARS = 1800;
|
||||
private static final int OVERLAP_CHARS = 200;
|
||||
|
||||
public List<Document> chunk(SectionEntity section, String bookTitle) {
|
||||
String text = section.getFullText();
|
||||
if (text == null || text.isBlank()) return List.of();
|
||||
|
||||
List<String> windows = split(text);
|
||||
List<Document> documents = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < windows.size(); i++) {
|
||||
String chunkId = UUID.randomUUID().toString();
|
||||
Map<String, Object> metadata = buildMetadata(section, bookTitle, i, windows.size(), chunkId);
|
||||
documents.add(new Document(chunkId, windows.get(i), metadata));
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
private List<String> split(String text) {
|
||||
List<String> windows = new ArrayList<>();
|
||||
int start = 0;
|
||||
while (start < text.length()) {
|
||||
int end = Math.min(start + TARGET_CHARS, text.length());
|
||||
windows.add(text.substring(start, end));
|
||||
if (end == text.length()) break;
|
||||
start = end - OVERLAP_CHARS;
|
||||
}
|
||||
return windows;
|
||||
}
|
||||
|
||||
private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
|
||||
int index, int total, String chunkId) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
m.put("type", "TEXT");
|
||||
m.put("book_id", section.getBookId().toString());
|
||||
m.put("book_title", bookTitle);
|
||||
m.put("chapter_id", section.getChapterId());
|
||||
m.put("section_id", section.getId());
|
||||
m.put("section_title", section.getTitle() != null ? section.getTitle() : "");
|
||||
m.put("page_start", section.getPageStart());
|
||||
m.put("page_end", section.getPageEnd());
|
||||
m.put("chunk_index", index);
|
||||
m.put("total_chunks", total);
|
||||
m.put("chunk_id", chunkId);
|
||||
return m;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.MimeTypeUtils;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* Generates a clinical text description for an extracted figure image
|
||||
* using the OpenAI vision model via Spring AI ChatClient.
|
||||
*/
|
||||
@Service
|
||||
public class VisionDescriptionService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class);
|
||||
|
||||
private static final String PROMPT =
|
||||
"You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " +
|
||||
"this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " +
|
||||
"significance. If text or labels are visible, include them verbatim.";
|
||||
|
||||
private final ChatClient chatClient;
|
||||
|
||||
public VisionDescriptionService(ChatClient chatClient) {
|
||||
this.chatClient = chatClient;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a description string. Falls back to the provided caption if vision fails.
|
||||
*/
|
||||
public String describe(Path imagePath, String captionFallback) {
|
||||
try {
|
||||
return chatClient.prompt()
|
||||
.user(u -> u
|
||||
.text(PROMPT)
|
||||
.media(MimeTypeUtils.IMAGE_PNG, new FileSystemResource(imagePath.toFile())))
|
||||
.call()
|
||||
.content();
|
||||
} catch (Exception ex) {
|
||||
log.warn("Vision description failed for {}: {} — using caption as fallback",
|
||||
imagePath.getFileName(), ex.getMessage());
|
||||
return captionFallback != null ? captionFallback : "Figure";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
package com.aiteacher.figure;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.nio.file.Path;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface FigureStorageService {
|
||||
|
||||
/**
|
||||
* Saves an extracted image to the figure store and returns the relative path
|
||||
* (relative to the configured base-path) stored in the database.
|
||||
*/
|
||||
String save(UUID bookId, String figureId, BufferedImage image);
|
||||
|
||||
/**
|
||||
* Resolves a stored relative path to an absolute filesystem path.
|
||||
*/
|
||||
Path resolve(String relativePath);
|
||||
|
||||
/**
|
||||
* Deletes all figure files for the given book.
|
||||
*/
|
||||
void deleteAll(UUID bookId);
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.aiteacher.figure;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.UUID;
|
||||
|
||||
@Service
|
||||
public class LocalFigureStorageService implements FigureStorageService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(LocalFigureStorageService.class);
|
||||
|
||||
private final Path basePath;
|
||||
|
||||
public LocalFigureStorageService(@Value("${app.figure-storage.base-path:./uploads}") String basePath) {
|
||||
this.basePath = Paths.get(basePath).toAbsolutePath().normalize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String save(UUID bookId, String figureId, BufferedImage image) {
|
||||
try {
|
||||
Path dir = basePath.resolve("figures").resolve(bookId.toString());
|
||||
Files.createDirectories(dir);
|
||||
String filename = figureId + ".png";
|
||||
Path file = dir.resolve(filename);
|
||||
ImageIO.write(image, "PNG", file.toFile());
|
||||
// Return relative path for storage in DB
|
||||
return "figures/" + bookId + "/" + filename;
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to save figure " + figureId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Path resolve(String relativePath) {
|
||||
return basePath.resolve(relativePath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteAll(UUID bookId) {
|
||||
Path dir = basePath.resolve("figures").resolve(bookId.toString());
|
||||
if (!Files.exists(dir)) return;
|
||||
try (var walk = Files.walk(dir)) {
|
||||
walk.sorted(java.util.Comparator.reverseOrder())
|
||||
.map(Path::toFile)
|
||||
.forEach(java.io.File::delete);
|
||||
} catch (IOException ex) {
|
||||
log.warn("Could not fully delete figures for book {}: {}", bookId, ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
import com.aiteacher.document.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.vectorstore.SearchRequest;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Dual-modality retriever: searches text chunks and figure captions independently,
|
||||
* then expands text hits to their parent sections and merges linked figures.
|
||||
*/
|
||||
@Service
|
||||
public class NeurosurgeryRetriever {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(NeurosurgeryRetriever.class);
|
||||
|
||||
private static final int TEXT_TOP_K = 5;
|
||||
private static final int FIGURE_TOP_K = 3;
|
||||
|
||||
private final VectorStore vectorStore;
|
||||
private final SectionRepository sectionRepository;
|
||||
private final FigureRepository figureRepository;
|
||||
private final ChunkFigureRefRepository chunkFigureRefRepository;
|
||||
|
||||
public NeurosurgeryRetriever(VectorStore vectorStore,
|
||||
SectionRepository sectionRepository,
|
||||
FigureRepository figureRepository,
|
||||
ChunkFigureRefRepository chunkFigureRefRepository) {
|
||||
this.vectorStore = vectorStore;
|
||||
this.sectionRepository = sectionRepository;
|
||||
this.figureRepository = figureRepository;
|
||||
this.chunkFigureRefRepository = chunkFigureRefRepository;
|
||||
}
|
||||
|
||||
public RetrievalResult retrieve(String query, UUID bookId) {
|
||||
FilterExpressionBuilder b = new FilterExpressionBuilder();
|
||||
|
||||
// 1. Text chunk search
|
||||
List<Document> textHits = vectorStore.similaritySearch(
|
||||
SearchRequest.builder()
|
||||
.query(query)
|
||||
.topK(TEXT_TOP_K)
|
||||
.filterExpression(b.and(
|
||||
b.eq("type", "TEXT"),
|
||||
b.eq("book_id", bookId.toString())
|
||||
).build())
|
||||
.build()
|
||||
);
|
||||
|
||||
// 2. Figure caption search (independent topK)
|
||||
List<Document> figureHits = vectorStore.similaritySearch(
|
||||
SearchRequest.builder()
|
||||
.query(query)
|
||||
.topK(FIGURE_TOP_K)
|
||||
.filterExpression(b.and(
|
||||
b.eq("type", "FIGURE"),
|
||||
b.eq("book_id", bookId.toString())
|
||||
).build())
|
||||
.build()
|
||||
);
|
||||
|
||||
// 3. Expand text chunks to parent sections from Postgres
|
||||
List<String> sectionIds = textHits.stream()
|
||||
.map(d -> (String) d.getMetadata().get("section_id"))
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.toList();
|
||||
List<SectionEntity> sections = sectionIds.isEmpty()
|
||||
? List.of()
|
||||
: sectionRepository.findAllById(sectionIds);
|
||||
|
||||
// 4. Fetch figures explicitly linked to retrieved chunks
|
||||
List<UUID> chunkIds = textHits.stream()
|
||||
.map(d -> {
|
||||
try { return UUID.fromString(d.getId()); }
|
||||
catch (Exception e) { return null; }
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.toList();
|
||||
List<String> linkedFigureIds = chunkIds.isEmpty()
|
||||
? List.of()
|
||||
: chunkFigureRefRepository.findByChunkIdIn(chunkIds)
|
||||
.stream().map(ChunkFigureRefEntity::getFigureId).distinct().toList();
|
||||
List<FigureEntity> linkedFigures = linkedFigureIds.isEmpty()
|
||||
? List.of()
|
||||
: figureRepository.findAllById(linkedFigureIds);
|
||||
|
||||
// 5. Collect figures from semantic figure search
|
||||
List<String> semanticFigureIds = figureHits.stream()
|
||||
.map(d -> (String) d.getMetadata().get("figure_id"))
|
||||
.filter(Objects::nonNull)
|
||||
.toList();
|
||||
List<FigureEntity> semanticFigures = semanticFigureIds.isEmpty()
|
||||
? List.of()
|
||||
: figureRepository.findAllById(semanticFigureIds);
|
||||
|
||||
// 6. Merge and deduplicate figures by figureId (linked figures take precedence)
|
||||
Map<String, FigureEntity> merged = new LinkedHashMap<>();
|
||||
linkedFigures.forEach(f -> merged.put(f.getId(), f));
|
||||
semanticFigures.forEach(f -> merged.putIfAbsent(f.getId(), f));
|
||||
|
||||
log.debug("Retrieved {} sections, {} figures for query", sections.size(), merged.size());
|
||||
return new RetrievalResult(sections, new ArrayList<>(merged.values()));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.SectionEntity;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record RetrievalResult(
|
||||
List<SectionEntity> parentSections,
|
||||
List<FigureEntity> figures
|
||||
) {}
|
||||
Reference in New Issue
Block a user