enhance rag retrieval + summary

This commit is contained in:
Adrien
2026-04-07 22:39:28 +02:00
parent 0cf318f0a7
commit aee6a9dfba
34 changed files with 2306 additions and 279 deletions
@@ -92,7 +92,7 @@ public class BookEmbeddingService {
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter);
// Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
// Step 1: Parse with Marker — split into 100-page chunks, then merge results
ParsedBook parsed = markerPageParser.parse(pdfPath);
List<PageResult> pageResults = parsed.pages();
@@ -125,25 +125,32 @@ public class BookEmbeddingService {
log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
// Step 5: Vision analysis (description + visible text) → embed figure chunks
for (FigureEntity figure : figures) {
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
VisionDescriptionService.ImageAnalysis analysis =
visionDescriptionService.analyze(imageBytes, figure.getCaption());
Map<String, SectionEntity> sectionById = new HashMap<>();
for (SectionEntity s : sections) sectionById.put(s.getId(), s);
for (FigureEntity figure : figures) {
// Prefer caption extracted from the linked section's full text
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
figure.setCaption(analysis.description());
figureRepository.save(figure);
String sectionCaption = extractCaptionFromSection(sectionById.get(figure.getSectionId()));
if (sectionCaption != null) {
figure.setCaption(sectionCaption);
figureRepository.save(figure);
} else {
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
VisionDescriptionService.ImageAnalysis analysis =
visionDescriptionService.analyze(imageBytes, figure.getCaption());
figure.setCaption(analysis.description());
figureRepository.save(figure);
}
}
// Embedding content: description + caption + visible image text
String embeddingContent = analysis.description()
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
// Embedding content: description
String embeddingContent = (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
String embeddingId = UUID.randomUUID().toString();
if (!skipEmbedding) {
Document figureDoc = new Document(embeddingId, embeddingContent,
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
buildFigureMetadata(figure, bookTitle, embeddingId, ""));
vectorStore.add(List.of(figureDoc));
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
}
@@ -163,7 +170,7 @@ public class BookEmbeddingService {
}
book.setStatus(BookStatus.READY);
book.setPageCount(sections.size());
book.setPageCount(parsed.htmlByPage().size());
book.setProcessedAt(Instant.now());
bookRepository.save(book);
@@ -210,7 +217,7 @@ public class BookEmbeddingService {
if (page.orderedText().isBlank()) continue;
String sectionId = bookId + "-p" + page.pageNumber();
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
String title = truncate(page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber(), 500);
SectionEntity section = new SectionEntity(
sectionId, chapterId, bookId,
@@ -271,6 +278,17 @@ public class BookEmbeddingService {
return html;
}
private String extractCaptionFromSection(SectionEntity section) {
if (section == null) return null;
for (String line : section.getFullText().split("\n")) {
String trimmed = line.strip();
if (trimmed.startsWith("Fig.") || trimmed.startsWith("Figure") || trimmed.startsWith("Algorithm")) {
return trimmed;
}
}
return null;
}
private String truncate(String msg, int max) {
if (msg == null) return null;
return msg.length() <= max ? msg : msg.substring(0, max);
@@ -5,10 +5,11 @@ import com.aiteacher.book.BookStatus;
import com.aiteacher.book.NoKnowledgeSourceException;
import com.aiteacher.document.FigureEntity;
import com.aiteacher.document.SectionEntity;
import com.aiteacher.retrieval.CitationValidatorService;
import com.aiteacher.retrieval.LabelledContext;
import com.aiteacher.retrieval.NeurosurgeryRetriever;
import com.aiteacher.retrieval.QueryExpansionService;
import com.aiteacher.retrieval.RetrievalResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.stereotype.Service;
@@ -17,8 +18,6 @@ import java.util.*;
@Service
public class ChatService {
private static final Logger log = LoggerFactory.getLogger(ChatService.class);
private static final String SYSTEM_PROMPT = """
You are an expert neurosurgery educator assistant. Answer questions using the
medical textbook content provided to you as context.
@@ -29,8 +28,8 @@ public class ChatService {
- Build answers from what is present: procedures, conditions, techniques, and descriptions all contribute; combine them into a rich, structured response
- Use clear structure: headings, bullet points, or numbered steps where appropriate to maximize clarity
- Only say you cannot answer if the context is entirely unrelated to the question
- Cite sources for each major point (book title and page number from the context)
- When referencing diagrams or figures, cite them as [Fig. X, p.N]
- Cite sources for each major claim using the reference labels from the context (e.g. [S1], [F2]). Prefer these labels over inventing page numbers, but you may also describe the source naturally if needed.
- When referencing diagrams or figures, prefer their label from the context (e.g. [F1])
- Maintain continuity with the conversation history
- Never fabricate clinical information not present in the context
""";
@@ -40,17 +39,23 @@ public class ChatService {
private final ChatSessionRepository sessionRepository;
private final MessageRepository messageRepository;
private final NeurosurgeryRetriever retriever;
private final QueryExpansionService queryExpansionService;
private final CitationValidatorService citationValidatorService;
public ChatService(ChatClient chatClient,
BookRepository bookRepository,
ChatSessionRepository sessionRepository,
MessageRepository messageRepository,
NeurosurgeryRetriever retriever) {
NeurosurgeryRetriever retriever,
QueryExpansionService queryExpansionService,
CitationValidatorService citationValidatorService) {
this.chatClient = chatClient;
this.bookRepository = bookRepository;
this.sessionRepository = sessionRepository;
this.messageRepository = messageRepository;
this.retriever = retriever;
this.queryExpansionService = queryExpansionService;
this.citationValidatorService = citationValidatorService;
}
public ChatSession createSession(String topicId) {
@@ -85,25 +90,34 @@ public class ChatService {
List<Message> history = messageRepository.findBySessionIdOrderByCreatedAtAsc(sessionId);
String fullQuestion = buildQuestionWithHistory(history, userContent, session.getTopicId());
// Retrieve context from all ready books (aggregate across books)
// Expand only the current user question to clinical terminology for retrieval (US1).
// fullQuestion (which includes conversation history) is used for the LLM context prompt,
// but retrieval should be driven by a concise clinical rewrite of the actual question.
String retrievalQuery = queryExpansionService.expand(userContent).rewritten();
// Retrieve context from all ready books using the expanded query
List<SectionEntity> allSections = new ArrayList<>();
List<FigureEntity> allFigures = new ArrayList<>();
for (com.aiteacher.book.Book book : readyBooks) {
RetrievalResult result = retriever.retrieve(fullQuestion, book.getId());
RetrievalResult result = retriever.retrieve(retrievalQuery, book.getId());
allSections.addAll(result.parentSections());
allFigures.addAll(result.figures());
}
// Build LLM prompt with section full texts and figure references
String contextPrompt = buildContextPrompt(fullQuestion, allSections, allFigures);
// Build labelled context prompt (US2): assigns [S1]/[F1] labels to each source
LabelledContext ctx = buildContextPrompt(fullQuestion, allSections, allFigures);
String assistantContent = chatClient.prompt()
// Generate answer
String rawContent = chatClient.prompt()
.system(SYSTEM_PROMPT)
.user(contextPrompt)
.user(ctx.promptText())
.call()
.content();
// Build sources list with TEXT and FIGURE entries
// Strip any citation labels not present in the retrieved context (US2)
String assistantContent = citationValidatorService.validate(rawContent, ctx.allLabels());
// Attach sources with their ref-labels for frontend traceability
List<Map<String, Object>> sources = buildSources(allSections, allFigures);
Message assistantMessage = new Message(sessionId, MessageRole.ASSISTANT, assistantContent);
@@ -126,51 +140,71 @@ public class ChatService {
// Private helpers
// -------------------------------------------------------------------------
private String buildContextPrompt(String question,
List<SectionEntity> sections,
List<FigureEntity> figures) {
/**
* Builds the LLM context prompt, tagging each section as [S1], [S2]… and
* each figure as [F1], [F2]… so the model can cite only known sources.
*/
private LabelledContext buildContextPrompt(String question,
List<SectionEntity> sections,
List<FigureEntity> figures) {
Map<String, SectionEntity> sectionLabels = new LinkedHashMap<>();
Map<String, FigureEntity> figureLabels = new LinkedHashMap<>();
StringBuilder sb = new StringBuilder();
if (!sections.isEmpty()) {
sb.append("CONTEXT:\n\n");
for (SectionEntity section : sections) {
sb.append("[").append(section.getTitle())
.append(", p.").append(section.getPageStart()).append("]\n");
for (int i = 0; i < sections.size(); i++) {
SectionEntity section = sections.get(i);
String label = "S" + (i + 1);
sectionLabels.put(label, section);
sb.append("[").append(label).append("] ")
.append(section.getTitle())
.append(", p.").append(section.getPageStart()).append("\n");
sb.append(section.getFullText()).append("\n\n");
}
}
if (!figures.isEmpty()) {
sb.append("AVAILABLE FIGURES:\n");
for (FigureEntity figure : figures) {
sb.append("- ").append(figure.getLabel() != null ? figure.getLabel() : "Figure")
for (int i = 0; i < figures.size(); i++) {
FigureEntity figure = figures.get(i);
String label = "F" + (i + 1);
figureLabels.put(label, figure);
sb.append("[").append(label).append("] ")
.append(figure.getLabel() != null ? figure.getLabel() : "Figure")
.append(" (p.").append(figure.getPage()).append("): ")
.append(figure.getCaption() != null ? figure.getCaption() : "")
.append("\n");
}
sb.append("\nWhen referencing diagrams, cite them as [Fig. X, p.N].\n\n");
sb.append("\nWhen referencing diagrams, use their label from the context (e.g. [F1]).\n\n");
}
sb.append("QUESTION:\n").append(question);
return sb.toString();
return new LabelledContext(sectionLabels, figureLabels, sb.toString());
}
private List<Map<String, Object>> buildSources(List<SectionEntity> sections,
List<FigureEntity> figures) {
List<Map<String, Object>> sources = new ArrayList<>();
for (SectionEntity section : sections) {
for (int i = 0; i < sections.size(); i++) {
SectionEntity section = sections.get(i);
Map<String, Object> source = new LinkedHashMap<>();
source.put("type", "TEXT");
source.put("refLabel", "S" + (i + 1));
source.put("bookId", section.getBookId());
source.put("bookTitle", deriveTitleFromSection(section));
source.put("page", section.getPageStart());
source.put("chunkText", truncate(section.getFullText(), 500));
sources.add(source);
}
for (FigureEntity figure : figures) {
for (int i = 0; i < figures.size(); i++) {
FigureEntity figure = figures.get(i);
Map<String, Object> source = new LinkedHashMap<>();
source.put("type", "FIGURE");
source.put("refLabel", "F" + (i + 1));
source.put("bookId", figure.getBookId());
source.put("bookTitle", bookRepository.findById(figure.getBookId())
.map(com.aiteacher.book.Book::getTitle).orElse("Book"));
source.put("page", figure.getPage());
@@ -178,7 +212,6 @@ public class ChatService {
source.put("label", figure.getLabel() != null ? figure.getLabel() : "");
source.put("caption", figure.getCaption() != null ? figure.getCaption() : "");
source.put("figureType", figure.getFigureType().name());
// imageUrl assembled from relative path: figures/{bookId}/{filename}
String filename = figure.getImagePath().substring(
figure.getImagePath().lastIndexOf('/') + 1);
source.put("imageUrl", "/api/v1/figures/" + figure.getBookId() + "/" + filename);
@@ -17,6 +17,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
/**
* Parses a PDF with a single call to the Marker server using {@code output_format=json}.
*
@@ -46,19 +47,65 @@ public class MarkerPageParser {
);
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
private static final int CHUNK_SIZE = 100;
private static final ObjectMapper MAPPER = new ObjectMapper();
private final RestClient restClient;
private final PdfSplitterService pdfSplitterService;
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient,
PdfSplitterService pdfSplitterService) {
this.restClient = restClient;
this.pdfSplitterService = pdfSplitterService;
}
public ParsedBook parse(Path pdfPath) {
log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
/**
* Parses a PDF by splitting it into {@value #CHUNK_SIZE}-page chunks, submitting each
* chunk to Marker individually, and merging the results into a single {@link ParsedBook}.
* Page numbers in the merged result are absolute (1-based across the whole document).
*/
public ParsedBook parse(Path pdfPath) throws IOException {
List<PdfSplitterService.PdfChunk> chunks = pdfSplitterService.split(pdfPath, CHUNK_SIZE);
log.info("Processing {} chunk(s) for {}", chunks.size(), pdfPath.getFileName());
List<PageResult> allPages = new ArrayList<>();
Map<Integer, String> allHtml = new LinkedHashMap<>();
try {
for (int c = 0; c < chunks.size(); c++) {
PdfSplitterService.PdfChunk chunk = chunks.get(c);
log.info("Submitting chunk {}/{} to Marker (page offset {})", c + 1, chunks.size(), chunk.pageOffset());
ParsedBook chunkResult = submitChunk(chunk.tempFile());
// Rebase page numbers from chunk-relative to document-absolute
for (PageResult page : chunkResult.pages()) {
int absolutePage = chunk.pageOffset() + page.pageNumber();
allPages.add(new PageResult(absolutePage, page.orderedText(), page.headingTitle(), page.figures()));
}
chunkResult.htmlByPage().forEach((chunkPage, html) ->
allHtml.put(chunk.pageOffset() + chunkPage, html));
}
} finally {
// Delete temporary chunk files (skip if the chunk is the original PDF)
for (PdfSplitterService.PdfChunk chunk : chunks) {
if (!chunk.tempFile().equals(pdfPath)) {
try { Files.deleteIfExists(chunk.tempFile()); }
catch (IOException e) { log.warn("Could not delete temp chunk {}", chunk.tempFile()); }
}
}
}
log.info("Marker produced {} non-empty pages from {} chunk(s) of {}",
allPages.size(), chunks.size(), pdfPath.getFileName());
return new ParsedBook(allPages, allHtml);
}
/** Submits a single PDF file to Marker and returns the parsed result with chunk-relative page numbers. */
private ParsedBook submitChunk(Path chunkPath) {
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
body.add("file", new FileSystemResource(pdfPath));
body.add("file", new FileSystemResource(chunkPath));
body.add("output_format", "json");
JsonNode response = restClient.post()
@@ -76,28 +123,29 @@ public class MarkerPageParser {
List<JsonNode> pageNodes = extractPages(response);
if (pageNodes.isEmpty()) {
log.warn("Marker returned no pages for {}", pdfPath.getFileName());
log.warn("Marker returned no pages for chunk {}", chunkPath.getFileName());
return new ParsedBook(List.of(), Map.of());
}
log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());
List<PageResult> pages = new ArrayList<>();
Map<Integer, String> htmlByPage = new LinkedHashMap<>();
for (int i = 0; i < pageNodes.size(); i++) {
JsonNode pageNode = pageNodes.get(i);
int pageNumber = i + 1; // 1-based
int pageNumber = i + 1; // 1-based, chunk-relative
PageResult result = buildPageResult(pageNode, pageNumber);
String html = jsonToHtml(pageNode);
// Always save HTML so the reader can navigate to every page
htmlByPage.put(pageNumber, html);
// Only queue for embedding if the page has extractable content
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
pages.add(result);
htmlByPage.put(pageNumber, html);
}
}
log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
return new ParsedBook(pages, htmlByPage);
}
@@ -0,0 +1,72 @@
package com.aiteacher.document;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* Splits a PDF file into fixed-size chunks using PDFBox.
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
*/
@Service
public class PdfSplitterService {
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
/**
* A chunk of a split PDF.
*
* @param tempFile path to the temporary PDF file (caller must delete when done)
* @param pageOffset 0-based index of the first page in this chunk within the original document
*/
public record PdfChunk(Path tempFile, int pageOffset) {}
/**
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
* Returns a single-element list when the document fits in one chunk.
*
* @param pdfPath source PDF
* @param maxPagesPerChunk maximum pages per chunk
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
*/
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
int totalPages = doc.getNumberOfPages();
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
if (totalPages <= maxPagesPerChunk) {
// No split needed — return the original file as a single virtual chunk
return List.of(new PdfChunk(pdfPath, 0));
}
Splitter splitter = new Splitter();
splitter.setSplitAtPage(maxPagesPerChunk);
List<PDDocument> parts = splitter.split(doc);
List<PdfChunk> chunks = new ArrayList<>(parts.size());
int offset = 0;
for (PDDocument part : parts) {
try {
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
part.save(tmp.toFile());
chunks.add(new PdfChunk(tmp, offset));
log.debug("Created chunk at {} (page offset {})", tmp, offset);
offset += part.getNumberOfPages();
} finally {
part.close();
}
}
return chunks;
}
}
}
@@ -1,6 +1,8 @@
package com.aiteacher.document;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import java.util.List;
import java.util.UUID;
@@ -8,4 +10,10 @@ import java.util.UUID;
public interface SectionRepository extends JpaRepository<SectionEntity, String> {
List<SectionEntity> findAllByBookId(UUID bookId);
void deleteAllByBookId(UUID bookId);
@Query("SELECT s FROM SectionEntity s WHERE s.bookId = :bookId AND s.pageStart <= :windowEnd AND s.pageEnd >= :windowStart ORDER BY s.pageStart")
List<SectionEntity> findByBookIdAndPageOverlap(
@Param("bookId") UUID bookId,
@Param("windowStart") int windowStart,
@Param("windowEnd") int windowEnd);
}
@@ -3,6 +3,7 @@ package com.aiteacher.document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.stereotype.Service;
import org.springframework.util.MimeTypeUtils;
@@ -32,10 +33,16 @@ public class VisionDescriptionService {
IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
""";
/** Minimum ms between vision API calls. Configurable via app.vision.min-interval-ms. */
private final long minIntervalMs;
private final ChatClient chatClient;
private volatile long lastCallAt = 0;
public VisionDescriptionService(ChatClient chatClient) {
public VisionDescriptionService(
ChatClient chatClient,
@Value("${app.vision.min-interval-ms:2000}") long minIntervalMs) {
this.chatClient = chatClient;
this.minIntervalMs = minIntervalMs;
}
/**
@@ -55,6 +62,7 @@ public class VisionDescriptionService {
* @param captionFallback caption detected from surrounding text, may be null
*/
public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
throttle();
try {
String raw = chatClient.prompt()
.user(u -> u
@@ -71,6 +79,15 @@ public class VisionDescriptionService {
}
}
private synchronized void throttle() {
long now = System.currentTimeMillis();
long wait = minIntervalMs - (now - lastCallAt);
if (wait > 0) {
try { Thread.sleep(wait); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
}
lastCallAt = System.currentTimeMillis();
}
private ImageAnalysis parse(String raw, String captionFallback) {
String description = captionFallback != null ? captionFallback : "Figure";
String imageText = "";
@@ -0,0 +1,59 @@
package com.aiteacher.retrieval;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Post-processes generated answers to strip citation labels that do not
* correspond to any passage retrieved for the current query, preventing
* hallucinated source references from reaching the user.
*/
@Service
public class CitationValidatorService {
private static final Logger log = LoggerFactory.getLogger(CitationValidatorService.class);
/** Matches citation labels of the form [S1], [F2], [S12], etc. */
private static final Pattern CITATION_PATTERN = Pattern.compile("\\[(S|F)\\d+\\]");
/**
* Removes any {@code [Sx]} / {@code [Fx]} citation in {@code generatedAnswer}
* whose label is not contained in {@code validLabels}.
*
* @param generatedAnswer raw model output
* @param validLabels set of labels present in the retrieved context
* @return cleaned answer text with hallucinated citations removed
*/
public String validate(String generatedAnswer, Set<String> validLabels) {
if (generatedAnswer == null) return "";
Matcher matcher = CITATION_PATTERN.matcher(generatedAnswer);
List<String> removed = new ArrayList<>();
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String label = matcher.group();
String inner = label.substring(1, label.length() - 1); // strip [ ]
if (validLabels.contains(inner)) {
matcher.appendReplacement(sb, Matcher.quoteReplacement(label));
} else {
removed.add(inner);
matcher.appendReplacement(sb, "");
}
}
matcher.appendTail(sb);
if (!removed.isEmpty()) {
log.warn("Stripped hallucinated citations: {}", removed);
}
return sb.toString();
}
}
@@ -0,0 +1,7 @@
package com.aiteacher.retrieval;
/**
* Value object holding the original user query alongside its clinically
* rewritten variant used for vector-store retrieval.
*/
public record ExpandedQuery(String original, String rewritten) {}
@@ -0,0 +1,27 @@
package com.aiteacher.retrieval;
import com.aiteacher.document.FigureEntity;
import com.aiteacher.document.SectionEntity;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Value object produced when building the LLM context prompt.
* Maps short ref-labels (S1, S2… / F1, F2…) to their source entities
* and carries the fully formatted prompt text.
*/
public record LabelledContext(
Map<String, SectionEntity> sectionLabels,
Map<String, FigureEntity> figureLabels,
String promptText) {
/** Returns the union of all valid citation labels for this context. */
public Set<String> allLabels() {
Set<String> labels = new HashSet<>();
labels.addAll(sectionLabels.keySet());
labels.addAll(figureLabels.keySet());
return labels;
}
}
@@ -0,0 +1,47 @@
package com.aiteacher.retrieval;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.stereotype.Service;
/**
* Rewrites a user query into precise clinical/surgical terminology so that
* vector-store retrieval can match textbook language even when the user's
* phrasing differs from the documentation vocabulary.
*/
@Service
public class QueryExpansionService {
private static final Logger log = LoggerFactory.getLogger(QueryExpansionService.class);
private static final String EXPANSION_PROMPT = """
Rewrite the following question using precise medical and surgical terminology \
as it would appear in a neurosurgery textbook index. \
Output only the rewritten question, nothing else.
Question: %s""";
private final ChatClient chatClient;
public QueryExpansionService(ChatClient chatClient) {
this.chatClient = chatClient;
}
/**
* Returns an {@link ExpandedQuery} whose {@code rewritten} field contains
* the clinically rephrased version of {@code query}.
*/
public ExpandedQuery expand(String query) {
String rewritten = chatClient.prompt()
.user(EXPANSION_PROMPT.formatted(query))
.call()
.content();
if (rewritten == null || rewritten.isBlank()) {
rewritten = query;
}
log.debug("Query expanded: '{}' → '{}'", query, rewritten);
return new ExpandedQuery(query, rewritten);
}
}
@@ -0,0 +1,7 @@
package com.aiteacher.topic;
import java.time.Instant;
import java.util.UUID;
public record SavedSummaryItem(UUID id, int summaryNumber, Instant generatedAt) {
}
@@ -5,6 +5,7 @@ import org.springframework.web.bind.annotation.*;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.UUID;
@RestController
@RequestMapping("/api/v1/topics")
@@ -32,4 +33,21 @@ public class TopicController {
TopicSummaryResponse response = topicSummaryService.generateSummary(topic);
return ResponseEntity.ok(response);
}
@GetMapping("/{id}/summaries")
public ResponseEntity<List<SavedSummaryItem>> listSummaries(@PathVariable String id) {
topicRepository.findById(id)
.orElseThrow(() -> new NoSuchElementException("Topic not found."));
return ResponseEntity.ok(topicSummaryService.listSummaries(id));
}
@GetMapping("/{id}/summaries/{summaryId}")
public ResponseEntity<TopicSummaryResponse> getSummary(@PathVariable String id,
@PathVariable UUID summaryId) {
topicRepository.findById(id)
.orElseThrow(() -> new NoSuchElementException("Topic not found."));
return ResponseEntity.ok(topicSummaryService.getSummary(summaryId));
}
}
@@ -0,0 +1,53 @@
package com.aiteacher.topic;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.GeneratedValue;
import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.Table;
import java.time.Instant;
import java.util.UUID;
@Entity
@Table(name = "topic_summary")
public class TopicSummaryEntity {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
private UUID id;
@Column(name = "topic_id", nullable = false)
private String topicId;
@Column(name = "summary_number", nullable = false)
private int summaryNumber;
@Column(nullable = false, columnDefinition = "TEXT")
private String summary;
@Column(name = "sources_json", nullable = false, columnDefinition = "TEXT")
private String sourcesJson;
@Column(name = "generated_at", nullable = false)
private Instant generatedAt;
protected TopicSummaryEntity() {}
public TopicSummaryEntity(String topicId, int summaryNumber, String summary,
String sourcesJson, Instant generatedAt) {
this.topicId = topicId;
this.summaryNumber = summaryNumber;
this.summary = summary;
this.sourcesJson = sourcesJson;
this.generatedAt = generatedAt;
}
public UUID getId() { return id; }
public String getTopicId() { return topicId; }
public int getSummaryNumber() { return summaryNumber; }
public String getSummary() { return summary; }
public String getSourcesJson() { return sourcesJson; }
public Instant getGeneratedAt() { return generatedAt; }
}
@@ -0,0 +1,13 @@
package com.aiteacher.topic;
import org.springframework.data.jpa.repository.JpaRepository;
import java.util.List;
import java.util.UUID;
public interface TopicSummaryRepository extends JpaRepository<TopicSummaryEntity, UUID> {
List<TopicSummaryEntity> findByTopicIdOrderBySummaryNumberAsc(String topicId);
long countByTopicId(String topicId);
}
@@ -2,8 +2,11 @@ package com.aiteacher.topic;
import java.time.Instant;
import java.util.List;
import java.util.UUID;
public record TopicSummaryResponse(
UUID id,
int summaryNumber,
String topicId,
String topicName,
String summary,
@@ -11,6 +14,7 @@ public record TopicSummaryResponse(
Instant generatedAt
) {
public record SourceReference(
String bookId,
String bookTitle,
Integer page
) {
@@ -1,21 +1,25 @@
package com.aiteacher.topic;
import com.aiteacher.book.Book;
import com.aiteacher.book.BookRepository;
import com.aiteacher.book.BookStatus;
import com.aiteacher.book.NoKnowledgeSourceException;
import com.aiteacher.document.FigureEntity;
import com.aiteacher.document.SectionEntity;
import com.aiteacher.retrieval.NeurosurgeryRetriever;
import com.aiteacher.retrieval.RetrievalResult;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.client.advisor.vectorstore.QuestionAnswerAdvisor;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.stereotype.Service;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.UUID;
@Service
public class TopicSummaryService {
@@ -29,80 +33,190 @@ public class TopicSummaryService {
When answering:
- Structure your response clearly with key points
- If the context mentions specific book titles and page numbers, reference them
- Cite claims using ONLY the reference labels provided in the context (e.g. [S1], [F2]).
Do not invent page numbers, section titles, or labels not present in the CONTEXT block.
- If the retrieved context does not contain sufficient information on the topic,
explicitly state: "The uploaded books do not contain sufficient information on this topic."
- Never hallucinate or fabricate clinical information
""";
private final ChatClient chatClient;
private final VectorStore vectorStore;
private final BookRepository bookRepository;
private final NeurosurgeryRetriever retriever;
private final TopicSummaryRepository summaryRepository;
private final ObjectMapper objectMapper;
public TopicSummaryService(ChatClient chatClient, VectorStore vectorStore,
BookRepository bookRepository) {
public TopicSummaryService(ChatClient chatClient,
BookRepository bookRepository,
NeurosurgeryRetriever retriever,
TopicSummaryRepository summaryRepository,
ObjectMapper objectMapper) {
this.chatClient = chatClient;
this.vectorStore = vectorStore;
this.bookRepository = bookRepository;
this.retriever = retriever;
this.summaryRepository = summaryRepository;
this.objectMapper = objectMapper;
}
public TopicSummaryResponse generateSummary(Topic topic) {
if (!bookRepository.existsByStatus(BookStatus.READY)) {
List<Book> readyBooks = bookRepository.findAll().stream()
.filter(b -> b.getStatus() == BookStatus.READY)
.toList();
if (readyBooks.isEmpty()) {
throw new NoKnowledgeSourceException(
"No books are available as knowledge sources. Please upload and process at least one book.");
}
String question = buildQuestion(topic);
ChatResponse response = chatClient.prompt()
.system(SYSTEM_PROMPT)
.advisors(QuestionAnswerAdvisor.builder(vectorStore).build())
.user(question)
.call()
.chatResponse();
List<SectionEntity> allSections = new ArrayList<>();
List<FigureEntity> allFigures = new ArrayList<>();
for (Book book : readyBooks) {
RetrievalResult result = retriever.retrieve(question, book.getId());
allSections.addAll(result.parentSections());
allFigures.addAll(result.figures());
}
String summary = response.getResult().getOutput().getText();
List<TopicSummaryResponse.SourceReference> sources = extractSources(response);
log.debug("Topic summary for '{}': {} sections, {} figures retrieved",
topic.getName(), allSections.size(), allFigures.size());
String contextPrompt = buildContextPrompt(question, allSections, allFigures);
String summary = chatClient.prompt()
.system(SYSTEM_PROMPT)
.user(contextPrompt)
.call()
.content();
List<TopicSummaryResponse.SourceReference> sources = buildSources(allSections, allFigures, readyBooks);
Instant generatedAt = Instant.now();
int summaryNumber = (int) summaryRepository.countByTopicId(topic.getId()) + 1;
String sourcesJson = serializeSources(sources);
TopicSummaryEntity entity = new TopicSummaryEntity(
topic.getId(), summaryNumber, summary, sourcesJson, generatedAt);
entity = summaryRepository.save(entity);
return new TopicSummaryResponse(
entity.getId(),
summaryNumber,
topic.getId(),
topic.getName(),
summary,
sources,
Instant.now()
generatedAt
);
}
public List<SavedSummaryItem> listSummaries(String topicId) {
return summaryRepository.findByTopicIdOrderBySummaryNumberAsc(topicId).stream()
.map(e -> new SavedSummaryItem(e.getId(), e.getSummaryNumber(), e.getGeneratedAt()))
.toList();
}
public TopicSummaryResponse getSummary(UUID summaryId) {
TopicSummaryEntity entity = summaryRepository.findById(summaryId)
.orElseThrow(() -> new NoSuchElementException("Summary not found."));
List<TopicSummaryResponse.SourceReference> sources = deserializeSources(entity.getSourcesJson());
return new TopicSummaryResponse(
entity.getId(),
entity.getSummaryNumber(),
entity.getTopicId(),
entity.getTopicId(),
entity.getSummary(),
sources,
entity.getGeneratedAt()
);
}
private String buildQuestion(Topic topic) {
return String.format(
"Please provide a comprehensive educational summary of the following neurosurgery topic: " +
"Provide a comprehensive educational summary of the following neurosurgery topic: " +
"%s. Topic description: %s. " +
"Include key concepts, clinical considerations, and important details that a neurosurgeon should know.",
topic.getName(), topic.getDescription()
);
}
private List<TopicSummaryResponse.SourceReference> extractSources(ChatResponse response) {
List<TopicSummaryResponse.SourceReference> sources = new ArrayList<>();
private String buildContextPrompt(String question,
List<SectionEntity> sections,
List<FigureEntity> figures) {
StringBuilder sb = new StringBuilder();
if (response.getMetadata() != null) {
Object retrieved = response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS);
if (retrieved instanceof List<?> docs) {
for (Object docObj : docs) {
if (docObj instanceof Document doc) {
Map<String, Object> metadata = doc.getMetadata();
String bookTitle = (String) metadata.get("book_title");
Object pageObj = metadata.get("page_number");
Integer page = pageObj instanceof Number n ? n.intValue() : null;
if (bookTitle != null) {
sources.add(new TopicSummaryResponse.SourceReference(bookTitle, page));
}
}
}
if (!sections.isEmpty()) {
sb.append("CONTEXT:\n\n");
for (int i = 0; i < sections.size(); i++) {
SectionEntity s = sections.get(i);
sb.append("[S").append(i + 1).append("] ")
.append(s.getTitle()).append(", p.").append(s.getPageStart()).append("\n");
sb.append(s.getFullText()).append("\n\n");
}
}
// Deduplicate by bookTitle + page
if (!figures.isEmpty()) {
sb.append("AVAILABLE FIGURES:\n");
for (int i = 0; i < figures.size(); i++) {
FigureEntity f = figures.get(i);
sb.append("[F").append(i + 1).append("] ")
.append(f.getLabel() != null ? f.getLabel() : "Figure")
.append(" (p.").append(f.getPage()).append("): ")
.append(f.getCaption() != null ? f.getCaption() : "")
.append("\n");
}
sb.append("\n");
}
sb.append("QUESTION:\n").append(question);
return sb.toString();
}
private List<TopicSummaryResponse.SourceReference> buildSources(List<SectionEntity> sections,
List<FigureEntity> figures,
List<Book> readyBooks) {
List<TopicSummaryResponse.SourceReference> sources = new ArrayList<>();
for (SectionEntity s : sections) {
Book book = readyBooks.stream()
.filter(b -> b.getId().equals(s.getBookId()))
.findFirst()
.orElse(null);
String title = book != null ? book.getTitle() : "Book";
String bookId = book != null ? book.getId().toString() : null;
sources.add(new TopicSummaryResponse.SourceReference(bookId, title, s.getPageStart()));
}
for (FigureEntity f : figures) {
Book book = readyBooks.stream()
.filter(b -> b.getId().equals(f.getBookId()))
.findFirst()
.orElse(null);
String title = book != null ? book.getTitle() : "Book";
String bookId = book != null ? book.getId().toString() : null;
sources.add(new TopicSummaryResponse.SourceReference(bookId, title, f.getPage()));
}
return sources.stream().distinct().toList();
}
private String serializeSources(List<TopicSummaryResponse.SourceReference> sources) {
try {
return objectMapper.writeValueAsString(sources);
} catch (JsonProcessingException e) {
log.warn("Failed to serialize sources, storing empty array", e);
return "[]";
}
}
private List<TopicSummaryResponse.SourceReference> deserializeSources(String json) {
try {
return objectMapper.readValue(json,
objectMapper.getTypeFactory().constructCollectionType(
List.class, TopicSummaryResponse.SourceReference.class));
} catch (JsonProcessingException e) {
log.warn("Failed to deserialize sources from stored JSON", e);
return List.of();
}
}
}
+4 -2
View File
@@ -30,7 +30,7 @@ spring:
api-key: ${OPENAI_API_KEY}
chat:
options:
model: gpt-4o
model: gpt-4o-mini
embedding:
options:
model: "text-embedding-3-small"
@@ -68,6 +68,8 @@ app:
embedding:
batch-size: 20
batch-delay-ms: 2000
skip-embedding: true
skip-embedding: false
marker:
base-url: ${MARKER_BASE_URL:http://192.168.1.105:8000}
vision:
min-interval-ms: ${VISION_MIN_INTERVAL_MS:2000}
@@ -0,0 +1,10 @@
CREATE TABLE topic_summary (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
topic_id VARCHAR(100) NOT NULL,
summary_number INT NOT NULL,
summary TEXT NOT NULL,
sources_json TEXT NOT NULL,
generated_at TIMESTAMPTZ NOT NULL
);
CREATE INDEX idx_topic_summary_topic_id ON topic_summary(topic_id, summary_number);