enhance rag retrieval + summary
This commit is contained in:
@@ -92,7 +92,7 @@ public class BookEmbeddingService {
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel
|
||||
// Step 1: Parse with Marker — split into 100-page chunks, then merge results
|
||||
ParsedBook parsed = markerPageParser.parse(pdfPath);
|
||||
|
||||
List<PageResult> pageResults = parsed.pages();
|
||||
@@ -125,25 +125,32 @@ public class BookEmbeddingService {
|
||||
log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId);
|
||||
|
||||
// Step 5: Vision analysis (description + visible text) → embed figure chunks
|
||||
for (FigureEntity figure : figures) {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
Map<String, SectionEntity> sectionById = new HashMap<>();
|
||||
for (SectionEntity s : sections) sectionById.put(s.getId(), s);
|
||||
|
||||
for (FigureEntity figure : figures) {
|
||||
// Prefer caption extracted from the linked section's full text
|
||||
if (figure.getCaption() == null || figure.getCaption().isBlank()) {
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
String sectionCaption = extractCaptionFromSection(sectionById.get(figure.getSectionId()));
|
||||
if (sectionCaption != null) {
|
||||
figure.setCaption(sectionCaption);
|
||||
figureRepository.save(figure);
|
||||
} else {
|
||||
byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
|
||||
VisionDescriptionService.ImageAnalysis analysis =
|
||||
visionDescriptionService.analyze(imageBytes, figure.getCaption());
|
||||
figure.setCaption(analysis.description());
|
||||
figureRepository.save(figure);
|
||||
}
|
||||
}
|
||||
|
||||
// Embedding content: description + caption + visible image text
|
||||
String embeddingContent = analysis.description()
|
||||
+ (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
|
||||
+ (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());
|
||||
// Embedding content: description
|
||||
String embeddingContent = (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
|
||||
|
||||
String embeddingId = UUID.randomUUID().toString();
|
||||
if (!skipEmbedding) {
|
||||
Document figureDoc = new Document(embeddingId, embeddingContent,
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
|
||||
buildFigureMetadata(figure, bookTitle, embeddingId, ""));
|
||||
vectorStore.add(List.of(figureDoc));
|
||||
figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
|
||||
}
|
||||
@@ -163,7 +170,7 @@ public class BookEmbeddingService {
|
||||
}
|
||||
|
||||
book.setStatus(BookStatus.READY);
|
||||
book.setPageCount(sections.size());
|
||||
book.setPageCount(parsed.htmlByPage().size());
|
||||
book.setProcessedAt(Instant.now());
|
||||
bookRepository.save(book);
|
||||
|
||||
@@ -210,7 +217,7 @@ public class BookEmbeddingService {
|
||||
if (page.orderedText().isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + page.pageNumber();
|
||||
String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
|
||||
String title = truncate(page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber(), 500);
|
||||
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
@@ -271,6 +278,17 @@ public class BookEmbeddingService {
|
||||
return html;
|
||||
}
|
||||
|
||||
private String extractCaptionFromSection(SectionEntity section) {
|
||||
if (section == null) return null;
|
||||
for (String line : section.getFullText().split("\n")) {
|
||||
String trimmed = line.strip();
|
||||
if (trimmed.startsWith("Fig.") || trimmed.startsWith("Figure") || trimmed.startsWith("Algorithm")) {
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String truncate(String msg, int max) {
|
||||
if (msg == null) return null;
|
||||
return msg.length() <= max ? msg : msg.substring(0, max);
|
||||
|
||||
@@ -5,10 +5,11 @@ import com.aiteacher.book.BookStatus;
|
||||
import com.aiteacher.book.NoKnowledgeSourceException;
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.SectionEntity;
|
||||
import com.aiteacher.retrieval.CitationValidatorService;
|
||||
import com.aiteacher.retrieval.LabelledContext;
|
||||
import com.aiteacher.retrieval.NeurosurgeryRetriever;
|
||||
import com.aiteacher.retrieval.QueryExpansionService;
|
||||
import com.aiteacher.retrieval.RetrievalResult;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@@ -17,8 +18,6 @@ import java.util.*;
|
||||
@Service
|
||||
public class ChatService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ChatService.class);
|
||||
|
||||
private static final String SYSTEM_PROMPT = """
|
||||
You are an expert neurosurgery educator assistant. Answer questions using the
|
||||
medical textbook content provided to you as context.
|
||||
@@ -29,8 +28,8 @@ public class ChatService {
|
||||
- Build answers from what is present: procedures, conditions, techniques, and descriptions all contribute; combine them into a rich, structured response
|
||||
- Use clear structure: headings, bullet points, or numbered steps where appropriate to maximize clarity
|
||||
- Only say you cannot answer if the context is entirely unrelated to the question
|
||||
- Cite sources for each major point (book title and page number from the context)
|
||||
- When referencing diagrams or figures, cite them as [Fig. X, p.N]
|
||||
- Cite sources for each major claim using the reference labels from the context (e.g. [S1], [F2]). Prefer these labels over inventing page numbers, but you may also describe the source naturally if needed.
|
||||
- When referencing diagrams or figures, prefer their label from the context (e.g. [F1])
|
||||
- Maintain continuity with the conversation history
|
||||
- Never fabricate clinical information not present in the context
|
||||
""";
|
||||
@@ -40,17 +39,23 @@ public class ChatService {
|
||||
private final ChatSessionRepository sessionRepository;
|
||||
private final MessageRepository messageRepository;
|
||||
private final NeurosurgeryRetriever retriever;
|
||||
private final QueryExpansionService queryExpansionService;
|
||||
private final CitationValidatorService citationValidatorService;
|
||||
|
||||
public ChatService(ChatClient chatClient,
|
||||
BookRepository bookRepository,
|
||||
ChatSessionRepository sessionRepository,
|
||||
MessageRepository messageRepository,
|
||||
NeurosurgeryRetriever retriever) {
|
||||
NeurosurgeryRetriever retriever,
|
||||
QueryExpansionService queryExpansionService,
|
||||
CitationValidatorService citationValidatorService) {
|
||||
this.chatClient = chatClient;
|
||||
this.bookRepository = bookRepository;
|
||||
this.sessionRepository = sessionRepository;
|
||||
this.messageRepository = messageRepository;
|
||||
this.retriever = retriever;
|
||||
this.queryExpansionService = queryExpansionService;
|
||||
this.citationValidatorService = citationValidatorService;
|
||||
}
|
||||
|
||||
public ChatSession createSession(String topicId) {
|
||||
@@ -85,25 +90,34 @@ public class ChatService {
|
||||
List<Message> history = messageRepository.findBySessionIdOrderByCreatedAtAsc(sessionId);
|
||||
String fullQuestion = buildQuestionWithHistory(history, userContent, session.getTopicId());
|
||||
|
||||
// Retrieve context from all ready books (aggregate across books)
|
||||
// Expand only the current user question to clinical terminology for retrieval (US1).
|
||||
// fullQuestion (which includes conversation history) is used for the LLM context prompt,
|
||||
// but retrieval should be driven by a concise clinical rewrite of the actual question.
|
||||
String retrievalQuery = queryExpansionService.expand(userContent).rewritten();
|
||||
|
||||
// Retrieve context from all ready books using the expanded query
|
||||
List<SectionEntity> allSections = new ArrayList<>();
|
||||
List<FigureEntity> allFigures = new ArrayList<>();
|
||||
for (com.aiteacher.book.Book book : readyBooks) {
|
||||
RetrievalResult result = retriever.retrieve(fullQuestion, book.getId());
|
||||
RetrievalResult result = retriever.retrieve(retrievalQuery, book.getId());
|
||||
allSections.addAll(result.parentSections());
|
||||
allFigures.addAll(result.figures());
|
||||
}
|
||||
|
||||
// Build LLM prompt with section full texts and figure references
|
||||
String contextPrompt = buildContextPrompt(fullQuestion, allSections, allFigures);
|
||||
// Build labelled context prompt (US2): assigns [S1]/[F1] labels to each source
|
||||
LabelledContext ctx = buildContextPrompt(fullQuestion, allSections, allFigures);
|
||||
|
||||
String assistantContent = chatClient.prompt()
|
||||
// Generate answer
|
||||
String rawContent = chatClient.prompt()
|
||||
.system(SYSTEM_PROMPT)
|
||||
.user(contextPrompt)
|
||||
.user(ctx.promptText())
|
||||
.call()
|
||||
.content();
|
||||
|
||||
// Build sources list with TEXT and FIGURE entries
|
||||
// Strip any citation labels not present in the retrieved context (US2)
|
||||
String assistantContent = citationValidatorService.validate(rawContent, ctx.allLabels());
|
||||
|
||||
// Attach sources with their ref-labels for frontend traceability
|
||||
List<Map<String, Object>> sources = buildSources(allSections, allFigures);
|
||||
|
||||
Message assistantMessage = new Message(sessionId, MessageRole.ASSISTANT, assistantContent);
|
||||
@@ -126,51 +140,71 @@ public class ChatService {
|
||||
// Private helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private String buildContextPrompt(String question,
|
||||
List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
/**
|
||||
* Builds the LLM context prompt, tagging each section as [S1], [S2]… and
|
||||
* each figure as [F1], [F2]… so the model can cite only known sources.
|
||||
*/
|
||||
private LabelledContext buildContextPrompt(String question,
|
||||
List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
Map<String, SectionEntity> sectionLabels = new LinkedHashMap<>();
|
||||
Map<String, FigureEntity> figureLabels = new LinkedHashMap<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (!sections.isEmpty()) {
|
||||
sb.append("CONTEXT:\n\n");
|
||||
for (SectionEntity section : sections) {
|
||||
sb.append("[").append(section.getTitle())
|
||||
.append(", p.").append(section.getPageStart()).append("]\n");
|
||||
for (int i = 0; i < sections.size(); i++) {
|
||||
SectionEntity section = sections.get(i);
|
||||
String label = "S" + (i + 1);
|
||||
sectionLabels.put(label, section);
|
||||
sb.append("[").append(label).append("] ")
|
||||
.append(section.getTitle())
|
||||
.append(", p.").append(section.getPageStart()).append("\n");
|
||||
sb.append(section.getFullText()).append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (!figures.isEmpty()) {
|
||||
sb.append("AVAILABLE FIGURES:\n");
|
||||
for (FigureEntity figure : figures) {
|
||||
sb.append("- ").append(figure.getLabel() != null ? figure.getLabel() : "Figure")
|
||||
for (int i = 0; i < figures.size(); i++) {
|
||||
FigureEntity figure = figures.get(i);
|
||||
String label = "F" + (i + 1);
|
||||
figureLabels.put(label, figure);
|
||||
sb.append("[").append(label).append("] ")
|
||||
.append(figure.getLabel() != null ? figure.getLabel() : "Figure")
|
||||
.append(" (p.").append(figure.getPage()).append("): ")
|
||||
.append(figure.getCaption() != null ? figure.getCaption() : "")
|
||||
.append("\n");
|
||||
}
|
||||
sb.append("\nWhen referencing diagrams, cite them as [Fig. X, p.N].\n\n");
|
||||
sb.append("\nWhen referencing diagrams, use their label from the context (e.g. [F1]).\n\n");
|
||||
}
|
||||
|
||||
sb.append("QUESTION:\n").append(question);
|
||||
return sb.toString();
|
||||
return new LabelledContext(sectionLabels, figureLabels, sb.toString());
|
||||
}
|
||||
|
||||
private List<Map<String, Object>> buildSources(List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
List<Map<String, Object>> sources = new ArrayList<>();
|
||||
|
||||
for (SectionEntity section : sections) {
|
||||
for (int i = 0; i < sections.size(); i++) {
|
||||
SectionEntity section = sections.get(i);
|
||||
Map<String, Object> source = new LinkedHashMap<>();
|
||||
source.put("type", "TEXT");
|
||||
source.put("refLabel", "S" + (i + 1));
|
||||
source.put("bookId", section.getBookId());
|
||||
source.put("bookTitle", deriveTitleFromSection(section));
|
||||
source.put("page", section.getPageStart());
|
||||
source.put("chunkText", truncate(section.getFullText(), 500));
|
||||
sources.add(source);
|
||||
}
|
||||
|
||||
for (FigureEntity figure : figures) {
|
||||
for (int i = 0; i < figures.size(); i++) {
|
||||
FigureEntity figure = figures.get(i);
|
||||
Map<String, Object> source = new LinkedHashMap<>();
|
||||
source.put("type", "FIGURE");
|
||||
source.put("refLabel", "F" + (i + 1));
|
||||
source.put("bookId", figure.getBookId());
|
||||
source.put("bookTitle", bookRepository.findById(figure.getBookId())
|
||||
.map(com.aiteacher.book.Book::getTitle).orElse("Book"));
|
||||
source.put("page", figure.getPage());
|
||||
@@ -178,7 +212,6 @@ public class ChatService {
|
||||
source.put("label", figure.getLabel() != null ? figure.getLabel() : "");
|
||||
source.put("caption", figure.getCaption() != null ? figure.getCaption() : "");
|
||||
source.put("figureType", figure.getFigureType().name());
|
||||
// imageUrl assembled from relative path: figures/{bookId}/{filename}
|
||||
String filename = figure.getImagePath().substring(
|
||||
figure.getImagePath().lastIndexOf('/') + 1);
|
||||
source.put("imageUrl", "/api/v1/figures/" + figure.getBookId() + "/" + filename);
|
||||
|
||||
@@ -17,6 +17,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Parses a PDF with a single call to the Marker server using {@code output_format=json}.
|
||||
*
|
||||
@@ -46,19 +47,65 @@ public class MarkerPageParser {
|
||||
);
|
||||
private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
|
||||
|
||||
private static final int CHUNK_SIZE = 100;
|
||||
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
private final RestClient restClient;
|
||||
private final PdfSplitterService pdfSplitterService;
|
||||
|
||||
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) {
|
||||
public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient,
|
||||
PdfSplitterService pdfSplitterService) {
|
||||
this.restClient = restClient;
|
||||
this.pdfSplitterService = pdfSplitterService;
|
||||
}
|
||||
|
||||
public ParsedBook parse(Path pdfPath) {
|
||||
log.info("Submitting {} to Marker (json)", pdfPath.getFileName());
|
||||
/**
|
||||
* Parses a PDF by splitting it into {@value #CHUNK_SIZE}-page chunks, submitting each
|
||||
* chunk to Marker individually, and merging the results into a single {@link ParsedBook}.
|
||||
* Page numbers in the merged result are absolute (1-based across the whole document).
|
||||
*/
|
||||
public ParsedBook parse(Path pdfPath) throws IOException {
|
||||
List<PdfSplitterService.PdfChunk> chunks = pdfSplitterService.split(pdfPath, CHUNK_SIZE);
|
||||
log.info("Processing {} chunk(s) for {}", chunks.size(), pdfPath.getFileName());
|
||||
|
||||
List<PageResult> allPages = new ArrayList<>();
|
||||
Map<Integer, String> allHtml = new LinkedHashMap<>();
|
||||
|
||||
try {
|
||||
for (int c = 0; c < chunks.size(); c++) {
|
||||
PdfSplitterService.PdfChunk chunk = chunks.get(c);
|
||||
log.info("Submitting chunk {}/{} to Marker (page offset {})", c + 1, chunks.size(), chunk.pageOffset());
|
||||
|
||||
ParsedBook chunkResult = submitChunk(chunk.tempFile());
|
||||
|
||||
// Rebase page numbers from chunk-relative to document-absolute
|
||||
for (PageResult page : chunkResult.pages()) {
|
||||
int absolutePage = chunk.pageOffset() + page.pageNumber();
|
||||
allPages.add(new PageResult(absolutePage, page.orderedText(), page.headingTitle(), page.figures()));
|
||||
}
|
||||
chunkResult.htmlByPage().forEach((chunkPage, html) ->
|
||||
allHtml.put(chunk.pageOffset() + chunkPage, html));
|
||||
}
|
||||
} finally {
|
||||
// Delete temporary chunk files (skip if the chunk is the original PDF)
|
||||
for (PdfSplitterService.PdfChunk chunk : chunks) {
|
||||
if (!chunk.tempFile().equals(pdfPath)) {
|
||||
try { Files.deleteIfExists(chunk.tempFile()); }
|
||||
catch (IOException e) { log.warn("Could not delete temp chunk {}", chunk.tempFile()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Marker produced {} non-empty pages from {} chunk(s) of {}",
|
||||
allPages.size(), chunks.size(), pdfPath.getFileName());
|
||||
return new ParsedBook(allPages, allHtml);
|
||||
}
|
||||
|
||||
/** Submits a single PDF file to Marker and returns the parsed result with chunk-relative page numbers. */
|
||||
private ParsedBook submitChunk(Path chunkPath) {
|
||||
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
|
||||
body.add("file", new FileSystemResource(pdfPath));
|
||||
body.add("file", new FileSystemResource(chunkPath));
|
||||
body.add("output_format", "json");
|
||||
|
||||
JsonNode response = restClient.post()
|
||||
@@ -76,28 +123,29 @@ public class MarkerPageParser {
|
||||
|
||||
List<JsonNode> pageNodes = extractPages(response);
|
||||
if (pageNodes.isEmpty()) {
|
||||
log.warn("Marker returned no pages for {}", pdfPath.getFileName());
|
||||
log.warn("Marker returned no pages for chunk {}", chunkPath.getFileName());
|
||||
return new ParsedBook(List.of(), Map.of());
|
||||
}
|
||||
log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName());
|
||||
|
||||
List<PageResult> pages = new ArrayList<>();
|
||||
Map<Integer, String> htmlByPage = new LinkedHashMap<>();
|
||||
|
||||
for (int i = 0; i < pageNodes.size(); i++) {
|
||||
JsonNode pageNode = pageNodes.get(i);
|
||||
int pageNumber = i + 1; // 1-based
|
||||
int pageNumber = i + 1; // 1-based, chunk-relative
|
||||
|
||||
PageResult result = buildPageResult(pageNode, pageNumber);
|
||||
String html = jsonToHtml(pageNode);
|
||||
|
||||
// Always save HTML so the reader can navigate to every page
|
||||
htmlByPage.put(pageNumber, html);
|
||||
|
||||
// Only queue for embedding if the page has extractable content
|
||||
if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
|
||||
pages.add(result);
|
||||
htmlByPage.put(pageNumber, html);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName());
|
||||
return new ParsedBook(pages, htmlByPage);
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
||||
import org.apache.pdfbox.multipdf.Splitter;
|
||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Splits a PDF file into fixed-size chunks using PDFBox.
|
||||
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
|
||||
*/
|
||||
@Service
|
||||
public class PdfSplitterService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
|
||||
|
||||
/**
|
||||
* A chunk of a split PDF.
|
||||
*
|
||||
* @param tempFile path to the temporary PDF file (caller must delete when done)
|
||||
* @param pageOffset 0-based index of the first page in this chunk within the original document
|
||||
*/
|
||||
public record PdfChunk(Path tempFile, int pageOffset) {}
|
||||
|
||||
/**
|
||||
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
|
||||
* Returns a single-element list when the document fits in one chunk.
|
||||
*
|
||||
* @param pdfPath source PDF
|
||||
* @param maxPagesPerChunk maximum pages per chunk
|
||||
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
|
||||
*/
|
||||
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
|
||||
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
|
||||
int totalPages = doc.getNumberOfPages();
|
||||
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
|
||||
|
||||
if (totalPages <= maxPagesPerChunk) {
|
||||
// No split needed — return the original file as a single virtual chunk
|
||||
return List.of(new PdfChunk(pdfPath, 0));
|
||||
}
|
||||
|
||||
Splitter splitter = new Splitter();
|
||||
splitter.setSplitAtPage(maxPagesPerChunk);
|
||||
List<PDDocument> parts = splitter.split(doc);
|
||||
|
||||
List<PdfChunk> chunks = new ArrayList<>(parts.size());
|
||||
int offset = 0;
|
||||
for (PDDocument part : parts) {
|
||||
try {
|
||||
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
|
||||
part.save(tmp.toFile());
|
||||
chunks.add(new PdfChunk(tmp, offset));
|
||||
log.debug("Created chunk at {} (page offset {})", tmp, offset);
|
||||
offset += part.getNumberOfPages();
|
||||
} finally {
|
||||
part.close();
|
||||
}
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
@@ -8,4 +10,10 @@ import java.util.UUID;
|
||||
public interface SectionRepository extends JpaRepository<SectionEntity, String> {
|
||||
List<SectionEntity> findAllByBookId(UUID bookId);
|
||||
void deleteAllByBookId(UUID bookId);
|
||||
|
||||
@Query("SELECT s FROM SectionEntity s WHERE s.bookId = :bookId AND s.pageStart <= :windowEnd AND s.pageEnd >= :windowStart ORDER BY s.pageStart")
|
||||
List<SectionEntity> findByBookIdAndPageOverlap(
|
||||
@Param("bookId") UUID bookId,
|
||||
@Param("windowStart") int windowStart,
|
||||
@Param("windowEnd") int windowEnd);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package com.aiteacher.document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.ByteArrayResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.MimeTypeUtils;
|
||||
@@ -32,10 +33,16 @@ public class VisionDescriptionService {
|
||||
IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
|
||||
""";
|
||||
|
||||
/** Minimum ms between vision API calls. Configurable via app.vision.min-interval-ms. */
|
||||
private final long minIntervalMs;
|
||||
private final ChatClient chatClient;
|
||||
private volatile long lastCallAt = 0;
|
||||
|
||||
public VisionDescriptionService(ChatClient chatClient) {
|
||||
public VisionDescriptionService(
|
||||
ChatClient chatClient,
|
||||
@Value("${app.vision.min-interval-ms:2000}") long minIntervalMs) {
|
||||
this.chatClient = chatClient;
|
||||
this.minIntervalMs = minIntervalMs;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -55,6 +62,7 @@ public class VisionDescriptionService {
|
||||
* @param captionFallback caption detected from surrounding text, may be null
|
||||
*/
|
||||
public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
|
||||
throttle();
|
||||
try {
|
||||
String raw = chatClient.prompt()
|
||||
.user(u -> u
|
||||
@@ -71,6 +79,15 @@ public class VisionDescriptionService {
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void throttle() {
|
||||
long now = System.currentTimeMillis();
|
||||
long wait = minIntervalMs - (now - lastCallAt);
|
||||
if (wait > 0) {
|
||||
try { Thread.sleep(wait); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
|
||||
}
|
||||
lastCallAt = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
private ImageAnalysis parse(String raw, String captionFallback) {
|
||||
String description = captionFallback != null ? captionFallback : "Figure";
|
||||
String imageText = "";
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Post-processes generated answers to strip citation labels that do not
|
||||
* correspond to any passage retrieved for the current query, preventing
|
||||
* hallucinated source references from reaching the user.
|
||||
*/
|
||||
@Service
|
||||
public class CitationValidatorService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CitationValidatorService.class);
|
||||
|
||||
/** Matches citation labels of the form [S1], [F2], [S12], etc. */
|
||||
private static final Pattern CITATION_PATTERN = Pattern.compile("\\[(S|F)\\d+\\]");
|
||||
|
||||
/**
|
||||
* Removes any {@code [Sx]} / {@code [Fx]} citation in {@code generatedAnswer}
|
||||
* whose label is not contained in {@code validLabels}.
|
||||
*
|
||||
* @param generatedAnswer raw model output
|
||||
* @param validLabels set of labels present in the retrieved context
|
||||
* @return cleaned answer text with hallucinated citations removed
|
||||
*/
|
||||
public String validate(String generatedAnswer, Set<String> validLabels) {
|
||||
if (generatedAnswer == null) return "";
|
||||
|
||||
Matcher matcher = CITATION_PATTERN.matcher(generatedAnswer);
|
||||
List<String> removed = new ArrayList<>();
|
||||
StringBuffer sb = new StringBuffer();
|
||||
|
||||
while (matcher.find()) {
|
||||
String label = matcher.group();
|
||||
String inner = label.substring(1, label.length() - 1); // strip [ ]
|
||||
if (validLabels.contains(inner)) {
|
||||
matcher.appendReplacement(sb, Matcher.quoteReplacement(label));
|
||||
} else {
|
||||
removed.add(inner);
|
||||
matcher.appendReplacement(sb, "");
|
||||
}
|
||||
}
|
||||
matcher.appendTail(sb);
|
||||
|
||||
if (!removed.isEmpty()) {
|
||||
log.warn("Stripped hallucinated citations: {}", removed);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
/**
|
||||
* Value object holding the original user query alongside its clinically
|
||||
* rewritten variant used for vector-store retrieval.
|
||||
*/
|
||||
public record ExpandedQuery(String original, String rewritten) {}
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.SectionEntity;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Value object produced when building the LLM context prompt.
|
||||
* Maps short ref-labels (S1, S2… / F1, F2…) to their source entities
|
||||
* and carries the fully formatted prompt text.
|
||||
*/
|
||||
public record LabelledContext(
|
||||
Map<String, SectionEntity> sectionLabels,
|
||||
Map<String, FigureEntity> figureLabels,
|
||||
String promptText) {
|
||||
|
||||
/** Returns the union of all valid citation labels for this context. */
|
||||
public Set<String> allLabels() {
|
||||
Set<String> labels = new HashSet<>();
|
||||
labels.addAll(sectionLabels.keySet());
|
||||
labels.addAll(figureLabels.keySet());
|
||||
return labels;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package com.aiteacher.retrieval;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* Rewrites a user query into precise clinical/surgical terminology so that
|
||||
* vector-store retrieval can match textbook language even when the user's
|
||||
* phrasing differs from the documentation vocabulary.
|
||||
*/
|
||||
@Service
|
||||
public class QueryExpansionService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(QueryExpansionService.class);
|
||||
|
||||
private static final String EXPANSION_PROMPT = """
|
||||
Rewrite the following question using precise medical and surgical terminology \
|
||||
as it would appear in a neurosurgery textbook index. \
|
||||
Output only the rewritten question, nothing else.
|
||||
Question: %s""";
|
||||
|
||||
private final ChatClient chatClient;
|
||||
|
||||
public QueryExpansionService(ChatClient chatClient) {
|
||||
this.chatClient = chatClient;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link ExpandedQuery} whose {@code rewritten} field contains
|
||||
* the clinically rephrased version of {@code query}.
|
||||
*/
|
||||
public ExpandedQuery expand(String query) {
|
||||
String rewritten = chatClient.prompt()
|
||||
.user(EXPANSION_PROMPT.formatted(query))
|
||||
.call()
|
||||
.content();
|
||||
|
||||
if (rewritten == null || rewritten.isBlank()) {
|
||||
rewritten = query;
|
||||
}
|
||||
|
||||
log.debug("Query expanded: '{}' → '{}'", query, rewritten);
|
||||
return new ExpandedQuery(query, rewritten);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package com.aiteacher.topic;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.UUID;
|
||||
|
||||
public record SavedSummaryItem(UUID id, int summaryNumber, Instant generatedAt) {
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.UUID;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/topics")
|
||||
@@ -32,4 +33,21 @@ public class TopicController {
|
||||
TopicSummaryResponse response = topicSummaryService.generateSummary(topic);
|
||||
return ResponseEntity.ok(response);
|
||||
}
|
||||
|
||||
@GetMapping("/{id}/summaries")
|
||||
public ResponseEntity<List<SavedSummaryItem>> listSummaries(@PathVariable String id) {
|
||||
topicRepository.findById(id)
|
||||
.orElseThrow(() -> new NoSuchElementException("Topic not found."));
|
||||
|
||||
return ResponseEntity.ok(topicSummaryService.listSummaries(id));
|
||||
}
|
||||
|
||||
@GetMapping("/{id}/summaries/{summaryId}")
|
||||
public ResponseEntity<TopicSummaryResponse> getSummary(@PathVariable String id,
|
||||
@PathVariable UUID summaryId) {
|
||||
topicRepository.findById(id)
|
||||
.orElseThrow(() -> new NoSuchElementException("Topic not found."));
|
||||
|
||||
return ResponseEntity.ok(topicSummaryService.getSummary(summaryId));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
package com.aiteacher.topic;
|
||||
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Table;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "topic_summary")
|
||||
public class TopicSummaryEntity {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "topic_id", nullable = false)
|
||||
private String topicId;
|
||||
|
||||
@Column(name = "summary_number", nullable = false)
|
||||
private int summaryNumber;
|
||||
|
||||
@Column(nullable = false, columnDefinition = "TEXT")
|
||||
private String summary;
|
||||
|
||||
@Column(name = "sources_json", nullable = false, columnDefinition = "TEXT")
|
||||
private String sourcesJson;
|
||||
|
||||
@Column(name = "generated_at", nullable = false)
|
||||
private Instant generatedAt;
|
||||
|
||||
protected TopicSummaryEntity() {}
|
||||
|
||||
public TopicSummaryEntity(String topicId, int summaryNumber, String summary,
|
||||
String sourcesJson, Instant generatedAt) {
|
||||
this.topicId = topicId;
|
||||
this.summaryNumber = summaryNumber;
|
||||
this.summary = summary;
|
||||
this.sourcesJson = sourcesJson;
|
||||
this.generatedAt = generatedAt;
|
||||
}
|
||||
|
||||
public UUID getId() { return id; }
|
||||
public String getTopicId() { return topicId; }
|
||||
public int getSummaryNumber() { return summaryNumber; }
|
||||
public String getSummary() { return summary; }
|
||||
public String getSourcesJson() { return sourcesJson; }
|
||||
public Instant getGeneratedAt() { return generatedAt; }
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.aiteacher.topic;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface TopicSummaryRepository extends JpaRepository<TopicSummaryEntity, UUID> {
|
||||
|
||||
List<TopicSummaryEntity> findByTopicIdOrderBySummaryNumberAsc(String topicId);
|
||||
|
||||
long countByTopicId(String topicId);
|
||||
}
|
||||
@@ -2,8 +2,11 @@ package com.aiteacher.topic;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public record TopicSummaryResponse(
|
||||
UUID id,
|
||||
int summaryNumber,
|
||||
String topicId,
|
||||
String topicName,
|
||||
String summary,
|
||||
@@ -11,6 +14,7 @@ public record TopicSummaryResponse(
|
||||
Instant generatedAt
|
||||
) {
|
||||
public record SourceReference(
|
||||
String bookId,
|
||||
String bookTitle,
|
||||
Integer page
|
||||
) {
|
||||
|
||||
@@ -1,21 +1,25 @@
|
||||
package com.aiteacher.topic;
|
||||
|
||||
import com.aiteacher.book.Book;
|
||||
import com.aiteacher.book.BookRepository;
|
||||
import com.aiteacher.book.BookStatus;
|
||||
import com.aiteacher.book.NoKnowledgeSourceException;
|
||||
import com.aiteacher.document.FigureEntity;
|
||||
import com.aiteacher.document.SectionEntity;
|
||||
import com.aiteacher.retrieval.NeurosurgeryRetriever;
|
||||
import com.aiteacher.retrieval.RetrievalResult;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.ai.chat.client.advisor.vectorstore.QuestionAnswerAdvisor;
|
||||
import org.springframework.ai.chat.model.ChatResponse;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.UUID;
|
||||
|
||||
@Service
|
||||
public class TopicSummaryService {
|
||||
@@ -29,80 +33,190 @@ public class TopicSummaryService {
|
||||
|
||||
When answering:
|
||||
- Structure your response clearly with key points
|
||||
- If the context mentions specific book titles and page numbers, reference them
|
||||
- Cite claims using ONLY the reference labels provided in the context (e.g. [S1], [F2]).
|
||||
Do not invent page numbers, section titles, or labels not present in the CONTEXT block.
|
||||
- If the retrieved context does not contain sufficient information on the topic,
|
||||
explicitly state: "The uploaded books do not contain sufficient information on this topic."
|
||||
- Never hallucinate or fabricate clinical information
|
||||
""";
|
||||
|
||||
private final ChatClient chatClient;
|
||||
private final VectorStore vectorStore;
|
||||
private final BookRepository bookRepository;
|
||||
private final NeurosurgeryRetriever retriever;
|
||||
private final TopicSummaryRepository summaryRepository;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
public TopicSummaryService(ChatClient chatClient, VectorStore vectorStore,
|
||||
BookRepository bookRepository) {
|
||||
public TopicSummaryService(ChatClient chatClient,
|
||||
BookRepository bookRepository,
|
||||
NeurosurgeryRetriever retriever,
|
||||
TopicSummaryRepository summaryRepository,
|
||||
ObjectMapper objectMapper) {
|
||||
this.chatClient = chatClient;
|
||||
this.vectorStore = vectorStore;
|
||||
this.bookRepository = bookRepository;
|
||||
this.retriever = retriever;
|
||||
this.summaryRepository = summaryRepository;
|
||||
this.objectMapper = objectMapper;
|
||||
}
|
||||
|
||||
public TopicSummaryResponse generateSummary(Topic topic) {
|
||||
if (!bookRepository.existsByStatus(BookStatus.READY)) {
|
||||
List<Book> readyBooks = bookRepository.findAll().stream()
|
||||
.filter(b -> b.getStatus() == BookStatus.READY)
|
||||
.toList();
|
||||
|
||||
if (readyBooks.isEmpty()) {
|
||||
throw new NoKnowledgeSourceException(
|
||||
"No books are available as knowledge sources. Please upload and process at least one book.");
|
||||
}
|
||||
|
||||
String question = buildQuestion(topic);
|
||||
|
||||
ChatResponse response = chatClient.prompt()
|
||||
.system(SYSTEM_PROMPT)
|
||||
.advisors(QuestionAnswerAdvisor.builder(vectorStore).build())
|
||||
.user(question)
|
||||
.call()
|
||||
.chatResponse();
|
||||
List<SectionEntity> allSections = new ArrayList<>();
|
||||
List<FigureEntity> allFigures = new ArrayList<>();
|
||||
for (Book book : readyBooks) {
|
||||
RetrievalResult result = retriever.retrieve(question, book.getId());
|
||||
allSections.addAll(result.parentSections());
|
||||
allFigures.addAll(result.figures());
|
||||
}
|
||||
|
||||
String summary = response.getResult().getOutput().getText();
|
||||
List<TopicSummaryResponse.SourceReference> sources = extractSources(response);
|
||||
log.debug("Topic summary for '{}': {} sections, {} figures retrieved",
|
||||
topic.getName(), allSections.size(), allFigures.size());
|
||||
|
||||
String contextPrompt = buildContextPrompt(question, allSections, allFigures);
|
||||
String summary = chatClient.prompt()
|
||||
.system(SYSTEM_PROMPT)
|
||||
.user(contextPrompt)
|
||||
.call()
|
||||
.content();
|
||||
|
||||
List<TopicSummaryResponse.SourceReference> sources = buildSources(allSections, allFigures, readyBooks);
|
||||
Instant generatedAt = Instant.now();
|
||||
|
||||
int summaryNumber = (int) summaryRepository.countByTopicId(topic.getId()) + 1;
|
||||
String sourcesJson = serializeSources(sources);
|
||||
TopicSummaryEntity entity = new TopicSummaryEntity(
|
||||
topic.getId(), summaryNumber, summary, sourcesJson, generatedAt);
|
||||
entity = summaryRepository.save(entity);
|
||||
|
||||
return new TopicSummaryResponse(
|
||||
entity.getId(),
|
||||
summaryNumber,
|
||||
topic.getId(),
|
||||
topic.getName(),
|
||||
summary,
|
||||
sources,
|
||||
Instant.now()
|
||||
generatedAt
|
||||
);
|
||||
}
|
||||
|
||||
public List<SavedSummaryItem> listSummaries(String topicId) {
|
||||
return summaryRepository.findByTopicIdOrderBySummaryNumberAsc(topicId).stream()
|
||||
.map(e -> new SavedSummaryItem(e.getId(), e.getSummaryNumber(), e.getGeneratedAt()))
|
||||
.toList();
|
||||
}
|
||||
|
||||
public TopicSummaryResponse getSummary(UUID summaryId) {
|
||||
TopicSummaryEntity entity = summaryRepository.findById(summaryId)
|
||||
.orElseThrow(() -> new NoSuchElementException("Summary not found."));
|
||||
|
||||
List<TopicSummaryResponse.SourceReference> sources = deserializeSources(entity.getSourcesJson());
|
||||
|
||||
return new TopicSummaryResponse(
|
||||
entity.getId(),
|
||||
entity.getSummaryNumber(),
|
||||
entity.getTopicId(),
|
||||
entity.getTopicId(),
|
||||
entity.getSummary(),
|
||||
sources,
|
||||
entity.getGeneratedAt()
|
||||
);
|
||||
}
|
||||
|
||||
private String buildQuestion(Topic topic) {
|
||||
return String.format(
|
||||
"Please provide a comprehensive educational summary of the following neurosurgery topic: " +
|
||||
"Provide a comprehensive educational summary of the following neurosurgery topic: " +
|
||||
"%s. Topic description: %s. " +
|
||||
"Include key concepts, clinical considerations, and important details that a neurosurgeon should know.",
|
||||
topic.getName(), topic.getDescription()
|
||||
);
|
||||
}
|
||||
|
||||
private List<TopicSummaryResponse.SourceReference> extractSources(ChatResponse response) {
|
||||
List<TopicSummaryResponse.SourceReference> sources = new ArrayList<>();
|
||||
private String buildContextPrompt(String question,
|
||||
List<SectionEntity> sections,
|
||||
List<FigureEntity> figures) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (response.getMetadata() != null) {
|
||||
Object retrieved = response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS);
|
||||
if (retrieved instanceof List<?> docs) {
|
||||
for (Object docObj : docs) {
|
||||
if (docObj instanceof Document doc) {
|
||||
Map<String, Object> metadata = doc.getMetadata();
|
||||
String bookTitle = (String) metadata.get("book_title");
|
||||
Object pageObj = metadata.get("page_number");
|
||||
Integer page = pageObj instanceof Number n ? n.intValue() : null;
|
||||
if (bookTitle != null) {
|
||||
sources.add(new TopicSummaryResponse.SourceReference(bookTitle, page));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!sections.isEmpty()) {
|
||||
sb.append("CONTEXT:\n\n");
|
||||
for (int i = 0; i < sections.size(); i++) {
|
||||
SectionEntity s = sections.get(i);
|
||||
sb.append("[S").append(i + 1).append("] ")
|
||||
.append(s.getTitle()).append(", p.").append(s.getPageStart()).append("\n");
|
||||
sb.append(s.getFullText()).append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by bookTitle + page
|
||||
if (!figures.isEmpty()) {
|
||||
sb.append("AVAILABLE FIGURES:\n");
|
||||
for (int i = 0; i < figures.size(); i++) {
|
||||
FigureEntity f = figures.get(i);
|
||||
sb.append("[F").append(i + 1).append("] ")
|
||||
.append(f.getLabel() != null ? f.getLabel() : "Figure")
|
||||
.append(" (p.").append(f.getPage()).append("): ")
|
||||
.append(f.getCaption() != null ? f.getCaption() : "")
|
||||
.append("\n");
|
||||
}
|
||||
sb.append("\n");
|
||||
}
|
||||
|
||||
sb.append("QUESTION:\n").append(question);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private List<TopicSummaryResponse.SourceReference> buildSources(List<SectionEntity> sections,
|
||||
List<FigureEntity> figures,
|
||||
List<Book> readyBooks) {
|
||||
List<TopicSummaryResponse.SourceReference> sources = new ArrayList<>();
|
||||
|
||||
for (SectionEntity s : sections) {
|
||||
Book book = readyBooks.stream()
|
||||
.filter(b -> b.getId().equals(s.getBookId()))
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
String title = book != null ? book.getTitle() : "Book";
|
||||
String bookId = book != null ? book.getId().toString() : null;
|
||||
sources.add(new TopicSummaryResponse.SourceReference(bookId, title, s.getPageStart()));
|
||||
}
|
||||
|
||||
for (FigureEntity f : figures) {
|
||||
Book book = readyBooks.stream()
|
||||
.filter(b -> b.getId().equals(f.getBookId()))
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
String title = book != null ? book.getTitle() : "Book";
|
||||
String bookId = book != null ? book.getId().toString() : null;
|
||||
sources.add(new TopicSummaryResponse.SourceReference(bookId, title, f.getPage()));
|
||||
}
|
||||
|
||||
return sources.stream().distinct().toList();
|
||||
}
|
||||
|
||||
private String serializeSources(List<TopicSummaryResponse.SourceReference> sources) {
|
||||
try {
|
||||
return objectMapper.writeValueAsString(sources);
|
||||
} catch (JsonProcessingException e) {
|
||||
log.warn("Failed to serialize sources, storing empty array", e);
|
||||
return "[]";
|
||||
}
|
||||
}
|
||||
|
||||
private List<TopicSummaryResponse.SourceReference> deserializeSources(String json) {
|
||||
try {
|
||||
return objectMapper.readValue(json,
|
||||
objectMapper.getTypeFactory().constructCollectionType(
|
||||
List.class, TopicSummaryResponse.SourceReference.class));
|
||||
} catch (JsonProcessingException e) {
|
||||
log.warn("Failed to deserialize sources from stored JSON", e);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ spring:
|
||||
api-key: ${OPENAI_API_KEY}
|
||||
chat:
|
||||
options:
|
||||
model: gpt-4o
|
||||
model: gpt-4o-mini
|
||||
embedding:
|
||||
options:
|
||||
model: "text-embedding-3-small"
|
||||
@@ -68,6 +68,8 @@ app:
|
||||
embedding:
|
||||
batch-size: 20
|
||||
batch-delay-ms: 2000
|
||||
skip-embedding: true
|
||||
skip-embedding: false
|
||||
marker:
|
||||
base-url: ${MARKER_BASE_URL:http://192.168.1.105:8000}
|
||||
vision:
|
||||
min-interval-ms: ${VISION_MIN_INTERVAL_MS:2000}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
CREATE TABLE topic_summary (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
topic_id VARCHAR(100) NOT NULL,
|
||||
summary_number INT NOT NULL,
|
||||
summary TEXT NOT NULL,
|
||||
sources_json TEXT NOT NULL,
|
||||
generated_at TIMESTAMPTZ NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX idx_topic_summary_topic_id ON topic_summary(topic_id, summary_number);
|
||||
Reference in New Issue
Block a user