first implementation - image/drawing integration

2026-04-04 12:56:56 +02:00
parent fc5b22fba1
commit 5acfdd33c1
42 changed files with 2854 additions and 151 deletions
@@ -1,5 +1,7 @@
 package com.aiteacher.book;

+import com.aiteacher.document.FigureEntity;
+import com.aiteacher.document.FigureRepository;
 import org.springframework.http.HttpStatus;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.*;
@@ -15,9 +17,11 @@ import java.util.UUID;
 public class BookController {

    private final BookService bookService;
+    private final FigureRepository figureRepository;

-    public BookController(BookService bookService) {
+    public BookController(BookService bookService, FigureRepository figureRepository) {
        this.bookService = bookService;
+        this.figureRepository = figureRepository;
    }

    @PostMapping(consumes = "multipart/form-data")
@@ -46,6 +50,36 @@ public class BookController {
        return ResponseEntity.noContent().build();
    }

+    @PostMapping("/{id}/reembed")
+    public ResponseEntity<Map<String, Object>> reembed(@PathVariable UUID id) {
+        Book book = bookService.reembed(id);
+        return ResponseEntity.accepted().body(Map.of(
+            "bookId", book.getId(),
+            "status", BookStatus.PROCESSING.name()
+        ));
+    }
+
+    @GetMapping("/{id}/figures")
+    public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
+        bookService.getById(id); // 404 if not found
+        List<FigureResponse> responses = figureRepository.findAllByBookId(id)
+            .stream()
+            .map(f -> toFigureResponse(id, f))
+            .toList();
+        return ResponseEntity.ok(responses);
+    }
+
+    private FigureResponse toFigureResponse(UUID bookId, FigureEntity f) {
+        String filename = f.getImagePath().substring(f.getImagePath().lastIndexOf('/') + 1);
+        String imageUrl = "/api/v1/figures/" + bookId + "/" + filename;
+        return new FigureResponse(
+            f.getId(), f.getLabel(), f.getCaption(),
+            f.getFigureType().name(), f.getPage(), imageUrl,
+            f.getSectionId(),
+            null // section title not eagerly loaded here
+        );
+    }
+
    private Map<String, Object> toSummaryResponse(Book book) {
        return Map.of(
            "id", book.getId(),
@@ -1,41 +1,75 @@
 package com.aiteacher.book;

+import com.aiteacher.document.*;
+import com.aiteacher.figure.FigureStorageService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.document.Document;
-import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
-import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
 import org.springframework.ai.vectorstore.VectorStore;
 import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
-import org.springframework.core.io.FileSystemResource;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;

 import java.nio.file.Path;
-import java.util.List;
-import java.util.UUID;
-import java.util.regex.Pattern;
+import java.time.Instant;
+import java.util.*;

@Service
 public class BookEmbeddingService {

    private static final Logger log = LoggerFactory.getLogger(BookEmbeddingService.class);

-    // Pattern to detect diagram/figure captions
-    private static final Pattern CAPTION_PATTERN =
-        Pattern.compile("^(Figure|Fig\\.|Table|Diagram)\\s+[\\d.]+", Pattern.CASE_INSENSITIVE);
-
    private final VectorStore vectorStore;
    private final BookRepository bookRepository;

-    public BookEmbeddingService(VectorStore vectorStore, BookRepository bookRepository) {
+    @Value("${app.embedding.batch-size:50}")
+    private int embeddingBatchSize;
+
+    @Value("${app.embedding.batch-delay-ms:1000}")
+    private long embeddingBatchDelayMs;
+    private final PdfStructureParser pdfStructureParser;
+    private final FigureExtractionService figureExtractionService;
+    private final VisionDescriptionService visionDescriptionService;
+    private final TextChunkingService textChunkingService;
+    private final ChunkFigureRefService chunkFigureRefService;
+    private final SectionRepository sectionRepository;
+    private final ChapterRepository chapterRepository;
+    private final FigureRepository figureRepository;
+    private final ChunkFigureRefRepository chunkFigureRefRepository;
+    private final FigureStorageService figureStorageService;
+
+    public BookEmbeddingService(
+            VectorStore vectorStore,
+            BookRepository bookRepository,
+            PdfStructureParser pdfStructureParser,
+            FigureExtractionService figureExtractionService,
+            VisionDescriptionService visionDescriptionService,
+            TextChunkingService textChunkingService,
+            ChunkFigureRefService chunkFigureRefService,
+            SectionRepository sectionRepository,
+            ChapterRepository chapterRepository,
+            FigureRepository figureRepository,
+            ChunkFigureRefRepository chunkFigureRefRepository,
+            FigureStorageService figureStorageService) {
        this.vectorStore = vectorStore;
        this.bookRepository = bookRepository;
+        this.pdfStructureParser = pdfStructureParser;
+        this.figureExtractionService = figureExtractionService;
+        this.visionDescriptionService = visionDescriptionService;
+        this.textChunkingService = textChunkingService;
+        this.chunkFigureRefService = chunkFigureRefService;
+        this.sectionRepository = sectionRepository;
+        this.chapterRepository = chapterRepository;
+        this.figureRepository = figureRepository;
+        this.chunkFigureRefRepository = chunkFigureRefRepository;
+        this.figureStorageService = figureStorageService;
    }

    @Async
    public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
-        log.info("Starting embedding for book {} ({})", bookId, bookTitle);
+        log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);

        Book book = bookRepository.findById(bookId).orElse(null);
        if (book == null) {
@@ -47,29 +81,68 @@ public class BookEmbeddingService {
            book.setStatus(BookStatus.PROCESSING);
            bookRepository.save(book);

-            PagePdfDocumentReader reader = new PagePdfDocumentReader(
-                new FileSystemResource(pdfPath.toFile()),
-                PdfDocumentReaderConfig.builder()
-                    .withPagesPerDocument(1)
-                    .build()
-            );
+            // Step 1: Parse PDF into page-level sections persisted in Postgres
+            List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
+            String chapterId = bookId + "-ch1";

-            List<Document> pages = reader.get();
-            int pageCount = pages.size();
+            // Step 2: Build and embed text chunks for all sections in batches
+            List<Document> allChunks = new ArrayList<>();
+            for (SectionEntity section : sections) {
+                List<Document> chunks = textChunkingService.chunk(section, bookTitle);
+                allChunks.addAll(chunks);
+            }
+            embedInBatches(allChunks, bookId);
+            log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);

-            // Enrich metadata and tag diagram captions
-            List<Document> enriched = pages.stream()
-                .map(doc -> enrichDocument(doc, bookId.toString(), bookTitle))
-                .toList();
+            // Step 3: Extract images from the PDF, save to file store, persist FigureEntity
+            List<FigureEntity> figures = figureExtractionService.extract(
+                bookId, chapterId, sections, pdfPath);

-            vectorStore.add(enriched);
+            // Step 4: For each figure, generate vision description and embed caption
+            for (FigureEntity figure : figures) {
+                Path imagePath = figureStorageService.resolve(figure.getImagePath());
+                String description = visionDescriptionService.describe(
+                    imagePath, figure.getCaption());
+
+                // Use description as caption fallback if no caption was detected
+                if (figure.getCaption() == null || figure.getCaption().isBlank()) {
+                    figure.setCaption(description);
+                    figureRepository.save(figure);
+                }
+
+                // Content for embedding = vision description + caption for maximum signal
+                String embeddingContent = description
+                    + (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
+
+                String embeddingId = UUID.randomUUID().toString();
+                Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
+                Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
+                vectorStore.add(List.of(figureDoc));
+
+                figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
+                figureRepository.save(figure);
+            }
+            log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
+
+            // Step 5: Link text chunks to figures via text references
+            for (SectionEntity section : sections) {
+                List<Document> sectionChunks = allChunks.stream()
+                    .filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
+                    .toList();
+                List<FigureEntity> sectionFigures = figures.stream()
+                    .filter(f -> section.getId().equals(f.getSectionId()))
+                    .toList();
+                chunkFigureRefService.linkChunksToFigures(
+                    sectionChunks, sectionFigures, section.getPageStart());
+            }

            book.setStatus(BookStatus.READY);
-            book.setPageCount(pageCount);
-            book.setProcessedAt(java.time.Instant.now());
+            book.setPageCount(sections.size());
+            book.setProcessedAt(Instant.now());
            bookRepository.save(book);

-            log.info("Finished embedding book {} — {} pages", bookId, pageCount);
+            log.info("Finished embedding book {} — {} pages, {} figures",
+                bookId, sections.size(), figures.size());

        } catch (Exception ex) {
            log.error("Failed to embed book {}", bookId, ex);
@@ -79,40 +152,74 @@ public class BookEmbeddingService {
        }
    }

-    private Document enrichDocument(Document doc, String bookId, String bookTitle) {
-        String content = doc.getText();
-        String chunkType = detectChunkType(content);
+    @Transactional
+    public void deleteBookChunks(UUID bookId) {
+        log.info("Deleting all data for book {}", bookId);
+        try {
+            // Delete chunk-figure refs (by figureId for this book)
+            List<String> figureIds = figureRepository.findAllByBookId(bookId)
+                .stream().map(FigureEntity::getId).toList();
+            if (!figureIds.isEmpty()) {
+                chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
+            }

-        doc.getMetadata().put("book_id", bookId);
-        doc.getMetadata().put("book_title", bookTitle);
-        doc.getMetadata().put("chunk_type", chunkType);
+            // Delete figures from Postgres
+            figureRepository.deleteAllByBookId(bookId);

-        return doc;
+            // Delete figure files from disk
+            figureStorageService.deleteAll(bookId);
+
+            // Delete sections and chapters from Postgres
+            sectionRepository.deleteAllByBookId(bookId);
+            chapterRepository.deleteAllByBookId(bookId);
+
+            // Delete vector store entries (text chunks + figure embeddings)
+            FilterExpressionBuilder b = new FilterExpressionBuilder();
+            vectorStore.delete(b.eq("book_id", bookId.toString()).build());
+
+        } catch (Exception ex) {
+            log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
+        }
    }

-    private String detectChunkType(String content) {
-        if (content != null) {
-            for (String line : content.split("\\r?\\n")) {
-                if (CAPTION_PATTERN.matcher(line.trim()).find()) {
-                    return "diagram";
+    private void embedInBatches(List<Document> docs, UUID bookId) {
+        int total = docs.size();
+        for (int i = 0; i < total; i += embeddingBatchSize) {
+            List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
+            vectorStore.add(batch);
+            int batchNum = i / embeddingBatchSize + 1;
+            int totalBatches = (total - 1) / embeddingBatchSize + 1;
+            log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
+            if (i + embeddingBatchSize < total) {
+                try {
+                    Thread.sleep(embeddingBatchDelayMs);
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                    log.warn("Embedding batch sleep interrupted for book {}", bookId);
                }
            }
        }
-        return "text";
    }

-    public void deleteBookChunks(UUID bookId) {
-        log.info("Deleting vector chunks for book {}", bookId);
-        try {
-            FilterExpressionBuilder b = new FilterExpressionBuilder();
-            vectorStore.delete(b.eq("book_id", bookId.toString()).build());
-        } catch (Exception ex) {
-            log.warn("Could not delete vector chunks for book {}: {}", bookId, ex.getMessage());
-        }
+    private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
+                                                     String embeddingId) {
+        Map<String, Object> m = new HashMap<>();
+        m.put("type", "FIGURE");
+        m.put("book_id", figure.getBookId().toString());
+        m.put("book_title", bookTitle);
+        m.put("chapter_id", figure.getChapterId() != null ? figure.getChapterId() : "");
+        m.put("section_id", figure.getSectionId() != null ? figure.getSectionId() : "");
+        m.put("figure_id", figure.getId());
+        m.put("figure_type", figure.getFigureType().name());
+        m.put("image_path", figure.getImagePath());
+        m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
+        m.put("page", figure.getPage());
+        m.put("embedding_id", embeddingId);
+        return m;
    }

-    private String truncate(String message, int maxLength) {
-        if (message == null) return null;
-        return message.length() <= maxLength ? message : message.substring(0, maxLength);
+    private String truncate(String msg, int max) {
+        if (msg == null) return null;
+        return msg.length() <= max ? msg : msg.substring(0, max);
    }
 }
@@ -1,11 +1,13 @@
 package com.aiteacher.book;

+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
 import org.springframework.web.multipart.MultipartFile;

 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
 import java.util.NoSuchElementException;
 import java.util.UUID;
@@ -15,10 +17,15 @@ public class BookService {

    private final BookRepository bookRepository;
    private final BookEmbeddingService bookEmbeddingService;
+    private final Path bookStoragePath;

-    public BookService(BookRepository bookRepository, BookEmbeddingService bookEmbeddingService) {
+    public BookService(
+            BookRepository bookRepository,
+            BookEmbeddingService bookEmbeddingService,
+            @Value("${app.figure-storage.base-path:./uploads}") String basePath) {
        this.bookRepository = bookRepository;
        this.bookEmbeddingService = bookEmbeddingService;
+        this.bookStoragePath = Paths.get(basePath).toAbsolutePath().normalize().resolve("books");
    }

    public Book upload(MultipartFile file) throws IOException {
@@ -28,20 +35,35 @@ public class BookService {
        }

        String title = deriveTitle(originalFilename);
-
        Book book = new Book(title, originalFilename, file.getSize());
        book = bookRepository.save(book);

-        // Write to a temp file so the async task can read it
-        Path tempFile = Files.createTempFile("aiteacher-", "-" + book.getId() + ".pdf");
-        file.transferTo(tempFile.toFile());
+        // Persist PDF in a stable location for potential re-embedding
+        Files.createDirectories(bookStoragePath);
+        Path pdfPath = bookStoragePath.resolve(book.getId() + ".pdf");
+        file.transferTo(pdfPath.toFile());

        UUID bookId = book.getId();
-        Path pdfPath = tempFile;
-        String bookTitle = title;
+        bookEmbeddingService.embedBook(bookId, title, pdfPath);
+        return book;
+    }

-        bookEmbeddingService.embedBook(bookId, bookTitle, pdfPath);
+    public Book reembed(UUID id) {
+        Book book = bookRepository.findById(id)
+            .orElseThrow(() -> new NoSuchElementException("Book not found."));

+        if (book.getStatus() == BookStatus.PROCESSING) {
+            throw new IllegalStateException("Book is already being processed.");
+        }
+
+        Path pdfPath = bookStoragePath.resolve(id + ".pdf");
+        if (!Files.exists(pdfPath)) {
+            throw new IllegalStateException(
+                "Original PDF not found. Please re-upload the book before re-embedding.");
+        }
+
+        bookEmbeddingService.deleteBookChunks(id);
+        bookEmbeddingService.embedBook(id, book.getTitle(), pdfPath);
        return book;
    }

@@ -63,14 +85,21 @@ public class BookService {
        }

        bookEmbeddingService.deleteBookChunks(id);
+
+        // Delete the stored PDF
+        Path pdfPath = bookStoragePath.resolve(id + ".pdf");
+        try {
+            Files.deleteIfExists(pdfPath);
+        } catch (IOException ex) {
+            // Non-fatal — log only
+        }
+
        bookRepository.deleteById(id);
    }

    private String deriveTitle(String filename) {
-        // Strip .pdf extension and replace separators with spaces
        String name = filename.replaceAll("(?i)\\.pdf$", "");
        name = name.replaceAll("[-_]", " ");
-        // Capitalise first letter
        if (!name.isEmpty()) {
            name = Character.toUpperCase(name.charAt(0)) + name.substring(1);
        }
@@ -0,0 +1,12 @@
+package com.aiteacher.book;
+
+public record FigureResponse(
+    String figureId,
+    String label,
+    String caption,
+    String figureType,
+    int page,
+    String imageUrl,
+    String sectionId,
+    String sectionTitle
+) {}