adding Marker to parse effectively pdf

2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -108,7 +108,7 @@
      <artifactId>spring-ai-pdf-document-reader</artifactId>
    </dependency>

-    <!-- PDFBox — explicit for image extraction per page -->
+    <!-- PDFBox — page rendering and cropping for figure extraction -->
    <dependency>
      <groupId>org.apache.pdfbox</groupId>
      <artifactId>pdfbox</artifactId>
@@ -2,7 +2,9 @@ package com.aiteacher.book;

 import com.aiteacher.document.FigureEntity;
 import com.aiteacher.document.FigureRepository;
+import com.aiteacher.document.MarkdownStorageService;
 import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.*;
 import org.springframework.web.multipart.MultipartFile;
@@ -18,10 +20,13 @@ public class BookController {

    private final BookService bookService;
    private final FigureRepository figureRepository;
+    private final MarkdownStorageService markdownStorageService;

-    public BookController(BookService bookService, FigureRepository figureRepository) {
+    public BookController(BookService bookService, FigureRepository figureRepository,
+                          MarkdownStorageService markdownStorageService) {
        this.bookService = bookService;
        this.figureRepository = figureRepository;
+        this.markdownStorageService = markdownStorageService;
    }

    @PostMapping(consumes = "multipart/form-data")
@@ -59,6 +64,17 @@ public class BookController {
        ));
    }

+    @GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE)
+    public ResponseEntity<String> getPageMarkdown(@PathVariable UUID id,
+                                                   @PathVariable int pageNumber) {
+        bookService.getById(id); // 404 if not found
+        try {
+            return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber));
+        } catch (Exception e) {
+            return ResponseEntity.notFound().build();
+        }
+    }
+
    @GetMapping("/{id}/figures")
    public ResponseEntity<List<FigureResponse>> figures(@PathVariable UUID id) {
        bookService.getById(id); // 404 if not found
@@ -2,6 +2,9 @@ package com.aiteacher.book;

 import com.aiteacher.document.*;
 import com.aiteacher.figure.FigureStorageService;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.document.Document;
@@ -23,13 +26,7 @@ public class BookEmbeddingService {

    private final VectorStore vectorStore;
    private final BookRepository bookRepository;
-
-    @Value("${app.embedding.batch-size:50}")
-    private int embeddingBatchSize;
-
-    @Value("${app.embedding.batch-delay-ms:1000}")
-    private long embeddingBatchDelayMs;
-    private final PdfStructureParser pdfStructureParser;
+    private final MarkerPageParser markerPageParser;
    private final FigureExtractionService figureExtractionService;
    private final VisionDescriptionService visionDescriptionService;
    private final TextChunkingService textChunkingService;
@@ -39,11 +36,21 @@ public class BookEmbeddingService {
    private final FigureRepository figureRepository;
    private final ChunkFigureRefRepository chunkFigureRefRepository;
    private final FigureStorageService figureStorageService;
+    private final MarkdownStorageService markdownStorageService;
+
+    private static final Pattern MARKER_PLACEHOLDER =
+            Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)");
+
+    @Value("${app.embedding.batch-size:50}")
+    private int embeddingBatchSize;
+
+    @Value("${app.embedding.batch-delay-ms:1000}")
+    private long embeddingBatchDelayMs;

    public BookEmbeddingService(
            VectorStore vectorStore,
            BookRepository bookRepository,
-            PdfStructureParser pdfStructureParser,
+            MarkerPageParser markerPageParser,
            FigureExtractionService figureExtractionService,
            VisionDescriptionService visionDescriptionService,
            TextChunkingService textChunkingService,
@@ -52,10 +59,11 @@ public class BookEmbeddingService {
            ChapterRepository chapterRepository,
            FigureRepository figureRepository,
            ChunkFigureRefRepository chunkFigureRefRepository,
-            FigureStorageService figureStorageService) {
+            FigureStorageService figureStorageService,
+            MarkdownStorageService markdownStorageService) {
        this.vectorStore = vectorStore;
        this.bookRepository = bookRepository;
-        this.pdfStructureParser = pdfStructureParser;
+        this.markerPageParser = markerPageParser;
        this.figureExtractionService = figureExtractionService;
        this.visionDescriptionService = visionDescriptionService;
        this.textChunkingService = textChunkingService;
@@ -65,11 +73,12 @@ public class BookEmbeddingService {
        this.figureRepository = figureRepository;
        this.chunkFigureRefRepository = chunkFigureRefRepository;
        this.figureStorageService = figureStorageService;
+        this.markdownStorageService = markdownStorageService;
    }

    @Async
    public void embedBook(UUID bookId, String bookTitle, Path pdfPath) {
-        log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle);
+        log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle);

        Book book = bookRepository.findById(bookId).orElse(null);
        if (book == null) {
@@ -81,59 +90,73 @@ public class BookEmbeddingService {
            book.setStatus(BookStatus.PROCESSING);
            bookRepository.save(book);

-            // Step 1: Parse PDF into page-level sections persisted in Postgres
-            List<SectionEntity> sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath);
            String chapterId = bookId + "-ch1";
+            ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
+            chapterRepository.save(chapter);

-            // Step 2: Build and embed text chunks for all sections in batches
+            // Step 1: Parse every page with Marker — correct reading order + pre-cropped figures
+            List<PageResult> pageResults = markerPageParser.parse(pdfPath);
+
+            // Step 2: Build SectionEntity per page and persist
+            List<SectionEntity> sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults);
+
+            // Step 3: Chunk and embed text
            List<Document> allChunks = new ArrayList<>();
            for (SectionEntity section : sections) {
-                List<Document> chunks = textChunkingService.chunk(section, bookTitle);
-                allChunks.addAll(chunks);
+                allChunks.addAll(textChunkingService.chunk(section, bookTitle));
            }
            embedInBatches(allChunks, bookId);
            log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId);

-            // Step 3: Extract images from the PDF, save to file store, persist FigureEntity
-            List<FigureEntity> figures = figureExtractionService.extract(
-                bookId, chapterId, sections, pdfPath);
+            // Step 4: Decode pre-cropped figures from Marker output
+            FigureExtractionService.ExtractionResult extraction =
+                    figureExtractionService.extract(bookId, chapterId, pageResults);
+            List<FigureEntity> figures = extraction.figures();

-            // Step 4: For each figure, generate vision description and embed caption
+            // Step 4b: Upload per-page markdown with resolved figure URLs to S3
+            for (PageResult page : pageResults) {
+                if (!page.markdown().isBlank()) {
+                    String resolved = resolvePlaceholders(page.markdown(), bookId,
+                            extraction.blockIdToFigureId());
+                    markdownStorageService.save(bookId, page.pageNumber(), resolved);
+                }
+            }
+
+            // Step 5: Vision analysis (description + visible text) → embed figure chunks
            for (FigureEntity figure : figures) {
                byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath());
-                String description = visionDescriptionService.describe(
-                    imageBytes, figure.getCaption());
+                VisionDescriptionService.ImageAnalysis analysis =
+                        visionDescriptionService.analyze(imageBytes, figure.getCaption());

-                // Use description as caption fallback if no caption was detected
                if (figure.getCaption() == null || figure.getCaption().isBlank()) {
-                    figure.setCaption(description);
+                    figure.setCaption(analysis.description());
                    figureRepository.save(figure);
                }

-                // Content for embedding = vision description + caption for maximum signal
-                String embeddingContent = description
-                    + (figure.getCaption() != null ? "\n" + figure.getCaption() : "");
+                // Embedding content: description + caption + visible image text
+                String embeddingContent = analysis.description()
+                        + (figure.getCaption() != null ? "\n" + figure.getCaption() : "")
+                        + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText());

                String embeddingId = UUID.randomUUID().toString();
-                Map<String, Object> metadata = buildFigureMetadata(figure, bookTitle, embeddingId);
-                Document figureDoc = new Document(embeddingId, embeddingContent, metadata);
+                Document figureDoc = new Document(embeddingId, embeddingContent,
+                        buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText()));
                vectorStore.add(List.of(figureDoc));

                figure.setCaptionEmbeddingId(UUID.fromString(embeddingId));
                figureRepository.save(figure);
            }
-            log.info("Embedded {} figure captions for book {}", figures.size(), bookId);
+            log.info("Embedded {} figure chunks for book {}", figures.size(), bookId);

-            // Step 5: Link text chunks to figures via text references
+            // Step 6: Link text chunks to figures via in-text references
            for (SectionEntity section : sections) {
                List<Document> sectionChunks = allChunks.stream()
-                    .filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
-                    .toList();
+                        .filter(d -> section.getId().equals(d.getMetadata().get("section_id")))
+                        .toList();
                List<FigureEntity> sectionFigures = figures.stream()
-                    .filter(f -> section.getId().equals(f.getSectionId()))
-                    .toList();
-                chunkFigureRefService.linkChunksToFigures(
-                    sectionChunks, sectionFigures, section.getPageStart());
+                        .filter(f -> section.getId().equals(f.getSectionId()))
+                        .toList();
+                chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart());
            }

            book.setStatus(BookStatus.READY);
@@ -142,7 +165,7 @@ public class BookEmbeddingService {
            bookRepository.save(book);

            log.info("Finished embedding book {} — {} pages, {} figures",
-                bookId, sections.size(), figures.size());
+                    bookId, sections.size(), figures.size());

        } catch (Exception ex) {
            log.error("Failed to embed book {}", bookId, ex);
@@ -156,53 +179,63 @@ public class BookEmbeddingService {
    public void deleteBookChunks(UUID bookId) {
        log.info("Deleting all data for book {}", bookId);
        try {
-            // Delete chunk-figure refs (by figureId for this book)
            List<String> figureIds = figureRepository.findAllByBookId(bookId)
-                .stream().map(FigureEntity::getId).toList();
+                    .stream().map(FigureEntity::getId).toList();
            if (!figureIds.isEmpty()) {
                chunkFigureRefRepository.deleteByFigureIdIn(figureIds);
            }
-
-            // Delete figures from Postgres
            figureRepository.deleteAllByBookId(bookId);
-
-            // Delete figure files from disk
            figureStorageService.deleteAll(bookId);
-
-            // Delete sections and chapters from Postgres
+            markdownStorageService.deleteAll(bookId);
            sectionRepository.deleteAllByBookId(bookId);
            chapterRepository.deleteAllByBookId(bookId);

-            // Delete vector store entries (text chunks + figure embeddings)
            FilterExpressionBuilder b = new FilterExpressionBuilder();
            vectorStore.delete(b.eq("book_id", bookId.toString()).build());
-
        } catch (Exception ex) {
            log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage());
        }
    }

+    // --- Private helpers ---
+
+    private List<SectionEntity> buildAndSaveSections(UUID bookId, String bookTitle,
+                                                      String chapterId,
+                                                      List<PageResult> pageResults) {
+        List<SectionEntity> sections = new ArrayList<>();
+        for (PageResult page : pageResults) {
+            if (page.orderedText().isBlank()) continue;
+
+            String sectionId = bookId + "-p" + page.pageNumber();
+            String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber();
+
+            SectionEntity section = new SectionEntity(
+                    sectionId, chapterId, bookId,
+                    String.valueOf(page.pageNumber()),
+                    title,
+                    page.pageNumber(), page.pageNumber(),
+                    page.orderedText());
+            sections.add(sectionRepository.save(section));
+        }
+        return sections;
+    }
+
    private void embedInBatches(List<Document> docs, UUID bookId) {
        int total = docs.size();
        for (int i = 0; i < total; i += embeddingBatchSize) {
            List<Document> batch = docs.subList(i, Math.min(i + embeddingBatchSize, total));
            vectorStore.add(batch);
-            int batchNum = i / embeddingBatchSize + 1;
-            int totalBatches = (total - 1) / embeddingBatchSize + 1;
-            log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId);
+            log.debug("Embedded batch {}/{} for book {}",
+                    i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId);
            if (i + embeddingBatchSize < total) {
-                try {
-                    Thread.sleep(embeddingBatchDelayMs);
-                } catch (InterruptedException e) {
-                    Thread.currentThread().interrupt();
-                    log.warn("Embedding batch sleep interrupted for book {}", bookId);
-                }
+                try { Thread.sleep(embeddingBatchDelayMs); }
+                catch (InterruptedException e) { Thread.currentThread().interrupt(); }
            }
        }
    }

    private Map<String, Object> buildFigureMetadata(FigureEntity figure, String bookTitle,
-                                                     String embeddingId) {
+                                                     String embeddingId, String imageText) {
        Map<String, Object> m = new HashMap<>();
        m.put("type", "FIGURE");
        m.put("book_id", figure.getBookId().toString());
@@ -215,9 +248,31 @@ public class BookEmbeddingService {
        m.put("label", figure.getLabel() != null ? figure.getLabel() : "");
        m.put("page", figure.getPage());
        m.put("embedding_id", embeddingId);
+        m.put("image_text", imageText);  // verbatim text visible inside the image
        return m;
    }

+    /** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */
+    private String resolvePlaceholders(String markdown, UUID bookId,
+                                       Map<String, String> blockIdToFigureId) {
+        Matcher m = MARKER_PLACEHOLDER.matcher(markdown);
+        StringBuilder sb = new StringBuilder();
+        while (m.find()) {
+            String altText = m.group(1);
+            String blockId = m.group(2);
+            String figureId = blockIdToFigureId.get(blockId);
+            if (figureId != null) {
+                String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png";
+                m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\")
+                        .replace("$", "\\$") + "](" + url + ")");
+            } else {
+                m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.)
+            }
+        }
+        m.appendTail(sb);
+        return sb.toString().strip();
+    }
+
    private String truncate(String msg, int max) {
        if (msg == null) return null;
        return msg.length() <= max ? msg : msg.substring(0, max);
@@ -0,0 +1,30 @@
+package com.aiteacher.config;
+
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.http.client.JdkClientHttpRequestFactory;
+import org.springframework.web.client.RestClient;
+
+import java.net.http.HttpClient;
+
+@Configuration
+public class MarkerConfig {
+
+    @Value("${app.marker.base-url:http://localhost:8000}")
+    private String markerBaseUrl;
+
+    @Bean
+    RestClient markerRestClient() {
+        // Use the JDK HTTP client with no timeout — Marker conversions can take several minutes.
+        HttpClient httpClient = HttpClient.newBuilder()
+                .build();
+        JdkClientHttpRequestFactory factory = new JdkClientHttpRequestFactory(httpClient);
+        // No read timeout set: JDK HTTP client defaults to no deadline.
+
+        return RestClient.builder()
+                .baseUrl(markerBaseUrl)
+                .requestFactory(factory)
+                .build();
+    }
+}
@@ -1,43 +1,43 @@
 package com.aiteacher.document;

 import com.aiteacher.figure.FigureStorageService;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;

+import javax.imageio.ImageIO;
 import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
- * Extracts images from each PDF page using PDFBox.
- * Images below the configured minimum size are skipped.
- * Caption is detected by the "Fig." pattern in page text.
+ * Extracts figure images from {@link PageResult.FigureData} entries produced by
+ * {@link MarkerPageParser}.
+ *
+ * <p>Marker returns pre-cropped PNG bytes for each detected figure, so no PDFBox
+ * page rendering or bounding-box cropping is needed. This service:
+ * <ol>
+ *   <li>Decodes the PNG bytes to check dimensions (skip images below min size)</li>
+ *   <li>Classifies the figure type from caption and surrounding text keywords</li>
+ *   <li>Persists the image via {@link FigureStorageService}</li>
+ *   <li>Persists a {@link FigureEntity} to the database</li>
+ * </ol>
 */
@Service
 public class FigureExtractionService {

    private static final Logger log = LoggerFactory.getLogger(FigureExtractionService.class);

-    // Caption: line starting with "Fig." or "Figure" followed by a number
-    private static final Pattern CAPTION_PATTERN =
-        Pattern.compile("(?m)^(Fig\\.?\\s*\\d+[\\-.]?\\d*[^\\n]*)", Pattern.CASE_INSENSITIVE);
-
-    // Figure label: "Fig. 12-4" or "Fig. 12.4"
    private static final Pattern LABEL_PATTERN =
-        Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");
+            Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");

    private final FigureStorageService storageService;
    private final FigureRepository figureRepository;
@@ -52,65 +52,77 @@ public class FigureExtractionService {
        this.minImageSizePx = minImageSizePx;
    }

+    /** Holds the extraction output: persisted figures and a Marker blockId → DB figureId map. */
+    public record ExtractionResult(List<FigureEntity> figures, Map<String, String> blockIdToFigureId) {}
+
    /**
-     * Extracts all qualifying images from the PDF for the given book.
-     * Returns persisted FigureEntity list (without vision descriptions — set later).
+     * Extracts and persists figures for all pages described by {@code pageResults}.
+     *
+     * @param bookId      owning book
+     * @param chapterId   chapter bucket for these sections
+     * @param pageResults Marker parse output — each entry's {@code figures} list
+     *                    carries pre-cropped PNG bytes for that page
+     * @return {@link ExtractionResult} with persisted figures and blockId→figureId map
+     *         (used to resolve markdown image placeholders)
     */
-    public List<FigureEntity> extract(UUID bookId, String chapterId,
-                                      List<SectionEntity> sections, Path pdfPath) {
+    public ExtractionResult extract(UUID bookId, String chapterId,
+                                    List<PageResult> pageResults) {
        List<FigureEntity> figures = new ArrayList<>();
+        Map<String, String> blockIdToFigureId = new HashMap<>();
        int figureCounter = 0;

-        try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
-            for (SectionEntity section : sections) {
-                int pageIndex = section.getPageStart() - 1; // 0-based
-                if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue;
-
-                PDPage page = doc.getPage(pageIndex);
-                String pageText = section.getFullText();
+        for (PageResult page : pageResults) {
+            if (page.figures().isEmpty()) continue;

+            for (PageResult.FigureData figureData : page.figures()) {
                try {
-                    for (COSName name : page.getResources().getXObjectNames()) {
-                        PDXObject xObject = page.getResources().getXObject(name);
-                        if (!(xObject instanceof PDImageXObject image)) continue;
-
-                        BufferedImage bufferedImage = image.getImage();
-                        if (bufferedImage.getWidth() < minImageSizePx
-                                || bufferedImage.getHeight() < minImageSizePx) {
-                            continue; // skip decorative images
-                        }
-
-                        figureCounter++;
-                        String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter;
-                        String caption = detectCaption(pageText);
-                        String label = detectLabel(caption, figureCounter);
-                        FigureType type = classifyType(caption, pageText);
-
-                        String imagePath = storageService.save(bookId, figureId, bufferedImage);
-
-                        FigureEntity figure = new FigureEntity(
-                            figureId, bookId, section.getId(), chapterId,
-                            label, caption, type, section.getPageStart(), imagePath
-                        );
-                        figures.add(figureRepository.save(figure));
+                    BufferedImage image = decodeImage(figureData.imageBytes());
+                    if (image == null) {
+                        log.debug("Could not decode image on page {} of book {} (block {})",
+                                page.pageNumber(), bookId, figureData.blockId());
+                        continue;
                    }
-                } catch (IOException ex) {
-                    log.warn("Failed to extract images from page {} of book {}: {}",
-                        section.getPageStart(), bookId, ex.getMessage());
+                    if (image.getWidth() < minImageSizePx || image.getHeight() < minImageSizePx) {
+                        log.debug("Skipping small figure on page {} ({}×{})",
+                                page.pageNumber(), image.getWidth(), image.getHeight());
+                        continue;
+                    }
+
+                    figureCounter++;
+                    String figureId = bookId + "-fig-" + page.pageNumber() + "-" + figureCounter;
+                    String caption = figureData.nearestCaption();
+                    String label = detectLabel(caption, figureCounter);
+                    FigureType type = classifyType(caption, page.orderedText());
+
+                    String sectionId = bookId + "-p" + page.pageNumber();
+                    String imagePath = storageService.save(bookId, figureId, image);
+
+                    FigureEntity figure = new FigureEntity(
+                            figureId, bookId, sectionId, chapterId,
+                            label, caption, type, page.pageNumber(), imagePath);
+                    figures.add(figureRepository.save(figure));
+                    blockIdToFigureId.put(figureData.blockId(), figureId);
+
+                } catch (Exception ex) {
+                    log.warn("Failed to extract figure on page {} of book {}: {}",
+                            page.pageNumber(), bookId, ex.getMessage());
                }
            }
-        } catch (IOException ex) {
-            log.error("Could not open PDF for image extraction, book {}", bookId, ex);
        }

        log.info("Extracted {} figures for book {}", figures.size(), bookId);
-        return figures;
+        return new ExtractionResult(figures, blockIdToFigureId);
    }

-    private String detectCaption(String pageText) {
-        if (pageText == null) return null;
-        Matcher m = CAPTION_PATTERN.matcher(pageText);
-        return m.find() ? m.group(1).trim() : null;
+    // --- Private helpers ---
+
+    private BufferedImage decodeImage(byte[] imageBytes) {
+        if (imageBytes == null || imageBytes.length == 0) return null;
+        try {
+            return ImageIO.read(new ByteArrayInputStream(imageBytes));
+        } catch (IOException ex) {
+            return null;
+        }
    }

    private String detectLabel(String caption, int counter) {
@@ -122,14 +134,18 @@ public class FigureExtractionService {
    }

    private FigureType classifyType(String caption, String pageText) {
-        String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase();
+        String combined = ((caption != null ? caption : "") + " " +
+                           (pageText != null ? pageText : "")).toLowerCase();
        if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic")
-                || combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
-        if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE;
-        if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE;
+                || combined.contains("tomography"))    return FigureType.MRI_CT_SCAN;
+        if (combined.contains("intraoperative") || combined.contains("intra-op"))
+                                                       return FigureType.INTRAOPERATIVE_IMAGE;
+        if (caption != null && caption.toLowerCase().startsWith("table"))
+                                                       return FigureType.TABLE;
        if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph"))
-            return FigureType.CHART;
-        if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH;
+                                                       return FigureType.CHART;
+        if (combined.contains("photograph") || combined.contains("photo"))
+                                                       return FigureType.SURGICAL_PHOTOGRAPH;
        return FigureType.ANATOMICAL_DIAGRAM;
    }
 }
@@ -0,0 +1,14 @@
+package com.aiteacher.document;
+
+import java.util.UUID;
+
+public interface MarkdownStorageService {
+    /** Uploads the markdown content and returns the S3 key. */
+    String save(UUID bookId, int pageNumber, String markdown);
+
+    /** Downloads and returns the markdown content for the given book and page. */
+    String getText(UUID bookId, int pageNumber);
+
+    /** Deletes all markdown files for the given book. */
+    void deleteAll(UUID bookId);
+}
@@ -0,0 +1,273 @@
+package com.aiteacher.document;
+
+import tools.jackson.databind.JsonNode;
+import tools.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.core.io.FileSystemResource;
+import org.springframework.http.MediaType;
+import org.springframework.stereotype.Service;
+import org.springframework.util.LinkedMultiValueMap;
+import org.springframework.util.MultiValueMap;
+import org.springframework.web.client.RestClient;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+
+/**
+ * Parses a PDF using the local Marker server ({@code POST /marker/upload}).
+ *
+ * <p>A single HTTP call returns:
+ * <ul>
+ *   <li>Reading-order text blocks — correct for multi-column and scanned pages</li>
+ *   <li>Section headings extracted from {@code SectionHeader} blocks</li>
+ *   <li>Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's
+ *       {@code images} map</li>
+ * </ul>
+ *
+ * <p>The response is mapped to one {@link PageResult} per page block.
+ */
+@Service
+public class MarkerPageParser {
+
+    private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class);
+
+    private static final Set<String> TEXT_BLOCK_TYPES = Set.of(
+            "Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation",
+            "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting"
+    );
+    private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
+
+    private final RestClient restClient;
+    private final ObjectMapper objectMapper;
+
+    public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) {
+        this.restClient = restClient;
+        this.objectMapper = objectMapper;
+    }
+
+    /**
+     * Parses the entire PDF and returns one {@link PageResult} per non-empty page.
+     */
+    public List<PageResult> parse(Path pdfPath) {
+        log.info("Submitting {} to Marker for parsing", pdfPath.getFileName());
+
+        MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
+        body.add("file", new FileSystemResource(pdfPath));
+        body.add("output_format", "json");
+
+        JsonNode response = restClient.post()
+                .uri("/marker/upload")
+                .contentType(MediaType.MULTIPART_FORM_DATA)
+                .body(body)
+                .retrieve()
+                .body(JsonNode.class);
+
+        try {
+            Path debugFile = Path.of("/tmp/marker-response-md.json");
+            Files.writeString(debugFile, response.toPrettyString());
+            log.info("Marker response saved to {}", debugFile);
+        } catch (IOException e) {
+            log.warn("Could not save Marker response to file", e);
+        }
+
+        List<PageResult> results = parseResponse(response);
+        log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName());
+        return results;
+    }
+
+    // --- Private helpers ---
+
+    private List<PageResult> parseResponse(JsonNode response) {
+        if (response == null) return List.of();
+
+        // The "output" field is a JSON-encoded string — parse it first.
+        // Fall back to treating the whole response as the root if "output" is absent.
+        JsonNode root;
+        JsonNode outputNode = response.path("output");
+        if (!outputNode.isMissingNode() && outputNode.isTextual()) {
+            try {
+                root = objectMapper.readTree(outputNode.asText());
+            } catch (tools.jackson.core.JacksonException e) {
+                log.warn("Could not parse Marker 'output' field as JSON", e);
+                return List.of();
+            }
+        } else if (!outputNode.isMissingNode()) {
+            root = outputNode;
+        } else {
+            root = response;
+        }
+
+        JsonNode children = root.path("children");
+        if (children.isMissingNode() || !children.isArray()) {
+            log.warn("Marker response has no 'children' array — empty result");
+            return List.of();
+        }
+
+        List<PageResult> results = new ArrayList<>();
+        int pageIndex = 0;
+        for (JsonNode pageBlock : children) {
+            String blockType = pageBlock.path("block_type").asText();
+            if (!"Page".equals(blockType)) continue;
+
+            int pageNumber = pageIndex + 1;
+            pageIndex++;
+
+            PageResult result = parsePage(pageBlock, pageNumber);
+            if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
+                results.add(result);
+            }
+        }
+        return results;
+    }
+
+    private PageResult parsePage(JsonNode pageBlock, int pageNumber) {
+        JsonNode children = pageBlock.path("children");
+        if (children.isMissingNode() || !children.isArray()) {
+            return new PageResult(pageNumber, "", null, List.of(), "");
+        }
+
+        StringBuilder textBuilder = new StringBuilder();
+        StringBuilder markdownBuilder = new StringBuilder();
+        String headingTitle = null;
+        List<PageResult.FigureData> figures = new ArrayList<>();
+        Set<Integer> consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure
+
+        List<JsonNode> childList = new ArrayList<>();
+        children.forEach(childList::add);
+
+        for (int i = 0; i < childList.size(); i++) {
+            if (consumed.contains(i)) continue;
+
+            JsonNode child = childList.get(i);
+            String type = child.path("block_type").asText();
+
+            if ("SectionHeader".equals(type)) {
+                String heading = stripHtml(child.path("html").asText()).strip();
+                if (!heading.isEmpty() && headingTitle == null) {
+                    headingTitle = heading;
+                }
+                appendText(textBuilder, heading);
+                appendMarkdown(markdownBuilder, "## " + heading);
+
+            } else if (TEXT_BLOCK_TYPES.contains(type)) {
+                String text = stripHtml(child.path("html").asText());
+                appendText(textBuilder, text);
+                appendMarkdown(markdownBuilder, text.strip());
+
+            } else if (FIGURE_BLOCK_TYPES.contains(type)) {
+                extractFigures(child, i, childList, figures, markdownBuilder, consumed);
+            }
+        }
+
+        return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle,
+                figures, markdownBuilder.toString().strip());
+    }
+
+    /**
+     * Handles a figure/picture block at {@code index} in {@code siblings}.
+     * For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure,
+     * and the caption is a sibling Caption child inside the group.
+     * For leaf blocks the caption is the next sibling in the page child list.
+     * Image refs are appended to {@code markdown} as {@code ![caption](marker://{blockId})}.
+     * Consumed caption sibling indices are added to {@code consumed}.
+     */
+    private void extractFigures(JsonNode block, int index, List<JsonNode> siblings,
+                                 List<PageResult.FigureData> out, StringBuilder markdown,
+                                 Set<Integer> consumed) {
+        String type = block.path("block_type").asText();
+        boolean isGroup = type.endsWith("Group");
+
+        if (isGroup) {
+            JsonNode groupChildren = block.path("children");
+            if (groupChildren.isMissingNode() || !groupChildren.isArray()) return;
+
+            String groupCaption = null;
+            for (JsonNode sub : groupChildren) {
+                if ("Caption".equals(sub.path("block_type").asText())) {
+                    String c = stripHtml(sub.path("html").asText()).strip();
+                    if (!c.isEmpty()) groupCaption = c;
+                }
+            }
+            for (JsonNode sub : groupChildren) {
+                String subType = sub.path("block_type").asText();
+                if ("Figure".equals(subType) || "Picture".equals(subType)) {
+                    String blockId = sub.path("id").asText();
+                    byte[] imageBytes = extractImageBytes(sub, blockId);
+                    if (imageBytes != null) {
+                        out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId));
+                        String altText = groupCaption != null ? groupCaption : blockId;
+                        appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
+                    }
+                }
+            }
+        } else {
+            String blockId = block.path("id").asText();
+            byte[] imageBytes = extractImageBytes(block, blockId);
+            if (imageBytes != null) {
+                String caption = null;
+                if (index + 1 < siblings.size()) {
+                    JsonNode next = siblings.get(index + 1);
+                    if ("Caption".equals(next.path("block_type").asText())) {
+                        String c = stripHtml(next.path("html").asText()).strip();
+                        if (!c.isEmpty()) caption = c;
+                        consumed.add(index + 1);
+                    }
+                }
+                out.add(new PageResult.FigureData(imageBytes, caption, blockId));
+                String altText = caption != null ? caption : blockId;
+                appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
+            }
+        }
+    }
+
+    /**
+     * Extracts and base64-decodes the image bytes for this block.
+     * Marker stores images in the block's {@code images} map keyed by block ID.
+     */
+    private byte[] extractImageBytes(JsonNode block, String blockId) {
+        JsonNode images = block.path("images");
+        if (images.isMissingNode() || images.isEmpty()) return null;
+
+        // Try the block's own ID first, then fall back to the first entry
+        JsonNode imgNode = images.path(blockId);
+        if (imgNode.isMissingNode()) {
+            imgNode = images.properties().stream()
+                    .findFirst()
+                    .map(e -> e.getValue())
+                    .orElse(imgNode);
+        }
+
+        String base64 = imgNode.asText();
+        if (base64.isEmpty()) return null;
+
+        try {
+            return Base64.getDecoder().decode(base64);
+        } catch (IllegalArgumentException ex) {
+            log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
+            return null;
+        }
+    }
+
+    private void appendText(StringBuilder sb, String text) {
+        String stripped = text.strip();
+        if (stripped.isEmpty()) return;
+        if (sb.length() > 0) sb.append("\n\n");
+        sb.append(stripped);
+    }
+
+    private void appendMarkdown(StringBuilder sb, String text) {
+        if (text == null || text.isBlank()) return;
+        if (sb.length() > 0) sb.append("\n\n");
+        sb.append(text.strip());
+    }
+
+    /** Strips HTML tags and normalises whitespace. */
+    private String stripHtml(String html) {
+        if (html == null || html.isEmpty()) return "";
+        return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip();
+    }
+}
@@ -0,0 +1,26 @@
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by MarkerPageParser for one PDF page.
+ * Decouples the Marker HTTP API from downstream services.
+ */
+public record PageResult(
+        int pageNumber,           // 1-based, derived from Marker page block index
+        String orderedText,       // full page text in correct reading order (blocks joined by \n\n)
+        String headingTitle,      // first SectionHeader block on page, or null
+        List<FigureData> figures, // extracted figure images (may be empty)
+        String markdown           // markdown representation with marker://{blockId} image placeholders
+) {
+
+    /**
+     * A figure extracted from the page.
+     * Image bytes are PNG data decoded from the Marker JSON {@code images} map.
+     */
+    public record FigureData(
+            byte[] imageBytes,       // PNG image data (base64-decoded from Marker response)
+            String nearestCaption,   // text of the adjacent Caption block, or null
+            String blockId           // Marker block ID (e.g. "/page/0/Figure/2") for traceability
+    ) {}
+}
@@ -1,13 +1,17 @@
 package com.aiteacher.document;

+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.text.PDFTextStripperByArea;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
-import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
-import org.springframework.core.io.FileSystemResource;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;

+import java.awt.Rectangle;
+import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
@@ -15,13 +19,18 @@ import java.util.UUID;

 /**
 * Parses a PDF into page-level SectionEntity records stored in Postgres.
- * Each page becomes one section, grouped under a single chapter per book.
+ * Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
+ * left column is extracted first then right, preserving correct reading order.
+ * Text is also normalized (collapsed whitespace) before storage.
 */
@Service
 public class PdfStructureParser {

    private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);

+    // Right column is considered empty (single-column page) if it has < 20% of left column's content
+    private static final double TWO_COLUMN_THRESHOLD = 0.2;
+
    private final ChapterRepository chapterRepository;
    private final SectionRepository sectionRepository;

@@ -35,37 +44,71 @@ public class PdfStructureParser {
    public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
        log.info("Parsing PDF structure for book {}", bookId);

-        // One chapter per book
        String chapterId = bookId + "-ch1";
        ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
        chapterRepository.save(chapter);

-        // One section per page
-        PagePdfDocumentReader reader = new PagePdfDocumentReader(
-            new FileSystemResource(pdfPath.toFile()),
-            PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
-        );
-
-        List<org.springframework.ai.document.Document> pages = reader.get();
        List<SectionEntity> sections = new ArrayList<>();

-        for (int i = 0; i < pages.size(); i++) {
-            int pageNum = i + 1;
-            String text = pages.get(i).getText();
-            if (text == null || text.isBlank()) continue;
+        try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
+            List<PDPage> pages = new ArrayList<>();
+            doc.getPages().forEach(pages::add);

-            String sectionId = bookId + "-p" + pageNum;
-            SectionEntity section = new SectionEntity(
-                sectionId, chapterId, bookId,
-                String.valueOf(pageNum),
-                "Page " + pageNum,
-                pageNum, pageNum,
-                text
-            );
-            sections.add(sectionRepository.save(section));
+            for (int i = 0; i < 25; i++) {
+                int pageNum = i + 1;
+                String text = normalizeWhitespace(extractPageText(pages.get(i)));
+                if (text.isBlank()) continue;
+
+                String sectionId = bookId + "-p" + pageNum;
+                SectionEntity section = new SectionEntity(
+                    sectionId, chapterId, bookId,
+                    String.valueOf(pageNum),
+                    "Page " + pageNum,
+                    pageNum, pageNum,
+                    text
+                );
+                sections.add(sectionRepository.save(section));
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
        }

        log.info("Parsed {} sections for book {}", sections.size(), bookId);
        return sections;
    }
+
+    /**
+     * Extracts text from a single page using column-aware region extraction.
+     * Splits the page at the horizontal midpoint. If the right region has fewer
+     * than 20% of the characters of the left region, treats the page as single-column.
+     */
+    private String extractPageText(PDPage page) throws IOException {
+        PDRectangle mediaBox = page.getMediaBox();
+        int width  = (int) mediaBox.getWidth();
+        int height = (int) mediaBox.getHeight();
+        int mid    = width / 2;
+
+        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+        stripper.setSortByPosition(true);
+        stripper.addRegion("left",  new Rectangle(0,   0, mid,         height));
+        stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
+        stripper.extractRegions(page);
+
+        String left  = stripper.getTextForRegion("left").strip();
+        String right = stripper.getTextForRegion("right").strip();
+
+        if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
+            // Single-column page — left holds all (or nearly all) content
+            return left.isEmpty() ? right : left;
+        }
+        return left + "\n\n" + right;
+    }
+
+    /** Collapses multi-space/tab runs and excessive blank lines. */
+    private String normalizeWhitespace(String text) {
+        return text
+            .replaceAll("[ \t]{2,}", " ")
+            .replaceAll("\n{3,}", "\n\n")
+            .trim();
+    }
 }
@@ -0,0 +1,97 @@
+package com.aiteacher.document;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
+import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
+import software.amazon.awssdk.core.sync.RequestBody;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.S3Configuration;
+import software.amazon.awssdk.services.s3.model.*;
+
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+@Service
+public class S3MarkdownStorageService implements MarkdownStorageService {
+
+    private static final Logger log = LoggerFactory.getLogger(S3MarkdownStorageService.class);
+
+    private final S3Client s3;
+    private final String bucket;
+
+    public S3MarkdownStorageService(
+            @Value("${app.figure-storage.endpoint}") String endpoint,
+            @Value("${app.figure-storage.region}") String region,
+            @Value("${app.figure-storage.bucket}") String bucket,
+            @Value("${app.figure-storage.access-key-id}") String accessKeyId,
+            @Value("${app.figure-storage.secret-access-key}") String secretKey) {
+        this.bucket = bucket;
+        URI endpointUri = URI.create(endpoint);
+        StaticCredentialsProvider credentials = StaticCredentialsProvider.create(
+                AwsBasicCredentials.create(accessKeyId, secretKey));
+        Region awsRegion = Region.of(region);
+        S3Configuration s3Config = S3Configuration.builder().pathStyleAccessEnabled(true).build();
+
+        this.s3 = S3Client.builder()
+                .endpointOverride(endpointUri)
+                .region(awsRegion)
+                .credentialsProvider(credentials)
+                .serviceConfiguration(s3Config)
+                .build();
+    }
+
+    @Override
+    public String save(UUID bookId, int pageNumber, String markdown) {
+        String key = key(bookId, pageNumber);
+        byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8);
+        s3.putObject(
+                PutObjectRequest.builder().bucket(bucket).key(key)
+                        .contentType("text/markdown; charset=utf-8")
+                        .contentLength((long) bytes.length).build(),
+                RequestBody.fromBytes(bytes));
+        return key;
+    }
+
+    @Override
+    public String getText(UUID bookId, int pageNumber) {
+        byte[] bytes = s3.getObjectAsBytes(
+                GetObjectRequest.builder().bucket(bucket).key(key(bookId, pageNumber)).build()
+        ).asByteArray();
+        return new String(bytes, StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public void deleteAll(UUID bookId) {
+        String prefix = "markdown/" + bookId + "/";
+        try {
+            List<ObjectIdentifier> toDelete = new ArrayList<>();
+            s3.listObjectsV2Paginator(ListObjectsV2Request.builder()
+                    .bucket(bucket).prefix(prefix).build()).stream()
+                    .flatMap(page -> page.contents().stream())
+                    .map(S3Object::key)
+                    .map(k -> ObjectIdentifier.builder().key(k).build())
+                    .forEach(toDelete::add);
+
+            if (toDelete.isEmpty()) return;
+
+            s3.deleteObjects(DeleteObjectsRequest.builder()
+                    .bucket(bucket)
+                    .delete(Delete.builder().objects(toDelete).build())
+                    .build());
+            log.info("Deleted {} markdown files from S3 for book {}", toDelete.size(), bookId);
+        } catch (S3Exception ex) {
+            log.warn("Could not fully delete markdown for book {} from S3: {}", bookId, ex.getMessage());
+        }
+    }
+
+    private static String key(UUID bookId, int pageNumber) {
+        return "markdown/" + bookId + "/page-" + pageNumber + ".md";
+    }
+}
@@ -38,14 +38,52 @@ public class TextChunkingService {
        List<String> windows = new ArrayList<>();
        int start = 0;
        while (start < text.length()) {
-            int end = Math.min(start + TARGET_CHARS, text.length());
-            windows.add(text.substring(start, end));
-            if (end == text.length()) break;
-            start = end - OVERLAP_CHARS;
+            int hardEnd = Math.min(start + TARGET_CHARS, text.length());
+            if (hardEnd == text.length()) {
+                String last = text.substring(start).strip();
+                if (!last.isEmpty()) windows.add(last);
+                break;
+            }
+            int splitAt = findSplitPoint(text, start, hardEnd);
+            String chunk = text.substring(start, splitAt).strip();
+            if (!chunk.isEmpty()) windows.add(chunk);
+            // Overlap: back up from split point, align to a word start
+            int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
+            while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
+            start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
        }
        return windows;
    }

+    /**
+     * Finds the best split point at or before hardEnd, preferring (in order):
+     * paragraph boundary, sentence boundary, word boundary, hard cut.
+     */
+    private int findSplitPoint(String text, int start, int hardEnd) {
+        int lookback = Math.min(400, (hardEnd - start) / 2);
+
+        // 1. Paragraph boundary
+        int paraIdx = text.lastIndexOf("\n\n", hardEnd);
+        if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
+
+        // 2. Sentence boundary (. ! ?) followed by space or newline
+        for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
+            char c = text.charAt(i);
+            if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
+                char next = text.charAt(i + 1);
+                if (next == ' ' || next == '\n') return i + 1;
+            }
+        }
+
+        // 3. Word boundary
+        for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
+            if (text.charAt(i) == ' ') return i + 1;
+        }
+
+        // 4. Hard cut
+        return hardEnd;
+    }
+
    private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
                                               int index, int total, String chunkId) {
        Map<String, Object> m = new HashMap<>();
@@ -8,18 +8,29 @@ import org.springframework.stereotype.Service;
 import org.springframework.util.MimeTypeUtils;

 /**
- * Generates a clinical text description for an extracted figure image
- * using the OpenAI vision model via Spring AI ChatClient.
+ * Analyses an extracted figure image using the OpenAI vision model.
+ *
+ * <p>Returns an {@link ImageAnalysis} record containing:
+ * <ul>
+ *   <li>{@code description} — 2-3 sentence clinical description of the image</li>
+ *   <li>{@code imageText} — all visible text, labels, and annotations copied verbatim
+ *       from the image (empty string when none present)</li>
+ * </ul>
+ *
+ * <p>Both fields are stored: {@code description} drives the embedding; {@code imageText}
+ * is added to chunk metadata so queries can match exact labels (e.g., "Circle of Willis").
 */
@Service
 public class VisionDescriptionService {

    private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class);

-    private static final String PROMPT =
-        "You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " +
-        "this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " +
-        "significance. If text or labels are visible, include them verbatim.";
+    private static final String PROMPT = """
+            You are a neurosurgery educator analysing a medical image.
+            Respond in EXACTLY this format — no other text, no markdown:
+            DESCRIPTION: <2-3 sentence clinical description focusing on anatomical structures, surgical landmarks, and clinical significance>
+            IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
+            """;

    private final ChatClient chatClient;

@@ -28,19 +39,53 @@ public class VisionDescriptionService {
    }

    /**
-     * Returns a description string. Falls back to the provided caption if vision fails.
+     * Holds the structured output of a vision model call on one figure image.
+     *
+     * @param description clinical description of the image content
+     * @param imageText   verbatim text visible inside the image; empty string if none
     */
-    public String describe(byte[] imageBytes, String captionFallback) {
+    public record ImageAnalysis(String description, String imageText) {}
+
+    /**
+     * Analyses the image bytes and returns an {@link ImageAnalysis}.
+     * Falls back gracefully: if the vision call fails, the caption is used as description
+     * and imageText is left empty.
+     *
+     * @param imageBytes    PNG bytes of the extracted figure
+     * @param captionFallback caption detected from surrounding text, may be null
+     */
+    public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
        try {
-            return chatClient.prompt()
-                .user(u -> u
-                    .text(PROMPT)
-                    .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
-                .call()
-                .content();
+            String raw = chatClient.prompt()
+                    .user(u -> u
+                            .text(PROMPT)
+                            .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
+                    .call()
+                    .content();
+            return parse(raw, captionFallback);
        } catch (Exception ex) {
-            log.warn("Vision description failed: {} — using caption as fallback", ex.getMessage());
-            return captionFallback != null ? captionFallback : "Figure";
+            log.warn("Vision analysis failed: {} — using caption as fallback", ex.getMessage());
+            return new ImageAnalysis(
+                    captionFallback != null ? captionFallback : "Figure",
+                    "");
        }
    }
+
+    private ImageAnalysis parse(String raw, String captionFallback) {
+        String description = captionFallback != null ? captionFallback : "Figure";
+        String imageText = "";
+
+        if (raw != null) {
+            for (String line : raw.split("\n")) {
+                if (line.startsWith("DESCRIPTION:")) {
+                    String val = line.substring("DESCRIPTION:".length()).strip();
+                    if (!val.isEmpty()) description = val;
+                } else if (line.startsWith("IMAGE_TEXT:")) {
+                    String val = line.substring("IMAGE_TEXT:".length()).strip();
+                    if (!val.isEmpty() && !"NONE".equalsIgnoreCase(val)) imageText = val;
+                }
+            }
+        }
+        return new ImageAnalysis(description, imageText);
+    }
 }
@@ -64,3 +64,5 @@ app:
  embedding:
    batch-size: 20
    batch-delay-ms: 2000
+  marker:
+    base-url: ${MARKER_BASE_URL:http://localhost:8000}