adding Marker to parse effectively pdf

2026-04-04 21:30:18 +02:00
parent b154e29f2d
commit ea1276dc2e
25 changed files with 2318 additions and 285 deletions
@@ -1,43 +1,43 @@
 package com.aiteacher.document;

 import com.aiteacher.figure.FigureStorageService;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;

+import javax.imageio.ImageIO;
 import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
- * Extracts images from each PDF page using PDFBox.
- * Images below the configured minimum size are skipped.
- * Caption is detected by the "Fig." pattern in page text.
+ * Extracts figure images from {@link PageResult.FigureData} entries produced by
+ * {@link MarkerPageParser}.
+ *
+ * <p>Marker returns pre-cropped PNG bytes for each detected figure, so no PDFBox
+ * page rendering or bounding-box cropping is needed. This service:
+ * <ol>
+ *   <li>Decodes the PNG bytes to check dimensions (skip images below min size)</li>
+ *   <li>Classifies the figure type from caption and surrounding text keywords</li>
+ *   <li>Persists the image via {@link FigureStorageService}</li>
+ *   <li>Persists a {@link FigureEntity} to the database</li>
+ * </ol>
 */
@Service
 public class FigureExtractionService {

    private static final Logger log = LoggerFactory.getLogger(FigureExtractionService.class);

-    // Caption: line starting with "Fig." or "Figure" followed by a number
-    private static final Pattern CAPTION_PATTERN =
-        Pattern.compile("(?m)^(Fig\\.?\\s*\\d+[\\-.]?\\d*[^\\n]*)", Pattern.CASE_INSENSITIVE);
-
-    // Figure label: "Fig. 12-4" or "Fig. 12.4"
    private static final Pattern LABEL_PATTERN =
-        Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");
+            Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)");

    private final FigureStorageService storageService;
    private final FigureRepository figureRepository;
@@ -52,65 +52,77 @@ public class FigureExtractionService {
        this.minImageSizePx = minImageSizePx;
    }

+    /** Holds the extraction output: persisted figures and a Marker blockId → DB figureId map. */
+    public record ExtractionResult(List<FigureEntity> figures, Map<String, String> blockIdToFigureId) {}
+
    /**
-     * Extracts all qualifying images from the PDF for the given book.
-     * Returns persisted FigureEntity list (without vision descriptions — set later).
+     * Extracts and persists figures for all pages described by {@code pageResults}.
+     *
+     * @param bookId      owning book
+     * @param chapterId   chapter bucket for these sections
+     * @param pageResults Marker parse output — each entry's {@code figures} list
+     *                    carries pre-cropped PNG bytes for that page
+     * @return {@link ExtractionResult} with persisted figures and blockId→figureId map
+     *         (used to resolve markdown image placeholders)
     */
-    public List<FigureEntity> extract(UUID bookId, String chapterId,
-                                      List<SectionEntity> sections, Path pdfPath) {
+    public ExtractionResult extract(UUID bookId, String chapterId,
+                                    List<PageResult> pageResults) {
        List<FigureEntity> figures = new ArrayList<>();
+        Map<String, String> blockIdToFigureId = new HashMap<>();
        int figureCounter = 0;

-        try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
-            for (SectionEntity section : sections) {
-                int pageIndex = section.getPageStart() - 1; // 0-based
-                if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue;
-
-                PDPage page = doc.getPage(pageIndex);
-                String pageText = section.getFullText();
+        for (PageResult page : pageResults) {
+            if (page.figures().isEmpty()) continue;

+            for (PageResult.FigureData figureData : page.figures()) {
                try {
-                    for (COSName name : page.getResources().getXObjectNames()) {
-                        PDXObject xObject = page.getResources().getXObject(name);
-                        if (!(xObject instanceof PDImageXObject image)) continue;
-
-                        BufferedImage bufferedImage = image.getImage();
-                        if (bufferedImage.getWidth() < minImageSizePx
-                                || bufferedImage.getHeight() < minImageSizePx) {
-                            continue; // skip decorative images
-                        }
-
-                        figureCounter++;
-                        String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter;
-                        String caption = detectCaption(pageText);
-                        String label = detectLabel(caption, figureCounter);
-                        FigureType type = classifyType(caption, pageText);
-
-                        String imagePath = storageService.save(bookId, figureId, bufferedImage);
-
-                        FigureEntity figure = new FigureEntity(
-                            figureId, bookId, section.getId(), chapterId,
-                            label, caption, type, section.getPageStart(), imagePath
-                        );
-                        figures.add(figureRepository.save(figure));
+                    BufferedImage image = decodeImage(figureData.imageBytes());
+                    if (image == null) {
+                        log.debug("Could not decode image on page {} of book {} (block {})",
+                                page.pageNumber(), bookId, figureData.blockId());
+                        continue;
                    }
-                } catch (IOException ex) {
-                    log.warn("Failed to extract images from page {} of book {}: {}",
-                        section.getPageStart(), bookId, ex.getMessage());
+                    if (image.getWidth() < minImageSizePx || image.getHeight() < minImageSizePx) {
+                        log.debug("Skipping small figure on page {} ({}×{})",
+                                page.pageNumber(), image.getWidth(), image.getHeight());
+                        continue;
+                    }
+
+                    figureCounter++;
+                    String figureId = bookId + "-fig-" + page.pageNumber() + "-" + figureCounter;
+                    String caption = figureData.nearestCaption();
+                    String label = detectLabel(caption, figureCounter);
+                    FigureType type = classifyType(caption, page.orderedText());
+
+                    String sectionId = bookId + "-p" + page.pageNumber();
+                    String imagePath = storageService.save(bookId, figureId, image);
+
+                    FigureEntity figure = new FigureEntity(
+                            figureId, bookId, sectionId, chapterId,
+                            label, caption, type, page.pageNumber(), imagePath);
+                    figures.add(figureRepository.save(figure));
+                    blockIdToFigureId.put(figureData.blockId(), figureId);
+
+                } catch (Exception ex) {
+                    log.warn("Failed to extract figure on page {} of book {}: {}",
+                            page.pageNumber(), bookId, ex.getMessage());
                }
            }
-        } catch (IOException ex) {
-            log.error("Could not open PDF for image extraction, book {}", bookId, ex);
        }

        log.info("Extracted {} figures for book {}", figures.size(), bookId);
-        return figures;
+        return new ExtractionResult(figures, blockIdToFigureId);
    }

-    private String detectCaption(String pageText) {
-        if (pageText == null) return null;
-        Matcher m = CAPTION_PATTERN.matcher(pageText);
-        return m.find() ? m.group(1).trim() : null;
+    // --- Private helpers ---
+
+    private BufferedImage decodeImage(byte[] imageBytes) {
+        if (imageBytes == null || imageBytes.length == 0) return null;
+        try {
+            return ImageIO.read(new ByteArrayInputStream(imageBytes));
+        } catch (IOException ex) {
+            return null;
+        }
    }

    private String detectLabel(String caption, int counter) {
@@ -122,14 +134,18 @@ public class FigureExtractionService {
    }

    private FigureType classifyType(String caption, String pageText) {
-        String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase();
+        String combined = ((caption != null ? caption : "") + " " +
+                           (pageText != null ? pageText : "")).toLowerCase();
        if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic")
-                || combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
-        if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE;
-        if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE;
+                || combined.contains("tomography"))    return FigureType.MRI_CT_SCAN;
+        if (combined.contains("intraoperative") || combined.contains("intra-op"))
+                                                       return FigureType.INTRAOPERATIVE_IMAGE;
+        if (caption != null && caption.toLowerCase().startsWith("table"))
+                                                       return FigureType.TABLE;
        if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph"))
-            return FigureType.CHART;
-        if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH;
+                                                       return FigureType.CHART;
+        if (combined.contains("photograph") || combined.contains("photo"))
+                                                       return FigureType.SURGICAL_PHOTOGRAPH;
        return FigureType.ANATOMICAL_DIAGRAM;
    }
 }
@@ -0,0 +1,14 @@
+package com.aiteacher.document;
+
+import java.util.UUID;
+
+public interface MarkdownStorageService {
+    /** Uploads the markdown content and returns the S3 key. */
+    String save(UUID bookId, int pageNumber, String markdown);
+
+    /** Downloads and returns the markdown content for the given book and page. */
+    String getText(UUID bookId, int pageNumber);
+
+    /** Deletes all markdown files for the given book. */
+    void deleteAll(UUID bookId);
+}
@@ -0,0 +1,273 @@
+package com.aiteacher.document;
+
+import tools.jackson.databind.JsonNode;
+import tools.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.core.io.FileSystemResource;
+import org.springframework.http.MediaType;
+import org.springframework.stereotype.Service;
+import org.springframework.util.LinkedMultiValueMap;
+import org.springframework.util.MultiValueMap;
+import org.springframework.web.client.RestClient;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+
+/**
+ * Parses a PDF using the local Marker server ({@code POST /marker/upload}).
+ *
+ * <p>A single HTTP call returns:
+ * <ul>
+ *   <li>Reading-order text blocks — correct for multi-column and scanned pages</li>
+ *   <li>Section headings extracted from {@code SectionHeader} blocks</li>
+ *   <li>Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's
+ *       {@code images} map</li>
+ * </ul>
+ *
+ * <p>The response is mapped to one {@link PageResult} per page block.
+ */
+@Service
+public class MarkerPageParser {
+
+    private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class);
+
+    private static final Set<String> TEXT_BLOCK_TYPES = Set.of(
+            "Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation",
+            "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting"
+    );
+    private static final Set<String> FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
+
+    private final RestClient restClient;
+    private final ObjectMapper objectMapper;
+
+    public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) {
+        this.restClient = restClient;
+        this.objectMapper = objectMapper;
+    }
+
+    /**
+     * Parses the entire PDF and returns one {@link PageResult} per non-empty page.
+     */
+    public List<PageResult> parse(Path pdfPath) {
+        log.info("Submitting {} to Marker for parsing", pdfPath.getFileName());
+
+        MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
+        body.add("file", new FileSystemResource(pdfPath));
+        body.add("output_format", "json");
+
+        JsonNode response = restClient.post()
+                .uri("/marker/upload")
+                .contentType(MediaType.MULTIPART_FORM_DATA)
+                .body(body)
+                .retrieve()
+                .body(JsonNode.class);
+
+        try {
+            Path debugFile = Path.of("/tmp/marker-response-md.json");
+            Files.writeString(debugFile, response.toPrettyString());
+            log.info("Marker response saved to {}", debugFile);
+        } catch (IOException e) {
+            log.warn("Could not save Marker response to file", e);
+        }
+
+        List<PageResult> results = parseResponse(response);
+        log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName());
+        return results;
+    }
+
+    // --- Private helpers ---
+
+    private List<PageResult> parseResponse(JsonNode response) {
+        if (response == null) return List.of();
+
+        // The "output" field is a JSON-encoded string — parse it first.
+        // Fall back to treating the whole response as the root if "output" is absent.
+        JsonNode root;
+        JsonNode outputNode = response.path("output");
+        if (!outputNode.isMissingNode() && outputNode.isTextual()) {
+            try {
+                root = objectMapper.readTree(outputNode.asText());
+            } catch (tools.jackson.core.JacksonException e) {
+                log.warn("Could not parse Marker 'output' field as JSON", e);
+                return List.of();
+            }
+        } else if (!outputNode.isMissingNode()) {
+            root = outputNode;
+        } else {
+            root = response;
+        }
+
+        JsonNode children = root.path("children");
+        if (children.isMissingNode() || !children.isArray()) {
+            log.warn("Marker response has no 'children' array — empty result");
+            return List.of();
+        }
+
+        List<PageResult> results = new ArrayList<>();
+        int pageIndex = 0;
+        for (JsonNode pageBlock : children) {
+            String blockType = pageBlock.path("block_type").asText();
+            if (!"Page".equals(blockType)) continue;
+
+            int pageNumber = pageIndex + 1;
+            pageIndex++;
+
+            PageResult result = parsePage(pageBlock, pageNumber);
+            if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
+                results.add(result);
+            }
+        }
+        return results;
+    }
+
+    private PageResult parsePage(JsonNode pageBlock, int pageNumber) {
+        JsonNode children = pageBlock.path("children");
+        if (children.isMissingNode() || !children.isArray()) {
+            return new PageResult(pageNumber, "", null, List.of(), "");
+        }
+
+        StringBuilder textBuilder = new StringBuilder();
+        StringBuilder markdownBuilder = new StringBuilder();
+        String headingTitle = null;
+        List<PageResult.FigureData> figures = new ArrayList<>();
+        Set<Integer> consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure
+
+        List<JsonNode> childList = new ArrayList<>();
+        children.forEach(childList::add);
+
+        for (int i = 0; i < childList.size(); i++) {
+            if (consumed.contains(i)) continue;
+
+            JsonNode child = childList.get(i);
+            String type = child.path("block_type").asText();
+
+            if ("SectionHeader".equals(type)) {
+                String heading = stripHtml(child.path("html").asText()).strip();
+                if (!heading.isEmpty() && headingTitle == null) {
+                    headingTitle = heading;
+                }
+                appendText(textBuilder, heading);
+                appendMarkdown(markdownBuilder, "## " + heading);
+
+            } else if (TEXT_BLOCK_TYPES.contains(type)) {
+                String text = stripHtml(child.path("html").asText());
+                appendText(textBuilder, text);
+                appendMarkdown(markdownBuilder, text.strip());
+
+            } else if (FIGURE_BLOCK_TYPES.contains(type)) {
+                extractFigures(child, i, childList, figures, markdownBuilder, consumed);
+            }
+        }
+
+        return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle,
+                figures, markdownBuilder.toString().strip());
+    }
+
+    /**
+     * Handles a figure/picture block at {@code index} in {@code siblings}.
+     * For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure,
+     * and the caption is a sibling Caption child inside the group.
+     * For leaf blocks the caption is the next sibling in the page child list.
+     * Image refs are appended to {@code markdown} as {@code ![caption](marker://{blockId})}.
+     * Consumed caption sibling indices are added to {@code consumed}.
+     */
+    private void extractFigures(JsonNode block, int index, List<JsonNode> siblings,
+                                 List<PageResult.FigureData> out, StringBuilder markdown,
+                                 Set<Integer> consumed) {
+        String type = block.path("block_type").asText();
+        boolean isGroup = type.endsWith("Group");
+
+        if (isGroup) {
+            JsonNode groupChildren = block.path("children");
+            if (groupChildren.isMissingNode() || !groupChildren.isArray()) return;
+
+            String groupCaption = null;
+            for (JsonNode sub : groupChildren) {
+                if ("Caption".equals(sub.path("block_type").asText())) {
+                    String c = stripHtml(sub.path("html").asText()).strip();
+                    if (!c.isEmpty()) groupCaption = c;
+                }
+            }
+            for (JsonNode sub : groupChildren) {
+                String subType = sub.path("block_type").asText();
+                if ("Figure".equals(subType) || "Picture".equals(subType)) {
+                    String blockId = sub.path("id").asText();
+                    byte[] imageBytes = extractImageBytes(sub, blockId);
+                    if (imageBytes != null) {
+                        out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId));
+                        String altText = groupCaption != null ? groupCaption : blockId;
+                        appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
+                    }
+                }
+            }
+        } else {
+            String blockId = block.path("id").asText();
+            byte[] imageBytes = extractImageBytes(block, blockId);
+            if (imageBytes != null) {
+                String caption = null;
+                if (index + 1 < siblings.size()) {
+                    JsonNode next = siblings.get(index + 1);
+                    if ("Caption".equals(next.path("block_type").asText())) {
+                        String c = stripHtml(next.path("html").asText()).strip();
+                        if (!c.isEmpty()) caption = c;
+                        consumed.add(index + 1);
+                    }
+                }
+                out.add(new PageResult.FigureData(imageBytes, caption, blockId));
+                String altText = caption != null ? caption : blockId;
+                appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")");
+            }
+        }
+    }
+
+    /**
+     * Extracts and base64-decodes the image bytes for this block.
+     * Marker stores images in the block's {@code images} map keyed by block ID.
+     */
+    private byte[] extractImageBytes(JsonNode block, String blockId) {
+        JsonNode images = block.path("images");
+        if (images.isMissingNode() || images.isEmpty()) return null;
+
+        // Try the block's own ID first, then fall back to the first entry
+        JsonNode imgNode = images.path(blockId);
+        if (imgNode.isMissingNode()) {
+            imgNode = images.properties().stream()
+                    .findFirst()
+                    .map(e -> e.getValue())
+                    .orElse(imgNode);
+        }
+
+        String base64 = imgNode.asText();
+        if (base64.isEmpty()) return null;
+
+        try {
+            return Base64.getDecoder().decode(base64);
+        } catch (IllegalArgumentException ex) {
+            log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
+            return null;
+        }
+    }
+
+    private void appendText(StringBuilder sb, String text) {
+        String stripped = text.strip();
+        if (stripped.isEmpty()) return;
+        if (sb.length() > 0) sb.append("\n\n");
+        sb.append(stripped);
+    }
+
+    private void appendMarkdown(StringBuilder sb, String text) {
+        if (text == null || text.isBlank()) return;
+        if (sb.length() > 0) sb.append("\n\n");
+        sb.append(text.strip());
+    }
+
+    /** Strips HTML tags and normalises whitespace. */
+    private String stripHtml(String html) {
+        if (html == null || html.isEmpty()) return "";
+        return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip();
+    }
+}
@@ -0,0 +1,26 @@
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by MarkerPageParser for one PDF page.
+ * Decouples the Marker HTTP API from downstream services.
+ */
+public record PageResult(
+        int pageNumber,           // 1-based, derived from Marker page block index
+        String orderedText,       // full page text in correct reading order (blocks joined by \n\n)
+        String headingTitle,      // first SectionHeader block on page, or null
+        List<FigureData> figures, // extracted figure images (may be empty)
+        String markdown           // markdown representation with marker://{blockId} image placeholders
+) {
+
+    /**
+     * A figure extracted from the page.
+     * Image bytes are PNG data decoded from the Marker JSON {@code images} map.
+     */
+    public record FigureData(
+            byte[] imageBytes,       // PNG image data (base64-decoded from Marker response)
+            String nearestCaption,   // text of the adjacent Caption block, or null
+            String blockId           // Marker block ID (e.g. "/page/0/Figure/2") for traceability
+    ) {}
+}
@@ -1,13 +1,17 @@
 package com.aiteacher.document;

+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.text.PDFTextStripperByArea;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
-import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
-import org.springframework.core.io.FileSystemResource;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;

+import java.awt.Rectangle;
+import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
@@ -15,13 +19,18 @@ import java.util.UUID;

 /**
 * Parses a PDF into page-level SectionEntity records stored in Postgres.
- * Each page becomes one section, grouped under a single chapter per book.
+ * Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
+ * left column is extracted first then right, preserving correct reading order.
+ * Text is also normalized (collapsed whitespace) before storage.
 */
@Service
 public class PdfStructureParser {

    private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);

+    // Right column is considered empty (single-column page) if it has < 20% of left column's content
+    private static final double TWO_COLUMN_THRESHOLD = 0.2;
+
    private final ChapterRepository chapterRepository;
    private final SectionRepository sectionRepository;

@@ -35,37 +44,71 @@ public class PdfStructureParser {
    public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
        log.info("Parsing PDF structure for book {}", bookId);

-        // One chapter per book
        String chapterId = bookId + "-ch1";
        ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
        chapterRepository.save(chapter);

-        // One section per page
-        PagePdfDocumentReader reader = new PagePdfDocumentReader(
-            new FileSystemResource(pdfPath.toFile()),
-            PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
-        );
-
-        List<org.springframework.ai.document.Document> pages = reader.get();
        List<SectionEntity> sections = new ArrayList<>();

-        for (int i = 0; i < pages.size(); i++) {
-            int pageNum = i + 1;
-            String text = pages.get(i).getText();
-            if (text == null || text.isBlank()) continue;
+        try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
+            List<PDPage> pages = new ArrayList<>();
+            doc.getPages().forEach(pages::add);

-            String sectionId = bookId + "-p" + pageNum;
-            SectionEntity section = new SectionEntity(
-                sectionId, chapterId, bookId,
-                String.valueOf(pageNum),
-                "Page " + pageNum,
-                pageNum, pageNum,
-                text
-            );
-            sections.add(sectionRepository.save(section));
+            for (int i = 0; i < 25; i++) {
+                int pageNum = i + 1;
+                String text = normalizeWhitespace(extractPageText(pages.get(i)));
+                if (text.isBlank()) continue;
+
+                String sectionId = bookId + "-p" + pageNum;
+                SectionEntity section = new SectionEntity(
+                    sectionId, chapterId, bookId,
+                    String.valueOf(pageNum),
+                    "Page " + pageNum,
+                    pageNum, pageNum,
+                    text
+                );
+                sections.add(sectionRepository.save(section));
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
        }

        log.info("Parsed {} sections for book {}", sections.size(), bookId);
        return sections;
    }
+
+    /**
+     * Extracts text from a single page using column-aware region extraction.
+     * Splits the page at the horizontal midpoint. If the right region has fewer
+     * than 20% of the characters of the left region, treats the page as single-column.
+     */
+    private String extractPageText(PDPage page) throws IOException {
+        PDRectangle mediaBox = page.getMediaBox();
+        int width  = (int) mediaBox.getWidth();
+        int height = (int) mediaBox.getHeight();
+        int mid    = width / 2;
+
+        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+        stripper.setSortByPosition(true);
+        stripper.addRegion("left",  new Rectangle(0,   0, mid,         height));
+        stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
+        stripper.extractRegions(page);
+
+        String left  = stripper.getTextForRegion("left").strip();
+        String right = stripper.getTextForRegion("right").strip();
+
+        if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
+            // Single-column page — left holds all (or nearly all) content
+            return left.isEmpty() ? right : left;
+        }
+        return left + "\n\n" + right;
+    }
+
+    /** Collapses multi-space/tab runs and excessive blank lines. */
+    private String normalizeWhitespace(String text) {
+        return text
+            .replaceAll("[ \t]{2,}", " ")
+            .replaceAll("\n{3,}", "\n\n")
+            .trim();
+    }
 }
@@ -0,0 +1,97 @@
+package com.aiteacher.document;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
+import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
+import software.amazon.awssdk.core.sync.RequestBody;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.S3Configuration;
+import software.amazon.awssdk.services.s3.model.*;
+
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+@Service
+public class S3MarkdownStorageService implements MarkdownStorageService {
+
+    private static final Logger log = LoggerFactory.getLogger(S3MarkdownStorageService.class);
+
+    private final S3Client s3;
+    private final String bucket;
+
+    public S3MarkdownStorageService(
+            @Value("${app.figure-storage.endpoint}") String endpoint,
+            @Value("${app.figure-storage.region}") String region,
+            @Value("${app.figure-storage.bucket}") String bucket,
+            @Value("${app.figure-storage.access-key-id}") String accessKeyId,
+            @Value("${app.figure-storage.secret-access-key}") String secretKey) {
+        this.bucket = bucket;
+        URI endpointUri = URI.create(endpoint);
+        StaticCredentialsProvider credentials = StaticCredentialsProvider.create(
+                AwsBasicCredentials.create(accessKeyId, secretKey));
+        Region awsRegion = Region.of(region);
+        S3Configuration s3Config = S3Configuration.builder().pathStyleAccessEnabled(true).build();
+
+        this.s3 = S3Client.builder()
+                .endpointOverride(endpointUri)
+                .region(awsRegion)
+                .credentialsProvider(credentials)
+                .serviceConfiguration(s3Config)
+                .build();
+    }
+
+    @Override
+    public String save(UUID bookId, int pageNumber, String markdown) {
+        String key = key(bookId, pageNumber);
+        byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8);
+        s3.putObject(
+                PutObjectRequest.builder().bucket(bucket).key(key)
+                        .contentType("text/markdown; charset=utf-8")
+                        .contentLength((long) bytes.length).build(),
+                RequestBody.fromBytes(bytes));
+        return key;
+    }
+
+    @Override
+    public String getText(UUID bookId, int pageNumber) {
+        byte[] bytes = s3.getObjectAsBytes(
+                GetObjectRequest.builder().bucket(bucket).key(key(bookId, pageNumber)).build()
+        ).asByteArray();
+        return new String(bytes, StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public void deleteAll(UUID bookId) {
+        String prefix = "markdown/" + bookId + "/";
+        try {
+            List<ObjectIdentifier> toDelete = new ArrayList<>();
+            s3.listObjectsV2Paginator(ListObjectsV2Request.builder()
+                    .bucket(bucket).prefix(prefix).build()).stream()
+                    .flatMap(page -> page.contents().stream())
+                    .map(S3Object::key)
+                    .map(k -> ObjectIdentifier.builder().key(k).build())
+                    .forEach(toDelete::add);
+
+            if (toDelete.isEmpty()) return;
+
+            s3.deleteObjects(DeleteObjectsRequest.builder()
+                    .bucket(bucket)
+                    .delete(Delete.builder().objects(toDelete).build())
+                    .build());
+            log.info("Deleted {} markdown files from S3 for book {}", toDelete.size(), bookId);
+        } catch (S3Exception ex) {
+            log.warn("Could not fully delete markdown for book {} from S3: {}", bookId, ex.getMessage());
+        }
+    }
+
+    private static String key(UUID bookId, int pageNumber) {
+        return "markdown/" + bookId + "/page-" + pageNumber + ".md";
+    }
+}
@@ -38,14 +38,52 @@ public class TextChunkingService {
        List<String> windows = new ArrayList<>();
        int start = 0;
        while (start < text.length()) {
-            int end = Math.min(start + TARGET_CHARS, text.length());
-            windows.add(text.substring(start, end));
-            if (end == text.length()) break;
-            start = end - OVERLAP_CHARS;
+            int hardEnd = Math.min(start + TARGET_CHARS, text.length());
+            if (hardEnd == text.length()) {
+                String last = text.substring(start).strip();
+                if (!last.isEmpty()) windows.add(last);
+                break;
+            }
+            int splitAt = findSplitPoint(text, start, hardEnd);
+            String chunk = text.substring(start, splitAt).strip();
+            if (!chunk.isEmpty()) windows.add(chunk);
+            // Overlap: back up from split point, align to a word start
+            int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
+            while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
+            start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
        }
        return windows;
    }

+    /**
+     * Finds the best split point at or before hardEnd, preferring (in order):
+     * paragraph boundary, sentence boundary, word boundary, hard cut.
+     */
+    private int findSplitPoint(String text, int start, int hardEnd) {
+        int lookback = Math.min(400, (hardEnd - start) / 2);
+
+        // 1. Paragraph boundary
+        int paraIdx = text.lastIndexOf("\n\n", hardEnd);
+        if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
+
+        // 2. Sentence boundary (. ! ?) followed by space or newline
+        for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
+            char c = text.charAt(i);
+            if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
+                char next = text.charAt(i + 1);
+                if (next == ' ' || next == '\n') return i + 1;
+            }
+        }
+
+        // 3. Word boundary
+        for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
+            if (text.charAt(i) == ' ') return i + 1;
+        }
+
+        // 4. Hard cut
+        return hardEnd;
+    }
+
    private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
                                               int index, int total, String chunkId) {
        Map<String, Object> m = new HashMap<>();
@@ -8,18 +8,29 @@ import org.springframework.stereotype.Service;
 import org.springframework.util.MimeTypeUtils;

 /**
- * Generates a clinical text description for an extracted figure image
- * using the OpenAI vision model via Spring AI ChatClient.
+ * Analyses an extracted figure image using the OpenAI vision model.
+ *
+ * <p>Returns an {@link ImageAnalysis} record containing:
+ * <ul>
+ *   <li>{@code description} — 2-3 sentence clinical description of the image</li>
+ *   <li>{@code imageText} — all visible text, labels, and annotations copied verbatim
+ *       from the image (empty string when none present)</li>
+ * </ul>
+ *
+ * <p>Both fields are stored: {@code description} drives the embedding; {@code imageText}
+ * is added to chunk metadata so queries can match exact labels (e.g., "Circle of Willis").
 */
@Service
 public class VisionDescriptionService {

    private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class);

-    private static final String PROMPT =
-        "You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " +
-        "this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " +
-        "significance. If text or labels are visible, include them verbatim.";
+    private static final String PROMPT = """
+            You are a neurosurgery educator analysing a medical image.
+            Respond in EXACTLY this format — no other text, no markdown:
+            DESCRIPTION: <2-3 sentence clinical description focusing on anatomical structures, surgical landmarks, and clinical significance>
+            IMAGE_TEXT: <all visible text, labels, measurements, and annotations copied verbatim, comma-separated; write NONE if no text visible>
+            """;

    private final ChatClient chatClient;

@@ -28,19 +39,53 @@ public class VisionDescriptionService {
    }

    /**
-     * Returns a description string. Falls back to the provided caption if vision fails.
+     * Holds the structured output of a vision model call on one figure image.
+     *
+     * @param description clinical description of the image content
+     * @param imageText   verbatim text visible inside the image; empty string if none
     */
-    public String describe(byte[] imageBytes, String captionFallback) {
+    public record ImageAnalysis(String description, String imageText) {}
+
+    /**
+     * Analyses the image bytes and returns an {@link ImageAnalysis}.
+     * Falls back gracefully: if the vision call fails, the caption is used as description
+     * and imageText is left empty.
+     *
+     * @param imageBytes    PNG bytes of the extracted figure
+     * @param captionFallback caption detected from surrounding text, may be null
+     */
+    public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
        try {
-            return chatClient.prompt()
-                .user(u -> u
-                    .text(PROMPT)
-                    .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
-                .call()
-                .content();
+            String raw = chatClient.prompt()
+                    .user(u -> u
+                            .text(PROMPT)
+                            .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
+                    .call()
+                    .content();
+            return parse(raw, captionFallback);
        } catch (Exception ex) {
-            log.warn("Vision description failed: {} — using caption as fallback", ex.getMessage());
-            return captionFallback != null ? captionFallback : "Figure";
+            log.warn("Vision analysis failed: {} — using caption as fallback", ex.getMessage());
+            return new ImageAnalysis(
+                    captionFallback != null ? captionFallback : "Figure",
+                    "");
        }
    }
+
+    private ImageAnalysis parse(String raw, String captionFallback) {
+        String description = captionFallback != null ? captionFallback : "Figure";
+        String imageText = "";
+
+        if (raw != null) {
+            for (String line : raw.split("\n")) {
+                if (line.startsWith("DESCRIPTION:")) {
+                    String val = line.substring("DESCRIPTION:".length()).strip();
+                    if (!val.isEmpty()) description = val;
+                } else if (line.startsWith("IMAGE_TEXT:")) {
+                    String val = line.substring("IMAGE_TEXT:".length()).strip();
+                    if (!val.isEmpty() && !"NONE".equalsIgnoreCase(val)) imageText = val;
+                }
+            }
+        }
+        return new ImageAnalysis(description, imageText);
+    }
 }