blockIdToFigureId = new HashMap<>();
int figureCounter = 0;
- try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
- for (SectionEntity section : sections) {
- int pageIndex = section.getPageStart() - 1; // 0-based
- if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue;
-
- PDPage page = doc.getPage(pageIndex);
- String pageText = section.getFullText();
+ for (PageResult page : pageResults) {
+ if (page.figures().isEmpty()) continue;
+ for (PageResult.FigureData figureData : page.figures()) {
try {
- for (COSName name : page.getResources().getXObjectNames()) {
- PDXObject xObject = page.getResources().getXObject(name);
- if (!(xObject instanceof PDImageXObject image)) continue;
-
- BufferedImage bufferedImage = image.getImage();
- if (bufferedImage.getWidth() < minImageSizePx
- || bufferedImage.getHeight() < minImageSizePx) {
- continue; // skip decorative images
- }
-
- figureCounter++;
- String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter;
- String caption = detectCaption(pageText);
- String label = detectLabel(caption, figureCounter);
- FigureType type = classifyType(caption, pageText);
-
- String imagePath = storageService.save(bookId, figureId, bufferedImage);
-
- FigureEntity figure = new FigureEntity(
- figureId, bookId, section.getId(), chapterId,
- label, caption, type, section.getPageStart(), imagePath
- );
- figures.add(figureRepository.save(figure));
+ BufferedImage image = decodeImage(figureData.imageBytes());
+ if (image == null) {
+ log.debug("Could not decode image on page {} of book {} (block {})",
+ page.pageNumber(), bookId, figureData.blockId());
+ continue;
}
- } catch (IOException ex) {
- log.warn("Failed to extract images from page {} of book {}: {}",
- section.getPageStart(), bookId, ex.getMessage());
+ if (image.getWidth() < minImageSizePx || image.getHeight() < minImageSizePx) {
+ log.debug("Skipping small figure on page {} ({}×{})",
+ page.pageNumber(), image.getWidth(), image.getHeight());
+ continue;
+ }
+
+ figureCounter++;
+ String figureId = bookId + "-fig-" + page.pageNumber() + "-" + figureCounter;
+ String caption = figureData.nearestCaption();
+ String label = detectLabel(caption, figureCounter);
+ FigureType type = classifyType(caption, page.orderedText());
+
+ String sectionId = bookId + "-p" + page.pageNumber();
+ String imagePath = storageService.save(bookId, figureId, image);
+
+ FigureEntity figure = new FigureEntity(
+ figureId, bookId, sectionId, chapterId,
+ label, caption, type, page.pageNumber(), imagePath);
+ figures.add(figureRepository.save(figure));
+ blockIdToFigureId.put(figureData.blockId(), figureId);
+
+ } catch (Exception ex) {
+ log.warn("Failed to extract figure on page {} of book {}: {}",
+ page.pageNumber(), bookId, ex.getMessage());
}
}
- } catch (IOException ex) {
- log.error("Could not open PDF for image extraction, book {}", bookId, ex);
}
log.info("Extracted {} figures for book {}", figures.size(), bookId);
- return figures;
+ return new ExtractionResult(figures, blockIdToFigureId);
}
- private String detectCaption(String pageText) {
- if (pageText == null) return null;
- Matcher m = CAPTION_PATTERN.matcher(pageText);
- return m.find() ? m.group(1).trim() : null;
+ // --- Private helpers ---
+
+ private BufferedImage decodeImage(byte[] imageBytes) {
+ if (imageBytes == null || imageBytes.length == 0) return null;
+ try {
+ return ImageIO.read(new ByteArrayInputStream(imageBytes));
+ } catch (IOException ex) {
+ return null;
+ }
}
private String detectLabel(String caption, int counter) {
@@ -122,14 +134,18 @@ public class FigureExtractionService {
}
private FigureType classifyType(String caption, String pageText) {
- String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase();
+ String combined = ((caption != null ? caption : "") + " " +
+ (pageText != null ? pageText : "")).toLowerCase();
if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic")
- || combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
- if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE;
- if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE;
+ || combined.contains("tomography")) return FigureType.MRI_CT_SCAN;
+ if (combined.contains("intraoperative") || combined.contains("intra-op"))
+ return FigureType.INTRAOPERATIVE_IMAGE;
+ if (caption != null && caption.toLowerCase().startsWith("table"))
+ return FigureType.TABLE;
if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph"))
- return FigureType.CHART;
- if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH;
+ return FigureType.CHART;
+ if (combined.contains("photograph") || combined.contains("photo"))
+ return FigureType.SURGICAL_PHOTOGRAPH;
return FigureType.ANATOMICAL_DIAGRAM;
}
}
diff --git a/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java b/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java
new file mode 100644
index 0000000..7359978
--- /dev/null
+++ b/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java
@@ -0,0 +1,14 @@
+package com.aiteacher.document;
+
+import java.util.UUID;
+
+public interface MarkdownStorageService {
+ /** Uploads the markdown content and returns the S3 key. */
+ String save(UUID bookId, int pageNumber, String markdown);
+
+ /** Downloads and returns the markdown content for the given book and page. */
+ String getText(UUID bookId, int pageNumber);
+
+ /** Deletes all markdown files for the given book. */
+ void deleteAll(UUID bookId);
+}
diff --git a/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java
new file mode 100644
index 0000000..d75806b
--- /dev/null
+++ b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java
@@ -0,0 +1,273 @@
+package com.aiteacher.document;
+
+import tools.jackson.databind.JsonNode;
+import tools.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.core.io.FileSystemResource;
+import org.springframework.http.MediaType;
+import org.springframework.stereotype.Service;
+import org.springframework.util.LinkedMultiValueMap;
+import org.springframework.util.MultiValueMap;
+import org.springframework.web.client.RestClient;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+
+/**
+ * Parses a PDF using the local Marker server ({@code POST /marker/upload}).
+ *
+ * A single HTTP call returns:
+ *
+ * Reading-order text blocks — correct for multi-column and scanned pages
+ * Section headings extracted from {@code SectionHeader} blocks
+ * Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's
+ * {@code images} map
+ *
+ *
+ * The response is mapped to one {@link PageResult} per page block.
+ */
+@Service
+public class MarkerPageParser {
+
+ private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class);
+
+ private static final Set TEXT_BLOCK_TYPES = Set.of(
+ "Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation",
+ "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting"
+ );
+ private static final Set FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup");
+
+ private final RestClient restClient;
+ private final ObjectMapper objectMapper;
+
+ public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) {
+ this.restClient = restClient;
+ this.objectMapper = objectMapper;
+ }
+
+ /**
+ * Parses the entire PDF and returns one {@link PageResult} per non-empty page.
+ */
+ public List parse(Path pdfPath) {
+ log.info("Submitting {} to Marker for parsing", pdfPath.getFileName());
+
+ MultiValueMap body = new LinkedMultiValueMap<>();
+ body.add("file", new FileSystemResource(pdfPath));
+ body.add("output_format", "json");
+
+ JsonNode response = restClient.post()
+ .uri("/marker/upload")
+ .contentType(MediaType.MULTIPART_FORM_DATA)
+ .body(body)
+ .retrieve()
+ .body(JsonNode.class);
+
+ try {
+ Path debugFile = Path.of("/tmp/marker-response-md.json");
+ Files.writeString(debugFile, response.toPrettyString());
+ log.info("Marker response saved to {}", debugFile);
+ } catch (IOException e) {
+ log.warn("Could not save Marker response to file", e);
+ }
+
+ List results = parseResponse(response);
+ log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName());
+ return results;
+ }
+
+ // --- Private helpers ---
+
+ private List parseResponse(JsonNode response) {
+ if (response == null) return List.of();
+
+ // The "output" field is a JSON-encoded string — parse it first.
+ // Fall back to treating the whole response as the root if "output" is absent.
+ JsonNode root;
+ JsonNode outputNode = response.path("output");
+ if (!outputNode.isMissingNode() && outputNode.isTextual()) {
+ try {
+ root = objectMapper.readTree(outputNode.asText());
+ } catch (tools.jackson.core.JacksonException e) {
+ log.warn("Could not parse Marker 'output' field as JSON", e);
+ return List.of();
+ }
+ } else if (!outputNode.isMissingNode()) {
+ root = outputNode;
+ } else {
+ root = response;
+ }
+
+ JsonNode children = root.path("children");
+ if (children.isMissingNode() || !children.isArray()) {
+ log.warn("Marker response has no 'children' array — empty result");
+ return List.of();
+ }
+
+ List results = new ArrayList<>();
+ int pageIndex = 0;
+ for (JsonNode pageBlock : children) {
+ String blockType = pageBlock.path("block_type").asText();
+ if (!"Page".equals(blockType)) continue;
+
+ int pageNumber = pageIndex + 1;
+ pageIndex++;
+
+ PageResult result = parsePage(pageBlock, pageNumber);
+ if (!result.orderedText().isBlank() || !result.figures().isEmpty()) {
+ results.add(result);
+ }
+ }
+ return results;
+ }
+
+ private PageResult parsePage(JsonNode pageBlock, int pageNumber) {
+ JsonNode children = pageBlock.path("children");
+ if (children.isMissingNode() || !children.isArray()) {
+ return new PageResult(pageNumber, "", null, List.of(), "");
+ }
+
+ StringBuilder textBuilder = new StringBuilder();
+ StringBuilder markdownBuilder = new StringBuilder();
+ String headingTitle = null;
+ List figures = new ArrayList<>();
+ Set consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure
+
+ List childList = new ArrayList<>();
+ children.forEach(childList::add);
+
+ for (int i = 0; i < childList.size(); i++) {
+ if (consumed.contains(i)) continue;
+
+ JsonNode child = childList.get(i);
+ String type = child.path("block_type").asText();
+
+ if ("SectionHeader".equals(type)) {
+ String heading = stripHtml(child.path("html").asText()).strip();
+ if (!heading.isEmpty() && headingTitle == null) {
+ headingTitle = heading;
+ }
+ appendText(textBuilder, heading);
+ appendMarkdown(markdownBuilder, "## " + heading);
+
+ } else if (TEXT_BLOCK_TYPES.contains(type)) {
+ String text = stripHtml(child.path("html").asText());
+ appendText(textBuilder, text);
+ appendMarkdown(markdownBuilder, text.strip());
+
+ } else if (FIGURE_BLOCK_TYPES.contains(type)) {
+ extractFigures(child, i, childList, figures, markdownBuilder, consumed);
+ }
+ }
+
+ return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle,
+ figures, markdownBuilder.toString().strip());
+ }
+
+ /**
+ * Handles a figure/picture block at {@code index} in {@code siblings}.
+ * For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure,
+ * and the caption is a sibling Caption child inside the group.
+ * For leaf blocks the caption is the next sibling in the page child list.
+ * Image refs are appended to {@code markdown} as {@code }.
+ * Consumed caption sibling indices are added to {@code consumed}.
+ */
+ private void extractFigures(JsonNode block, int index, List siblings,
+ List out, StringBuilder markdown,
+ Set consumed) {
+ String type = block.path("block_type").asText();
+ boolean isGroup = type.endsWith("Group");
+
+ if (isGroup) {
+ JsonNode groupChildren = block.path("children");
+ if (groupChildren.isMissingNode() || !groupChildren.isArray()) return;
+
+ String groupCaption = null;
+ for (JsonNode sub : groupChildren) {
+ if ("Caption".equals(sub.path("block_type").asText())) {
+ String c = stripHtml(sub.path("html").asText()).strip();
+ if (!c.isEmpty()) groupCaption = c;
+ }
+ }
+ for (JsonNode sub : groupChildren) {
+ String subType = sub.path("block_type").asText();
+ if ("Figure".equals(subType) || "Picture".equals(subType)) {
+ String blockId = sub.path("id").asText();
+ byte[] imageBytes = extractImageBytes(sub, blockId);
+ if (imageBytes != null) {
+ out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId));
+ String altText = groupCaption != null ? groupCaption : blockId;
+ appendMarkdown(markdown, "");
+ }
+ }
+ }
+ } else {
+ String blockId = block.path("id").asText();
+ byte[] imageBytes = extractImageBytes(block, blockId);
+ if (imageBytes != null) {
+ String caption = null;
+ if (index + 1 < siblings.size()) {
+ JsonNode next = siblings.get(index + 1);
+ if ("Caption".equals(next.path("block_type").asText())) {
+ String c = stripHtml(next.path("html").asText()).strip();
+ if (!c.isEmpty()) caption = c;
+ consumed.add(index + 1);
+ }
+ }
+ out.add(new PageResult.FigureData(imageBytes, caption, blockId));
+ String altText = caption != null ? caption : blockId;
+ appendMarkdown(markdown, "");
+ }
+ }
+ }
+
+ /**
+ * Extracts and base64-decodes the image bytes for this block.
+ * Marker stores images in the block's {@code images} map keyed by block ID.
+ */
+ private byte[] extractImageBytes(JsonNode block, String blockId) {
+ JsonNode images = block.path("images");
+ if (images.isMissingNode() || images.isEmpty()) return null;
+
+ // Try the block's own ID first, then fall back to the first entry
+ JsonNode imgNode = images.path(blockId);
+ if (imgNode.isMissingNode()) {
+ imgNode = images.properties().stream()
+ .findFirst()
+ .map(e -> e.getValue())
+ .orElse(imgNode);
+ }
+
+ String base64 = imgNode.asText();
+ if (base64.isEmpty()) return null;
+
+ try {
+ return Base64.getDecoder().decode(base64);
+ } catch (IllegalArgumentException ex) {
+ log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage());
+ return null;
+ }
+ }
+
+ private void appendText(StringBuilder sb, String text) {
+ String stripped = text.strip();
+ if (stripped.isEmpty()) return;
+ if (sb.length() > 0) sb.append("\n\n");
+ sb.append(stripped);
+ }
+
+ private void appendMarkdown(StringBuilder sb, String text) {
+ if (text == null || text.isBlank()) return;
+ if (sb.length() > 0) sb.append("\n\n");
+ sb.append(text.strip());
+ }
+
+ /** Strips HTML tags and normalises whitespace. */
+ private String stripHtml(String html) {
+ if (html == null || html.isEmpty()) return "";
+ return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip();
+ }
+}
diff --git a/backend/src/main/java/com/aiteacher/document/PageResult.java b/backend/src/main/java/com/aiteacher/document/PageResult.java
new file mode 100644
index 0000000..cb9989d
--- /dev/null
+++ b/backend/src/main/java/com/aiteacher/document/PageResult.java
@@ -0,0 +1,26 @@
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by MarkerPageParser for one PDF page.
+ * Decouples the Marker HTTP API from downstream services.
+ */
+public record PageResult(
+ int pageNumber, // 1-based, derived from Marker page block index
+ String orderedText, // full page text in correct reading order (blocks joined by \n\n)
+ String headingTitle, // first SectionHeader block on page, or null
+ List figures, // extracted figure images (may be empty)
+ String markdown // markdown representation with marker://{blockId} image placeholders
+) {
+
+ /**
+ * A figure extracted from the page.
+ * Image bytes are PNG data decoded from the Marker JSON {@code images} map.
+ */
+ public record FigureData(
+ byte[] imageBytes, // PNG image data (base64-decoded from Marker response)
+ String nearestCaption, // text of the adjacent Caption block, or null
+ String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability
+ ) {}
+}
diff --git a/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java b/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java
index 930914f..bae4176 100644
--- a/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java
+++ b/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java
@@ -1,13 +1,17 @@
package com.aiteacher.document;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
-import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
-import org.springframework.core.io.FileSystemResource;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
+import java.awt.Rectangle;
+import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
@@ -15,13 +19,18 @@ import java.util.UUID;
/**
* Parses a PDF into page-level SectionEntity records stored in Postgres.
- * Each page becomes one section, grouped under a single chapter per book.
+ * Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
+ * left column is extracted first then right, preserving correct reading order.
+ * Text is also normalized (collapsed whitespace) before storage.
*/
@Service
public class PdfStructureParser {
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
+ // Right column is considered empty (single-column page) if it has < 20% of left column's content
+ private static final double TWO_COLUMN_THRESHOLD = 0.2;
+
private final ChapterRepository chapterRepository;
private final SectionRepository sectionRepository;
@@ -35,37 +44,71 @@ public class PdfStructureParser {
public List parse(UUID bookId, String bookTitle, Path pdfPath) {
log.info("Parsing PDF structure for book {}", bookId);
- // One chapter per book
String chapterId = bookId + "-ch1";
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter);
- // One section per page
- PagePdfDocumentReader reader = new PagePdfDocumentReader(
- new FileSystemResource(pdfPath.toFile()),
- PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
- );
-
- List pages = reader.get();
List sections = new ArrayList<>();
- for (int i = 0; i < pages.size(); i++) {
- int pageNum = i + 1;
- String text = pages.get(i).getText();
- if (text == null || text.isBlank()) continue;
+ try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
+ List pages = new ArrayList<>();
+ doc.getPages().forEach(pages::add);
- String sectionId = bookId + "-p" + pageNum;
- SectionEntity section = new SectionEntity(
- sectionId, chapterId, bookId,
- String.valueOf(pageNum),
- "Page " + pageNum,
- pageNum, pageNum,
- text
- );
- sections.add(sectionRepository.save(section));
+ for (int i = 0; i < 25; i++) {
+ int pageNum = i + 1;
+ String text = normalizeWhitespace(extractPageText(pages.get(i)));
+ if (text.isBlank()) continue;
+
+ String sectionId = bookId + "-p" + pageNum;
+ SectionEntity section = new SectionEntity(
+ sectionId, chapterId, bookId,
+ String.valueOf(pageNum),
+ "Page " + pageNum,
+ pageNum, pageNum,
+ text
+ );
+ sections.add(sectionRepository.save(section));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
}
log.info("Parsed {} sections for book {}", sections.size(), bookId);
return sections;
}
+
+ /**
+ * Extracts text from a single page using column-aware region extraction.
+ * Splits the page at the horizontal midpoint. If the right region has fewer
+ * than 20% of the characters of the left region, treats the page as single-column.
+ */
+ private String extractPageText(PDPage page) throws IOException {
+ PDRectangle mediaBox = page.getMediaBox();
+ int width = (int) mediaBox.getWidth();
+ int height = (int) mediaBox.getHeight();
+ int mid = width / 2;
+
+ PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+ stripper.setSortByPosition(true);
+ stripper.addRegion("left", new Rectangle(0, 0, mid, height));
+ stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
+ stripper.extractRegions(page);
+
+ String left = stripper.getTextForRegion("left").strip();
+ String right = stripper.getTextForRegion("right").strip();
+
+ if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
+ // Single-column page — left holds all (or nearly all) content
+ return left.isEmpty() ? right : left;
+ }
+ return left + "\n\n" + right;
+ }
+
+ /** Collapses multi-space/tab runs and excessive blank lines. */
+ private String normalizeWhitespace(String text) {
+ return text
+ .replaceAll("[ \t]{2,}", " ")
+ .replaceAll("\n{3,}", "\n\n")
+ .trim();
+ }
}
diff --git a/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java
new file mode 100644
index 0000000..478f0cc
--- /dev/null
+++ b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java
@@ -0,0 +1,97 @@
+package com.aiteacher.document;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
+import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
+import software.amazon.awssdk.core.sync.RequestBody;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.S3Configuration;
+import software.amazon.awssdk.services.s3.model.*;
+
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+@Service
+public class S3MarkdownStorageService implements MarkdownStorageService {
+
+ private static final Logger log = LoggerFactory.getLogger(S3MarkdownStorageService.class);
+
+ private final S3Client s3;
+ private final String bucket;
+
+ public S3MarkdownStorageService(
+ @Value("${app.figure-storage.endpoint}") String endpoint,
+ @Value("${app.figure-storage.region}") String region,
+ @Value("${app.figure-storage.bucket}") String bucket,
+ @Value("${app.figure-storage.access-key-id}") String accessKeyId,
+ @Value("${app.figure-storage.secret-access-key}") String secretKey) {
+ this.bucket = bucket;
+ URI endpointUri = URI.create(endpoint);
+ StaticCredentialsProvider credentials = StaticCredentialsProvider.create(
+ AwsBasicCredentials.create(accessKeyId, secretKey));
+ Region awsRegion = Region.of(region);
+ S3Configuration s3Config = S3Configuration.builder().pathStyleAccessEnabled(true).build();
+
+ this.s3 = S3Client.builder()
+ .endpointOverride(endpointUri)
+ .region(awsRegion)
+ .credentialsProvider(credentials)
+ .serviceConfiguration(s3Config)
+ .build();
+ }
+
+ @Override
+ public String save(UUID bookId, int pageNumber, String markdown) {
+ String key = key(bookId, pageNumber);
+ byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8);
+ s3.putObject(
+ PutObjectRequest.builder().bucket(bucket).key(key)
+ .contentType("text/markdown; charset=utf-8")
+ .contentLength((long) bytes.length).build(),
+ RequestBody.fromBytes(bytes));
+ return key;
+ }
+
+ @Override
+ public String getText(UUID bookId, int pageNumber) {
+ byte[] bytes = s3.getObjectAsBytes(
+ GetObjectRequest.builder().bucket(bucket).key(key(bookId, pageNumber)).build()
+ ).asByteArray();
+ return new String(bytes, StandardCharsets.UTF_8);
+ }
+
+ @Override
+ public void deleteAll(UUID bookId) {
+ String prefix = "markdown/" + bookId + "/";
+ try {
+ List toDelete = new ArrayList<>();
+ s3.listObjectsV2Paginator(ListObjectsV2Request.builder()
+ .bucket(bucket).prefix(prefix).build()).stream()
+ .flatMap(page -> page.contents().stream())
+ .map(S3Object::key)
+ .map(k -> ObjectIdentifier.builder().key(k).build())
+ .forEach(toDelete::add);
+
+ if (toDelete.isEmpty()) return;
+
+ s3.deleteObjects(DeleteObjectsRequest.builder()
+ .bucket(bucket)
+ .delete(Delete.builder().objects(toDelete).build())
+ .build());
+ log.info("Deleted {} markdown files from S3 for book {}", toDelete.size(), bookId);
+ } catch (S3Exception ex) {
+ log.warn("Could not fully delete markdown for book {} from S3: {}", bookId, ex.getMessage());
+ }
+ }
+
+ private static String key(UUID bookId, int pageNumber) {
+ return "markdown/" + bookId + "/page-" + pageNumber + ".md";
+ }
+}
diff --git a/backend/src/main/java/com/aiteacher/document/TextChunkingService.java b/backend/src/main/java/com/aiteacher/document/TextChunkingService.java
index 776b28b..7eaf021 100644
--- a/backend/src/main/java/com/aiteacher/document/TextChunkingService.java
+++ b/backend/src/main/java/com/aiteacher/document/TextChunkingService.java
@@ -38,14 +38,52 @@ public class TextChunkingService {
List windows = new ArrayList<>();
int start = 0;
while (start < text.length()) {
- int end = Math.min(start + TARGET_CHARS, text.length());
- windows.add(text.substring(start, end));
- if (end == text.length()) break;
- start = end - OVERLAP_CHARS;
+ int hardEnd = Math.min(start + TARGET_CHARS, text.length());
+ if (hardEnd == text.length()) {
+ String last = text.substring(start).strip();
+ if (!last.isEmpty()) windows.add(last);
+ break;
+ }
+ int splitAt = findSplitPoint(text, start, hardEnd);
+ String chunk = text.substring(start, splitAt).strip();
+ if (!chunk.isEmpty()) windows.add(chunk);
+ // Overlap: back up from split point, align to a word start
+ int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
+ while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
+ start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
}
return windows;
}
+ /**
+ * Finds the best split point at or before hardEnd, preferring (in order):
+ * paragraph boundary, sentence boundary, word boundary, hard cut.
+ */
+ private int findSplitPoint(String text, int start, int hardEnd) {
+ int lookback = Math.min(400, (hardEnd - start) / 2);
+
+ // 1. Paragraph boundary
+ int paraIdx = text.lastIndexOf("\n\n", hardEnd);
+ if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
+
+ // 2. Sentence boundary (. ! ?) followed by space or newline
+ for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
+ char c = text.charAt(i);
+ if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
+ char next = text.charAt(i + 1);
+ if (next == ' ' || next == '\n') return i + 1;
+ }
+ }
+
+ // 3. Word boundary
+ for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
+ if (text.charAt(i) == ' ') return i + 1;
+ }
+
+ // 4. Hard cut
+ return hardEnd;
+ }
+
private Map buildMetadata(SectionEntity section, String bookTitle,
int index, int total, String chunkId) {
Map m = new HashMap<>();
diff --git a/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java b/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java
index 86380a3..5b40b2b 100644
--- a/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java
+++ b/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java
@@ -8,18 +8,29 @@ import org.springframework.stereotype.Service;
import org.springframework.util.MimeTypeUtils;
/**
- * Generates a clinical text description for an extracted figure image
- * using the OpenAI vision model via Spring AI ChatClient.
+ * Analyses an extracted figure image using the OpenAI vision model.
+ *
+ * Returns an {@link ImageAnalysis} record containing:
+ *
+ * {@code description} — 2-3 sentence clinical description of the image
+ * {@code imageText} — all visible text, labels, and annotations copied verbatim
+ * from the image (empty string when none present)
+ *
+ *
+ * Both fields are stored: {@code description} drives the embedding; {@code imageText}
+ * is added to chunk metadata so queries can match exact labels (e.g., "Circle of Willis").
*/
@Service
public class VisionDescriptionService {
private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class);
- private static final String PROMPT =
- "You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " +
- "this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " +
- "significance. If text or labels are visible, include them verbatim.";
+ private static final String PROMPT = """
+ You are a neurosurgery educator analysing a medical image.
+ Respond in EXACTLY this format — no other text, no markdown:
+ DESCRIPTION: <2-3 sentence clinical description focusing on anatomical structures, surgical landmarks, and clinical significance>
+ IMAGE_TEXT:
+ """;
private final ChatClient chatClient;
@@ -28,19 +39,53 @@ public class VisionDescriptionService {
}
/**
- * Returns a description string. Falls back to the provided caption if vision fails.
+ * Holds the structured output of a vision model call on one figure image.
+ *
+ * @param description clinical description of the image content
+ * @param imageText verbatim text visible inside the image; empty string if none
*/
- public String describe(byte[] imageBytes, String captionFallback) {
+ public record ImageAnalysis(String description, String imageText) {}
+
+ /**
+ * Analyses the image bytes and returns an {@link ImageAnalysis}.
+ * Falls back gracefully: if the vision call fails, the caption is used as description
+ * and imageText is left empty.
+ *
+ * @param imageBytes PNG bytes of the extracted figure
+ * @param captionFallback caption detected from surrounding text, may be null
+ */
+ public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) {
try {
- return chatClient.prompt()
- .user(u -> u
- .text(PROMPT)
- .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
- .call()
- .content();
+ String raw = chatClient.prompt()
+ .user(u -> u
+ .text(PROMPT)
+ .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes)))
+ .call()
+ .content();
+ return parse(raw, captionFallback);
} catch (Exception ex) {
- log.warn("Vision description failed: {} — using caption as fallback", ex.getMessage());
- return captionFallback != null ? captionFallback : "Figure";
+ log.warn("Vision analysis failed: {} — using caption as fallback", ex.getMessage());
+ return new ImageAnalysis(
+ captionFallback != null ? captionFallback : "Figure",
+ "");
}
}
+
+ private ImageAnalysis parse(String raw, String captionFallback) {
+ String description = captionFallback != null ? captionFallback : "Figure";
+ String imageText = "";
+
+ if (raw != null) {
+ for (String line : raw.split("\n")) {
+ if (line.startsWith("DESCRIPTION:")) {
+ String val = line.substring("DESCRIPTION:".length()).strip();
+ if (!val.isEmpty()) description = val;
+ } else if (line.startsWith("IMAGE_TEXT:")) {
+ String val = line.substring("IMAGE_TEXT:".length()).strip();
+ if (!val.isEmpty() && !"NONE".equalsIgnoreCase(val)) imageText = val;
+ }
+ }
+ }
+ return new ImageAnalysis(description, imageText);
+ }
}
diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml
index f045ac8..5cf70c0 100644
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -64,3 +64,5 @@ app:
embedding:
batch-size: 20
batch-delay-ms: 2000
+ marker:
+ base-url: ${MARKER_BASE_URL:http://localhost:8000}
diff --git a/frontend/src/components/BookCard.vue b/frontend/src/components/BookCard.vue
index 3755cbf..067ac32 100644
--- a/frontend/src/components/BookCard.vue
+++ b/frontend/src/components/BookCard.vue
@@ -33,6 +33,13 @@
+
+ Read
+
diff --git a/frontend/src/router/index.ts b/frontend/src/router/index.ts
index 0caef38..e6c2912 100644
--- a/frontend/src/router/index.ts
+++ b/frontend/src/router/index.ts
@@ -2,6 +2,7 @@ import { createRouter, createWebHistory } from 'vue-router'
import UploadView from '@/views/UploadView.vue'
import TopicsView from '@/views/TopicsView.vue'
import ChatView from '@/views/ChatView.vue'
+import BookReaderView from '@/views/BookReaderView.vue'
const router = createRouter({
history: createWebHistory(import.meta.env.BASE_URL),
@@ -20,6 +21,11 @@ const router = createRouter({
path: '/chat',
name: 'chat',
component: ChatView
+ },
+ {
+ path: '/books/:id/read',
+ name: 'book-reader',
+ component: BookReaderView
}
]
})
diff --git a/frontend/src/views/BookReaderView.vue b/frontend/src/views/BookReaderView.vue
new file mode 100644
index 0000000..1ecc12a
--- /dev/null
+++ b/frontend/src/views/BookReaderView.vue
@@ -0,0 +1,322 @@
+
+
+
+
+
+
+
+
+
+
Loading page {{ currentPage }}…
+
+
+
+ Could not load page {{ currentPage }}
+ {{ error }}
+
+
+
+
+
+
+
+
+
+
diff --git a/specs/002-image-aware-embedding/contracts/document-ai-page-result.md b/specs/002-image-aware-embedding/contracts/document-ai-page-result.md
new file mode 100644
index 0000000..0d46ff5
--- /dev/null
+++ b/specs/002-image-aware-embedding/contracts/document-ai-page-result.md
@@ -0,0 +1,79 @@
+# Internal Contract: DocumentAiPageParser → FigureExtractionService
+
+**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-04
+**Type**: Internal Java DTO (not an HTTP contract)
+
+---
+
+## Purpose
+
+`PageResult` is the internal data transfer object produced by `DocumentAiPageParser` for each
+PDF page. It decouples the Google Document AI SDK types from the rest of the pipeline so that
+`PdfStructureParser` can be replaced without cascading changes.
+
+---
+
+## Java Record
+
+```java
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by DocumentAiPageParser for one PDF page.
+ * Decouples the Document AI SDK types from downstream services.
+ */
+public record PageResult(
+ int pageNumber, // 1-based, matches Document.Page.getPageNumber()
+ String orderedText, // full page text in correct reading order (blocks joined by \n\n)
+ String headingTitle, // first HEADING block on page, or null
+ List figures // detected figure regions (may be empty)
+) {
+
+ /**
+ * Normalized bounding box for a detected figure region.
+ * Coordinates are in the [0.0, 1.0] range relative to page dimensions.
+ */
+ public record FigureBbox(
+ float x, // left edge (normalized)
+ float y, // top edge (normalized)
+ float width, // width (normalized)
+ float height, // height (normalized)
+ String nearestCaption // text of adjacent paragraph block, or null
+ ) {}
+}
+```
+
+---
+
+## Production Rules
+
+| Field | Rule |
+|-------|------|
+| `orderedText` | Concatenation of all `PARAGRAPH` and `HEADING_*` blocks, joined with `\n\n`. Tables are represented as tab-separated text. |
+| `headingTitle` | First block whose `blockType` is `HEADING_1` through `HEADING_6`. `null` if no heading detected. |
+| `figures` | One entry per `VisualElement` with `type == "figure"` and `confidence ≥ 0.5`. Sorted top-to-bottom by `y`. |
+| `nearestCaption` | The `PARAGRAPH` block immediately following the figure bbox (by Y coordinate). May be `null` if no paragraph follows within 10% of page height. |
+
+---
+
+## Mapping from Document AI Proto
+
+```
+Document.Page.Block → orderedText (concatenated)
+Document.Page.Block (HEADING_*) → headingTitle (first match)
+Document.Page.VisualElement → FigureBbox
+ └─ layout.bounding_poly.normalized_vertices[0] → (x, y) top-left
+ └─ normalized_vertices[2] → (x+w, y+h) bottom-right
+```
+
+---
+
+## Consumers
+
+| Consumer | What It Uses |
+|----------|-------------|
+| `BookEmbeddingService` | `orderedText` → `SectionEntity.fullText`; `headingTitle` → `SectionEntity.title` |
+| `FigureExtractionService` | `figures` list → renders page via PDFBox, crops each bbox to `BufferedImage` |
+| `TextChunkingService` | Receives `SectionEntity` (indirectly uses `orderedText`) — **unchanged** |
diff --git a/specs/002-image-aware-embedding/contracts/marker-page-result.md b/specs/002-image-aware-embedding/contracts/marker-page-result.md
new file mode 100644
index 0000000..8300de5
--- /dev/null
+++ b/specs/002-image-aware-embedding/contracts/marker-page-result.md
@@ -0,0 +1,84 @@
+# Internal Contract: MarkerPageParser → FigureExtractionService / BookEmbeddingService
+
+**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-04
+**Type**: Internal Java DTO (not an HTTP contract)
+
+---
+
+## Purpose
+
+`PageResult` is the internal data transfer object produced by `MarkerPageParser` for each
+PDF page. It decouples the Marker HTTP API from the rest of the pipeline. Downstream consumers
+(`BookEmbeddingService`, `FigureExtractionService`, `TextChunkingService`) are unaware of
+Marker and depend only on this DTO.
+
+---
+
+## Java Record
+
+```java
+package com.aiteacher.document;
+
+import java.util.List;
+
+/**
+ * Internal DTO produced by MarkerPageParser for one PDF page.
+ * Decouples the Marker HTTP API from downstream services.
+ */
+public record PageResult(
+ int pageNumber, // 1-based, derived from Marker page block index
+ String orderedText, // full page text in correct reading order (blocks joined by \n\n)
+ String headingTitle, // first SectionHeader block on page, or null
+ List figures // extracted figure images (may be empty)
+) {
+
+ /**
+ * A figure extracted from the page.
+ * Image bytes are PNG data decoded from the Marker JSON `images` map.
+ */
+ public record FigureData(
+ byte[] imageBytes, // PNG image data (base64-decoded from Marker response)
+ String nearestCaption, // text of the adjacent Caption block, or null
+ String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability
+ ) {}
+}
+```
+
+---
+
+## Production Rules
+
+| Field | Rule |
+|-------|------|
+| `pageNumber` | 1-based index derived from the Marker page block's position in the `children` array (index + 1). |
+| `orderedText` | HTML-stripped text from all `Text`, `TextInlineMath`, `SectionHeader`, `ListItem`, and `Table` blocks, joined with `\n\n`. Marker already returns them in reading order. |
+| `headingTitle` | Plain text of the first `SectionHeader` block on the page. `null` if no heading detected. |
+| `figures` | One `FigureData` per `Figure` or `Picture` block that has a non-empty `images` entry. Blocks with no image data are skipped. |
+| `imageBytes` | Base64-decoded bytes from `block.images[blockId]`. Marker returns PNG. |
+| `nearestCaption` | Plain text of the first `Caption` block that is a sibling appearing immediately after the figure block. `null` if absent. |
+
+---
+
+## Mapping from Marker JSON
+
+```
+Marker JSON → PageResult
+
+Page block ("/page/N/Page/M") → PageResult(pageNumber = N + 1)
+ SectionHeader child → headingTitle (first match, HTML-stripped)
+ Text / TextInlineMath children → orderedText (HTML-stripped, joined \n\n)
+ Figure / Picture child → FigureData
+ images[blockId] → FigureData.imageBytes (base64-decoded)
+ next Caption sibling → FigureData.nearestCaption (HTML-stripped)
+ blockId → FigureData.blockId
+```
+
+---
+
+## Consumers
+
+| Consumer | What It Uses |
+|----------|-------------|
+| `BookEmbeddingService` | `orderedText` → `SectionEntity.fullText`; `headingTitle` → `SectionEntity.title` |
+| `FigureExtractionService` | `figures` list → decodes `imageBytes`, checks min size, saves to S3 |
+| `TextChunkingService` | Receives `SectionEntity` (uses `orderedText` indirectly) — **unchanged** |
diff --git a/specs/002-image-aware-embedding/plan.md b/specs/002-image-aware-embedding/plan.md
index 477375e..f263861 100644
--- a/specs/002-image-aware-embedding/plan.md
+++ b/specs/002-image-aware-embedding/plan.md
@@ -1,40 +1,42 @@
# Implementation Plan: Enhanced Embedding with Image Parsing and Metadata
-**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-03 | **Spec**: [spec.md](spec.md)
+**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-04 | **Spec**: [spec.md](spec.md)
**Input**: Feature specification from `/specs/002-image-aware-embedding/spec.md`
## Summary
-Enhance the book embedding pipeline to extract images from every PDF page, generate descriptive
-text for each image, and store all content (text chunks + figure captions) with rich, consistent
-metadata in the vector store. A new document hierarchy (Book → Chapter → Section → TextChunk +
-Figure) is introduced. Postgres holds the full-text sections and figure metadata; the vector
-store holds chunk and figure caption embeddings; the local file store holds extracted image files.
-At query time, both the text-chunk store and figure-caption store are searched in parallel and
-results are merged before being sent to the LLM.
+Enhance the PDF embedding pipeline to extract figures and generate AI descriptions for them,
+making image content semantically searchable alongside text. PDF parsing and figure extraction
+are delegated to a local **Marker** server (`http://localhost:8000/marker/upload`), which
+returns reading-order text and pre-cropped figure images (base64) in a single JSON response,
+eliminating the need for PDFBox column heuristics and figure bbox rendering.
## Technical Context
**Language/Version**: Java 25 (backend), TypeScript / Node 20 (frontend)
-**Primary Dependencies**: Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API (embeddings + chat), PDFBox (via Spring AI PDF reader dependency)
-**Storage**: PostgreSQL (JPA + Flyway), pgvector (Spring AI `VectorStore`), local file system (extracted images — `/uploads/figures/`)
-**Testing**: Spring Boot Test, JUnit 5, Mockito
-**Target Platform**: Linux server (Docker Compose)
-**Project Type**: Web application — backend REST API + Vue 3 frontend
-**Performance Goals**: Full book (up to 500 pages with images) processed in ≤ 30 minutes; query response unchanged from existing baseline
-**Constraints**: No new deployable units; all changes within the existing `backend/` module; image storage on local disk (S3 migration is a future concern, behind an interface)
-**Scale/Scope**: POC — <10 concurrent users; single shared book library
+**Primary Dependencies**: Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API (embeddings +
+GPT-4o vision), PDFBox 3.0.3 (via `spring-ai-pdf-document-reader` — retained transitively,
+no longer used directly), Marker local HTTP API (`http://localhost:8000/marker/upload`)
+**Storage**: PostgreSQL (JPA + Flyway), pgvector (Spring AI `VectorStore`), S3-compatible
+object store (figure images via `FigureStorageService`)
+**Testing**: Maven / JUnit 5 (`spring-boot-starter-test`)
+**Target Platform**: Linux server
+**Project Type**: Web application (backend API + frontend client)
+**Performance Goals**: SC-003 — book processing time ≤ 3× text-only for ≤ 500 pages
+**Constraints**: REST API only (Constitution III); Marker server must be running locally;
+S3-compatible storage configured via env vars
+**Scale/Scope**: POC — handful of books, <10 users
## Constitution Check
-*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.*
+*GATE: Must pass before Phase 0 research. Re-checked after Phase 1 design.*
| Principle | Status | Notes |
|-----------|--------|-------|
-| I — KISS | ⚠️ Justified violation — see Complexity Tracking | Hierarchical model + dual search adds complexity; justified by precision requirement |
-| II — Easy to Change | ✅ | Figure storage wrapped behind `FigureStorageService` interface; can swap local disk for S3 |
-| III — Web-First | ✅ | All new capabilities exposed via existing REST API; no new deployable units |
-| IV — Docs as Architecture | ⚠️ Required | README Mermaid diagram MUST be updated in this PR to show new storage tiers |
+| **I. KISS** | ✅ Justified | Marker replaces a bespoke PDFBox column heuristic + Google Cloud SDK with one HTTP call. Net complexity reduction vs. the Document AI approach. |
+| **II. Easy to Change** | ✅ | `MarkerPageParser` is the only class that knows about Marker; swap the implementation to replace Marker with any other parser. `PageResult` DTO remains unchanged. |
+| **III. Web-First** | ✅ | Internal pipeline change; no public API contract change. |
+| **IV. Documentation** | ✅ | README must be updated to show Marker as a local external service. |
## Project Structure
@@ -46,60 +48,38 @@ specs/002-image-aware-embedding/
├── research.md # Phase 0 output
├── data-model.md # Phase 1 output
├── quickstart.md # Phase 1 output
-├── contracts/ # Phase 1 output
-└── tasks.md # Phase 2 output (/speckit.tasks)
+├── contracts/
+│ ├── api.md # HTTP API contracts (unchanged from initial plan)
+│ └── marker-page-result.md # Internal DTO contract (MarkerPageParser → downstream)
+└── tasks.md # Phase 2 output (/speckit.tasks — not created here)
```
-### Source Code (repository root)
+### Source Code
```text
backend/
├── src/main/java/com/aiteacher/
+│ ├── config/
+│ │ └── MarkerConfig.java # NEW: RestClient bean + base-url property
+│ ├── document/
+│ │ ├── MarkerPageParser.java # NEW: replaces DocumentAiPageParser + PdfStructureParser
+│ │ ├── PageResult.java # UPDATED: FigureBbox → FigureData (bytes not bbox)
+│ │ ├── FigureExtractionService.java # UPDATED: no PDFBox render; decode bytes directly
+│ │ ├── TextChunkingService.java # UNCHANGED
+│ │ ├── VisionDescriptionService.java # UNCHANGED
+│ │ └── [removed] DocumentAiPageParser.java
│ ├── book/
-│ │ ├── Book.java (existing)
-│ │ ├── BookController.java (existing)
-│ │ ├── BookService.java (existing)
-│ │ ├── BookRepository.java (existing)
-│ │ ├── BookStatus.java (existing)
-│ │ ├── BookEmbeddingService.java (existing — enhanced)
-│ │ └── NoKnowledgeSourceException.java (existing)
-│ ├── document/ (new package)
-│ │ ├── BookNode.java
-│ │ ├── ChapterNode.java
-│ │ ├── SectionNode.java
-│ │ ├── SectionRepository.java
-│ │ ├── TextChunkNode.java
-│ │ ├── FigureNode.java
-│ │ ├── FigureRepository.java
-│ │ ├── FigureType.java
-│ │ ├── ChunkFigureRef.java
-│ │ └── ChunkFigureRefRepository.java
-│ ├── figure/ (new package)
-│ │ ├── FigureStorageService.java (interface)
-│ │ └── LocalFigureStorageService.java (implementation)
-│ ├── retrieval/ (new package)
-│ │ └── NeurosurgeryRetriever.java
-│ ├── chat/
-│ │ └── ChatService.java (updated — uses NeurosurgeryRetriever)
-│ └── config/
-│ └── FigureStorageConfig.java (new — configures upload dir)
-└── src/main/resources/
- └── db/migration/
- ├── V4__document_hierarchy.sql (new)
- └── V5__figures_and_refs.sql (new)
-
-uploads/
-└── figures/ (runtime — extracted images; gitignored)
+│ │ └── BookEmbeddingService.java # MINOR UPDATE: inject MarkerPageParser, drop DocumentAiPageParser
+│ └── [removed] config/DocumentAiConfig.java
+├── src/main/resources/
+│ └── application.yaml # UPDATED: remove document-ai.*, add marker.base-url
+└── pom.xml # UPDATED: remove google-cloud-document-ai
```
-**Structure Decision**: Option 2 (Web Application) confirmed. All backend changes stay within
-`backend/`. Two new packages (`document/`, `retrieval/`) plus one interface package (`figure/`)
-keep concerns separated without adding a deployable unit.
+**Structure Decision**: Option 2 (backend + frontend) per constitution Technology Constraints.
+Frontend changes are display-only (render figure citations inline).
## Complexity Tracking
-| Violation | Why Needed | Simpler Alternative Rejected Because |
-|-----------|------------|-------------------------------------|
-| Document hierarchy (BookNode → ChapterNode → SectionNode) | Parent-child retrieval: chunks reference their parent section so the LLM receives full section context, not just the matching fragment. This is the established solution for RAG precision. | Flat page-per-doc model (current) loses inter-sentence context; chunk-only retrieval produces incomplete answers for multi-paragraph clinical questions |
-| Dual vector search (text chunks + figure captions) | Figure captions must be independently searchable — a query about "cavernous sinus anatomy" must surface the diagram even if no text chunk scores highly | Single vector store search would miss figures whose captions don't happen to be the highest-similarity hit; this is the core deliverable of the feature |
-| Third storage tier (local file store for images) | Extracted images cannot live in Postgres (binary blobs degrade query performance) or the vector store (only vectors). A file-per-image approach is standard. | Storing images as base64 in Postgres JSONB would bloat the DB and complicate backup/restore; the `FigureStorageService` interface keeps the implementation swappable |
+> No constitution violations — Marker reduces complexity compared to the previous
+> Google Document AI approach (fewer dependencies, no GCP credentials, no 15-page batching).
diff --git a/specs/002-image-aware-embedding/quickstart.md b/specs/002-image-aware-embedding/quickstart.md
index 277b1cd..1eb8bff 100644
--- a/specs/002-image-aware-embedding/quickstart.md
+++ b/specs/002-image-aware-embedding/quickstart.md
@@ -1,34 +1,67 @@
# Quickstart: Enhanced Embedding with Image Parsing and Metadata
-**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-03
+**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-04 (updated: Marker replaces Google Document AI)
---
## Prerequisites
- Docker Compose running (PostgreSQL + pgvector)
-- OpenAI API key set in `backend/src/main/resources/application.properties` or as env var `OPENAI_API_KEY`
+- OpenAI API key set as env var `OPENAI_API_KEY`
- Java 25 + Maven on PATH
+- **Marker server running** on `http://localhost:8000` (see setup below)
+- S3-compatible bucket configured (existing setup)
---
-## New Configuration
+## Marker Server Setup (one-time)
-Add to `backend/src/main/resources/application.properties`:
+Marker is a local Python service — no cloud credentials required.
-```properties
-# Figure storage
-app.figure-storage.base-path=./uploads
-app.figure-storage.min-image-size-px=100
+```bash
+# Install (Python 3.10+ required)
+pip install marker-pdf
+
+# Start the server on port 8000
+marker_server --port 8000
```
-The `uploads/figures/` directory is created automatically on first use. Add it to `.gitignore`.
+The server is ready when you see:
+```
+INFO: Uvicorn running on http://0.0.0.0:8000
+```
+
+Keep the server running in the background (or use a process manager like `systemd` or `screen`).
+
+---
+
+## Backend Configuration
+
+Add or update `backend/src/main/resources/application.yaml`:
+
+```yaml
+app:
+ figure-storage:
+ endpoint: https://your-s3-endpoint
+ region: your-region
+ bucket: ${S3_BUCKET:aiteacher}
+ access-key-id: ${S3_ACCESS_KEY_ID}
+ secret-access-key: ${S3_SECRET_ACCESS_KEY}
+ min-image-size-px: 100 # skip decorative images smaller than 100×100 px
+ marker:
+ base-url: ${MARKER_BASE_URL:http://localhost:8000}
+ embedding:
+ batch-size: 20
+ batch-delay-ms: 2000
+```
+
+No GCP credentials or project IDs are needed.
---
## Database Migration
-Two new Flyway migrations run automatically on startup:
+Two Flyway migrations run automatically on startup:
- `V4__document_hierarchy.sql` — adds `chapter` and `section` tables
- `V5__figures_and_refs.sql` — adds `figure` and `chunk_figure_ref` tables
@@ -54,10 +87,11 @@ image-aware pipeline runs. Status can be polled via `GET /api/v1/books`.
## Verifying Image Extraction
-1. Upload a PDF with diagrams: `POST /api/v1/books/upload`
-2. Wait for `status: "READY"` via `GET /api/v1/books`
-3. List figures: `GET /api/v1/books/{id}/figures` — should return at least one entry per image page
-4. Ask a diagram-specific question in chat — response `sources` should include a `type: "FIGURE"` entry
+1. Ensure Marker is running: `curl http://localhost:8000` should respond.
+2. Upload a PDF with diagrams: `POST /api/v1/books/upload`
+3. Wait for `status: "READY"` via `GET /api/v1/books`
+4. List figures: `GET /api/v1/books/{id}/figures` — should return at least one entry per image page
+5. Ask a diagram-specific question in chat — response `sources` should include a `type: "FIGURE"` entry
---
@@ -80,7 +114,8 @@ mvn test
```
Key new test classes:
-- `FigureExtractionServiceTest` — unit tests for image extraction and classification
+- `MarkerPageParserTest` — unit tests for JSON parsing and block-to-PageResult mapping
+- `FigureExtractionServiceTest` — unit tests for base64 decode, size filtering, classification
- `NeurosurgeryRetrieverTest` — unit tests for dual-search merge and deduplication
- `BookEmbeddingServiceIntegrationTest` — integration test: upload PDF with known figures,
verify figures appear in `GET /api/v1/books/{id}/figures`
diff --git a/specs/002-image-aware-embedding/research.md b/specs/002-image-aware-embedding/research.md
index 6115813..b30c0c7 100644
--- a/specs/002-image-aware-embedding/research.md
+++ b/specs/002-image-aware-embedding/research.md
@@ -1,10 +1,10 @@
# Research: Enhanced Embedding with Image Parsing and Metadata
-**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-03
+**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-04 (updated: Marker replaces Google Document AI)
-This document resolves all technical unknowns identified during planning. The primary source for
-decisions is the detailed architecture provided directly by the project owner, supplemented by
-Spring AI 2.0.0-M4 API specifics.
+This document resolves all technical unknowns identified during planning. Decisions 1–10 cover
+the core pipeline. The **Marker Study** section at the bottom explains why Marker was chosen
+over Google Document AI to drive PDF parsing and figure extraction.
---
@@ -28,19 +28,29 @@ association explicit and queryable.
---
-## Decision 2: Image Extraction Strategy
+## Decision 2: Document Parsing Strategy
-**Decision**: Use PDFBox (already on classpath via `spring-ai-pdf-document-reader`) to extract
-images per page. Each image is tagged with `page`, `figure_id` (derived from caption, e.g.
-"Fig. 12-4"), and the parent `sectionId`. Images are saved to local disk under
-`/uploads/figures/{bookId}/`.
+**Decision**: Use **Marker** (local HTTP server, `http://localhost:8000/marker/upload`) as the
+single entry point for PDF parsing. A single `POST` with `output_format=json` returns:
+- Reading-order text blocks (headings, paragraphs) — no column-split heuristic needed
+- Pre-cropped figure images as base64-encoded PNG in the `images` map of each `Figure` block
+- Table, equation, and code blocks as structured HTML
-**Rationale**: PDFBox is already present (Spring AI bundles it). No new dependency needed.
-Per-page extraction ensures every image is captured regardless of PDF structure.
+`MarkerPageParser` translates the Marker JSON response into `List`, which is the
+same internal DTO used by the rest of the pipeline.
+
+**Rationale**: Marker handles column reordering, scanned-page OCR, and figure cropping in one
+call, eliminating the PDFBox column heuristic (`PdfStructureParser`) and the PDFBox
+render+crop loop in `FigureExtractionService`. Net result: fewer classes, no cloud dependency,
+no GCP credentials.
**Alternatives considered**:
-- iText / iText7 → additional commercial dependency; overkill for extraction
-- Screenshot each page as PNG, then OCR → far slower; loses vector quality
+- PDFBox column heuristic (previous approach) → rejected: 50/50 split fails on asymmetric
+ columns and scanned pages
+- Google Document AI Layout Parser → rejected: adds GCP credentials, per-page billing, 15-page
+ batch limit, and still requires PDFBox to render+crop figure regions from bounding boxes.
+ See Marker Study below for detailed comparison.
+- Screenshot each page + OCR → far slower; loses digital text quality
---
@@ -103,18 +113,19 @@ search. This is the higher-recall path; dual search (Decision 4) is the higher-p
## Decision 6: Image Storage
-**Decision**: Extracted images are saved as PNG files to a local directory
-(`${app.figure-storage.base-path}`, defaults to `./uploads/figures/{bookId}/`). The path is
-stored in `figure.image_path` in Postgres. A `FigureStorageService` interface wraps all disk
-I/O so the implementation can be swapped to S3 or another object store without changing
-callers.
+**Decision**: Marker returns figure images as base64-encoded PNG bytes in the JSON response.
+`FigureExtractionService` decodes these bytes and passes them to `FigureStorageService`, which
+persists them to an S3-compatible bucket (`${app.figure-storage.bucket}`). The image path/URL
+is stored in `figure.image_path` in Postgres.
-**Rationale**: Local disk is the simplest viable option for a POC with <10 users. The interface
-boundary satisfies Constitution Principle II (Easy to Change).
+The `FigureStorageService` interface is unchanged; only the caller changes (from PDFBox crop
+to base64 decode).
+
+**Rationale**: Marker's pre-cropped images remove the need for PDFBox rendering.
+`FigureStorageService` interface boundary satisfies Constitution Principle II (Easy to Change).
**Alternatives considered**:
-- S3 from day 1 → operational overhead not justified at POC scale
-- Base64 in Postgres JSONB → bloats DB; complicates backup; query performance degrades
+- Store base64 in Postgres JSONB → bloats DB; complicates backup; query performance degrades
---
@@ -123,7 +134,8 @@ boundary satisfies Constitution Principle II (Easy to Change).
**Decision**: Use the enum `FigureType { ANATOMICAL_DIAGRAM, SURGICAL_PHOTOGRAPH, MRI_CT_SCAN,
TABLE, CHART, INTRAOPERATIVE_IMAGE }`. Classification is derived from:
1. Caption keywords ("MRI", "CT", "Fig.", "Table") — heuristic, no model needed
-2. Fall back to `ANATOMICAL_DIAGRAM` if unclassifiable
+2. Marker `block_type` hint (`"Table"` → TABLE, `"Figure"` / `"Picture"` → ANATOMICAL_DIAGRAM default)
+3. Fall back to `ANATOMICAL_DIAGRAM` if unclassifiable
**Rationale**: Allows the frontend to render different icon/label per type (e.g., "MRI" badge).
Heuristic classification avoids a separate model call per image at extraction time.
@@ -175,14 +187,225 @@ the process fails mid-way. An explicit, idempotent trigger is safer and more obs
## Decision 10: Minimum Image Size Threshold
-**Decision**: Images smaller than 100×100 pixels are discarded and no chunk is created. This
-threshold filters out decorative elements (bullets, dividers, publisher logos) without a
-classification model.
+**Decision**: Images smaller than 100×100 pixels are discarded and no chunk is created. Marker
+returns PNG bytes; `FigureExtractionService` decodes to `BufferedImage` solely to check
+dimensions. This threshold filters out decorative elements without a classification model.
**Rationale**: Neurosurgery textbook diagrams and MRI scans are never smaller than 100×100 px.
-The threshold is configurable via `app.figure-storage.min-image-size-px` in
-`application.properties`.
+The threshold is configurable via `app.figure-storage.min-image-size-px`.
**Alternatives considered**:
- No threshold → decorative icons pollute the figure index
- ML-based classification → accurate but adds model dependency; not needed at POC scale
+
+---
+
+# Marker Study — Why Marker Replaces Google Document AI
+
+*Added 2026-04-04.*
+
+## What Marker Offers
+
+Marker is an open-source, locally-runnable PDF-to-structured-content converter that uses a
+pipeline of deep-learning models (surya for OCR + layout detection, texify for equations).
+Key capabilities relevant to this project:
+
+| Capability | Marker | Google Document AI |
+|-----------|--------|--------------------|
+| Multi-column reading order | ✅ | ✅ |
+| OCR on scanned pages | ✅ | ✅ |
+| Figure detection | ✅ returns pre-cropped images | ⚠️ returns bbox only; PDFBox still needed |
+| Table extraction | ✅ HTML tables | ✅ |
+| JSON output with image bytes | ✅ base64 in `images` map | ❌ |
+| No cloud credentials | ✅ | ❌ GCP service account required |
+| No per-page billing | ✅ | ❌ ~$10/1,000 pages |
+| Batch size limits | None (local) | 15 pages / 20 MB per sync call |
+| Setup | `pip install marker-pdf && marker_server` | GCP project + processor + IAM |
+
+---
+
+## Does Marker Solve the Current Pain Points?
+
+### Pain Point 1: Naive 50/50 Column Split
+
+**Answer: Yes, Marker fixes this completely.**
+
+`PdfStructureParser.extractPageText()` splits pages at the horizontal midpoint with a 20%
+threshold. This fails on asymmetric columns and scanned pages. Marker's surya layout model
+returns blocks in natural reading order — no heuristic needed.
+
+### Pain Point 2: Figure Detection Misses Rasterized Figures
+
+**Answer: Yes, Marker fixes this for most cases.**
+
+`FigureExtractionService` previously iterated PDF XObjects (only finds embedded XObject images,
+misses rasterized figures and vector-path drawings). Marker's layout model detects visual
+elements by type and returns the cropped image bytes directly — no PDFBox page rendering needed.
+
+### Pain Point 3: OCR on Scanned Pages
+
+**Answer: Yes, Marker handles scanned pages transparently via surya OCR.**
+
+### Pain Point 4: Caption Detection
+
+**Answer: Improved — Marker groups caption blocks with their figure block.**
+
+The `block_type = "Caption"` block appears as a sibling or child adjacent to the `"Figure"`
+block in the Marker JSON, making caption association structural rather than regex-based.
+
+---
+
+## Marker API Integration
+
+### Local Server Setup
+
+```bash
+pip install marker-pdf
+marker_server --port 8000
+```
+
+The server exposes `POST /marker/upload` (the user's configured endpoint).
+
+### Request
+
+```
+POST http://localhost:8000/marker/upload
+Content-Type: multipart/form-data
+
+file=@document.pdf
+output_format=json
+```
+
+### Response (abbreviated)
+
+```json
+{
+ "output_format": "json",
+ "output": {
+ "block_type": "Document",
+ "children": [
+ {
+ "block_type": "Page",
+ "id": "/page/0/Page/0",
+ "children": [
+ {
+ "block_type": "SectionHeader",
+ "id": "/page/0/SectionHeader/0",
+ "html": "Cavernous Sinus Anatomy "
+ },
+ {
+ "block_type": "Text",
+ "id": "/page/0/Text/1",
+ "html": "The cavernous sinus contains...
"
+ },
+ {
+ "block_type": "Figure",
+ "id": "/page/0/Figure/2",
+ "html": " ",
+ "images": {
+ "/page/0/Figure/2": "iVBORw0KGgo..."
+ }
+ },
+ {
+ "block_type": "Caption",
+ "id": "/page/0/Caption/3",
+ "html": "Fig. 12-4. Coronal cross-section...
"
+ }
+ ]
+ }
+ ],
+ "metadata": { "page_stats": [...] }
+ }
+}
+```
+
+### Java Integration Pattern
+
+```java
+// MarkerPageParser — core call
+MultiValueMap body = new LinkedMultiValueMap<>();
+body.add("file", new FileSystemResource(pdfPath));
+body.add("output_format", "json");
+
+JsonNode response = restClient.post()
+ .uri(baseUrl + "/marker/upload")
+ .contentType(MediaType.MULTIPART_FORM_DATA)
+ .body(body)
+ .retrieve()
+ .body(JsonNode.class);
+
+JsonNode document = response.get("output");
+```
+
+### Mapping Marker Blocks to PageResult
+
+```
+Page block (id "/page/N/Page/M") → PageResult(pageNumber = N+1)
+ SectionHeader children → headingTitle (first match)
+ Text, TextInlineMath children → orderedText (HTML stripped, joined \n\n)
+ Figure children with images map → FigureData(imageBytes = base64decode(images[id]))
+ Caption sibling of Figure → FigureData.nearestCaption
+```
+
+---
+
+## Architecture Change
+
+```
+Before (Document AI — removed):
+ DocumentAiPageParser
+ → Google Document AI API (GCP, 15-page batches, credentials)
+ → returns text blocks + figure bboxes
+ PdfStructureParser (PDFBox column heuristic)
+ FigureExtractionService
+ → renders page via PDFBox at 150 DPI
+ → crops bbox region
+
+After (Marker):
+ MarkerPageParser
+ → POST PDF to http://localhost:8000/marker/upload (output_format=json)
+ → returns text blocks (correct reading order) + Figure blocks with base64 images
+ → produces List (same DTO, FigureData carries bytes not bbox)
+ FigureExtractionService (simplified)
+ → base64-decodes image bytes from PageResult.FigureData
+ → checks min size (ImageIO.read → getWidth/getHeight)
+ → saves to S3 via FigureStorageService (UNCHANGED)
+ VisionDescriptionService (UNCHANGED)
+ BookEmbeddingService orchestration (MINOR: inject MarkerPageParser)
+```
+
+**What is removed**:
+- `DocumentAiPageParser` — replaced by `MarkerPageParser`
+- `DocumentAiConfig` — replaced by `MarkerConfig`
+- `PdfStructureParser` — Marker handles reading order
+- `google-cloud-document-ai` Maven dependency
+- `app.document-ai.*` configuration properties
+
+**What stays the same**:
+- `PageResult` DTO structure (fields renamed, not restructured)
+- `FigureExtractionService` public interface
+- `TextChunkingService`, `VisionDescriptionService`, `BookEmbeddingService` orchestration
+- All JPA entities, repositories, vector store, S3 storage
+
+---
+
+## Constitution Compliance
+
+| Principle | Assessment |
+|-----------|------------|
+| **I. KISS** | ✅ Simpler than Document AI — one HTTP call replaces GCP SDK + PDFBox render loop. No new dependency beyond an HTTP client (Spring RestClient, already available). |
+| **II. Easy to Change** | ✅ `MarkerPageParser` is the only Marker-aware class. Swap it to use any other parser. `PageResult` DTO unchanged in contract. |
+| **III. Web-First** | ✅ Internal pipeline change; no API contract change. |
+| **IV. Documentation** | ✅ README must show Marker as a local external service dependency. |
+
+---
+
+## Risks & Mitigations
+
+| Risk | Likelihood | Mitigation |
+|------|-----------|------------|
+| Marker server not running when book is uploaded | Medium | `BookEmbeddingService` catches exception from `MarkerPageParser`, marks book as `FAILED`, logs full error. |
+| Marker misses some figures (complex PDFs) | Medium | `app.figure-storage.min-image-size-px` threshold can be tuned. Add fallback: if Marker returns 0 figures for a page with known images, log a warning. |
+| SC-003 (≤ 3× processing time) violated | Low | Marker runs locally (no network latency to cloud). Benchmark with a real 500-page book early. |
+| Large PDF upload to Marker (>100MB) | Low | Marker server handles the full file; no batching needed. Multipart upload limit configurable. |
+| Marker image quality vs PDFBox crop | Low | Marker crops at native resolution; quality is equivalent or better than 150 DPI PDFBox render. |
diff --git a/specs/002-image-aware-embedding/tasks.md b/specs/002-image-aware-embedding/tasks.md
index e71d4e3..7d15aa5 100644
--- a/specs/002-image-aware-embedding/tasks.md
+++ b/specs/002-image-aware-embedding/tasks.md
@@ -48,12 +48,13 @@
**Independent Test**: Upload a PDF containing at least one page with a labelled anatomical diagram. After status shows `READY`, call `GET /api/v1/books/{id}/figures` — response must contain at least one entry with `figureType`, `caption`, `page`, and `imageUrl` populated. Verify the PNG file exists at the path in `imagePath`.
-- [X] T013 [US2] Create `PdfStructureParser` service in `backend/src/main/java/com/aiteacher/document/PdfStructureParser.java` — uses Spring AI's `PagePdfDocumentReader` to extract per-page text; groups pages into `SectionEntity` records using heading-detection heuristics (lines matching `^\d+(\.\d+)*\s+[A-Z]`); groups sections into `ChapterEntity` records; persists both to Postgres via `ChapterRepository` and `SectionRepository`; returns `List` for the book
-- [X] T014 [US2] Create `FigureExtractionService` in `backend/src/main/java/com/aiteacher/document/FigureExtractionService.java` — opens PDF with PDFBox `PDDocument`; iterates pages; extracts `PDImageXObject` instances; skips images whose width or height are below `min-image-size-px`; classifies `FigureType` using the keyword-matching table from data-model.md §FigureType; parses caption from the nearest text line matching `CAPTION_PATTERN`; saves PNG via `FigureStorageService`; persists `FigureEntity` to `FigureRepository`; returns `List` per book
+- [X] T013 [US2] ~~Create `PdfStructureParser`~~ → **SUPERSEDED**: PDF parsing is handled by `MarkerPageParser` (see T013b). `PdfStructureParser` exists but is not wired into the pipeline.
+- [X] T013b [US2] Create `MarkerPageParser` in `backend/src/main/java/com/aiteacher/document/MarkerPageParser.java` — POSTs PDF to `http://localhost:8000/marker/upload?output_format=json` via Spring `RestClient`; parses JSON response into `List` (one per page block); extracts heading, ordered text, and pre-cropped figure PNG bytes per page
+- [X] T014 [US2] Update `FigureExtractionService` in `backend/src/main/java/com/aiteacher/document/FigureExtractionService.java` — **Marker migration**: removed PDFBox rendering + bbox-crop loop; decodes PNG bytes from `PageResult.FigureData` via `ImageIO.read()`; skips images below `min-image-size-px`; classifies `FigureType`; saves via `FigureStorageService`; persists `FigureEntity`
- [X] T015 [US2] Create `VisionDescriptionService` in `backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java` — accepts a `Path` to a PNG and a caption String; calls the OpenAI vision model (via Spring AI `ChatClient` with image media type) to generate a 2–4 sentence clinical description; returns the generated description string; handles API failures by returning the caption as fallback
- [X] T016 [US2] Create `TextChunkingService` in `backend/src/main/java/com/aiteacher/document/TextChunkingService.java` — accepts a `SectionEntity`; splits `fullText` into overlapping 400–600 token windows (20-token overlap); wraps each window in a Spring AI `Document` with the flat metadata map defined in data-model.md §Text chunk document; returns `List`
- [X] T017 [US2] Create `ChunkFigureRefService` in `backend/src/main/java/com/aiteacher/document/ChunkFigureRefService.java` — accepts a Spring AI `Document` (with its `id` as `chunkId`) and a `List` for the book; scans chunk text for patterns `Fig\.\s*\d+[\-\.]\d+` and `Figure\s+\d+[\-\.]\d+`; matches against figure labels; persists `ChunkFigureRefEntity` rows via `ChunkFigureRefRepository`
-- [X] T018 [US2] Rewrite `BookEmbeddingService.embedBook()` in `backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java` to orchestrate the full pipeline: (1) `PdfStructureParser` → sections; (2) parallel: `FigureExtractionService` + `TextChunkingService` for each section; (3) `VisionDescriptionService` for each figure; (4) embed figure captions+descriptions as `Document`s (metadata per data-model.md §Figure caption document) into `vectorStore`; (5) embed text chunks into `vectorStore`; (6) `ChunkFigureRefService` for each chunk; update `captionEmbeddingId` on `FigureEntity` after embedding
+- [X] T018 [US2] Update `BookEmbeddingService.embedBook()` — **Marker migration**: injected `MarkerPageParser` replacing `DocumentAiPageParser`; updated `figureExtractionService.extract()` call (removed `pdfPath` arg); updated log message. Pipeline: (1) `MarkerPageParser` → `List`; (2) `buildAndSaveSections()` → sections; (3) `TextChunkingService` → chunks → embed; (4) `FigureExtractionService.extract()` → figures; (5) `VisionDescriptionService` → embed figure chunks; (6) `ChunkFigureRefService` → refs
- [X] T019 [US2] Extend `BookEmbeddingService.deleteBookChunks()` to also delete: all `ChunkFigureRefEntity` rows (via `findByFigureIdIn`), all `FigureEntity` rows (via `deleteAllByBookId`), all figure PNG files (via `FigureStorageService.delete(bookId)`), all `SectionEntity` and `ChapterEntity` rows for the book
- [X] T020 [US2] Add `POST /api/v1/books/{id}/reembed` endpoint to `BookController` in `backend/src/main/java/com/aiteacher/book/BookController.java` — returns `202` with `{ bookId, status: "PROCESSING" }`; returns `404` if not found; returns `409` if already `PROCESSING`; calls `deleteBookChunks()` then `embedBook()` asynchronously