From 5c641f4bcc53d6ff756a188d53be4e4a2066e2cf Mon Sep 17 00:00:00 2001 From: Adrien Date: Sun, 5 Apr 2026 21:55:30 +0200 Subject: [PATCH] enhance page parsing using json output and html --- .../com/aiteacher/book/BookController.java | 6 +- .../aiteacher/book/BookEmbeddingService.java | 78 ++-- .../aiteacher/document/MarkerPageParser.java | 406 +++++++++--------- .../com/aiteacher/document/PageResult.java | 3 +- .../com/aiteacher/document/ParsedBook.java | 16 + .../document/S3MarkdownStorageService.java | 6 +- backend/src/main/resources/application.yaml | 3 +- frontend/src/App.vue | 7 +- frontend/src/views/BookReaderView.vue | 25 +- 9 files changed, 292 insertions(+), 258 deletions(-) create mode 100644 backend/src/main/java/com/aiteacher/document/ParsedBook.java diff --git a/backend/src/main/java/com/aiteacher/book/BookController.java b/backend/src/main/java/com/aiteacher/book/BookController.java index 3f4ed5f..6be7ef6 100644 --- a/backend/src/main/java/com/aiteacher/book/BookController.java +++ b/backend/src/main/java/com/aiteacher/book/BookController.java @@ -64,9 +64,9 @@ public class BookController { )); } - @GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE) - public ResponseEntity getPageMarkdown(@PathVariable UUID id, - @PathVariable int pageNumber) { + @GetMapping(value = "/{id}/pages/{pageNumber}/html", produces = MediaType.TEXT_HTML_VALUE) + public ResponseEntity getPageHtml(@PathVariable UUID id, + @PathVariable int pageNumber) { bookService.getById(id); // 404 if not found try { return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber)); diff --git a/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java b/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java index 1cedf36..a45375d 100644 --- a/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java +++ b/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java @@ -3,8 +3,6 @@ package com.aiteacher.book; import com.aiteacher.document.*; import com.aiteacher.figure.FigureStorageService; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; @@ -38,15 +36,15 @@ public class BookEmbeddingService { private final FigureStorageService figureStorageService; private final MarkdownStorageService markdownStorageService; - private static final Pattern MARKER_PLACEHOLDER = - Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)"); - @Value("${app.embedding.batch-size:50}") private int embeddingBatchSize; @Value("${app.embedding.batch-delay-ms:1000}") private long embeddingBatchDelayMs; + @Value("${app.embedding.skip-embedding:false}") + private boolean skipEmbedding; + public BookEmbeddingService( VectorStore vectorStore, BookRepository bookRepository, @@ -94,8 +92,10 @@ public class BookEmbeddingService { ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1); chapterRepository.save(chapter); - // Step 1: Parse every page with Marker — correct reading order + pre-cropped figures - List pageResults = markerPageParser.parse(pdfPath); + // Step 1: Parse with Marker — JSON (structured) + Markdown (per-page) in parallel + ParsedBook parsed = markerPageParser.parse(pdfPath); + + List pageResults = parsed.pages(); // Step 2: Build SectionEntity per page and persist List sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults); @@ -105,22 +105,24 @@ public class BookEmbeddingService { for (SectionEntity section : sections) { allChunks.addAll(textChunkingService.chunk(section, bookTitle)); } - embedInBatches(allChunks, bookId); - log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId); + if (skipEmbedding) { + log.info("skip-embedding=true — skipping text embedding for book {}", bookId); + } else { + embedInBatches(allChunks, bookId); + log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId); + } // Step 4: Decode pre-cropped figures from Marker output FigureExtractionService.ExtractionResult extraction = figureExtractionService.extract(bookId, chapterId, pageResults); List figures = extraction.figures(); - // Step 4b: Upload per-page markdown with resolved figure URLs to S3 - for (PageResult page : pageResults) { - if (!page.markdown().isBlank()) { - String resolved = resolvePlaceholders(page.markdown(), bookId, - extraction.blockIdToFigureId()); - markdownStorageService.save(bookId, page.pageNumber(), resolved); - } - } + // Step 4b: Save per-page HTML to S3, replacing Marker image src with API URLs + parsed.htmlByPage().forEach((pageNumber, html) -> { + String resolved = resolveImageSrcs(html, bookId, extraction.blockIdToFigureId()); + markdownStorageService.save(bookId, pageNumber, resolved); + }); + log.info("Saved {} HTML pages to S3 for book {}", parsed.htmlByPage().size(), bookId); // Step 5: Vision analysis (description + visible text) → embed figure chunks for (FigureEntity figure : figures) { @@ -139,11 +141,12 @@ public class BookEmbeddingService { + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText()); String embeddingId = UUID.randomUUID().toString(); - Document figureDoc = new Document(embeddingId, embeddingContent, - buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText())); - vectorStore.add(List.of(figureDoc)); - - figure.setCaptionEmbeddingId(UUID.fromString(embeddingId)); + if (!skipEmbedding) { + Document figureDoc = new Document(embeddingId, embeddingContent, + buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText())); + vectorStore.add(List.of(figureDoc)); + figure.setCaptionEmbeddingId(UUID.fromString(embeddingId)); + } figureRepository.save(figure); } log.info("Embedded {} figure chunks for book {}", figures.size(), bookId); @@ -252,25 +255,20 @@ public class BookEmbeddingService { return m; } - /** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */ - private String resolvePlaceholders(String markdown, UUID bookId, - Map blockIdToFigureId) { - Matcher m = MARKER_PLACEHOLDER.matcher(markdown); - StringBuilder sb = new StringBuilder(); - while (m.find()) { - String altText = m.group(1); - String blockId = m.group(2); - String figureId = blockIdToFigureId.get(blockId); - if (figureId != null) { - String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png"; - m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\") - .replace("$", "\\$") + "](" + url + ")"); - } else { - m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.) - } + /** + * Replaces Marker's {@code src='{blockId}'} image attributes with resolved API URLs. + * Block IDs look like {@code /page/0/Figure/2}. + */ + private String resolveImageSrcs(String html, UUID bookId, Map blockIdToFigureId) { + for (Map.Entry entry : blockIdToFigureId.entrySet()) { + String blockId = entry.getKey(); + String figureId = entry.getValue(); + String apiUrl = "/api/v1/figures/" + bookId + "/" + figureId + ".png"; + // Marker emits both single and double-quoted src attributes + html = html.replace("src='" + blockId + "'", "src='" + apiUrl + "'"); + html = html.replace("src=\"" + blockId + "\"", "src=\"" + apiUrl + "\""); } - m.appendTail(sb); - return sb.toString().strip(); + return html; } private String truncate(String msg, int max) { diff --git a/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java index d75806b..84e61b4 100644 --- a/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java +++ b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java @@ -18,17 +18,22 @@ import java.nio.file.Path; import java.util.*; /** - * Parses a PDF using the local Marker server ({@code POST /marker/upload}). + * Parses a PDF with a single call to the Marker server using {@code output_format=json}. * - *

A single HTTP call returns: + *

The JSON response contains an {@code output} field that is itself a JSON string with a + * tree structure: the root has a {@code children} array where each item is a {@code Page} block. + * Each block carries an {@code html} field with {@code } placeholders + * that reference its {@code children} by ID. + * + *

{@link #jsonToHtml} mirrors the Marker Python {@code json_to_html} utility: it walks the + * tree recursively and resolves every {@code content-ref} with the rendered HTML of the + * referenced child block. + * + *

Returns a {@link ParsedBook} with: *

    - *
  • Reading-order text blocks — correct for multi-column and scanned pages
  • - *
  • Section headings extracted from {@code SectionHeader} blocks
  • - *
  • Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's - * {@code images} map
  • + *
  • {@code pages} — one {@link PageResult} per non-empty page (drives embeddings)
  • + *
  • {@code htmlByPage} — full resolved HTML per page (saved to S3 for the reader)
  • *
- * - *

The response is mapped to one {@link PageResult} per page block. */ @Service public class MarkerPageParser { @@ -36,24 +41,21 @@ public class MarkerPageParser { private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class); private static final Set TEXT_BLOCK_TYPES = Set.of( - "Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation", + "Text", "TextInlineMath", "ListItem", "Table", "TableOfContents", "Code", "Equation", "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting" ); private static final Set FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup"); - private final RestClient restClient; - private final ObjectMapper objectMapper; + private static final ObjectMapper MAPPER = new ObjectMapper(); - public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) { + private final RestClient restClient; + + public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient) { this.restClient = restClient; - this.objectMapper = objectMapper; } - /** - * Parses the entire PDF and returns one {@link PageResult} per non-empty page. - */ - public List parse(Path pdfPath) { - log.info("Submitting {} to Marker for parsing", pdfPath.getFileName()); + public ParsedBook parse(Path pdfPath) { + log.info("Submitting {} to Marker (json)", pdfPath.getFileName()); MultiValueMap body = new LinkedMultiValueMap<>(); body.add("file", new FileSystemResource(pdfPath)); @@ -67,207 +69,219 @@ public class MarkerPageParser { .body(JsonNode.class); try { - Path debugFile = Path.of("/tmp/marker-response-md.json"); - Files.writeString(debugFile, response.toPrettyString()); - log.info("Marker response saved to {}", debugFile); + Files.writeString(Path.of("/tmp/marker-response-json.json"), response.toPrettyString()); } catch (IOException e) { - log.warn("Could not save Marker response to file", e); + log.warn("Could not save Marker response to /tmp/marker-response-json.json", e); } - List results = parseResponse(response); - log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName()); - return results; + List pageNodes = extractPages(response); + if (pageNodes.isEmpty()) { + log.warn("Marker returned no pages for {}", pdfPath.getFileName()); + return new ParsedBook(List.of(), Map.of()); + } + log.info("Marker returned {} pages for {}", pageNodes.size(), pdfPath.getFileName()); + + List pages = new ArrayList<>(); + Map htmlByPage = new LinkedHashMap<>(); + + for (int i = 0; i < pageNodes.size(); i++) { + JsonNode pageNode = pageNodes.get(i); + int pageNumber = i + 1; // 1-based + + PageResult result = buildPageResult(pageNode, pageNumber); + String html = jsonToHtml(pageNode); + + if (!result.orderedText().isBlank() || !result.figures().isEmpty()) { + pages.add(result); + htmlByPage.put(pageNumber, html); + } + } + + log.info("Marker produced {} non-empty pages from {}", pages.size(), pdfPath.getFileName()); + return new ParsedBook(pages, htmlByPage); } - // --- Private helpers --- + // ── Page extraction ─────────────────────────────────────────────────────── - private List parseResponse(JsonNode response) { + /** + * Parses the {@code output} JSON string and returns the list of page nodes + * (the top-level {@code children} of the document root). + */ + private List extractPages(JsonNode response) { if (response == null) return List.of(); - - // The "output" field is a JSON-encoded string — parse it first. - // Fall back to treating the whole response as the root if "output" is absent. - JsonNode root; JsonNode outputNode = response.path("output"); - if (!outputNode.isMissingNode() && outputNode.isTextual()) { - try { - root = objectMapper.readTree(outputNode.asText()); - } catch (tools.jackson.core.JacksonException e) { - log.warn("Could not parse Marker 'output' field as JSON", e); - return List.of(); - } - } else if (!outputNode.isMissingNode()) { - root = outputNode; - } else { - root = response; - } - - JsonNode children = root.path("children"); - if (children.isMissingNode() || !children.isArray()) { - log.warn("Marker response has no 'children' array — empty result"); + if (outputNode.isMissingNode()) { + log.warn("Marker response has no 'output' field"); return List.of(); } - - List results = new ArrayList<>(); - int pageIndex = 0; - for (JsonNode pageBlock : children) { - String blockType = pageBlock.path("block_type").asText(); - if (!"Page".equals(blockType)) continue; - - int pageNumber = pageIndex + 1; - pageIndex++; - - PageResult result = parsePage(pageBlock, pageNumber); - if (!result.orderedText().isBlank() || !result.figures().isEmpty()) { - results.add(result); - } - } - return results; - } - - private PageResult parsePage(JsonNode pageBlock, int pageNumber) { - JsonNode children = pageBlock.path("children"); - if (children.isMissingNode() || !children.isArray()) { - return new PageResult(pageNumber, "", null, List.of(), ""); - } - - StringBuilder textBuilder = new StringBuilder(); - StringBuilder markdownBuilder = new StringBuilder(); - String headingTitle = null; - List figures = new ArrayList<>(); - Set consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure - - List childList = new ArrayList<>(); - children.forEach(childList::add); - - for (int i = 0; i < childList.size(); i++) { - if (consumed.contains(i)) continue; - - JsonNode child = childList.get(i); - String type = child.path("block_type").asText(); - - if ("SectionHeader".equals(type)) { - String heading = stripHtml(child.path("html").asText()).strip(); - if (!heading.isEmpty() && headingTitle == null) { - headingTitle = heading; - } - appendText(textBuilder, heading); - appendMarkdown(markdownBuilder, "## " + heading); - - } else if (TEXT_BLOCK_TYPES.contains(type)) { - String text = stripHtml(child.path("html").asText()); - appendText(textBuilder, text); - appendMarkdown(markdownBuilder, text.strip()); - - } else if (FIGURE_BLOCK_TYPES.contains(type)) { - extractFigures(child, i, childList, figures, markdownBuilder, consumed); - } - } - - return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle, - figures, markdownBuilder.toString().strip()); - } - - /** - * Handles a figure/picture block at {@code index} in {@code siblings}. - * For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure, - * and the caption is a sibling Caption child inside the group. - * For leaf blocks the caption is the next sibling in the page child list. - * Image refs are appended to {@code markdown} as {@code ![caption](marker://{blockId})}. - * Consumed caption sibling indices are added to {@code consumed}. - */ - private void extractFigures(JsonNode block, int index, List siblings, - List out, StringBuilder markdown, - Set consumed) { - String type = block.path("block_type").asText(); - boolean isGroup = type.endsWith("Group"); - - if (isGroup) { - JsonNode groupChildren = block.path("children"); - if (groupChildren.isMissingNode() || !groupChildren.isArray()) return; - - String groupCaption = null; - for (JsonNode sub : groupChildren) { - if ("Caption".equals(sub.path("block_type").asText())) { - String c = stripHtml(sub.path("html").asText()).strip(); - if (!c.isEmpty()) groupCaption = c; - } - } - for (JsonNode sub : groupChildren) { - String subType = sub.path("block_type").asText(); - if ("Figure".equals(subType) || "Picture".equals(subType)) { - String blockId = sub.path("id").asText(); - byte[] imageBytes = extractImageBytes(sub, blockId); - if (imageBytes != null) { - out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId)); - String altText = groupCaption != null ? groupCaption : blockId; - appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")"); - } - } - } - } else { - String blockId = block.path("id").asText(); - byte[] imageBytes = extractImageBytes(block, blockId); - if (imageBytes != null) { - String caption = null; - if (index + 1 < siblings.size()) { - JsonNode next = siblings.get(index + 1); - if ("Caption".equals(next.path("block_type").asText())) { - String c = stripHtml(next.path("html").asText()).strip(); - if (!c.isEmpty()) caption = c; - consumed.add(index + 1); - } - } - out.add(new PageResult.FigureData(imageBytes, caption, blockId)); - String altText = caption != null ? caption : blockId; - appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")"); - } - } - } - - /** - * Extracts and base64-decodes the image bytes for this block. - * Marker stores images in the block's {@code images} map keyed by block ID. - */ - private byte[] extractImageBytes(JsonNode block, String blockId) { - JsonNode images = block.path("images"); - if (images.isMissingNode() || images.isEmpty()) return null; - - // Try the block's own ID first, then fall back to the first entry - JsonNode imgNode = images.path(blockId); - if (imgNode.isMissingNode()) { - imgNode = images.properties().stream() - .findFirst() - .map(e -> e.getValue()) - .orElse(imgNode); - } - - String base64 = imgNode.asText(); - if (base64.isEmpty()) return null; - try { - return Base64.getDecoder().decode(base64); - } catch (IllegalArgumentException ex) { - log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage()); - return null; + JsonNode root = MAPPER.readTree(outputNode.stringValue()); + JsonNode children = root.path("children"); + if (children.isMissingNode() || !children.isArray()) { + log.warn("Marker output root has no 'children' array"); + return List.of(); + } + List result = new ArrayList<>(); + children.forEach(result::add); + return result; + } catch (Exception e) { + log.warn("Could not parse Marker 'output' string as JSON: {}", e.getMessage()); + return List.of(); } } + // ── HTML rendering ──────────────────────────────────────────────────────── + + /** + * Java equivalent of the Marker Python {@code json_to_html} utility. + * + *

Algorithm: + *

    + *
  1. If the block has no children, return its {@code html} as-is (leaf node).
  2. + *
  3. Otherwise recursively render each child, then replace every + * {@code } placeholder in the block's own {@code html} + * with the rendered child HTML.
  4. + *
+ */ + String jsonToHtml(JsonNode block) { + String html = str(block.path("html")); + + // If the block carries image data, inject data-URI tags. + // Marker stores base64 image bytes in block.images keyed by block ID. + // Picture/Figure leaf blocks have empty html, so this is the only way to + // get the image into the rendered output. + JsonNode images = block.path("images"); + if (!images.isMissingNode() && !images.isNull() && !images.isEmpty()) { + StringBuilder imgTags = new StringBuilder(); + images.properties().forEach(entry -> { + String base64 = str(entry.getValue()); + if (!base64.isEmpty()) { + String mime = detectImageMime(base64); + imgTags.append(""); + } + }); + if (!imgTags.isEmpty()) { + html = html + imgTags; + } + } + + JsonNode children = block.path("children"); + if (children.isMissingNode() || children.isNull() || !children.isArray() || children.isEmpty()) { + return html; // leaf node + } + + // Build id → rendered-html map for all direct children + Map childHtml = new LinkedHashMap<>(); + for (JsonNode child : children) { + String id = str(child.path("id")); + childHtml.put(id, jsonToHtml(child)); + } + + // Replace every with the child's HTML + for (Map.Entry entry : childHtml.entrySet()) { + String ref = ""; + html = html.replace(ref, entry.getValue()); + } + + return html; + } + + // ── PageResult (text + figures for embeddings) ──────────────────────────── + + private PageResult buildPageResult(JsonNode pageBlock, int pageNumber) { + StringBuilder text = new StringBuilder(); + String[] headingTitle = {null}; + List figures = new ArrayList<>(); + + walkBlock(pageBlock, text, headingTitle, figures); + return new PageResult(pageNumber, text.toString().strip(), headingTitle[0], figures); + } + + /** Recursively walks the block tree, collecting text and figures in reading order. */ + private void walkBlock(JsonNode block, StringBuilder text, String[] headingTitle, + List figures) { + String type = str(block.path("block_type")); + + if ("SectionHeader".equals(type)) { + String heading = stripHtml(str(block.path("html"))).strip(); + if (!heading.isEmpty() && headingTitle[0] == null) headingTitle[0] = heading; + appendText(text, heading); + + } else if (TEXT_BLOCK_TYPES.contains(type)) { + appendText(text, stripHtml(str(block.path("html")))); + + } else if (FIGURE_BLOCK_TYPES.contains(type)) { + String caption = findCaption(block); + extractFigures(block, caption, figures); + } + + // Recurse into children (content-ref ordering is implicit via tree order) + JsonNode children = block.path("children"); + if (!children.isMissingNode() && !children.isNull() && children.isArray()) { + for (JsonNode child : children) { + walkBlock(child, text, headingTitle, figures); + } + } + } + + /** Finds the first Caption child inside a figure block, if any. */ + private String findCaption(JsonNode figureBlock) { + JsonNode children = figureBlock.path("children"); + if (children.isMissingNode() || !children.isArray()) return null; + for (JsonNode child : children) { + if ("Caption".equals(str(child.path("block_type")))) { + String caption = stripHtml(str(child.path("html"))).strip(); + return caption.isEmpty() ? null : caption; + } + } + return null; + } + + private void extractFigures(JsonNode block, String caption, List out) { + JsonNode images = block.path("images"); + if (images.isMissingNode() || images.isEmpty()) return; + + images.properties().forEach(entry -> { + String blockId = entry.getKey(); + String base64 = str(entry.getValue()); + if (base64.isEmpty()) return; + try { + byte[] bytes = Base64.getDecoder().decode(base64); + out.add(new PageResult.FigureData(bytes, caption, blockId)); + } catch (IllegalArgumentException ex) { + log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage()); + } + }); + } + + // ── Utilities ───────────────────────────────────────────────────────────── + private void appendText(StringBuilder sb, String text) { + if (text == null) return; String stripped = text.strip(); if (stripped.isEmpty()) return; if (sb.length() > 0) sb.append("\n\n"); sb.append(stripped); } - private void appendMarkdown(StringBuilder sb, String text) { - if (text == null || text.isBlank()) return; - if (sb.length() > 0) sb.append("\n\n"); - sb.append(text.strip()); - } - - /** Strips HTML tags and normalises whitespace. */ private String stripHtml(String html) { if (html == null || html.isEmpty()) return ""; return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip(); } + + /** Detects MIME type from the first characters of a base64-encoded image. */ + private static String detectImageMime(String base64) { + if (base64.startsWith("/9j/")) return "image/jpeg"; + if (base64.startsWith("iVBOR")) return "image/png"; + if (base64.startsWith("R0lGO")) return "image/gif"; + if (base64.startsWith("UklGR")) return "image/webp"; + return "image/png"; // safe fallback + } + + /** Null-safe string extraction from a JsonNode (Jackson 3: stringValue() returns null for non-strings). */ + private static String str(JsonNode node) { + String v = node.stringValue(); + return v != null ? v : ""; + } } diff --git a/backend/src/main/java/com/aiteacher/document/PageResult.java b/backend/src/main/java/com/aiteacher/document/PageResult.java index cb9989d..682a8f3 100644 --- a/backend/src/main/java/com/aiteacher/document/PageResult.java +++ b/backend/src/main/java/com/aiteacher/document/PageResult.java @@ -10,8 +10,7 @@ public record PageResult( int pageNumber, // 1-based, derived from Marker page block index String orderedText, // full page text in correct reading order (blocks joined by \n\n) String headingTitle, // first SectionHeader block on page, or null - List figures, // extracted figure images (may be empty) - String markdown // markdown representation with marker://{blockId} image placeholders + List figures // extracted figure images (may be empty) ) { /** diff --git a/backend/src/main/java/com/aiteacher/document/ParsedBook.java b/backend/src/main/java/com/aiteacher/document/ParsedBook.java new file mode 100644 index 0000000..d35dab4 --- /dev/null +++ b/backend/src/main/java/com/aiteacher/document/ParsedBook.java @@ -0,0 +1,16 @@ +package com.aiteacher.document; + +import java.util.List; +import java.util.Map; + +/** + * Result of a full Marker parse: structured page data (from JSON) plus + * native per-page markdown (from the separate Markdown API call). + * + * @param pages one entry per non-empty page, derived from the chunks response + * @param htmlByPage concatenated block HTML keyed by 1-based page number + */ +public record ParsedBook( + List pages, + Map htmlByPage +) {} diff --git a/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java index 478f0cc..b064480 100644 --- a/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java +++ b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java @@ -53,7 +53,7 @@ public class S3MarkdownStorageService implements MarkdownStorageService { byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8); s3.putObject( PutObjectRequest.builder().bucket(bucket).key(key) - .contentType("text/markdown; charset=utf-8") + .contentType("text/html; charset=utf-8") .contentLength((long) bytes.length).build(), RequestBody.fromBytes(bytes)); return key; @@ -69,7 +69,7 @@ public class S3MarkdownStorageService implements MarkdownStorageService { @Override public void deleteAll(UUID bookId) { - String prefix = "markdown/" + bookId + "/"; + String prefix = "html/" + bookId + "/"; try { List toDelete = new ArrayList<>(); s3.listObjectsV2Paginator(ListObjectsV2Request.builder() @@ -92,6 +92,6 @@ public class S3MarkdownStorageService implements MarkdownStorageService { } private static String key(UUID bookId, int pageNumber) { - return "markdown/" + bookId + "/page-" + pageNumber + ".md"; + return "html/" + bookId + "/page-" + pageNumber + ".html"; } } diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index 5cf70c0..bf00383 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -64,5 +64,6 @@ app: embedding: batch-size: 20 batch-delay-ms: 2000 + skip-embedding: true marker: - base-url: ${MARKER_BASE_URL:http://localhost:8000} + base-url: ${MARKER_BASE_URL:http://192.168.1.105:8000} diff --git a/frontend/src/App.vue b/frontend/src/App.vue index ef6713c..725faa3 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -64,11 +64,11 @@ body { Ubuntu, Cantarell, 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; background: #f0f4f8; color: #2d3748; - min-height: 100vh; + height: 100vh; } #app { - min-height: 100vh; + height: 100vh; display: flex; flex-direction: column; } @@ -133,6 +133,9 @@ body { .main-content { flex: 1; + min-height: 0; + display: flex; + flex-direction: column; padding: 2rem; max-width: 1200px; margin: 0 auto; diff --git a/frontend/src/views/BookReaderView.vue b/frontend/src/views/BookReaderView.vue index 1ecc12a..e3484ad 100644 --- a/frontend/src/views/BookReaderView.vue +++ b/frontend/src/views/BookReaderView.vue @@ -44,7 +44,6 @@