diff --git a/CLAUDE.md b/CLAUDE.md index d964a04..d3b2b2a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,10 +1,13 @@ # ai-teacher Development Guidelines -Auto-generated from all feature plans. Last updated: 2026-04-03 +Auto-generated from all feature plans. Last updated: 2026-04-04 ## Active Technologies - Java 25 (backend), TypeScript / Node 20 (frontend) + Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API (embeddings + chat), PDFBox (via Spring AI PDF reader dependency) (002-image-aware-embedding) - PostgreSQL (JPA + Flyway), pgvector (Spring AI `VectorStore`), local file system (extracted images — `/uploads/figures/`) (002-image-aware-embedding) +- Java 25 (backend), TypeScript / Node 20 (frontend) + Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API, PDFBox (rendering only), `com.google.cloud:google-cloud-documentai` (~2.40.x) (002-image-aware-embedding) +- PostgreSQL (JPA + Flyway), pgvector (Spring AI VectorStore), S3 / local filesystem (figure images) (002-image-aware-embedding) +- PostgreSQL (JPA + Flyway), pgvector (Spring AI `VectorStore`), S3-compatible (002-image-aware-embedding) - Java 21 (backend), TypeScript / Node 20 (frontend) (001-neuro-rag-learning) @@ -24,9 +27,10 @@ npm test && npm run lint Java 21 (backend), TypeScript / Node 20 (frontend): Follow standard conventions ## Recent Changes +- 002-image-aware-embedding: Added Java 25 (backend), TypeScript / Node 20 (frontend) + Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API (embeddings + +- 002-image-aware-embedding: Added Java 25 (backend), TypeScript / Node 20 (frontend) + Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API, PDFBox (rendering only), `com.google.cloud:google-cloud-documentai` (~2.40.x) - 002-image-aware-embedding: Added Java 25 (backend), TypeScript / Node 20 (frontend) + Spring Boot 4.0.5, Spring AI 2.0.0-M4, OpenAI API (embeddings + chat), PDFBox (via Spring AI PDF reader dependency) -- 001-neuro-rag-learning: Added Java 21 (backend), TypeScript / Node 20 (frontend) diff --git a/README-marker.md b/README-marker.md new file mode 100644 index 0000000..c32133c --- /dev/null +++ b/README-marker.md @@ -0,0 +1,566 @@ +# Marker + +Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately. + +- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages +- Formats tables, forms, equations, inline math, links, references, and code blocks +- Extracts and saves images +- Removes headers/footers/other artifacts +- Extensible with your own formatting and logic +- Does structured extraction, given a JSON schema (beta) +- Optionally boost accuracy with LLMs (and your own prompt) +- Works on GPU, CPU, or MPS + +For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker). + +## Performance + + + +Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools. + +The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100. + +See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. + +## Hybrid Mode + +For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details. + +Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm: + + + +As you can see, the use_llm mode offers higher accuracy than marker or gemini alone. + +## Examples + +| PDF | File type | Markdown | JSON | +|-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| +| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json) | +| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) | +| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) | + +# Commercial usage + +Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker). + +# Hosted API & On-prem + +There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out. + +The API: +- Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files +- Is 1/4th the price of leading cloud-based competitors +- Fast - ~15s for a 250 page PDF +- Supports LLM mode +- High uptime (99.99%) + +# Community + +[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. + +# Installation + +You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/). + +Install with: + +```shell +pip install marker-pdf +``` + +If you want to use marker on documents other than PDFs, you will need to install additional dependencies with: + +```shell +pip install marker-pdf[full] +``` + +# Usage + +First, some configuration: + +- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`. +- Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text. +- If you care about inline math, set `force_ocr` to convert inline math to LaTeX. + +## Interactive App + +I've included a streamlit app that lets you interactively try marker with some basic options. Run it with: + +```shell +pip install streamlit streamlit-ace +marker_gui +``` + +## Convert a single file + +```shell +marker_single /path/to/file.pdf +``` + +You can pass in PDFs or images. + +Options: +- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20. +- `--output_format [markdown|json|html|chunks]`: Specify the format for the output results. +- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR. +- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n` +- `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services). +- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly. +- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output. +- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya. +- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`. +- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description. +- `--debug`: Enable debug mode for additional logging and diagnostic information. +- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"` +- `--config_json PATH`: Path to a JSON configuration file containing additional settings. +- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults. +- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables. +- `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`. +- `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above) + +The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language. + +## Convert multiple files + +```shell +marker /path/to/input/folder +``` + +- `marker` supports all the same options from `marker_single` above. +- `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average. + +## Convert multiple files on multiple GPUs + +```shell +NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out +``` + +- `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater. +- `NUM_WORKERS` is the number of parallel processes to run on each GPU. + +## Use from python + +See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed. + +```python +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered + +converter = PdfConverter( + artifact_dict=create_model_dict(), +) +rendered = converter("FILEPATH") +text, _, images = text_from_rendered(rendered) +``` + +`rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`. For json output, you'll have `children`, `block_type`, and `metadata`. + +### Custom configuration + +You can pass configuration using the `ConfigParser`. To see all available options, do `marker_single --help`. + +```python +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.config.parser import ConfigParser + +config = { + "output_format": "json", + "ADDITIONAL_KEY": "VALUE" +} +config_parser = ConfigParser(config) + +converter = PdfConverter( + config=config_parser.generate_config_dict(), + artifact_dict=create_model_dict(), + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer(), + llm_service=config_parser.get_llm_service() +) +rendered = converter("FILEPATH") +``` + +### Extract blocks + +Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks. + +Here's an example of extracting all forms from a document: + +```python +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.schema import BlockTypes + +converter = PdfConverter( + artifact_dict=create_model_dict(), +) +document = converter.build_document("FILEPATH") +forms = document.contained_blocks((BlockTypes.Form,)) +``` + +Look at the processors for more examples of extracting and manipulating blocks. + +## Other converters + +You can also use other converters that define different conversion pipelines: + +### Extract tables + +The `TableConverter` will only convert and extract tables: + +```python +from marker.converters.table import TableConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered + +converter = TableConverter( + artifact_dict=create_model_dict(), +) +rendered = converter("FILEPATH") +text, _, images = text_from_rendered(rendered) +``` + +This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes. + +You can also run this via the CLI with +```shell +marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json +``` + +### OCR Only + +If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes. + +```python +from marker.converters.ocr import OCRConverter +from marker.models import create_model_dict + +converter = OCRConverter( + artifact_dict=create_model_dict(), +) +rendered = converter("FILEPATH") +``` + +This takes all the same configuration as the PdfConverter. + +You can also run this via the CLI with +```shell +marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter +``` + +### Structured Extraction (beta) + +You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values. + +```python +from marker.converters.extraction import ExtractionConverter +from marker.models import create_model_dict +from marker.config.parser import ConfigParser +from pydantic import BaseModel + +class Links(BaseModel): + links: list[str] + +schema = Links.model_json_schema() +config_parser = ConfigParser({ + "page_schema": schema +}) + +converter = ExtractionConverter( + artifact_dict=create_model_dict(), + config=config_parser.generate_config_dict(), + llm_service=config_parser.get_llm_service(), +) +rendered = converter("FILEPATH") +``` + +Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document. + +# Output Formats + +## Markdown + +Markdown output will include: + +- image links (images will be saved in the same folder) +- formatted tables +- embedded LaTeX equations (fenced with `$$`) +- Code is fenced with triple backticks +- Superscripts for footnotes + +## HTML + +HTML output is similar to markdown output: + +- Images are included via `img` tags +- equations are fenced with `` tags +- code is in `pre` tags + +## JSON + +JSON output will be organized in a tree-like structure, with the leaf nodes being blocks. Examples of leaf nodes are a single list item, a paragraph of text, or an image. + +The output will be a list, with each list item representing a page. Each page is considered a block in the internal marker schema. There are different types of blocks to represent different elements. + +Pages have the keys: + +- `id` - unique id for the block. +- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"] +- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML. +- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise. +- `children` - the child blocks. + +The child blocks have two additional keys: + +- `section_hierarchy` - indicates the sections that the block is part of. `1` indicates an h1 tag, `2` an h2, and so on. +- `images` - base64 encoded images. The key will be the block id, and the data will be the encoded image. + +Note that child blocks of pages can have their own children as well (a tree structure). + +```json +{ + "id": "/page/10/Page/366", + "block_type": "Page", + "html": "", + "polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]], + "children": [ + { + "id": "/page/10/SectionHeader/0", + "block_type": "SectionHeader", + "html": "

Supplementary Material for Subspace Adversarial Training

", + "polygon": [ + [217.845703125, 80.630859375], [374.73046875, 80.630859375], + [374.73046875, 107.0], + [217.845703125, 107.0] + ], + "children": null, + "section_hierarchy": { + "1": "/page/10/SectionHeader/1" + }, + "images": {} + }, + ... + ] + } + + +``` + +## Chunks + +Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG. + +## Metadata + +All output formats will return a metadata dictionary, with the following fields: + +```json +{ + "table_of_contents": [ + { + "title": "Introduction", + "heading_level": 1, + "page_id": 0, + "polygon": [...] + } + ], // computed PDF table of contents + "page_stats": [ + { + "page_id": 0, + "text_extraction_method": "pdftext", + "block_counts": [("Span", 200), ...] + }, + ... + ] +} +``` + +# LLM Services + +When running with the `--use_llm` flag, you have a choice of services you can use: + +- `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration. +- `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`. +- `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`. +- `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`. +- `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`. +- `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`. + +These services may have additional optional configuration as well - you can see it by viewing the classes. + +# Internals + +Marker is easy to extend. The core units of marker are: + +- `Providers`, at `marker/providers`. These provide information from a source file, like a PDF. +- `Builders`, at `marker/builders`. These generate the initial document blocks and fill in text, using info from the providers. +- `Processors`, at `marker/processors`. These process specific blocks, for example the table formatter is a processor. +- `Renderers`, at `marker/renderers`. These use the blocks to render output. +- `Schema`, at `marker/schema`. The classes for all the block types. +- `Converters`, at `marker/converters`. They run the whole end to end pipeline. + +To customize processing behavior, override the `processors`. To add new output formats, write a new `renderer`. For additional input formats, write a new `provider.` + +Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily. + +## API server + +There is a very simple API server you can run like this: + +```shell +pip install -U uvicorn fastapi python-multipart +marker_server --port 8001 +``` + +This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options. + +You can send requests like this: + +``` +import requests +import json + +post_data = { + 'filepath': 'FILEPATH', + # Add other params here +} + +requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json() +``` + +Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans). + +# Troubleshooting + +There are some settings that you may find useful if things aren't working the way you expect: + +- If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality. You must set `GOOGLE_API_KEY` to a Gemini API key for this to work. +- Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document. +- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference. +- If you're getting out of memory errors, decrease worker count. You can also try splitting up long PDFs into multiple files. + +## Debugging + +Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information. + +# Benchmarks + +## Overall PDF Conversion + +We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method. + +| Method | Avg Time | Heuristic Score | LLM Score | +|------------|----------|-----------------|-----------| +| marker | 2.83837 | 95.6709 | 4.23916 | +| llamaparse | 23.348 | 84.2442 | 3.97619 | +| mathpix | 6.36223 | 86.4281 | 4.15626 | +| docling | 3.69949 | 86.7073 | 3.70429 | + +Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type: + + + +| Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM | +|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------| +| Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 | +| Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 | +| Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 | +| Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 | +| Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 | +| Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 | +| Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 | +| Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 | +| Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 | +| Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 | +| Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 | + +## Throughput + +We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf). + +| Method | Time per page | Time per document | VRAM used | +|---------|---------------|-------------------|---------- | +| marker | 0.18 | 43.42 | 3.17GB | + +The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used. + +## Table Conversion + +Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores: + +| Method | Avg score | Total tables | +|------------------|-----------|--------------| +| marker | 0.816 | 99 | +| marker w/use_llm | 0.907 | 99 | +| gemini | 0.829 | 99 | + +The `--use_llm` flag can significantly improve table recognition performance, as you can see. + +We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged). + +## Running your own benchmarks + +You can benchmark the performance of marker on your machine. Install marker manually with: + +```shell +git clone https://github.com/VikParuchuri/marker.git +poetry install +``` + +### Overall PDF Conversion + +Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this: + +```shell +python benchmarks/overall.py --methods marker --scores heuristic,llm +``` + +Options: + +- `--use_llm` use an llm to improve the marker results. +- `--max_rows` how many rows to process for the benchmark. +- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated. +- `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated. + +### Table Conversion +The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with: + +```shell +python benchmarks/table/table.py --max_rows 100 +``` + +Options: + +- `--use_llm` uses an llm with marker to improve accuracy. +- `--use_gemini` also benchmarks gemini 2.0 flash. + +# How it works + +Marker is a pipeline of deep learning models: + +- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya)) +- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) +- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya)) +- Optionally use an LLM to improve quality +- Combine blocks and postprocess complete text + +It only uses models where necessary, which improves speed and accuracy. + +# Limitations + +PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: + +- Very complex layouts, with nested tables and forms, may not work +- Forms may not be rendered well + +Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues. + +# Usage and Deployment Examples + +You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options: +- Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to) +- Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations. +- [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes. diff --git a/README.md b/README.md index e1cade0..55a9c70 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,76 @@ graph TD end ``` +## Marker API Response Structure + +The PDF parsing pipeline calls a local [Marker](https://github.com/VikParuchuri/marker) server (`POST /marker/upload`). + +### Top-level envelope + +```json +{ + "format": "json", + "output": "" +} +``` + +`output` is a **JSON-encoded string** (not a nested object) and must be parsed a second time to get the document tree. + +### Parsed `output` shape + +``` +{ + "children": [ , ... ] +} +``` + +### Block types + +Every block shares these fields: + +| Field | Type | Notes | +|------------------|-------------------|--------------------------------------------| +| `id` | string | e.g. `/page/0/Picture/2` | +| `block_type` | string | see table below | +| `html` | string | rendered HTML; may contain `` | +| `bbox` | `[x0,y0,x1,y1]` | bounding box in page coordinates | +| `children` | array or null | nested blocks | +| `images` | object or null | base64 PNG map (leaf image blocks only) | +| `section_hierarchy` | object | heading ancestry | + +#### Known `block_type` values + +| block_type | Category | Notes | +|------------------|----------|-------------------------------------------------------| +| `Page` | structure | Top-level; direct children are the page content | +| `SectionHeader` | text | Section / chapter heading | +| `Text` | text | | +| `TextInlineMath` | text | | +| `ListItem` | text | | +| `Table` | text | | +| `Code` | text | | +| `Equation` | text | | +| `Footnote` | text | | +| `Caption` | text | Usually a child of a `*Group` block | +| `PageHeader` | text | | +| `PageFooter` | text | | +| `Handwriting` | text | | +| `Picture` | image | Leaf block; `images` map holds base64 PNG keyed by ID | +| `Figure` | image | Leaf block; same as `Picture` | +| `PictureGroup` | container | Wraps one `Picture` + one `Caption` child | +| `FigureGroup` | container | Wraps one `Figure` + one `Caption` child | + +### Image extraction + +Images are only present on **leaf** image blocks (`Picture`, `Figure`). +Group blocks (`PictureGroup`, `FigureGroup`) have `images: null` — the base64 PNG lives on the child leaf block. + +``` +PictureGroup +├── Picture ← images: { "/page/0/Picture/2": "" } +└── Caption ← html: "

Figure 1 — ...

" +``` + ## Stack - **Backend**: Spring Boot 4.0.5 + Spring AI 2.0.0-M4, Java 21, Maven diff --git a/backend/pom.xml b/backend/pom.xml index cc5cd35..1ca8141 100644 --- a/backend/pom.xml +++ b/backend/pom.xml @@ -108,7 +108,7 @@ spring-ai-pdf-document-reader - + org.apache.pdfbox pdfbox diff --git a/backend/src/main/java/com/aiteacher/book/BookController.java b/backend/src/main/java/com/aiteacher/book/BookController.java index c063152..3f4ed5f 100644 --- a/backend/src/main/java/com/aiteacher/book/BookController.java +++ b/backend/src/main/java/com/aiteacher/book/BookController.java @@ -2,7 +2,9 @@ package com.aiteacher.book; import com.aiteacher.document.FigureEntity; import com.aiteacher.document.FigureRepository; +import com.aiteacher.document.MarkdownStorageService; import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; @@ -18,10 +20,13 @@ public class BookController { private final BookService bookService; private final FigureRepository figureRepository; + private final MarkdownStorageService markdownStorageService; - public BookController(BookService bookService, FigureRepository figureRepository) { + public BookController(BookService bookService, FigureRepository figureRepository, + MarkdownStorageService markdownStorageService) { this.bookService = bookService; this.figureRepository = figureRepository; + this.markdownStorageService = markdownStorageService; } @PostMapping(consumes = "multipart/form-data") @@ -59,6 +64,17 @@ public class BookController { )); } + @GetMapping(value = "/{id}/pages/{pageNumber}/markdown", produces = MediaType.TEXT_PLAIN_VALUE) + public ResponseEntity getPageMarkdown(@PathVariable UUID id, + @PathVariable int pageNumber) { + bookService.getById(id); // 404 if not found + try { + return ResponseEntity.ok(markdownStorageService.getText(id, pageNumber)); + } catch (Exception e) { + return ResponseEntity.notFound().build(); + } + } + @GetMapping("/{id}/figures") public ResponseEntity> figures(@PathVariable UUID id) { bookService.getById(id); // 404 if not found diff --git a/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java b/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java index 834c0d2..1cedf36 100644 --- a/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java +++ b/backend/src/main/java/com/aiteacher/book/BookEmbeddingService.java @@ -2,6 +2,9 @@ package com.aiteacher.book; import com.aiteacher.document.*; import com.aiteacher.figure.FigureStorageService; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; @@ -23,13 +26,7 @@ public class BookEmbeddingService { private final VectorStore vectorStore; private final BookRepository bookRepository; - - @Value("${app.embedding.batch-size:50}") - private int embeddingBatchSize; - - @Value("${app.embedding.batch-delay-ms:1000}") - private long embeddingBatchDelayMs; - private final PdfStructureParser pdfStructureParser; + private final MarkerPageParser markerPageParser; private final FigureExtractionService figureExtractionService; private final VisionDescriptionService visionDescriptionService; private final TextChunkingService textChunkingService; @@ -39,11 +36,21 @@ public class BookEmbeddingService { private final FigureRepository figureRepository; private final ChunkFigureRefRepository chunkFigureRefRepository; private final FigureStorageService figureStorageService; + private final MarkdownStorageService markdownStorageService; + + private static final Pattern MARKER_PLACEHOLDER = + Pattern.compile("!\\[([^\\]]*)\\]\\(marker://([^)]+)\\)"); + + @Value("${app.embedding.batch-size:50}") + private int embeddingBatchSize; + + @Value("${app.embedding.batch-delay-ms:1000}") + private long embeddingBatchDelayMs; public BookEmbeddingService( VectorStore vectorStore, BookRepository bookRepository, - PdfStructureParser pdfStructureParser, + MarkerPageParser markerPageParser, FigureExtractionService figureExtractionService, VisionDescriptionService visionDescriptionService, TextChunkingService textChunkingService, @@ -52,10 +59,11 @@ public class BookEmbeddingService { ChapterRepository chapterRepository, FigureRepository figureRepository, ChunkFigureRefRepository chunkFigureRefRepository, - FigureStorageService figureStorageService) { + FigureStorageService figureStorageService, + MarkdownStorageService markdownStorageService) { this.vectorStore = vectorStore; this.bookRepository = bookRepository; - this.pdfStructureParser = pdfStructureParser; + this.markerPageParser = markerPageParser; this.figureExtractionService = figureExtractionService; this.visionDescriptionService = visionDescriptionService; this.textChunkingService = textChunkingService; @@ -65,11 +73,12 @@ public class BookEmbeddingService { this.figureRepository = figureRepository; this.chunkFigureRefRepository = chunkFigureRefRepository; this.figureStorageService = figureStorageService; + this.markdownStorageService = markdownStorageService; } @Async public void embedBook(UUID bookId, String bookTitle, Path pdfPath) { - log.info("Starting image-aware embedding for book {} ({})", bookId, bookTitle); + log.info("Starting Marker-powered embedding for book {} ({})", bookId, bookTitle); Book book = bookRepository.findById(bookId).orElse(null); if (book == null) { @@ -81,59 +90,73 @@ public class BookEmbeddingService { book.setStatus(BookStatus.PROCESSING); bookRepository.save(book); - // Step 1: Parse PDF into page-level sections persisted in Postgres - List sections = pdfStructureParser.parse(bookId, bookTitle, pdfPath); String chapterId = bookId + "-ch1"; + ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1); + chapterRepository.save(chapter); - // Step 2: Build and embed text chunks for all sections in batches + // Step 1: Parse every page with Marker — correct reading order + pre-cropped figures + List pageResults = markerPageParser.parse(pdfPath); + + // Step 2: Build SectionEntity per page and persist + List sections = buildAndSaveSections(bookId, bookTitle, chapterId, pageResults); + + // Step 3: Chunk and embed text List allChunks = new ArrayList<>(); for (SectionEntity section : sections) { - List chunks = textChunkingService.chunk(section, bookTitle); - allChunks.addAll(chunks); + allChunks.addAll(textChunkingService.chunk(section, bookTitle)); } embedInBatches(allChunks, bookId); log.info("Embedded {} text chunks for book {}", allChunks.size(), bookId); - // Step 3: Extract images from the PDF, save to file store, persist FigureEntity - List figures = figureExtractionService.extract( - bookId, chapterId, sections, pdfPath); + // Step 4: Decode pre-cropped figures from Marker output + FigureExtractionService.ExtractionResult extraction = + figureExtractionService.extract(bookId, chapterId, pageResults); + List figures = extraction.figures(); - // Step 4: For each figure, generate vision description and embed caption + // Step 4b: Upload per-page markdown with resolved figure URLs to S3 + for (PageResult page : pageResults) { + if (!page.markdown().isBlank()) { + String resolved = resolvePlaceholders(page.markdown(), bookId, + extraction.blockIdToFigureId()); + markdownStorageService.save(bookId, page.pageNumber(), resolved); + } + } + + // Step 5: Vision analysis (description + visible text) → embed figure chunks for (FigureEntity figure : figures) { byte[] imageBytes = figureStorageService.getBytes(figure.getImagePath()); - String description = visionDescriptionService.describe( - imageBytes, figure.getCaption()); + VisionDescriptionService.ImageAnalysis analysis = + visionDescriptionService.analyze(imageBytes, figure.getCaption()); - // Use description as caption fallback if no caption was detected if (figure.getCaption() == null || figure.getCaption().isBlank()) { - figure.setCaption(description); + figure.setCaption(analysis.description()); figureRepository.save(figure); } - // Content for embedding = vision description + caption for maximum signal - String embeddingContent = description - + (figure.getCaption() != null ? "\n" + figure.getCaption() : ""); + // Embedding content: description + caption + visible image text + String embeddingContent = analysis.description() + + (figure.getCaption() != null ? "\n" + figure.getCaption() : "") + + (analysis.imageText().isEmpty() ? "" : "\n" + analysis.imageText()); String embeddingId = UUID.randomUUID().toString(); - Map metadata = buildFigureMetadata(figure, bookTitle, embeddingId); - Document figureDoc = new Document(embeddingId, embeddingContent, metadata); + Document figureDoc = new Document(embeddingId, embeddingContent, + buildFigureMetadata(figure, bookTitle, embeddingId, analysis.imageText())); vectorStore.add(List.of(figureDoc)); figure.setCaptionEmbeddingId(UUID.fromString(embeddingId)); figureRepository.save(figure); } - log.info("Embedded {} figure captions for book {}", figures.size(), bookId); + log.info("Embedded {} figure chunks for book {}", figures.size(), bookId); - // Step 5: Link text chunks to figures via text references + // Step 6: Link text chunks to figures via in-text references for (SectionEntity section : sections) { List sectionChunks = allChunks.stream() - .filter(d -> section.getId().equals(d.getMetadata().get("section_id"))) - .toList(); + .filter(d -> section.getId().equals(d.getMetadata().get("section_id"))) + .toList(); List sectionFigures = figures.stream() - .filter(f -> section.getId().equals(f.getSectionId())) - .toList(); - chunkFigureRefService.linkChunksToFigures( - sectionChunks, sectionFigures, section.getPageStart()); + .filter(f -> section.getId().equals(f.getSectionId())) + .toList(); + chunkFigureRefService.linkChunksToFigures(sectionChunks, sectionFigures, section.getPageStart()); } book.setStatus(BookStatus.READY); @@ -142,7 +165,7 @@ public class BookEmbeddingService { bookRepository.save(book); log.info("Finished embedding book {} — {} pages, {} figures", - bookId, sections.size(), figures.size()); + bookId, sections.size(), figures.size()); } catch (Exception ex) { log.error("Failed to embed book {}", bookId, ex); @@ -156,53 +179,63 @@ public class BookEmbeddingService { public void deleteBookChunks(UUID bookId) { log.info("Deleting all data for book {}", bookId); try { - // Delete chunk-figure refs (by figureId for this book) List figureIds = figureRepository.findAllByBookId(bookId) - .stream().map(FigureEntity::getId).toList(); + .stream().map(FigureEntity::getId).toList(); if (!figureIds.isEmpty()) { chunkFigureRefRepository.deleteByFigureIdIn(figureIds); } - - // Delete figures from Postgres figureRepository.deleteAllByBookId(bookId); - - // Delete figure files from disk figureStorageService.deleteAll(bookId); - - // Delete sections and chapters from Postgres + markdownStorageService.deleteAll(bookId); sectionRepository.deleteAllByBookId(bookId); chapterRepository.deleteAllByBookId(bookId); - // Delete vector store entries (text chunks + figure embeddings) FilterExpressionBuilder b = new FilterExpressionBuilder(); vectorStore.delete(b.eq("book_id", bookId.toString()).build()); - } catch (Exception ex) { log.warn("Error during cleanup for book {}: {}", bookId, ex.getMessage()); } } + // --- Private helpers --- + + private List buildAndSaveSections(UUID bookId, String bookTitle, + String chapterId, + List pageResults) { + List sections = new ArrayList<>(); + for (PageResult page : pageResults) { + if (page.orderedText().isBlank()) continue; + + String sectionId = bookId + "-p" + page.pageNumber(); + String title = page.headingTitle() != null ? page.headingTitle() : "Page " + page.pageNumber(); + + SectionEntity section = new SectionEntity( + sectionId, chapterId, bookId, + String.valueOf(page.pageNumber()), + title, + page.pageNumber(), page.pageNumber(), + page.orderedText()); + sections.add(sectionRepository.save(section)); + } + return sections; + } + private void embedInBatches(List docs, UUID bookId) { int total = docs.size(); for (int i = 0; i < total; i += embeddingBatchSize) { List batch = docs.subList(i, Math.min(i + embeddingBatchSize, total)); vectorStore.add(batch); - int batchNum = i / embeddingBatchSize + 1; - int totalBatches = (total - 1) / embeddingBatchSize + 1; - log.debug("Embedded batch {}/{} for book {}", batchNum, totalBatches, bookId); + log.debug("Embedded batch {}/{} for book {}", + i / embeddingBatchSize + 1, (total - 1) / embeddingBatchSize + 1, bookId); if (i + embeddingBatchSize < total) { - try { - Thread.sleep(embeddingBatchDelayMs); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - log.warn("Embedding batch sleep interrupted for book {}", bookId); - } + try { Thread.sleep(embeddingBatchDelayMs); } + catch (InterruptedException e) { Thread.currentThread().interrupt(); } } } } private Map buildFigureMetadata(FigureEntity figure, String bookTitle, - String embeddingId) { + String embeddingId, String imageText) { Map m = new HashMap<>(); m.put("type", "FIGURE"); m.put("book_id", figure.getBookId().toString()); @@ -215,9 +248,31 @@ public class BookEmbeddingService { m.put("label", figure.getLabel() != null ? figure.getLabel() : ""); m.put("page", figure.getPage()); m.put("embedding_id", embeddingId); + m.put("image_text", imageText); // verbatim text visible inside the image return m; } + /** Replaces {@code marker://{blockId}} placeholders with resolved API URLs. */ + private String resolvePlaceholders(String markdown, UUID bookId, + Map blockIdToFigureId) { + Matcher m = MARKER_PLACEHOLDER.matcher(markdown); + StringBuilder sb = new StringBuilder(); + while (m.find()) { + String altText = m.group(1); + String blockId = m.group(2); + String figureId = blockIdToFigureId.get(blockId); + if (figureId != null) { + String url = "/api/v1/figures/" + bookId + "/" + figureId + ".png"; + m.appendReplacement(sb, "![" + altText.replace("\\", "\\\\") + .replace("$", "\\$") + "](" + url + ")"); + } else { + m.appendReplacement(sb, ""); // figure was filtered out (too small, etc.) + } + } + m.appendTail(sb); + return sb.toString().strip(); + } + private String truncate(String msg, int max) { if (msg == null) return null; return msg.length() <= max ? msg : msg.substring(0, max); diff --git a/backend/src/main/java/com/aiteacher/config/MarkerConfig.java b/backend/src/main/java/com/aiteacher/config/MarkerConfig.java new file mode 100644 index 0000000..28c9a57 --- /dev/null +++ b/backend/src/main/java/com/aiteacher/config/MarkerConfig.java @@ -0,0 +1,30 @@ +package com.aiteacher.config; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.http.client.JdkClientHttpRequestFactory; +import org.springframework.web.client.RestClient; + +import java.net.http.HttpClient; + +@Configuration +public class MarkerConfig { + + @Value("${app.marker.base-url:http://localhost:8000}") + private String markerBaseUrl; + + @Bean + RestClient markerRestClient() { + // Use the JDK HTTP client with no timeout — Marker conversions can take several minutes. + HttpClient httpClient = HttpClient.newBuilder() + .build(); + JdkClientHttpRequestFactory factory = new JdkClientHttpRequestFactory(httpClient); + // No read timeout set: JDK HTTP client defaults to no deadline. + + return RestClient.builder() + .baseUrl(markerBaseUrl) + .requestFactory(factory) + .build(); + } +} diff --git a/backend/src/main/java/com/aiteacher/document/FigureExtractionService.java b/backend/src/main/java/com/aiteacher/document/FigureExtractionService.java index a80ed52..25620e2 100644 --- a/backend/src/main/java/com/aiteacher/document/FigureExtractionService.java +++ b/backend/src/main/java/com/aiteacher/document/FigureExtractionService.java @@ -1,43 +1,43 @@ package com.aiteacher.document; import com.aiteacher.figure.FigureStorageService; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; +import javax.imageio.ImageIO; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.IOException; -import java.nio.file.Path; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * Extracts images from each PDF page using PDFBox. - * Images below the configured minimum size are skipped. - * Caption is detected by the "Fig." pattern in page text. + * Extracts figure images from {@link PageResult.FigureData} entries produced by + * {@link MarkerPageParser}. + * + *

Marker returns pre-cropped PNG bytes for each detected figure, so no PDFBox + * page rendering or bounding-box cropping is needed. This service: + *

    + *
  1. Decodes the PNG bytes to check dimensions (skip images below min size)
  2. + *
  3. Classifies the figure type from caption and surrounding text keywords
  4. + *
  5. Persists the image via {@link FigureStorageService}
  6. + *
  7. Persists a {@link FigureEntity} to the database
  8. + *
*/ @Service public class FigureExtractionService { private static final Logger log = LoggerFactory.getLogger(FigureExtractionService.class); - // Caption: line starting with "Fig." or "Figure" followed by a number - private static final Pattern CAPTION_PATTERN = - Pattern.compile("(?m)^(Fig\\.?\\s*\\d+[\\-.]?\\d*[^\\n]*)", Pattern.CASE_INSENSITIVE); - - // Figure label: "Fig. 12-4" or "Fig. 12.4" private static final Pattern LABEL_PATTERN = - Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)"); + Pattern.compile("(?i)Fig\\.?\\s*(\\d+[\\-.\\d]*)"); private final FigureStorageService storageService; private final FigureRepository figureRepository; @@ -52,65 +52,77 @@ public class FigureExtractionService { this.minImageSizePx = minImageSizePx; } + /** Holds the extraction output: persisted figures and a Marker blockId → DB figureId map. */ + public record ExtractionResult(List figures, Map blockIdToFigureId) {} + /** - * Extracts all qualifying images from the PDF for the given book. - * Returns persisted FigureEntity list (without vision descriptions — set later). + * Extracts and persists figures for all pages described by {@code pageResults}. + * + * @param bookId owning book + * @param chapterId chapter bucket for these sections + * @param pageResults Marker parse output — each entry's {@code figures} list + * carries pre-cropped PNG bytes for that page + * @return {@link ExtractionResult} with persisted figures and blockId→figureId map + * (used to resolve markdown image placeholders) */ - public List extract(UUID bookId, String chapterId, - List sections, Path pdfPath) { + public ExtractionResult extract(UUID bookId, String chapterId, + List pageResults) { List figures = new ArrayList<>(); + Map blockIdToFigureId = new HashMap<>(); int figureCounter = 0; - try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) { - for (SectionEntity section : sections) { - int pageIndex = section.getPageStart() - 1; // 0-based - if (pageIndex < 0 || pageIndex >= doc.getNumberOfPages()) continue; - - PDPage page = doc.getPage(pageIndex); - String pageText = section.getFullText(); + for (PageResult page : pageResults) { + if (page.figures().isEmpty()) continue; + for (PageResult.FigureData figureData : page.figures()) { try { - for (COSName name : page.getResources().getXObjectNames()) { - PDXObject xObject = page.getResources().getXObject(name); - if (!(xObject instanceof PDImageXObject image)) continue; - - BufferedImage bufferedImage = image.getImage(); - if (bufferedImage.getWidth() < minImageSizePx - || bufferedImage.getHeight() < minImageSizePx) { - continue; // skip decorative images - } - - figureCounter++; - String figureId = bookId + "-fig-" + pageIndex + "-" + figureCounter; - String caption = detectCaption(pageText); - String label = detectLabel(caption, figureCounter); - FigureType type = classifyType(caption, pageText); - - String imagePath = storageService.save(bookId, figureId, bufferedImage); - - FigureEntity figure = new FigureEntity( - figureId, bookId, section.getId(), chapterId, - label, caption, type, section.getPageStart(), imagePath - ); - figures.add(figureRepository.save(figure)); + BufferedImage image = decodeImage(figureData.imageBytes()); + if (image == null) { + log.debug("Could not decode image on page {} of book {} (block {})", + page.pageNumber(), bookId, figureData.blockId()); + continue; } - } catch (IOException ex) { - log.warn("Failed to extract images from page {} of book {}: {}", - section.getPageStart(), bookId, ex.getMessage()); + if (image.getWidth() < minImageSizePx || image.getHeight() < minImageSizePx) { + log.debug("Skipping small figure on page {} ({}×{})", + page.pageNumber(), image.getWidth(), image.getHeight()); + continue; + } + + figureCounter++; + String figureId = bookId + "-fig-" + page.pageNumber() + "-" + figureCounter; + String caption = figureData.nearestCaption(); + String label = detectLabel(caption, figureCounter); + FigureType type = classifyType(caption, page.orderedText()); + + String sectionId = bookId + "-p" + page.pageNumber(); + String imagePath = storageService.save(bookId, figureId, image); + + FigureEntity figure = new FigureEntity( + figureId, bookId, sectionId, chapterId, + label, caption, type, page.pageNumber(), imagePath); + figures.add(figureRepository.save(figure)); + blockIdToFigureId.put(figureData.blockId(), figureId); + + } catch (Exception ex) { + log.warn("Failed to extract figure on page {} of book {}: {}", + page.pageNumber(), bookId, ex.getMessage()); } } - } catch (IOException ex) { - log.error("Could not open PDF for image extraction, book {}", bookId, ex); } log.info("Extracted {} figures for book {}", figures.size(), bookId); - return figures; + return new ExtractionResult(figures, blockIdToFigureId); } - private String detectCaption(String pageText) { - if (pageText == null) return null; - Matcher m = CAPTION_PATTERN.matcher(pageText); - return m.find() ? m.group(1).trim() : null; + // --- Private helpers --- + + private BufferedImage decodeImage(byte[] imageBytes) { + if (imageBytes == null || imageBytes.length == 0) return null; + try { + return ImageIO.read(new ByteArrayInputStream(imageBytes)); + } catch (IOException ex) { + return null; + } } private String detectLabel(String caption, int counter) { @@ -122,14 +134,18 @@ public class FigureExtractionService { } private FigureType classifyType(String caption, String pageText) { - String combined = ((caption != null ? caption : "") + " " + (pageText != null ? pageText : "")).toLowerCase(); + String combined = ((caption != null ? caption : "") + " " + + (pageText != null ? pageText : "")).toLowerCase(); if (combined.contains("mri") || combined.contains("ct ") || combined.contains("magnetic") - || combined.contains("tomography")) return FigureType.MRI_CT_SCAN; - if (combined.contains("intraoperative") || combined.contains("intra-op")) return FigureType.INTRAOPERATIVE_IMAGE; - if (caption != null && caption.toLowerCase().startsWith("table")) return FigureType.TABLE; + || combined.contains("tomography")) return FigureType.MRI_CT_SCAN; + if (combined.contains("intraoperative") || combined.contains("intra-op")) + return FigureType.INTRAOPERATIVE_IMAGE; + if (caption != null && caption.toLowerCase().startsWith("table")) + return FigureType.TABLE; if (combined.contains("chart") || combined.contains("histogram") || combined.contains("graph")) - return FigureType.CHART; - if (combined.contains("photograph") || combined.contains("photo")) return FigureType.SURGICAL_PHOTOGRAPH; + return FigureType.CHART; + if (combined.contains("photograph") || combined.contains("photo")) + return FigureType.SURGICAL_PHOTOGRAPH; return FigureType.ANATOMICAL_DIAGRAM; } } diff --git a/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java b/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java new file mode 100644 index 0000000..7359978 --- /dev/null +++ b/backend/src/main/java/com/aiteacher/document/MarkdownStorageService.java @@ -0,0 +1,14 @@ +package com.aiteacher.document; + +import java.util.UUID; + +public interface MarkdownStorageService { + /** Uploads the markdown content and returns the S3 key. */ + String save(UUID bookId, int pageNumber, String markdown); + + /** Downloads and returns the markdown content for the given book and page. */ + String getText(UUID bookId, int pageNumber); + + /** Deletes all markdown files for the given book. */ + void deleteAll(UUID bookId); +} diff --git a/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java new file mode 100644 index 0000000..d75806b --- /dev/null +++ b/backend/src/main/java/com/aiteacher/document/MarkerPageParser.java @@ -0,0 +1,273 @@ +package com.aiteacher.document; + +import tools.jackson.databind.JsonNode; +import tools.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.core.io.FileSystemResource; +import org.springframework.http.MediaType; +import org.springframework.stereotype.Service; +import org.springframework.util.LinkedMultiValueMap; +import org.springframework.util.MultiValueMap; +import org.springframework.web.client.RestClient; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +/** + * Parses a PDF using the local Marker server ({@code POST /marker/upload}). + * + *

A single HTTP call returns: + *

    + *
  • Reading-order text blocks — correct for multi-column and scanned pages
  • + *
  • Section headings extracted from {@code SectionHeader} blocks
  • + *
  • Pre-cropped figure images as base64-encoded PNG in each {@code Figure} block's + * {@code images} map
  • + *
+ * + *

The response is mapped to one {@link PageResult} per page block. + */ +@Service +public class MarkerPageParser { + + private static final Logger log = LoggerFactory.getLogger(MarkerPageParser.class); + + private static final Set TEXT_BLOCK_TYPES = Set.of( + "Text", "TextInlineMath", "ListItem", "Table", "Code", "Equation", + "Footnote", "Caption", "PageHeader", "PageFooter", "Handwriting" + ); + private static final Set FIGURE_BLOCK_TYPES = Set.of("Figure", "Picture", "FigureGroup", "PictureGroup"); + + private final RestClient restClient; + private final ObjectMapper objectMapper; + + public MarkerPageParser(@Qualifier("markerRestClient") RestClient restClient, ObjectMapper objectMapper) { + this.restClient = restClient; + this.objectMapper = objectMapper; + } + + /** + * Parses the entire PDF and returns one {@link PageResult} per non-empty page. + */ + public List parse(Path pdfPath) { + log.info("Submitting {} to Marker for parsing", pdfPath.getFileName()); + + MultiValueMap body = new LinkedMultiValueMap<>(); + body.add("file", new FileSystemResource(pdfPath)); + body.add("output_format", "json"); + + JsonNode response = restClient.post() + .uri("/marker/upload") + .contentType(MediaType.MULTIPART_FORM_DATA) + .body(body) + .retrieve() + .body(JsonNode.class); + + try { + Path debugFile = Path.of("/tmp/marker-response-md.json"); + Files.writeString(debugFile, response.toPrettyString()); + log.info("Marker response saved to {}", debugFile); + } catch (IOException e) { + log.warn("Could not save Marker response to file", e); + } + + List results = parseResponse(response); + log.info("Marker produced {} page results from {}", results.size(), pdfPath.getFileName()); + return results; + } + + // --- Private helpers --- + + private List parseResponse(JsonNode response) { + if (response == null) return List.of(); + + // The "output" field is a JSON-encoded string — parse it first. + // Fall back to treating the whole response as the root if "output" is absent. + JsonNode root; + JsonNode outputNode = response.path("output"); + if (!outputNode.isMissingNode() && outputNode.isTextual()) { + try { + root = objectMapper.readTree(outputNode.asText()); + } catch (tools.jackson.core.JacksonException e) { + log.warn("Could not parse Marker 'output' field as JSON", e); + return List.of(); + } + } else if (!outputNode.isMissingNode()) { + root = outputNode; + } else { + root = response; + } + + JsonNode children = root.path("children"); + if (children.isMissingNode() || !children.isArray()) { + log.warn("Marker response has no 'children' array — empty result"); + return List.of(); + } + + List results = new ArrayList<>(); + int pageIndex = 0; + for (JsonNode pageBlock : children) { + String blockType = pageBlock.path("block_type").asText(); + if (!"Page".equals(blockType)) continue; + + int pageNumber = pageIndex + 1; + pageIndex++; + + PageResult result = parsePage(pageBlock, pageNumber); + if (!result.orderedText().isBlank() || !result.figures().isEmpty()) { + results.add(result); + } + } + return results; + } + + private PageResult parsePage(JsonNode pageBlock, int pageNumber) { + JsonNode children = pageBlock.path("children"); + if (children.isMissingNode() || !children.isArray()) { + return new PageResult(pageNumber, "", null, List.of(), ""); + } + + StringBuilder textBuilder = new StringBuilder(); + StringBuilder markdownBuilder = new StringBuilder(); + String headingTitle = null; + List figures = new ArrayList<>(); + Set consumed = new HashSet<>(); // indices of Caption nodes consumed by a figure + + List childList = new ArrayList<>(); + children.forEach(childList::add); + + for (int i = 0; i < childList.size(); i++) { + if (consumed.contains(i)) continue; + + JsonNode child = childList.get(i); + String type = child.path("block_type").asText(); + + if ("SectionHeader".equals(type)) { + String heading = stripHtml(child.path("html").asText()).strip(); + if (!heading.isEmpty() && headingTitle == null) { + headingTitle = heading; + } + appendText(textBuilder, heading); + appendMarkdown(markdownBuilder, "## " + heading); + + } else if (TEXT_BLOCK_TYPES.contains(type)) { + String text = stripHtml(child.path("html").asText()); + appendText(textBuilder, text); + appendMarkdown(markdownBuilder, text.strip()); + + } else if (FIGURE_BLOCK_TYPES.contains(type)) { + extractFigures(child, i, childList, figures, markdownBuilder, consumed); + } + } + + return new PageResult(pageNumber, textBuilder.toString().strip(), headingTitle, + figures, markdownBuilder.toString().strip()); + } + + /** + * Handles a figure/picture block at {@code index} in {@code siblings}. + * For group blocks (FigureGroup, PictureGroup) the image lives in a child Picture/Figure, + * and the caption is a sibling Caption child inside the group. + * For leaf blocks the caption is the next sibling in the page child list. + * Image refs are appended to {@code markdown} as {@code ![caption](marker://{blockId})}. + * Consumed caption sibling indices are added to {@code consumed}. + */ + private void extractFigures(JsonNode block, int index, List siblings, + List out, StringBuilder markdown, + Set consumed) { + String type = block.path("block_type").asText(); + boolean isGroup = type.endsWith("Group"); + + if (isGroup) { + JsonNode groupChildren = block.path("children"); + if (groupChildren.isMissingNode() || !groupChildren.isArray()) return; + + String groupCaption = null; + for (JsonNode sub : groupChildren) { + if ("Caption".equals(sub.path("block_type").asText())) { + String c = stripHtml(sub.path("html").asText()).strip(); + if (!c.isEmpty()) groupCaption = c; + } + } + for (JsonNode sub : groupChildren) { + String subType = sub.path("block_type").asText(); + if ("Figure".equals(subType) || "Picture".equals(subType)) { + String blockId = sub.path("id").asText(); + byte[] imageBytes = extractImageBytes(sub, blockId); + if (imageBytes != null) { + out.add(new PageResult.FigureData(imageBytes, groupCaption, blockId)); + String altText = groupCaption != null ? groupCaption : blockId; + appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")"); + } + } + } + } else { + String blockId = block.path("id").asText(); + byte[] imageBytes = extractImageBytes(block, blockId); + if (imageBytes != null) { + String caption = null; + if (index + 1 < siblings.size()) { + JsonNode next = siblings.get(index + 1); + if ("Caption".equals(next.path("block_type").asText())) { + String c = stripHtml(next.path("html").asText()).strip(); + if (!c.isEmpty()) caption = c; + consumed.add(index + 1); + } + } + out.add(new PageResult.FigureData(imageBytes, caption, blockId)); + String altText = caption != null ? caption : blockId; + appendMarkdown(markdown, "![" + altText + "](marker://" + blockId + ")"); + } + } + } + + /** + * Extracts and base64-decodes the image bytes for this block. + * Marker stores images in the block's {@code images} map keyed by block ID. + */ + private byte[] extractImageBytes(JsonNode block, String blockId) { + JsonNode images = block.path("images"); + if (images.isMissingNode() || images.isEmpty()) return null; + + // Try the block's own ID first, then fall back to the first entry + JsonNode imgNode = images.path(blockId); + if (imgNode.isMissingNode()) { + imgNode = images.properties().stream() + .findFirst() + .map(e -> e.getValue()) + .orElse(imgNode); + } + + String base64 = imgNode.asText(); + if (base64.isEmpty()) return null; + + try { + return Base64.getDecoder().decode(base64); + } catch (IllegalArgumentException ex) { + log.warn("Could not decode base64 image for block {}: {}", blockId, ex.getMessage()); + return null; + } + } + + private void appendText(StringBuilder sb, String text) { + String stripped = text.strip(); + if (stripped.isEmpty()) return; + if (sb.length() > 0) sb.append("\n\n"); + sb.append(stripped); + } + + private void appendMarkdown(StringBuilder sb, String text) { + if (text == null || text.isBlank()) return; + if (sb.length() > 0) sb.append("\n\n"); + sb.append(text.strip()); + } + + /** Strips HTML tags and normalises whitespace. */ + private String stripHtml(String html) { + if (html == null || html.isEmpty()) return ""; + return html.replaceAll("<[^>]*>", "").replaceAll("\\s{2,}", " ").strip(); + } +} diff --git a/backend/src/main/java/com/aiteacher/document/PageResult.java b/backend/src/main/java/com/aiteacher/document/PageResult.java new file mode 100644 index 0000000..cb9989d --- /dev/null +++ b/backend/src/main/java/com/aiteacher/document/PageResult.java @@ -0,0 +1,26 @@ +package com.aiteacher.document; + +import java.util.List; + +/** + * Internal DTO produced by MarkerPageParser for one PDF page. + * Decouples the Marker HTTP API from downstream services. + */ +public record PageResult( + int pageNumber, // 1-based, derived from Marker page block index + String orderedText, // full page text in correct reading order (blocks joined by \n\n) + String headingTitle, // first SectionHeader block on page, or null + List figures, // extracted figure images (may be empty) + String markdown // markdown representation with marker://{blockId} image placeholders +) { + + /** + * A figure extracted from the page. + * Image bytes are PNG data decoded from the Marker JSON {@code images} map. + */ + public record FigureData( + byte[] imageBytes, // PNG image data (base64-decoded from Marker response) + String nearestCaption, // text of the adjacent Caption block, or null + String blockId // Marker block ID (e.g. "/page/0/Figure/2") for traceability + ) {} +} diff --git a/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java b/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java index 930914f..bae4176 100644 --- a/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java +++ b/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java @@ -1,13 +1,17 @@ package com.aiteacher.document; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.text.PDFTextStripperByArea; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.ai.reader.pdf.PagePdfDocumentReader; -import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; -import org.springframework.core.io.FileSystemResource; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +import java.awt.Rectangle; +import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -15,13 +19,18 @@ import java.util.UUID; /** * Parses a PDF into page-level SectionEntity records stored in Postgres. - * Each page becomes one section, grouped under a single chapter per book. + * Uses column-aware extraction via PDFTextStripperByArea: for two-column pages, + * left column is extracted first then right, preserving correct reading order. + * Text is also normalized (collapsed whitespace) before storage. */ @Service public class PdfStructureParser { private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class); + // Right column is considered empty (single-column page) if it has < 20% of left column's content + private static final double TWO_COLUMN_THRESHOLD = 0.2; + private final ChapterRepository chapterRepository; private final SectionRepository sectionRepository; @@ -35,37 +44,71 @@ public class PdfStructureParser { public List parse(UUID bookId, String bookTitle, Path pdfPath) { log.info("Parsing PDF structure for book {}", bookId); - // One chapter per book String chapterId = bookId + "-ch1"; ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1); chapterRepository.save(chapter); - // One section per page - PagePdfDocumentReader reader = new PagePdfDocumentReader( - new FileSystemResource(pdfPath.toFile()), - PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build() - ); - - List pages = reader.get(); List sections = new ArrayList<>(); - for (int i = 0; i < pages.size(); i++) { - int pageNum = i + 1; - String text = pages.get(i).getText(); - if (text == null || text.isBlank()) continue; + try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) { + List pages = new ArrayList<>(); + doc.getPages().forEach(pages::add); - String sectionId = bookId + "-p" + pageNum; - SectionEntity section = new SectionEntity( - sectionId, chapterId, bookId, - String.valueOf(pageNum), - "Page " + pageNum, - pageNum, pageNum, - text - ); - sections.add(sectionRepository.save(section)); + for (int i = 0; i < 25; i++) { + int pageNum = i + 1; + String text = normalizeWhitespace(extractPageText(pages.get(i))); + if (text.isBlank()) continue; + + String sectionId = bookId + "-p" + pageNum; + SectionEntity section = new SectionEntity( + sectionId, chapterId, bookId, + String.valueOf(pageNum), + "Page " + pageNum, + pageNum, pageNum, + text + ); + sections.add(sectionRepository.save(section)); + } + } catch (IOException e) { + throw new RuntimeException("Failed to parse PDF for book " + bookId, e); } log.info("Parsed {} sections for book {}", sections.size(), bookId); return sections; } + + /** + * Extracts text from a single page using column-aware region extraction. + * Splits the page at the horizontal midpoint. If the right region has fewer + * than 20% of the characters of the left region, treats the page as single-column. + */ + private String extractPageText(PDPage page) throws IOException { + PDRectangle mediaBox = page.getMediaBox(); + int width = (int) mediaBox.getWidth(); + int height = (int) mediaBox.getHeight(); + int mid = width / 2; + + PDFTextStripperByArea stripper = new PDFTextStripperByArea(); + stripper.setSortByPosition(true); + stripper.addRegion("left", new Rectangle(0, 0, mid, height)); + stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height)); + stripper.extractRegions(page); + + String left = stripper.getTextForRegion("left").strip(); + String right = stripper.getTextForRegion("right").strip(); + + if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) { + // Single-column page — left holds all (or nearly all) content + return left.isEmpty() ? right : left; + } + return left + "\n\n" + right; + } + + /** Collapses multi-space/tab runs and excessive blank lines. */ + private String normalizeWhitespace(String text) { + return text + .replaceAll("[ \t]{2,}", " ") + .replaceAll("\n{3,}", "\n\n") + .trim(); + } } diff --git a/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java new file mode 100644 index 0000000..478f0cc --- /dev/null +++ b/backend/src/main/java/com/aiteacher/document/S3MarkdownStorageService.java @@ -0,0 +1,97 @@ +package com.aiteacher.document; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3Configuration; +import software.amazon.awssdk.services.s3.model.*; + +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +@Service +public class S3MarkdownStorageService implements MarkdownStorageService { + + private static final Logger log = LoggerFactory.getLogger(S3MarkdownStorageService.class); + + private final S3Client s3; + private final String bucket; + + public S3MarkdownStorageService( + @Value("${app.figure-storage.endpoint}") String endpoint, + @Value("${app.figure-storage.region}") String region, + @Value("${app.figure-storage.bucket}") String bucket, + @Value("${app.figure-storage.access-key-id}") String accessKeyId, + @Value("${app.figure-storage.secret-access-key}") String secretKey) { + this.bucket = bucket; + URI endpointUri = URI.create(endpoint); + StaticCredentialsProvider credentials = StaticCredentialsProvider.create( + AwsBasicCredentials.create(accessKeyId, secretKey)); + Region awsRegion = Region.of(region); + S3Configuration s3Config = S3Configuration.builder().pathStyleAccessEnabled(true).build(); + + this.s3 = S3Client.builder() + .endpointOverride(endpointUri) + .region(awsRegion) + .credentialsProvider(credentials) + .serviceConfiguration(s3Config) + .build(); + } + + @Override + public String save(UUID bookId, int pageNumber, String markdown) { + String key = key(bookId, pageNumber); + byte[] bytes = markdown.getBytes(StandardCharsets.UTF_8); + s3.putObject( + PutObjectRequest.builder().bucket(bucket).key(key) + .contentType("text/markdown; charset=utf-8") + .contentLength((long) bytes.length).build(), + RequestBody.fromBytes(bytes)); + return key; + } + + @Override + public String getText(UUID bookId, int pageNumber) { + byte[] bytes = s3.getObjectAsBytes( + GetObjectRequest.builder().bucket(bucket).key(key(bookId, pageNumber)).build() + ).asByteArray(); + return new String(bytes, StandardCharsets.UTF_8); + } + + @Override + public void deleteAll(UUID bookId) { + String prefix = "markdown/" + bookId + "/"; + try { + List toDelete = new ArrayList<>(); + s3.listObjectsV2Paginator(ListObjectsV2Request.builder() + .bucket(bucket).prefix(prefix).build()).stream() + .flatMap(page -> page.contents().stream()) + .map(S3Object::key) + .map(k -> ObjectIdentifier.builder().key(k).build()) + .forEach(toDelete::add); + + if (toDelete.isEmpty()) return; + + s3.deleteObjects(DeleteObjectsRequest.builder() + .bucket(bucket) + .delete(Delete.builder().objects(toDelete).build()) + .build()); + log.info("Deleted {} markdown files from S3 for book {}", toDelete.size(), bookId); + } catch (S3Exception ex) { + log.warn("Could not fully delete markdown for book {} from S3: {}", bookId, ex.getMessage()); + } + } + + private static String key(UUID bookId, int pageNumber) { + return "markdown/" + bookId + "/page-" + pageNumber + ".md"; + } +} diff --git a/backend/src/main/java/com/aiteacher/document/TextChunkingService.java b/backend/src/main/java/com/aiteacher/document/TextChunkingService.java index 776b28b..7eaf021 100644 --- a/backend/src/main/java/com/aiteacher/document/TextChunkingService.java +++ b/backend/src/main/java/com/aiteacher/document/TextChunkingService.java @@ -38,14 +38,52 @@ public class TextChunkingService { List windows = new ArrayList<>(); int start = 0; while (start < text.length()) { - int end = Math.min(start + TARGET_CHARS, text.length()); - windows.add(text.substring(start, end)); - if (end == text.length()) break; - start = end - OVERLAP_CHARS; + int hardEnd = Math.min(start + TARGET_CHARS, text.length()); + if (hardEnd == text.length()) { + String last = text.substring(start).strip(); + if (!last.isEmpty()) windows.add(last); + break; + } + int splitAt = findSplitPoint(text, start, hardEnd); + String chunk = text.substring(start, splitAt).strip(); + if (!chunk.isEmpty()) windows.add(chunk); + // Overlap: back up from split point, align to a word start + int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS); + while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++; + start = overlapStart < splitAt ? overlapStart + 1 : splitAt; } return windows; } + /** + * Finds the best split point at or before hardEnd, preferring (in order): + * paragraph boundary, sentence boundary, word boundary, hard cut. + */ + private int findSplitPoint(String text, int start, int hardEnd) { + int lookback = Math.min(400, (hardEnd - start) / 2); + + // 1. Paragraph boundary + int paraIdx = text.lastIndexOf("\n\n", hardEnd); + if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2; + + // 2. Sentence boundary (. ! ?) followed by space or newline + for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) { + char c = text.charAt(i); + if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) { + char next = text.charAt(i + 1); + if (next == ' ' || next == '\n') return i + 1; + } + } + + // 3. Word boundary + for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) { + if (text.charAt(i) == ' ') return i + 1; + } + + // 4. Hard cut + return hardEnd; + } + private Map buildMetadata(SectionEntity section, String bookTitle, int index, int total, String chunkId) { Map m = new HashMap<>(); diff --git a/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java b/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java index 86380a3..5b40b2b 100644 --- a/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java +++ b/backend/src/main/java/com/aiteacher/document/VisionDescriptionService.java @@ -8,18 +8,29 @@ import org.springframework.stereotype.Service; import org.springframework.util.MimeTypeUtils; /** - * Generates a clinical text description for an extracted figure image - * using the OpenAI vision model via Spring AI ChatClient. + * Analyses an extracted figure image using the OpenAI vision model. + * + *

Returns an {@link ImageAnalysis} record containing: + *

    + *
  • {@code description} — 2-3 sentence clinical description of the image
  • + *
  • {@code imageText} — all visible text, labels, and annotations copied verbatim + * from the image (empty string when none present)
  • + *
+ * + *

Both fields are stored: {@code description} drives the embedding; {@code imageText} + * is added to chunk metadata so queries can match exact labels (e.g., "Circle of Willis"). */ @Service public class VisionDescriptionService { private static final Logger log = LoggerFactory.getLogger(VisionDescriptionService.class); - private static final String PROMPT = - "You are a neurosurgery educator. Provide a brief 2-3 sentence clinical description of " + - "this image. Focus on anatomical structures, surgical landmarks, labels, and clinical " + - "significance. If text or labels are visible, include them verbatim."; + private static final String PROMPT = """ + You are a neurosurgery educator analysing a medical image. + Respond in EXACTLY this format — no other text, no markdown: + DESCRIPTION: <2-3 sentence clinical description focusing on anatomical structures, surgical landmarks, and clinical significance> + IMAGE_TEXT: + """; private final ChatClient chatClient; @@ -28,19 +39,53 @@ public class VisionDescriptionService { } /** - * Returns a description string. Falls back to the provided caption if vision fails. + * Holds the structured output of a vision model call on one figure image. + * + * @param description clinical description of the image content + * @param imageText verbatim text visible inside the image; empty string if none */ - public String describe(byte[] imageBytes, String captionFallback) { + public record ImageAnalysis(String description, String imageText) {} + + /** + * Analyses the image bytes and returns an {@link ImageAnalysis}. + * Falls back gracefully: if the vision call fails, the caption is used as description + * and imageText is left empty. + * + * @param imageBytes PNG bytes of the extracted figure + * @param captionFallback caption detected from surrounding text, may be null + */ + public ImageAnalysis analyze(byte[] imageBytes, String captionFallback) { try { - return chatClient.prompt() - .user(u -> u - .text(PROMPT) - .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes))) - .call() - .content(); + String raw = chatClient.prompt() + .user(u -> u + .text(PROMPT) + .media(MimeTypeUtils.IMAGE_PNG, new ByteArrayResource(imageBytes))) + .call() + .content(); + return parse(raw, captionFallback); } catch (Exception ex) { - log.warn("Vision description failed: {} — using caption as fallback", ex.getMessage()); - return captionFallback != null ? captionFallback : "Figure"; + log.warn("Vision analysis failed: {} — using caption as fallback", ex.getMessage()); + return new ImageAnalysis( + captionFallback != null ? captionFallback : "Figure", + ""); } } + + private ImageAnalysis parse(String raw, String captionFallback) { + String description = captionFallback != null ? captionFallback : "Figure"; + String imageText = ""; + + if (raw != null) { + for (String line : raw.split("\n")) { + if (line.startsWith("DESCRIPTION:")) { + String val = line.substring("DESCRIPTION:".length()).strip(); + if (!val.isEmpty()) description = val; + } else if (line.startsWith("IMAGE_TEXT:")) { + String val = line.substring("IMAGE_TEXT:".length()).strip(); + if (!val.isEmpty() && !"NONE".equalsIgnoreCase(val)) imageText = val; + } + } + } + return new ImageAnalysis(description, imageText); + } } diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index f045ac8..5cf70c0 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -64,3 +64,5 @@ app: embedding: batch-size: 20 batch-delay-ms: 2000 + marker: + base-url: ${MARKER_BASE_URL:http://localhost:8000} diff --git a/frontend/src/components/BookCard.vue b/frontend/src/components/BookCard.vue index 3755cbf..067ac32 100644 --- a/frontend/src/components/BookCard.vue +++ b/frontend/src/components/BookCard.vue @@ -33,6 +33,13 @@

+ + Read +