first implementation - image/drawing integration
This commit is contained in:
@@ -0,0 +1,305 @@
|
||||
# Data Model: Enhanced Embedding with Image Parsing and Metadata
|
||||
|
||||
**Branch**: `002-image-aware-embedding` | **Date**: 2026-04-03
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Three storage tiers work in concert:
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ PDF Upload │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Parsing Pipeline │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ Postgres (source of truth) pgvector (search index) │
|
||||
│ - book - vector_store (text chunks) │
|
||||
│ - chapter - vector_store (figure captions) │
|
||||
│ - section (+ fullText) File Store (images) │
|
||||
│ - figure (metadata) - /uploads/figures/{bookId}/*.png │
|
||||
│ - chunk_figure_refs │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Postgres Schema
|
||||
|
||||
### Existing tables (unchanged)
|
||||
|
||||
- `book` — status, metadata, page count (V1)
|
||||
- `chat_session`, `message` — conversation (V1)
|
||||
- `vector_store` — managed by Spring AI pgvector starter (V2)
|
||||
- `topic` — predefined topics (V3)
|
||||
|
||||
### New tables (Flyway V4)
|
||||
|
||||
```sql
|
||||
-- V4: Document hierarchy
|
||||
|
||||
CREATE TABLE chapter (
|
||||
id VARCHAR(200) PRIMARY KEY, -- "{bookId}-ch{N}"
|
||||
book_id UUID NOT NULL REFERENCES book(id) ON DELETE CASCADE,
|
||||
number INT NOT NULL,
|
||||
title VARCHAR(500),
|
||||
page_start INT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE section (
|
||||
id VARCHAR(200) PRIMARY KEY, -- "{bookId}-ch{N}-s{X}-{Y}"
|
||||
chapter_id VARCHAR(200) NOT NULL REFERENCES chapter(id) ON DELETE CASCADE,
|
||||
book_id UUID NOT NULL REFERENCES book(id) ON DELETE CASCADE,
|
||||
number VARCHAR(50), -- "2.3" or "12.2.3"
|
||||
title VARCHAR(500),
|
||||
page_start INT NOT NULL,
|
||||
page_end INT NOT NULL,
|
||||
full_text TEXT NOT NULL, -- NOT in vector store
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_section_book ON section(book_id);
|
||||
CREATE INDEX idx_section_chapter ON section(chapter_id);
|
||||
```
|
||||
|
||||
### New tables (Flyway V5)
|
||||
|
||||
```sql
|
||||
-- V5: Figures and chunk→figure links
|
||||
|
||||
CREATE TABLE figure (
|
||||
id VARCHAR(200) PRIMARY KEY, -- "{bookId}-fig-{label}"
|
||||
book_id UUID NOT NULL REFERENCES book(id) ON DELETE CASCADE,
|
||||
section_id VARCHAR(200) REFERENCES section(id) ON DELETE SET NULL,
|
||||
chapter_id VARCHAR(200) REFERENCES chapter(id) ON DELETE SET NULL,
|
||||
label VARCHAR(100), -- "Fig. 12-4"
|
||||
caption TEXT,
|
||||
figure_type VARCHAR(50) NOT NULL, -- FigureType enum name
|
||||
page INT NOT NULL,
|
||||
image_path VARCHAR(1000) NOT NULL, -- relative path on disk
|
||||
caption_embedding_id UUID, -- ID in vector_store
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE chunk_figure_ref (
|
||||
chunk_id UUID NOT NULL, -- vector_store document ID
|
||||
figure_id VARCHAR(200) NOT NULL REFERENCES figure(id) ON DELETE CASCADE,
|
||||
mention_page INT,
|
||||
PRIMARY KEY (chunk_id, figure_id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_figure_book ON figure(book_id);
|
||||
CREATE INDEX idx_cfr_chunk ON chunk_figure_ref(chunk_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Java Domain Records
|
||||
|
||||
### Document hierarchy (new package `com.aiteacher.document`)
|
||||
|
||||
```java
|
||||
// Root — in-memory only, not a JPA entity
|
||||
public record BookNode(
|
||||
String bookId,
|
||||
String title,
|
||||
String isbn,
|
||||
String edition,
|
||||
List<String> authors,
|
||||
List<ChapterNode> chapters
|
||||
) {}
|
||||
|
||||
// Chapter — maps to `chapter` table
|
||||
public record ChapterNode(
|
||||
String chapterId,
|
||||
String bookId,
|
||||
int number,
|
||||
String title,
|
||||
int pageStart,
|
||||
List<SectionNode> sections
|
||||
) {}
|
||||
|
||||
// Section — maps to `section` table; fullText stays in Postgres
|
||||
public record SectionNode(
|
||||
String sectionId,
|
||||
String chapterId,
|
||||
String bookId,
|
||||
String number,
|
||||
String title,
|
||||
int pageStart,
|
||||
int pageEnd,
|
||||
String fullText,
|
||||
List<TextChunkNode> chunks,
|
||||
List<FigureNode> figures
|
||||
) {}
|
||||
|
||||
// Text chunk — embedded into vector_store; references its parent section
|
||||
public record TextChunkNode(
|
||||
String chunkId, // UUID → becomes vector_store document ID
|
||||
String sectionId,
|
||||
String chapterId,
|
||||
String bookId,
|
||||
String text,
|
||||
int chunkIndex,
|
||||
int totalChunksInSection,
|
||||
int pageStart,
|
||||
int pageEnd,
|
||||
Map<String, Object> metadata // flattened for Spring AI filtering
|
||||
) {
|
||||
public Map<String, Object> toMetadata() {
|
||||
return Map.of(
|
||||
"type", "TEXT",
|
||||
"book_id", bookId,
|
||||
"chapter_id", chapterId,
|
||||
"section_id", sectionId,
|
||||
"section_title", /* from parent SectionNode */,
|
||||
"page_start", pageStart,
|
||||
"page_end", pageEnd,
|
||||
"chunk_index", chunkIndex,
|
||||
"total_chunks", totalChunksInSection
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Figure — maps to `figure` table; caption embedded into vector_store
|
||||
public record FigureNode(
|
||||
String figureId,
|
||||
String sectionId,
|
||||
String chapterId,
|
||||
String bookId,
|
||||
String label, // "Fig. 12-4"
|
||||
String caption,
|
||||
FigureType type,
|
||||
int page,
|
||||
String imagePath, // relative: "figures/{bookId}/{figureId}.png"
|
||||
UUID captionEmbeddingId // ID in vector_store
|
||||
) {}
|
||||
```
|
||||
|
||||
### Figure type enum
|
||||
|
||||
```java
|
||||
public enum FigureType {
|
||||
ANATOMICAL_DIAGRAM,
|
||||
SURGICAL_PHOTOGRAPH,
|
||||
MRI_CT_SCAN,
|
||||
TABLE,
|
||||
CHART,
|
||||
INTRAOPERATIVE_IMAGE
|
||||
}
|
||||
```
|
||||
|
||||
Classification heuristic (applied to caption + surrounding text):
|
||||
|
||||
| Keyword(s) | FigureType |
|
||||
|-----------|-----------|
|
||||
| `MRI`, `CT`, `magnetic`, `resonance`, `tomography` | `MRI_CT_SCAN` |
|
||||
| `intraoperative`, `intra-op` | `INTRAOPERATIVE_IMAGE` |
|
||||
| `table`, `Table` (at line start) | `TABLE` |
|
||||
| `chart`, `graph`, `histogram` | `CHART` |
|
||||
| `photograph`, `photo` | `SURGICAL_PHOTOGRAPH` |
|
||||
| (default) | `ANATOMICAL_DIAGRAM` |
|
||||
|
||||
### Chunk–figure join record
|
||||
|
||||
```java
|
||||
// Maps to `chunk_figure_ref` table
|
||||
public record ChunkFigureRef(
|
||||
UUID chunkId,
|
||||
String figureId,
|
||||
int mentionPage
|
||||
) {}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Vector Store Documents
|
||||
|
||||
All documents in `vector_store` carry a `metadata` JSON column with a `type` field for filtering.
|
||||
|
||||
### Text chunk document
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| `content` | chunk text (400–600 tokens) |
|
||||
| `metadata.type` | `"TEXT"` |
|
||||
| `metadata.book_id` | book UUID |
|
||||
| `metadata.book_title` | book title string |
|
||||
| `metadata.chapter_id` | chapter ID string |
|
||||
| `metadata.section_id` | section ID string |
|
||||
| `metadata.section_title` | section title string |
|
||||
| `metadata.page_start` | int |
|
||||
| `metadata.page_end` | int |
|
||||
| `metadata.chunk_index` | int (0-based) |
|
||||
| `metadata.total_chunks` | int |
|
||||
|
||||
### Figure caption document
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| `content` | vision-generated description + caption text |
|
||||
| `metadata.type` | `"FIGURE"` |
|
||||
| `metadata.book_id` | book UUID |
|
||||
| `metadata.book_title` | book title string |
|
||||
| `metadata.chapter_id` | chapter ID string |
|
||||
| `metadata.section_id` | section ID string |
|
||||
| `metadata.figure_id` | figure ID string |
|
||||
| `metadata.figure_type` | enum name string |
|
||||
| `metadata.image_path` | relative file path |
|
||||
| `metadata.label` | caption label e.g. `"Fig. 12-4"` |
|
||||
| `metadata.page` | int |
|
||||
|
||||
---
|
||||
|
||||
## File Store Layout
|
||||
|
||||
```
|
||||
uploads/
|
||||
└── figures/
|
||||
└── {bookId}/
|
||||
├── {figureId}.png
|
||||
└── ...
|
||||
```
|
||||
|
||||
- Base path configurable via `app.figure-storage.base-path` (default: `./uploads`)
|
||||
- Files are served via `GET /api/v1/figures/{bookId}/{filename}` (static resource mapping)
|
||||
- Gitignored; not version-controlled
|
||||
|
||||
---
|
||||
|
||||
## State Transitions
|
||||
|
||||
Book processing extends the existing `BookStatus` state machine:
|
||||
|
||||
```
|
||||
PENDING → PROCESSING → READY
|
||||
↘ FAILED
|
||||
```
|
||||
|
||||
During `PROCESSING`:
|
||||
1. Parse PDF structure → extract chapters/sections → persist to Postgres
|
||||
2. Split sections into text chunks → embed → write to vector_store
|
||||
3. Extract images per page → filter by min size → save PNG → generate vision description → embed caption → write figure to Postgres + vector_store
|
||||
4. Write chunk_figure_refs for all detected figure references in text
|
||||
|
||||
Failure at step 3 (individual page) → log + skip that page's images; continue.
|
||||
Failure at any other step → set `BookStatus.FAILED`.
|
||||
|
||||
---
|
||||
|
||||
## Retrieval Result Structure
|
||||
|
||||
```java
|
||||
public record RetrievalResult(
|
||||
List<SectionNode> parentSections, // expanded full-text context
|
||||
List<Document> figureVectorHits, // semantic figure matches
|
||||
List<FigureNode> linkedFigures // figures explicitly referenced in text chunks
|
||||
) {}
|
||||
```
|
||||
|
||||
The `NeurosurgeryRetriever` service deduplicates figures across both lists before passing
|
||||
the result to the LLM prompt builder.
|
||||
Reference in New Issue
Block a user