first implementation - image/drawing integration
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Splits a SectionEntity's full text into overlapping chunks for vector embedding.
|
||||
* Target size: ~1800 characters (~450 tokens); overlap: 200 characters.
|
||||
*/
|
||||
@Service
|
||||
public class TextChunkingService {
|
||||
|
||||
private static final int TARGET_CHARS = 1800;
|
||||
private static final int OVERLAP_CHARS = 200;
|
||||
|
||||
public List<Document> chunk(SectionEntity section, String bookTitle) {
|
||||
String text = section.getFullText();
|
||||
if (text == null || text.isBlank()) return List.of();
|
||||
|
||||
List<String> windows = split(text);
|
||||
List<Document> documents = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < windows.size(); i++) {
|
||||
String chunkId = UUID.randomUUID().toString();
|
||||
Map<String, Object> metadata = buildMetadata(section, bookTitle, i, windows.size(), chunkId);
|
||||
documents.add(new Document(chunkId, windows.get(i), metadata));
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
private List<String> split(String text) {
|
||||
List<String> windows = new ArrayList<>();
|
||||
int start = 0;
|
||||
while (start < text.length()) {
|
||||
int end = Math.min(start + TARGET_CHARS, text.length());
|
||||
windows.add(text.substring(start, end));
|
||||
if (end == text.length()) break;
|
||||
start = end - OVERLAP_CHARS;
|
||||
}
|
||||
return windows;
|
||||
}
|
||||
|
||||
private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
|
||||
int index, int total, String chunkId) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
m.put("type", "TEXT");
|
||||
m.put("book_id", section.getBookId().toString());
|
||||
m.put("book_title", bookTitle);
|
||||
m.put("chapter_id", section.getChapterId());
|
||||
m.put("section_id", section.getId());
|
||||
m.put("section_title", section.getTitle() != null ? section.getTitle() : "");
|
||||
m.put("page_start", section.getPageStart());
|
||||
m.put("page_end", section.getPageEnd());
|
||||
m.put("chunk_index", index);
|
||||
m.put("total_chunks", total);
|
||||
m.put("chunk_id", chunkId);
|
||||
return m;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user