104 lines
3.9 KiB
Java
104 lines
3.9 KiB
Java
package com.aiteacher.document;
|
|
|
|
import org.springframework.ai.document.Document;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.UUID;
|
|
|
|
/**
|
|
* Splits a SectionEntity's full text into overlapping chunks for vector embedding.
|
|
* Target size: ~1800 characters (~450 tokens); overlap: 200 characters.
|
|
*/
|
|
@Service
|
|
public class TextChunkingService {
|
|
|
|
private static final int TARGET_CHARS = 1800;
|
|
private static final int OVERLAP_CHARS = 200;
|
|
|
|
public List<Document> chunk(SectionEntity section, String bookTitle) {
|
|
String text = section.getFullText();
|
|
if (text == null || text.isBlank()) return List.of();
|
|
|
|
List<String> windows = split(text);
|
|
List<Document> documents = new ArrayList<>();
|
|
|
|
for (int i = 0; i < windows.size(); i++) {
|
|
String chunkId = UUID.randomUUID().toString();
|
|
Map<String, Object> metadata = buildMetadata(section, bookTitle, i, windows.size(), chunkId);
|
|
documents.add(new Document(chunkId, windows.get(i), metadata));
|
|
}
|
|
return documents;
|
|
}
|
|
|
|
private List<String> split(String text) {
|
|
List<String> windows = new ArrayList<>();
|
|
int start = 0;
|
|
while (start < text.length()) {
|
|
int hardEnd = Math.min(start + TARGET_CHARS, text.length());
|
|
if (hardEnd == text.length()) {
|
|
String last = text.substring(start).strip();
|
|
if (!last.isEmpty()) windows.add(last);
|
|
break;
|
|
}
|
|
int splitAt = findSplitPoint(text, start, hardEnd);
|
|
String chunk = text.substring(start, splitAt).strip();
|
|
if (!chunk.isEmpty()) windows.add(chunk);
|
|
// Overlap: back up from split point, align to a word start
|
|
int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
|
|
while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
|
|
start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
|
|
}
|
|
return windows;
|
|
}
|
|
|
|
/**
|
|
* Finds the best split point at or before hardEnd, preferring (in order):
|
|
* paragraph boundary, sentence boundary, word boundary, hard cut.
|
|
*/
|
|
private int findSplitPoint(String text, int start, int hardEnd) {
|
|
int lookback = Math.min(400, (hardEnd - start) / 2);
|
|
|
|
// 1. Paragraph boundary
|
|
int paraIdx = text.lastIndexOf("\n\n", hardEnd);
|
|
if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
|
|
|
|
// 2. Sentence boundary (. ! ?) followed by space or newline
|
|
for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
|
|
char c = text.charAt(i);
|
|
if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
|
|
char next = text.charAt(i + 1);
|
|
if (next == ' ' || next == '\n') return i + 1;
|
|
}
|
|
}
|
|
|
|
// 3. Word boundary
|
|
for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
|
|
if (text.charAt(i) == ' ') return i + 1;
|
|
}
|
|
|
|
// 4. Hard cut
|
|
return hardEnd;
|
|
}
|
|
|
|
private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
|
|
int index, int total, String chunkId) {
|
|
Map<String, Object> m = new HashMap<>();
|
|
m.put("type", "TEXT");
|
|
m.put("book_id", section.getBookId().toString());
|
|
m.put("book_title", bookTitle);
|
|
m.put("chapter_id", section.getChapterId());
|
|
m.put("section_id", section.getId());
|
|
m.put("section_title", section.getTitle() != null ? section.getTitle() : "");
|
|
m.put("page_start", section.getPageStart());
|
|
m.put("page_end", section.getPageEnd());
|
|
m.put("chunk_index", index);
|
|
m.put("total_chunks", total);
|
|
m.put("chunk_id", chunkId);
|
|
return m;
|
|
}
|
|
}
|