package com.aiteacher.document; import org.springframework.ai.document.Document; import org.springframework.stereotype.Service; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; /** * Splits a SectionEntity's full text into overlapping chunks for vector embedding. * Target size: ~1800 characters (~450 tokens); overlap: 200 characters. */ @Service public class TextChunkingService { private static final int TARGET_CHARS = 1800; private static final int OVERLAP_CHARS = 200; public List chunk(SectionEntity section, String bookTitle) { String text = section.getFullText(); if (text == null || text.isBlank()) return List.of(); List windows = split(text); List documents = new ArrayList<>(); for (int i = 0; i < windows.size(); i++) { String chunkId = UUID.randomUUID().toString(); Map metadata = buildMetadata(section, bookTitle, i, windows.size(), chunkId); documents.add(new Document(chunkId, windows.get(i), metadata)); } return documents; } private List split(String text) { List windows = new ArrayList<>(); int start = 0; while (start < text.length()) { int hardEnd = Math.min(start + TARGET_CHARS, text.length()); if (hardEnd == text.length()) { String last = text.substring(start).strip(); if (!last.isEmpty()) windows.add(last); break; } int splitAt = findSplitPoint(text, start, hardEnd); String chunk = text.substring(start, splitAt).strip(); if (!chunk.isEmpty()) windows.add(chunk); // Overlap: back up from split point, align to a word start int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS); while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++; start = overlapStart < splitAt ? overlapStart + 1 : splitAt; } return windows; } /** * Finds the best split point at or before hardEnd, preferring (in order): * paragraph boundary, sentence boundary, word boundary, hard cut. */ private int findSplitPoint(String text, int start, int hardEnd) { int lookback = Math.min(400, (hardEnd - start) / 2); // 1. Paragraph boundary int paraIdx = text.lastIndexOf("\n\n", hardEnd); if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2; // 2. Sentence boundary (. ! ?) followed by space or newline for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) { char c = text.charAt(i); if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) { char next = text.charAt(i + 1); if (next == ' ' || next == '\n') return i + 1; } } // 3. Word boundary for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) { if (text.charAt(i) == ' ') return i + 1; } // 4. Hard cut return hardEnd; } private Map buildMetadata(SectionEntity section, String bookTitle, int index, int total, String chunkId) { Map m = new HashMap<>(); m.put("type", "TEXT"); m.put("book_id", section.getBookId().toString()); m.put("book_title", bookTitle); m.put("chapter_id", section.getChapterId()); m.put("section_id", section.getId()); m.put("section_title", section.getTitle() != null ? section.getTitle() : ""); m.put("page_start", section.getPageStart()); m.put("page_end", section.getPageEnd()); m.put("chunk_index", index); m.put("total_chunks", total); m.put("chunk_id", chunkId); return m; } }