adding Marker to parse effectively pdf
This commit is contained in:
@@ -38,14 +38,52 @@ public class TextChunkingService {
|
||||
List<String> windows = new ArrayList<>();
|
||||
int start = 0;
|
||||
while (start < text.length()) {
|
||||
int end = Math.min(start + TARGET_CHARS, text.length());
|
||||
windows.add(text.substring(start, end));
|
||||
if (end == text.length()) break;
|
||||
start = end - OVERLAP_CHARS;
|
||||
int hardEnd = Math.min(start + TARGET_CHARS, text.length());
|
||||
if (hardEnd == text.length()) {
|
||||
String last = text.substring(start).strip();
|
||||
if (!last.isEmpty()) windows.add(last);
|
||||
break;
|
||||
}
|
||||
int splitAt = findSplitPoint(text, start, hardEnd);
|
||||
String chunk = text.substring(start, splitAt).strip();
|
||||
if (!chunk.isEmpty()) windows.add(chunk);
|
||||
// Overlap: back up from split point, align to a word start
|
||||
int overlapStart = Math.max(start + 1, splitAt - OVERLAP_CHARS);
|
||||
while (overlapStart < splitAt && text.charAt(overlapStart) != ' ') overlapStart++;
|
||||
start = overlapStart < splitAt ? overlapStart + 1 : splitAt;
|
||||
}
|
||||
return windows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the best split point at or before hardEnd, preferring (in order):
|
||||
* paragraph boundary, sentence boundary, word boundary, hard cut.
|
||||
*/
|
||||
private int findSplitPoint(String text, int start, int hardEnd) {
|
||||
int lookback = Math.min(400, (hardEnd - start) / 2);
|
||||
|
||||
// 1. Paragraph boundary
|
||||
int paraIdx = text.lastIndexOf("\n\n", hardEnd);
|
||||
if (paraIdx > hardEnd - lookback && paraIdx > start) return paraIdx + 2;
|
||||
|
||||
// 2. Sentence boundary (. ! ?) followed by space or newline
|
||||
for (int i = hardEnd - 1; i > hardEnd - lookback && i > start; i--) {
|
||||
char c = text.charAt(i);
|
||||
if ((c == '.' || c == '!' || c == '?') && i + 1 < text.length()) {
|
||||
char next = text.charAt(i + 1);
|
||||
if (next == ' ' || next == '\n') return i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Word boundary
|
||||
for (int i = hardEnd - 1; i > hardEnd - 100 && i > start; i--) {
|
||||
if (text.charAt(i) == ' ') return i + 1;
|
||||
}
|
||||
|
||||
// 4. Hard cut
|
||||
return hardEnd;
|
||||
}
|
||||
|
||||
private Map<String, Object> buildMetadata(SectionEntity section, String bookTitle,
|
||||
int index, int total, String chunkId) {
|
||||
Map<String, Object> m = new HashMap<>();
|
||||
|
||||
Reference in New Issue
Block a user