73 lines
2.7 KiB
Java
73 lines
2.7 KiB
Java
package com.aiteacher.document;
|
|
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
import org.apache.pdfbox.multipdf.Splitter;
|
|
import org.apache.pdfbox.pdfparser.PDFParser;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import java.io.IOException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
/**
|
|
* Splits a PDF file into fixed-size chunks using PDFBox.
|
|
* Each chunk is saved as a temporary file so it can be submitted independently to Marker.
|
|
*/
|
|
@Service
|
|
public class PdfSplitterService {
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class);
|
|
|
|
/**
|
|
* A chunk of a split PDF.
|
|
*
|
|
* @param tempFile path to the temporary PDF file (caller must delete when done)
|
|
* @param pageOffset 0-based index of the first page in this chunk within the original document
|
|
*/
|
|
public record PdfChunk(Path tempFile, int pageOffset) {}
|
|
|
|
/**
|
|
* Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages.
|
|
* Returns a single-element list when the document fits in one chunk.
|
|
*
|
|
* @param pdfPath source PDF
|
|
* @param maxPagesPerChunk maximum pages per chunk
|
|
* @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s
|
|
*/
|
|
public List<PdfChunk> split(Path pdfPath, int maxPagesPerChunk) throws IOException {
|
|
try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) {
|
|
int totalPages = doc.getNumberOfPages();
|
|
log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk);
|
|
|
|
if (totalPages <= maxPagesPerChunk) {
|
|
// No split needed — return the original file as a single virtual chunk
|
|
return List.of(new PdfChunk(pdfPath, 0));
|
|
}
|
|
|
|
Splitter splitter = new Splitter();
|
|
splitter.setSplitAtPage(maxPagesPerChunk);
|
|
List<PDDocument> parts = splitter.split(doc);
|
|
|
|
List<PdfChunk> chunks = new ArrayList<>(parts.size());
|
|
int offset = 0;
|
|
for (PDDocument part : parts) {
|
|
try {
|
|
Path tmp = Files.createTempFile("marker-chunk-", ".pdf");
|
|
part.save(tmp.toFile());
|
|
chunks.add(new PdfChunk(tmp, offset));
|
|
log.debug("Created chunk at {} (page offset {})", tmp, offset);
|
|
offset += part.getNumberOfPages();
|
|
} finally {
|
|
part.close();
|
|
}
|
|
}
|
|
return chunks;
|
|
}
|
|
}
|
|
}
|