package com.aiteacher.document; import org.apache.pdfbox.io.RandomAccessReadBufferedFile; import org.apache.pdfbox.multipdf.Splitter; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; /** * Splits a PDF file into fixed-size chunks using PDFBox. * Each chunk is saved as a temporary file so it can be submitted independently to Marker. */ @Service public class PdfSplitterService { private static final Logger log = LoggerFactory.getLogger(PdfSplitterService.class); /** * A chunk of a split PDF. * * @param tempFile path to the temporary PDF file (caller must delete when done) * @param pageOffset 0-based index of the first page in this chunk within the original document */ public record PdfChunk(Path tempFile, int pageOffset) {} /** * Splits {@code pdfPath} into chunks of at most {@code maxPagesPerChunk} pages. * Returns a single-element list when the document fits in one chunk. * * @param pdfPath source PDF * @param maxPagesPerChunk maximum pages per chunk * @return ordered list of chunks; caller is responsible for deleting {@code tempFile}s */ public List split(Path pdfPath, int maxPagesPerChunk) throws IOException { try (PDDocument doc = new PDFParser(new RandomAccessReadBufferedFile(pdfPath.toFile())).parse()) { int totalPages = doc.getNumberOfPages(); log.info("PDF {} has {} pages, splitting into chunks of {}", pdfPath.getFileName(), totalPages, maxPagesPerChunk); if (totalPages <= maxPagesPerChunk) { // No split needed — return the original file as a single virtual chunk return List.of(new PdfChunk(pdfPath, 0)); } Splitter splitter = new Splitter(); splitter.setSplitAtPage(maxPagesPerChunk); List parts = splitter.split(doc); List chunks = new ArrayList<>(parts.size()); int offset = 0; for (PDDocument part : parts) { try { Path tmp = Files.createTempFile("marker-chunk-", ".pdf"); part.save(tmp.toFile()); chunks.add(new PdfChunk(tmp, offset)); log.debug("Created chunk at {} (page offset {})", tmp, offset); offset += part.getNumberOfPages(); } finally { part.close(); } } return chunks; } } }