Files
ai-teacher/backend/src/main/java/com/aiteacher/document/PdfStructureParser.java
T
2026-04-04 21:30:18 +02:00

115 lines
4.3 KiB
Java

package com.aiteacher.document;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.awt.Rectangle;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
/**
* Parses a PDF into page-level SectionEntity records stored in Postgres.
* Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
* left column is extracted first then right, preserving correct reading order.
* Text is also normalized (collapsed whitespace) before storage.
*/
@Service
public class PdfStructureParser {
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
// Right column is considered empty (single-column page) if it has < 20% of left column's content
private static final double TWO_COLUMN_THRESHOLD = 0.2;
private final ChapterRepository chapterRepository;
private final SectionRepository sectionRepository;
public PdfStructureParser(ChapterRepository chapterRepository,
SectionRepository sectionRepository) {
this.chapterRepository = chapterRepository;
this.sectionRepository = sectionRepository;
}
@Transactional
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
log.info("Parsing PDF structure for book {}", bookId);
String chapterId = bookId + "-ch1";
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
chapterRepository.save(chapter);
List<SectionEntity> sections = new ArrayList<>();
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
List<PDPage> pages = new ArrayList<>();
doc.getPages().forEach(pages::add);
for (int i = 0; i < 25; i++) {
int pageNum = i + 1;
String text = normalizeWhitespace(extractPageText(pages.get(i)));
if (text.isBlank()) continue;
String sectionId = bookId + "-p" + pageNum;
SectionEntity section = new SectionEntity(
sectionId, chapterId, bookId,
String.valueOf(pageNum),
"Page " + pageNum,
pageNum, pageNum,
text
);
sections.add(sectionRepository.save(section));
}
} catch (IOException e) {
throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
}
log.info("Parsed {} sections for book {}", sections.size(), bookId);
return sections;
}
/**
* Extracts text from a single page using column-aware region extraction.
* Splits the page at the horizontal midpoint. If the right region has fewer
* than 20% of the characters of the left region, treats the page as single-column.
*/
private String extractPageText(PDPage page) throws IOException {
PDRectangle mediaBox = page.getMediaBox();
int width = (int) mediaBox.getWidth();
int height = (int) mediaBox.getHeight();
int mid = width / 2;
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
stripper.addRegion("left", new Rectangle(0, 0, mid, height));
stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
stripper.extractRegions(page);
String left = stripper.getTextForRegion("left").strip();
String right = stripper.getTextForRegion("right").strip();
if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
// Single-column page — left holds all (or nearly all) content
return left.isEmpty() ? right : left;
}
return left + "\n\n" + right;
}
/** Collapses multi-space/tab runs and excessive blank lines. */
private String normalizeWhitespace(String text) {
return text
.replaceAll("[ \t]{2,}", " ")
.replaceAll("\n{3,}", "\n\n")
.trim();
}
}