115 lines
4.3 KiB
Java
115 lines
4.3 KiB
Java
package com.aiteacher.document;
|
|
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
|
|
import java.awt.Rectangle;
|
|
import java.io.IOException;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.UUID;
|
|
|
|
/**
|
|
* Parses a PDF into page-level SectionEntity records stored in Postgres.
|
|
* Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
|
|
* left column is extracted first then right, preserving correct reading order.
|
|
* Text is also normalized (collapsed whitespace) before storage.
|
|
*/
|
|
@Service
|
|
public class PdfStructureParser {
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
|
|
|
|
// Right column is considered empty (single-column page) if it has < 20% of left column's content
|
|
private static final double TWO_COLUMN_THRESHOLD = 0.2;
|
|
|
|
private final ChapterRepository chapterRepository;
|
|
private final SectionRepository sectionRepository;
|
|
|
|
public PdfStructureParser(ChapterRepository chapterRepository,
|
|
SectionRepository sectionRepository) {
|
|
this.chapterRepository = chapterRepository;
|
|
this.sectionRepository = sectionRepository;
|
|
}
|
|
|
|
@Transactional
|
|
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
|
|
log.info("Parsing PDF structure for book {}", bookId);
|
|
|
|
String chapterId = bookId + "-ch1";
|
|
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
|
chapterRepository.save(chapter);
|
|
|
|
List<SectionEntity> sections = new ArrayList<>();
|
|
|
|
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
|
List<PDPage> pages = new ArrayList<>();
|
|
doc.getPages().forEach(pages::add);
|
|
|
|
for (int i = 0; i < 25; i++) {
|
|
int pageNum = i + 1;
|
|
String text = normalizeWhitespace(extractPageText(pages.get(i)));
|
|
if (text.isBlank()) continue;
|
|
|
|
String sectionId = bookId + "-p" + pageNum;
|
|
SectionEntity section = new SectionEntity(
|
|
sectionId, chapterId, bookId,
|
|
String.valueOf(pageNum),
|
|
"Page " + pageNum,
|
|
pageNum, pageNum,
|
|
text
|
|
);
|
|
sections.add(sectionRepository.save(section));
|
|
}
|
|
} catch (IOException e) {
|
|
throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
|
|
}
|
|
|
|
log.info("Parsed {} sections for book {}", sections.size(), bookId);
|
|
return sections;
|
|
}
|
|
|
|
/**
|
|
* Extracts text from a single page using column-aware region extraction.
|
|
* Splits the page at the horizontal midpoint. If the right region has fewer
|
|
* than 20% of the characters of the left region, treats the page as single-column.
|
|
*/
|
|
private String extractPageText(PDPage page) throws IOException {
|
|
PDRectangle mediaBox = page.getMediaBox();
|
|
int width = (int) mediaBox.getWidth();
|
|
int height = (int) mediaBox.getHeight();
|
|
int mid = width / 2;
|
|
|
|
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
stripper.setSortByPosition(true);
|
|
stripper.addRegion("left", new Rectangle(0, 0, mid, height));
|
|
stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
|
|
stripper.extractRegions(page);
|
|
|
|
String left = stripper.getTextForRegion("left").strip();
|
|
String right = stripper.getTextForRegion("right").strip();
|
|
|
|
if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
|
|
// Single-column page — left holds all (or nearly all) content
|
|
return left.isEmpty() ? right : left;
|
|
}
|
|
return left + "\n\n" + right;
|
|
}
|
|
|
|
/** Collapses multi-space/tab runs and excessive blank lines. */
|
|
private String normalizeWhitespace(String text) {
|
|
return text
|
|
.replaceAll("[ \t]{2,}", " ")
|
|
.replaceAll("\n{3,}", "\n\n")
|
|
.trim();
|
|
}
|
|
}
|