adding Marker to parse effectively pdf
This commit is contained in:
@@ -1,13 +1,17 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -15,13 +19,18 @@ import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Parses a PDF into page-level SectionEntity records stored in Postgres.
|
||||
* Each page becomes one section, grouped under a single chapter per book.
|
||||
* Uses column-aware extraction via PDFTextStripperByArea: for two-column pages,
|
||||
* left column is extracted first then right, preserving correct reading order.
|
||||
* Text is also normalized (collapsed whitespace) before storage.
|
||||
*/
|
||||
@Service
|
||||
public class PdfStructureParser {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
|
||||
|
||||
// Right column is considered empty (single-column page) if it has < 20% of left column's content
|
||||
private static final double TWO_COLUMN_THRESHOLD = 0.2;
|
||||
|
||||
private final ChapterRepository chapterRepository;
|
||||
private final SectionRepository sectionRepository;
|
||||
|
||||
@@ -35,37 +44,71 @@ public class PdfStructureParser {
|
||||
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Parsing PDF structure for book {}", bookId);
|
||||
|
||||
// One chapter per book
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// One section per page
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(
|
||||
new FileSystemResource(pdfPath.toFile()),
|
||||
PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
|
||||
);
|
||||
|
||||
List<org.springframework.ai.document.Document> pages = reader.get();
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < pages.size(); i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = pages.get(i).getText();
|
||||
if (text == null || text.isBlank()) continue;
|
||||
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
||||
List<PDPage> pages = new ArrayList<>();
|
||||
doc.getPages().forEach(pages::add);
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
for (int i = 0; i < 25; i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = normalizeWhitespace(extractPageText(pages.get(i)));
|
||||
if (text.isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Failed to parse PDF for book " + bookId, e);
|
||||
}
|
||||
|
||||
log.info("Parsed {} sections for book {}", sections.size(), bookId);
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts text from a single page using column-aware region extraction.
|
||||
* Splits the page at the horizontal midpoint. If the right region has fewer
|
||||
* than 20% of the characters of the left region, treats the page as single-column.
|
||||
*/
|
||||
private String extractPageText(PDPage page) throws IOException {
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
int width = (int) mediaBox.getWidth();
|
||||
int height = (int) mediaBox.getHeight();
|
||||
int mid = width / 2;
|
||||
|
||||
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.addRegion("left", new Rectangle(0, 0, mid, height));
|
||||
stripper.addRegion("right", new Rectangle(mid, 0, width - mid, height));
|
||||
stripper.extractRegions(page);
|
||||
|
||||
String left = stripper.getTextForRegion("left").strip();
|
||||
String right = stripper.getTextForRegion("right").strip();
|
||||
|
||||
if (right.length() < left.length() * TWO_COLUMN_THRESHOLD) {
|
||||
// Single-column page — left holds all (or nearly all) content
|
||||
return left.isEmpty() ? right : left;
|
||||
}
|
||||
return left + "\n\n" + right;
|
||||
}
|
||||
|
||||
/** Collapses multi-space/tab runs and excessive blank lines. */
|
||||
private String normalizeWhitespace(String text) {
|
||||
return text
|
||||
.replaceAll("[ \t]{2,}", " ")
|
||||
.replaceAll("\n{3,}", "\n\n")
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user