first implementation - image/drawing integration
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
package com.aiteacher.document;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Parses a PDF into page-level SectionEntity records stored in Postgres.
|
||||
* Each page becomes one section, grouped under a single chapter per book.
|
||||
*/
|
||||
@Service
|
||||
public class PdfStructureParser {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfStructureParser.class);
|
||||
|
||||
private final ChapterRepository chapterRepository;
|
||||
private final SectionRepository sectionRepository;
|
||||
|
||||
public PdfStructureParser(ChapterRepository chapterRepository,
|
||||
SectionRepository sectionRepository) {
|
||||
this.chapterRepository = chapterRepository;
|
||||
this.sectionRepository = sectionRepository;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public List<SectionEntity> parse(UUID bookId, String bookTitle, Path pdfPath) {
|
||||
log.info("Parsing PDF structure for book {}", bookId);
|
||||
|
||||
// One chapter per book
|
||||
String chapterId = bookId + "-ch1";
|
||||
ChapterEntity chapter = new ChapterEntity(chapterId, bookId, 1, bookTitle, 1);
|
||||
chapterRepository.save(chapter);
|
||||
|
||||
// One section per page
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(
|
||||
new FileSystemResource(pdfPath.toFile()),
|
||||
PdfDocumentReaderConfig.builder().withPagesPerDocument(1).build()
|
||||
);
|
||||
|
||||
List<org.springframework.ai.document.Document> pages = reader.get();
|
||||
List<SectionEntity> sections = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < pages.size(); i++) {
|
||||
int pageNum = i + 1;
|
||||
String text = pages.get(i).getText();
|
||||
if (text == null || text.isBlank()) continue;
|
||||
|
||||
String sectionId = bookId + "-p" + pageNum;
|
||||
SectionEntity section = new SectionEntity(
|
||||
sectionId, chapterId, bookId,
|
||||
String.valueOf(pageNum),
|
||||
"Page " + pageNum,
|
||||
pageNum, pageNum,
|
||||
text
|
||||
);
|
||||
sections.add(sectionRepository.save(section));
|
||||
}
|
||||
|
||||
log.info("Parsed {} sections for book {}", sections.size(), bookId);
|
||||
return sections;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user