Parsing Questions from PDF's using Python
pip install pypdf
pip install spacy
python -m spacy download en_core_web_sm
Script
from pypdf import PdfReader
reader = PdfReader("example.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[10]
text = page.extract_text()
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")
# Input text
# text = "This is a sample text. It contains multiple sentences. Do you want to extract questions from it? If so, how should I proceed?"
text = first_100_pages
# Parse the text
doc = nlp(text)
# Extract sentences and questions
sentences = [sent.text for sent in doc.sents]
questions = [sent.text for sent in doc.sents if '?' in sent.text]
# Print sentences
# print("Sentences:")
# for sentence in sentences:
# print(sentence)
# Print questions
print("\nQuestions:")
for question in questions:
print(question)
Links
Backlinks