pdfReader.py

from PyPDF2 import PdfReader
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en_core_web_sm")

#Contract Processing
def ContractProcessing(cont_):

        contract = nlp(cont_) #spaCy reads the contract
        tokens = [token.text for token in contract] #tokenises the key words and punctuation. This is a more reliable alternative to the split() function previously suggested
        pos_tags = [(token.text, token.pos_) for token in contract] #tags each word with their respective grammatical role (noun, verb, adjective, etc)
        entities = [(ent.text, ent.label_) for ent in contract.ents] #extracts important points and names that could be points of interest
        dependencies = [(token.text, token.dep_, token.head.text) for token in contract] #extracts the relationships between words in a clause
        sentences = [sent.text for sent in contract.sents]  # Sentence segmentation

        return sentences, entities

#QuestionProcessing
def QuestionProcessing(Q):
    Qu = nlp(Q)
    tokens = [token.text for token in Qu]
    entities = [(ent.text, ent.label_) for ent in Qu.ents] #extracts important points and names that could be used to match to the contract to find relevant info
    keywords = [token.lemma_ for token in Qu if not token.is_stop]
    return entities, keywords


def Comparison(c_sentences, Q_keywords):
    # Combine question keywords into a single string
    question_text = " ".join(Q_keywords)

    # Create a TF-IDF vectorizer to compare question with each contract sentence
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([question_text] + c_sentences)

    # Compute cosine similarity between the question and each contract sentence
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Get the index of the most similar sentence
    best_match_idx = similarities.argmax()
    best_match_sentence = c_sentences[best_match_idx]

    # Return the best matching sentence as the answer
    return best_match_sentence


#Variable Instantiation
flagExit = False
path = "example1.pdf"
text1 = ""

#Reading PDF file
reader = PdfReader(path) #opens the pdf to be read as a class (i think)
for i in range(len(reader.pages)): #loops through the readers pages
    page = reader.pages[i] #assigns the current page to a temp variable
    text1 = text1 + page.extract_text() #adds each page to one large variable which holds the entirety of the contract

C_sentences, C_entities = ContractProcessing(text1)


#Question Processing
while flagExit == False:
    Q = input("What would you like to know about? \n")
    if Q == "Exit":
        flagExit = True
    else:
        Q_entities, Q_keywords = QuestionProcessing(Q)
    print(Comparison(C_sentences, Q_keywords))
	from PyPDF2 import PdfReader
	import spacy
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	nlp = spacy.load("en_core_web_sm")

	#Contract Processing
	def ContractProcessing(cont_):

	contract = nlp(cont_) #spaCy reads the contract
	tokens = [token.text for token in contract] #tokenises the key words and punctuation. This is a more reliable alternative to the split() function previously suggested
	pos_tags = [(token.text, token.pos_) for token in contract] #tags each word with their respective grammatical role (noun, verb, adjective, etc)
	entities = [(ent.text, ent.label_) for ent in contract.ents] #extracts important points and names that could be points of interest
	dependencies = [(token.text, token.dep_, token.head.text) for token in contract] #extracts the relationships between words in a clause
	sentences = [sent.text for sent in contract.sents] # Sentence segmentation

	return sentences, entities

	#QuestionProcessing
	def QuestionProcessing(Q):
	Qu = nlp(Q)
	tokens = [token.text for token in Qu]
	entities = [(ent.text, ent.label_) for ent in Qu.ents] #extracts important points and names that could be used to match to the contract to find relevant info
	keywords = [token.lemma_ for token in Qu if not token.is_stop]
	return entities, keywords



	def Comparison(c_sentences, Q_keywords):
	# Combine question keywords into a single string
	question_text = " ".join(Q_keywords)

	# Create a TF-IDF vectorizer to compare question with each contract sentence
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform([question_text] + c_sentences)

	# Compute cosine similarity between the question and each contract sentence
	similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

	# Get the index of the most similar sentence
	best_match_idx = similarities.argmax()
	best_match_sentence = c_sentences[best_match_idx]

	# Return the best matching sentence as the answer
	return best_match_sentence


	#Variable Instantiation
	flagExit = False
	path = "example1.pdf"
	text1 = ""

	#Reading PDF file
	reader = PdfReader(path) #opens the pdf to be read as a class (i think)
	for i in range(len(reader.pages)): #loops through the readers pages
	page = reader.pages[i] #assigns the current page to a temp variable
	text1 = text1 + page.extract_text() #adds each page to one large variable which holds the entirety of the contract

	C_sentences, C_entities = ContractProcessing(text1)


	#Question Processing
	while flagExit == False:
	Q = input("What would you like to know about? \n")
	if Q == "Exit":
	flagExit = True
	else:
	Q_entities, Q_keywords = QuestionProcessing(Q)
	print(Comparison(C_sentences, Q_keywords))