Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from PyPDF2 import PdfReader
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load("en_core_web_sm")
#Contract Processing
def ContractProcessing(cont_):
contract = nlp(cont_) #spaCy reads the contract
tokens = [token.text for token in contract] #tokenises the key words and punctuation. This is a more reliable alternative to the split() function previously suggested
pos_tags = [(token.text, token.pos_) for token in contract] #tags each word with their respective grammatical role (noun, verb, adjective, etc)
entities = [(ent.text, ent.label_) for ent in contract.ents] #extracts important points and names that could be points of interest
dependencies = [(token.text, token.dep_, token.head.text) for token in contract] #extracts the relationships between words in a clause
sentences = [sent.text for sent in contract.sents] # Sentence segmentation
return sentences, entities
#QuestionProcessing
def QuestionProcessing(Q):
Qu = nlp(Q)
tokens = [token.text for token in Qu]
entities = [(ent.text, ent.label_) for ent in Qu.ents] #extracts important points and names that could be used to match to the contract to find relevant info
keywords = [token.lemma_ for token in Qu if not token.is_stop]
return entities, keywords
def Comparison(c_sentences, Q_keywords):
# Combine question keywords into a single string
question_text = " ".join(Q_keywords)
# Create a TF-IDF vectorizer to compare question with each contract sentence
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([question_text] + c_sentences)
# Compute cosine similarity between the question and each contract sentence
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
# Get the index of the most similar sentence
best_match_idx = similarities.argmax()
best_match_sentence = c_sentences[best_match_idx]
# Return the best matching sentence as the answer
return best_match_sentence
#Variable Instantiation
flagExit = False
path = "example1.pdf"
text1 = ""
#Reading PDF file
reader = PdfReader(path) #opens the pdf to be read as a class (i think)
for i in range(len(reader.pages)): #loops through the readers pages
page = reader.pages[i] #assigns the current page to a temp variable
text1 = text1 + page.extract_text() #adds each page to one large variable which holds the entirety of the contract
C_sentences, C_entities = ContractProcessing(text1)
#Question Processing
while flagExit == False:
Q = input("What would you like to know about? \n")
if Q == "Exit":
flagExit = True
else:
Q_entities, Q_keywords = QuestionProcessing(Q)
print(Comparison(C_sentences, Q_keywords))