Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AI-Contract-Reader-Project/pdfReader.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
72 lines (52 sloc)
2.88 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfReader | |
import spacy | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
nlp = spacy.load("en_core_web_sm") | |
#Contract Processing | |
def ContractProcessing(cont_): | |
contract = nlp(cont_) #spaCy reads the contract | |
tokens = [token.text for token in contract] #tokenises the key words and punctuation. This is a more reliable alternative to the split() function previously suggested | |
pos_tags = [(token.text, token.pos_) for token in contract] #tags each word with their respective grammatical role (noun, verb, adjective, etc) | |
entities = [(ent.text, ent.label_) for ent in contract.ents] #extracts important points and names that could be points of interest | |
dependencies = [(token.text, token.dep_, token.head.text) for token in contract] #extracts the relationships between words in a clause | |
sentences = [sent.text for sent in contract.sents] # Sentence segmentation | |
return sentences, entities | |
#QuestionProcessing | |
def QuestionProcessing(Q): | |
Qu = nlp(Q) | |
tokens = [token.text for token in Qu] | |
entities = [(ent.text, ent.label_) for ent in Qu.ents] #extracts important points and names that could be used to match to the contract to find relevant info | |
keywords = [token.lemma_ for token in Qu if not token.is_stop] | |
return entities, keywords | |
def Comparison(c_sentences, Q_keywords): | |
# Combine question keywords into a single string | |
question_text = " ".join(Q_keywords) | |
# Create a TF-IDF vectorizer to compare question with each contract sentence | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform([question_text] + c_sentences) | |
# Compute cosine similarity between the question and each contract sentence | |
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() | |
# Get the index of the most similar sentence | |
best_match_idx = similarities.argmax() | |
best_match_sentence = c_sentences[best_match_idx] | |
# Return the best matching sentence as the answer | |
return best_match_sentence | |
#Variable Instantiation | |
flagExit = False | |
path = "example1.pdf" | |
text1 = "" | |
#Reading PDF file | |
reader = PdfReader(path) #opens the pdf to be read as a class (i think) | |
for i in range(len(reader.pages)): #loops through the readers pages | |
page = reader.pages[i] #assigns the current page to a temp variable | |
text1 = text1 + page.extract_text() #adds each page to one large variable which holds the entirety of the contract | |
C_sentences, C_entities = ContractProcessing(text1) | |
#Question Processing | |
while flagExit == False: | |
Q = input("What would you like to know about? \n") | |
if Q == "Exit": | |
flagExit = True | |
else: | |
Q_entities, Q_keywords = QuestionProcessing(Q) | |
print(Comparison(C_sentences, Q_keywords)) | |