Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AdvancedALgorithmsComplete/ADV_3.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
66 lines (60 sloc)
5.85 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import newspaper | |
from newspaper import Article | |
import concurrent.futures | |
URLs = ['http://www.foxnews.com/', | |
'http://www.cnn.com/', | |
'http://www.derspiegel.de/', | |
'http://www.bbc.co.uk/', | |
'https://theguardian.com',] | |
def get_headlines(): | |
''' | |
Function for printing 5 firtst headlines from news sites specified in URLs list | |
Input: none | |
Output: none, print statements instead | |
''' | |
for url in URLs: #for each element in URLs list | |
result = newspaper.build(url, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False | |
print('\n''The headlines from %s are' % url, '\n') #print string with url from URLs as inserted value | |
for i in range(1,6): #loop 5 times | |
art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website | |
art.download() #download article specified in art variable | |
art.parse() #parse article specified in art variable | |
print(art.title) #print headline of the article | |
def get_headlines_thread(thread): | |
''' | |
Function for accesing a website from url provided and getting first 5 headlines | |
Input: string. url of the website | |
Output: string. first 5 headlines of the website | |
''' | |
art_content = "" #create string in which headlines will be stored | |
result = newspaper.build(thread, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False | |
art_content += '\n''The headlines from %s are\n\n' % thread #add to art_content string another string with website url as inserted value | |
for i in range(1,6): #loop 5 times | |
art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website | |
art.download() #download article specified in art variable | |
art.parse() #parse article specified in art variable | |
art_content += art.title + "\n" #join art content string, headline of article specified in art variable, and an enter key | |
return art_content,thread #return art_content string | |
def concurent_get_headlines(): | |
''' | |
Function for councurrently printing url of a news website and 5 of its headlines | |
Input: no Input | |
Output: no output, the functions ends with a print statement instead | |
''' | |
threaded_message = [None] * len(URLs) #initialise list of None values with the same length as URLs list | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: #create Executor class from concurrent.futures module and ThreadPoolExecutor subclass with max_workers argument set to 5 | |
threads = {executor.submit(get_headlines_thread,url): url for url in URLs} #create threads dictionary with keys being submit method from Executor class and values being ulrs from URLs list | |
for thread in concurrent.futures.as_completed(threads): #for each thread in pool of threads | |
headlines = thread.result() #initialise healines tuple and place in it two return values of thread function | |
threaded_message[URLs.index(headlines[1])]= headlines[0] #get the index of a value inside the URLs list that is equal to second element of headlines tuple, | |
#and under the same index inside threaded_message list, place first element of headline tuple | |
print("\n".join(threaded_message)) #print strings in threaded message list joined by enter keys | |
if __name__ == '__main__': | |
import timeit | |
print("--------------------NON-CONCURENT--------------------") | |
elapsed_time = timeit.timeit("get_headlines()", setup="from __main__ import get_headlines", number=2)/2 #create a timer with passed statement and setup code being the get_headlines() function, | |
#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time variable | |
print("--------------------CONCURENT--------------------") | |
elapsed_time2 = timeit.timeit("concurent_get_headlines()", setup="from __main__ import concurent_get_headlines", number=2)/2 #create a timer with passed statement and setup code being the concurent_get_headlines() function, | |
#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time2 variable | |
print("non_concurent: ",elapsed_time," concurent: ",elapsed_time2) #print execution time of non_concurent and concurent function |