Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import newspaper
from newspaper import Article
import concurrent.futures
URLs = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://www.derspiegel.de/',
'http://www.bbc.co.uk/',
'https://theguardian.com',]
def get_headlines():
'''
Function for printing 5 firtst headlines from news sites specified in URLs list
Input: none
Output: none, print statements instead
'''
for url in URLs: #for each element in URLs list
result = newspaper.build(url, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
print('\n''The headlines from %s are' % url, '\n') #print string with url from URLs as inserted value
for i in range(1,6): #loop 5 times
art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website
art.download() #download article specified in art variable
art.parse() #parse article specified in art variable
print(art.title) #print headline of the article
def get_headlines_thread(thread):
'''
Function for accesing a website from url provided and getting first 5 headlines
Input: string. url of the website
Output: string. first 5 headlines of the website
'''
art_content = "" #create string in which headlines will be stored
result = newspaper.build(thread, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
art_content += '\n''The headlines from %s are\n\n' % thread #add to art_content string another string with website url as inserted value
for i in range(1,6): #loop 5 times
art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website
art.download() #download article specified in art variable
art.parse() #parse article specified in art variable
art_content += art.title + "\n" #join art content string, headline of article specified in art variable, and an enter key
return art_content,thread #return art_content string
def concurent_get_headlines():
'''
Function for councurrently printing url of a news website and 5 of its headlines
Input: no Input
Output: no output, the functions ends with a print statement instead
'''
threaded_message = [None] * len(URLs) #initialise list of None values with the same length as URLs list
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: #create Executor class from concurrent.futures module and ThreadPoolExecutor subclass with max_workers argument set to 5
threads = {executor.submit(get_headlines_thread,url): url for url in URLs} #create threads dictionary with keys being submit method from Executor class and values being ulrs from URLs list
for thread in concurrent.futures.as_completed(threads): #for each thread in pool of threads
headlines = thread.result() #initialise healines tuple and place in it two return values of thread function
threaded_message[URLs.index(headlines[1])]= headlines[0] #get the index of a value inside the URLs list that is equal to second element of headlines tuple,
#and under the same index inside threaded_message list, place first element of headline tuple
print("\n".join(threaded_message)) #print strings in threaded message list joined by enter keys
if __name__ == '__main__':
import timeit
print("--------------------NON-CONCURENT--------------------")
elapsed_time = timeit.timeit("get_headlines()", setup="from __main__ import get_headlines", number=2)/2 #create a timer with passed statement and setup code being the get_headlines() function,
#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time variable
print("--------------------CONCURENT--------------------")
elapsed_time2 = timeit.timeit("concurent_get_headlines()", setup="from __main__ import concurent_get_headlines", number=2)/2 #create a timer with passed statement and setup code being the concurent_get_headlines() function,
#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time2 variable
print("non_concurent: ",elapsed_time," concurent: ",elapsed_time2) #print execution time of non_concurent and concurent function