ADV_3.py

import newspaper
from newspaper import Article
import concurrent.futures

URLs = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://www.derspiegel.de/',
        'http://www.bbc.co.uk/',
        'https://theguardian.com',]


def get_headlines():
    '''
    Function for printing 5 firtst headlines from news sites specified in URLs list
    Input: none
    Output: none, print statements instead
    '''
    for url in URLs:                                                                #for each element in URLs list
        result = newspaper.build(url, memoize_articles=False)                       #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
        print('\n''The headlines from %s are' % url, '\n')                          #print string with url from URLs as inserted value
        for i in range(1,6):                                                        #loop 5 times
            art = result.articles[i]                                                #set art variable to a refrence to an instruction for getting i article from the website
            art.download()                                                          #download article specified in art variable
            art.parse()                                                             #parse article specified in art variable
            print(art.title)                                                        #print headline of the article

def get_headlines_thread(thread):
        '''
        Function for accesing a website from url provided and getting first 5 headlines
        Input: string. url of the website
        Output: string. first 5 headlines of the website
        '''
        art_content = ""                                                            #create string in which headlines will be stored
        result = newspaper.build(thread, memoize_articles=False)                    #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
        art_content += '\n''The headlines from %s are\n\n' % thread                 #add to art_content string another string with website url as inserted value
        for i in range(1,6):                                                        #loop 5 times
            art = result.articles[i]                                                #set art variable to a refrence to an instruction for getting i article from the website
            art.download()                                                          #download article specified in art variable
            art.parse()                                                             #parse article specified in art variable
            art_content += art.title + "\n"                                         #join art content string, headline of article specified in art variable, and an enter key
        return art_content,thread                                                   #return art_content string

def concurent_get_headlines():
    '''
    Function for councurrently printing url of a news website and 5 of its headlines
    Input: no Input
    Output: no output, the functions ends with a print statement instead
    '''
    threaded_message = [None] * len(URLs)                                           #initialise list of None values with the same length as URLs list
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:          #create Executor class from concurrent.futures module and ThreadPoolExecutor subclass with max_workers argument set to 5
        threads = {executor.submit(get_headlines_thread,url): url for url in URLs}  #create threads dictionary with keys being submit method from Executor class and values being ulrs from URLs list
        for thread in concurrent.futures.as_completed(threads):                     #for each thread in pool of threads
            headlines = thread.result()                                             #initialise healines tuple and place in it two return values of thread function
            threaded_message[URLs.index(headlines[1])]= headlines[0]                #get the index of a value inside the URLs list that is equal to second element of headlines tuple,
                                                                                    #and under the same index inside threaded_message list, place first element of headline tuple
        print("\n".join(threaded_message))                                          #print strings in threaded message list joined by enter keys

if __name__ == '__main__':
    import timeit
    print("--------------------NON-CONCURENT--------------------")
    elapsed_time = timeit.timeit("get_headlines()", setup="from __main__ import get_headlines", number=2)/2                 #create a timer with passed statement and setup code being the get_headlines() function,
                                                                                                                            #set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time variable
    print("--------------------CONCURENT--------------------")
    elapsed_time2 = timeit.timeit("concurent_get_headlines()", setup="from __main__ import concurent_get_headlines", number=2)/2 #create a timer with passed statement and setup code being the concurent_get_headlines() function,
                                                                                                                                 #set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time2 variable
    print("non_concurent: ",elapsed_time," concurent: ",elapsed_time2)                                                           #print execution time of non_concurent and concurent function
	import newspaper
	from newspaper import Article
	import concurrent.futures

	URLs = ['http://www.foxnews.com/',
	'http://www.cnn.com/',
	'http://www.derspiegel.de/',
	'http://www.bbc.co.uk/',
	'https://theguardian.com',]


	def get_headlines():
	'''
	Function for printing 5 firtst headlines from news sites specified in URLs list
	Input: none
	Output: none, print statements instead
	'''
	for url in URLs: #for each element in URLs list
	result = newspaper.build(url, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
	print('\n''The headlines from %s are' % url, '\n') #print string with url from URLs as inserted value
	for i in range(1,6): #loop 5 times
	art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website
	art.download() #download article specified in art variable
	art.parse() #parse article specified in art variable
	print(art.title) #print headline of the article

	def get_headlines_thread(thread):
	'''
	Function for accesing a website from url provided and getting first 5 headlines
	Input: string. url of the website
	Output: string. first 5 headlines of the website
	'''
	art_content = "" #create string in which headlines will be stored
	result = newspaper.build(thread, memoize_articles=False) #create an instance of an article and set the url attribute value to thread parameter and memoize_articles attribute value to False
	art_content += '\n''The headlines from %s are\n\n' % thread #add to art_content string another string with website url as inserted value
	for i in range(1,6): #loop 5 times
	art = result.articles[i] #set art variable to a refrence to an instruction for getting i article from the website
	art.download() #download article specified in art variable
	art.parse() #parse article specified in art variable
	art_content += art.title + "\n" #join art content string, headline of article specified in art variable, and an enter key
	return art_content,thread #return art_content string

	def concurent_get_headlines():
	'''
	Function for councurrently printing url of a news website and 5 of its headlines
	Input: no Input
	Output: no output, the functions ends with a print statement instead
	'''
	threaded_message = [None] * len(URLs) #initialise list of None values with the same length as URLs list
	with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: #create Executor class from concurrent.futures module and ThreadPoolExecutor subclass with max_workers argument set to 5
	threads = {executor.submit(get_headlines_thread,url): url for url in URLs} #create threads dictionary with keys being submit method from Executor class and values being ulrs from URLs list
	for thread in concurrent.futures.as_completed(threads): #for each thread in pool of threads
	headlines = thread.result() #initialise healines tuple and place in it two return values of thread function
	threaded_message[URLs.index(headlines[1])]= headlines[0] #get the index of a value inside the URLs list that is equal to second element of headlines tuple,
	#and under the same index inside threaded_message list, place first element of headline tuple
	print("\n".join(threaded_message)) #print strings in threaded message list joined by enter keys

	if __name__ == '__main__':
	import timeit
	print("--------------------NON-CONCURENT--------------------")
	elapsed_time = timeit.timeit("get_headlines()", setup="from __main__ import get_headlines", number=2)/2 #create a timer with passed statement and setup code being the get_headlines() function,
	#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time variable
	print("--------------------CONCURENT--------------------")
	elapsed_time2 = timeit.timeit("concurent_get_headlines()", setup="from __main__ import concurent_get_headlines", number=2)/2 #create a timer with passed statement and setup code being the concurent_get_headlines() function,
	#set the number of executions to two, divide the whole thing by two, and save the results in elapsed_time2 variable
	print("non_concurent: ",elapsed_time," concurent: ",elapsed_time2) #print execution time of non_concurent and concurent function