wikipedia.py

import requests
from bs4 import BeautifulSoup
import discord

"""
Libraries:
 - requests (http://docs.python-requests.org/en/master/) by Kenneth Reitz for web requests
 - bs4 (https://www.crummy.com/software/BeautifulSoup/) by Leonard Richardson for parsing HTML pages
 - discord.py (https://github.com/Rapptz/discord.py/) by Rapptz used to connect to Discord.
"""

"""
I've learned how to use Beautiful Soup by following its documentation:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

and this tutorial:
https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
by Lisa Tagliaferri

"""

async def wikipedia(client, message):

    #message.content at this point is: "!wiki user_query
    title = message.content[6:]
    title = title.lower().split(' ')
    title = "_".join(title)

    URL = "https://en.wikipedia.org/wiki/" + title


    def pageContentFun(url):
        """Take URL as string and return its parsed body"""

        ######################################################
        # Based on Lisa Tagliaferri's tutorial

        page = requests.get(url)

        soup = BeautifulSoup(page.text, 'html.parser')

        x = soup.find(class_='mw-parser-output')

        ######################################################

        #deletes parts of html that sometimes make the function find wrong body
        deleteEmpty = x.find_all(class_='mw-empty-elt')
        for empty in deleteEmpty:
            empty.decompose()

        deleteTable = x.find('table')
        deleteTable.decompose()

        deleteTable = x.find('table')
        deleteTable.decompose()


        return x

    def findImage(url):
        """Take url as string and find first img in body and return its url"""

        ######################################################
        # Based on Lisa Tagliaferri's tutorial

        page = requests.get(url)

        soup = BeautifulSoup(page.text, 'html.parser')

        image = soup.find(class_='image')

        ######################################################

        img = image.find('img')

        imgURL = img['src']


        #specific case when "question book" image is found instead of target image. Delete it
        if 'Question_book' in imgURL:
            image.decompose()

        image = soup.find(class_='image')

        img = image.find('img')

        imgURL = img['src']

        return imgURL

    def findP(x):
        """take site's body found earlier and return first paragraph"""
        pageContentp = x.find_all('p')

        #there are some specific cases where the first paragraph isn't the one we're looking for, so take next one
        if 'Coordinates' in pageContentp[0].text:
            y = pageContentp[1].text
        else:
            y = pageContentp[0].text

        return y


    # finds page's first paragraph and first image
    pageContent = pageContentFun(URL)
    cleanText = findP(pageContent)
    imgURL = findImage(URL)


    #some wiki sites give you few option to choose from (try searching wiki for eg New York). Filter those sites
    if "most commonly refers to:" in cleanText:

        #find part of page that is a list of options
        pageContent = pageContent.find('ul')
        pageContent = pageContent.find_all('a')

        #list of names of options
        list1 = []

        #list of links of listed options
        list2 = []
        x = 1
        for link in pageContent:
            linkUrl = link['href']

            fullLink = 'https://en.wikipedia.org' + linkUrl
            list2.append(fullLink)


            linkUrl = linkUrl[6:]
            linkUrl = linkUrl.split('_')
            linkUrl = " ".join(linkUrl)

            linkUrl = str(x) + '. ' + linkUrl
            x=x+1
            list1.append(linkUrl)


        await client.send_message(message.channel, 'Which one exactly?')

        #display list of options
        for item in list1:
            await client.send_message(message.channel, item)

        #waits for user input (a number) which is then saved to variable number
        message = await client.wait_for_message(author=message.author)
        number = message.content


        numberChosen = int(number)
        URL = list2[numberChosen-1]

        pageContent = pageContentFun(URL)
        cleanText = findP(pageContent)
        imgURL = findImage(URL)


    # cleaning text from parts like [1], [2] , [1 note] often found on wikipedia
    x = 1
    while x < 45:
        cleanText = cleanText.replace('[' + str(x) + ']', '')
        x = x + 1

    cleanText = cleanText.replace('( listen)', '')

    y = 1
    while y < 10:
        cleanText = cleanText.replace('[note ' + str(y) + ']', '')
        y = y + 1


    imgFull = 'https:' + imgURL
    URLnolink = '<' + URL + '>'

    #when an image is posted to discord chat, its url is posted as well.
    # Creating embed message allows posting images witout url
    em = discord.Embed()
    em.set_image(url=imgFull)

    await client.send_message(message.channel, embed=em)
    await client.send_message(message.channel, cleanText)
    await client.send_message(message.channel, 'Wikipedia: ' + URLnolink)
	import requests
	from bs4 import BeautifulSoup
	import discord

	"""
	Libraries:
	- requests (http://docs.python-requests.org/en/master/) by Kenneth Reitz for web requests
	- bs4 (https://www.crummy.com/software/BeautifulSoup/) by Leonard Richardson for parsing HTML pages
	- discord.py (https://github.com/Rapptz/discord.py/) by Rapptz used to connect to Discord.
	"""

	"""
	I've learned how to use Beautiful Soup by following its documentation:
	https://www.crummy.com/software/BeautifulSoup/bs4/doc/

	and this tutorial:
	https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
	by Lisa Tagliaferri

	"""

	async def wikipedia(client, message):

	#message.content at this point is: "!wiki user_query
	title = message.content[6:]
	title = title.lower().split(' ')
	title = "_".join(title)

	URL = "https://en.wikipedia.org/wiki/" + title


	def pageContentFun(url):
	"""Take URL as string and return its parsed body"""

	######################################################
	# Based on Lisa Tagliaferri's tutorial

	page = requests.get(url)

	soup = BeautifulSoup(page.text, 'html.parser')

	x = soup.find(class_='mw-parser-output')

	######################################################

	#deletes parts of html that sometimes make the function find wrong body
	deleteEmpty = x.find_all(class_='mw-empty-elt')
	for empty in deleteEmpty:
	empty.decompose()

	deleteTable = x.find('table')
	deleteTable.decompose()

	deleteTable = x.find('table')
	deleteTable.decompose()


	return x

	def findImage(url):
	"""Take url as string and find first img in body and return its url"""

	######################################################
	# Based on Lisa Tagliaferri's tutorial

	page = requests.get(url)

	soup = BeautifulSoup(page.text, 'html.parser')

	image = soup.find(class_='image')

	######################################################

	img = image.find('img')

	imgURL = img['src']


	#specific case when "question book" image is found instead of target image. Delete it
	if 'Question_book' in imgURL:
	image.decompose()

	image = soup.find(class_='image')

	img = image.find('img')

	imgURL = img['src']

	return imgURL

	def findP(x):
	"""take site's body found earlier and return first paragraph"""
	pageContentp = x.find_all('p')

	#there are some specific cases where the first paragraph isn't the one we're looking for, so take next one
	if 'Coordinates' in pageContentp[0].text:
	y = pageContentp[1].text
	else:
	y = pageContentp[0].text

	return y


	# finds page's first paragraph and first image
	pageContent = pageContentFun(URL)
	cleanText = findP(pageContent)
	imgURL = findImage(URL)



	#some wiki sites give you few option to choose from (try searching wiki for eg New York). Filter those sites
	if "most commonly refers to:" in cleanText:

	#find part of page that is a list of options
	pageContent = pageContent.find('ul')
	pageContent = pageContent.find_all('a')

	#list of names of options
	list1 = []

	#list of links of listed options
	list2 = []
	x = 1
	for link in pageContent:
	linkUrl = link['href']

	fullLink = 'https://en.wikipedia.org' + linkUrl
	list2.append(fullLink)


	linkUrl = linkUrl[6:]
	linkUrl = linkUrl.split('_')
	linkUrl = " ".join(linkUrl)

	linkUrl = str(x) + '. ' + linkUrl
	x=x+1
	list1.append(linkUrl)


	await client.send_message(message.channel, 'Which one exactly?')

	#display list of options
	for item in list1:
	await client.send_message(message.channel, item)

	#waits for user input (a number) which is then saved to variable number
	message = await client.wait_for_message(author=message.author)
	number = message.content


	numberChosen = int(number)
	URL = list2[numberChosen-1]

	pageContent = pageContentFun(URL)
	cleanText = findP(pageContent)
	imgURL = findImage(URL)



	# cleaning text from parts like [1], [2] , [1 note] often found on wikipedia
	x = 1
	while x < 45:
	cleanText = cleanText.replace('[' + str(x) + ']', '')
	x = x + 1

	cleanText = cleanText.replace('( listen)', '')

	y = 1
	while y < 10:
	cleanText = cleanText.replace('[note ' + str(y) + ']', '')
	y = y + 1



	imgFull = 'https:' + imgURL
	URLnolink = '<' + URL + '>'

	#when an image is posted to discord chat, its url is posted as well.
	# Creating embed message allows posting images witout url
	em = discord.Embed()
	em.set_image(url=imgFull)

	await client.send_message(message.channel, embed=em)
	await client.send_message(message.channel, cleanText)
	await client.send_message(message.channel, 'Wikipedia: ' + URLnolink)