Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Python-Chatbot/wikipedia.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
185 lines (120 sloc)
5.06 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import discord | |
""" | |
Libraries: | |
- requests (http://docs.python-requests.org/en/master/) by Kenneth Reitz for web requests | |
- bs4 (https://www.crummy.com/software/BeautifulSoup/) by Leonard Richardson for parsing HTML pages | |
- discord.py (https://github.com/Rapptz/discord.py/) by Rapptz used to connect to Discord. | |
""" | |
""" | |
I've learned how to use Beautiful Soup by following its documentation: | |
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
and this tutorial: | |
https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3 | |
by Lisa Tagliaferri | |
""" | |
async def wikipedia(client, message): | |
#message.content at this point is: "!wiki user_query | |
title = message.content[6:] | |
title = title.lower().split(' ') | |
title = "_".join(title) | |
URL = "https://en.wikipedia.org/wiki/" + title | |
def pageContentFun(url): | |
"""Take URL as string and return its parsed body""" | |
###################################################### | |
# Based on Lisa Tagliaferri's tutorial | |
page = requests.get(url) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
x = soup.find(class_='mw-parser-output') | |
###################################################### | |
#deletes parts of html that sometimes make the function find wrong body | |
deleteEmpty = x.find_all(class_='mw-empty-elt') | |
for empty in deleteEmpty: | |
empty.decompose() | |
deleteTable = x.find('table') | |
deleteTable.decompose() | |
deleteTable = x.find('table') | |
deleteTable.decompose() | |
return x | |
def findImage(url): | |
"""Take url as string and find first img in body and return its url""" | |
###################################################### | |
# Based on Lisa Tagliaferri's tutorial | |
page = requests.get(url) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
image = soup.find(class_='image') | |
###################################################### | |
img = image.find('img') | |
imgURL = img['src'] | |
#specific case when "question book" image is found instead of target image. Delete it | |
if 'Question_book' in imgURL: | |
image.decompose() | |
image = soup.find(class_='image') | |
img = image.find('img') | |
imgURL = img['src'] | |
return imgURL | |
def findP(x): | |
"""take site's body found earlier and return first paragraph""" | |
pageContentp = x.find_all('p') | |
#there are some specific cases where the first paragraph isn't the one we're looking for, so take next one | |
if 'Coordinates' in pageContentp[0].text: | |
y = pageContentp[1].text | |
else: | |
y = pageContentp[0].text | |
return y | |
# finds page's first paragraph and first image | |
pageContent = pageContentFun(URL) | |
cleanText = findP(pageContent) | |
imgURL = findImage(URL) | |
#some wiki sites give you few option to choose from (try searching wiki for eg New York). Filter those sites | |
if "most commonly refers to:" in cleanText: | |
#find part of page that is a list of options | |
pageContent = pageContent.find('ul') | |
pageContent = pageContent.find_all('a') | |
#list of names of options | |
list1 = [] | |
#list of links of listed options | |
list2 = [] | |
x = 1 | |
for link in pageContent: | |
linkUrl = link['href'] | |
fullLink = 'https://en.wikipedia.org' + linkUrl | |
list2.append(fullLink) | |
linkUrl = linkUrl[6:] | |
linkUrl = linkUrl.split('_') | |
linkUrl = " ".join(linkUrl) | |
linkUrl = str(x) + '. ' + linkUrl | |
x=x+1 | |
list1.append(linkUrl) | |
await client.send_message(message.channel, 'Which one exactly?') | |
#display list of options | |
for item in list1: | |
await client.send_message(message.channel, item) | |
#waits for user input (a number) which is then saved to variable number | |
message = await client.wait_for_message(author=message.author) | |
number = message.content | |
numberChosen = int(number) | |
URL = list2[numberChosen-1] | |
pageContent = pageContentFun(URL) | |
cleanText = findP(pageContent) | |
imgURL = findImage(URL) | |
# cleaning text from parts like [1], [2] , [1 note] often found on wikipedia | |
x = 1 | |
while x < 45: | |
cleanText = cleanText.replace('[' + str(x) + ']', '') | |
x = x + 1 | |
cleanText = cleanText.replace('( listen)', '') | |
y = 1 | |
while y < 10: | |
cleanText = cleanText.replace('[note ' + str(y) + ']', '') | |
y = y + 1 | |
imgFull = 'https:' + imgURL | |
URLnolink = '<' + URL + '>' | |
#when an image is posted to discord chat, its url is posted as well. | |
# Creating embed message allows posting images witout url | |
em = discord.Embed() | |
em.set_image(url=imgFull) | |
await client.send_message(message.channel, embed=em) | |
await client.send_message(message.channel, cleanText) | |
await client.send_message(message.channel, 'Wikipedia: ' + URLnolink) |