Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import requests
from bs4 import BeautifulSoup
import discord
"""
Libraries:
- requests (http://docs.python-requests.org/en/master/) by Kenneth Reitz for web requests
- bs4 (https://www.crummy.com/software/BeautifulSoup/) by Leonard Richardson for parsing HTML pages
- discord.py (https://github.com/Rapptz/discord.py/) by Rapptz used to connect to Discord.
"""
"""
I've learned how to use Beautiful Soup by following its documentation:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
and this tutorial:
https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
by Lisa Tagliaferri
"""
async def wikipedia(client, message):
#message.content at this point is: "!wiki user_query
title = message.content[6:]
title = title.lower().split(' ')
title = "_".join(title)
URL = "https://en.wikipedia.org/wiki/" + title
def pageContentFun(url):
"""Take URL as string and return its parsed body"""
######################################################
# Based on Lisa Tagliaferri's tutorial
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
x = soup.find(class_='mw-parser-output')
######################################################
#deletes parts of html that sometimes make the function find wrong body
deleteEmpty = x.find_all(class_='mw-empty-elt')
for empty in deleteEmpty:
empty.decompose()
deleteTable = x.find('table')
deleteTable.decompose()
deleteTable = x.find('table')
deleteTable.decompose()
return x
def findImage(url):
"""Take url as string and find first img in body and return its url"""
######################################################
# Based on Lisa Tagliaferri's tutorial
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
image = soup.find(class_='image')
######################################################
img = image.find('img')
imgURL = img['src']
#specific case when "question book" image is found instead of target image. Delete it
if 'Question_book' in imgURL:
image.decompose()
image = soup.find(class_='image')
img = image.find('img')
imgURL = img['src']
return imgURL
def findP(x):
"""take site's body found earlier and return first paragraph"""
pageContentp = x.find_all('p')
#there are some specific cases where the first paragraph isn't the one we're looking for, so take next one
if 'Coordinates' in pageContentp[0].text:
y = pageContentp[1].text
else:
y = pageContentp[0].text
return y
# finds page's first paragraph and first image
pageContent = pageContentFun(URL)
cleanText = findP(pageContent)
imgURL = findImage(URL)
#some wiki sites give you few option to choose from (try searching wiki for eg New York). Filter those sites
if "most commonly refers to:" in cleanText:
#find part of page that is a list of options
pageContent = pageContent.find('ul')
pageContent = pageContent.find_all('a')
#list of names of options
list1 = []
#list of links of listed options
list2 = []
x = 1
for link in pageContent:
linkUrl = link['href']
fullLink = 'https://en.wikipedia.org' + linkUrl
list2.append(fullLink)
linkUrl = linkUrl[6:]
linkUrl = linkUrl.split('_')
linkUrl = " ".join(linkUrl)
linkUrl = str(x) + '. ' + linkUrl
x=x+1
list1.append(linkUrl)
await client.send_message(message.channel, 'Which one exactly?')
#display list of options
for item in list1:
await client.send_message(message.channel, item)
#waits for user input (a number) which is then saved to variable number
message = await client.wait_for_message(author=message.author)
number = message.content
numberChosen = int(number)
URL = list2[numberChosen-1]
pageContent = pageContentFun(URL)
cleanText = findP(pageContent)
imgURL = findImage(URL)
# cleaning text from parts like [1], [2] , [1 note] often found on wikipedia
x = 1
while x < 45:
cleanText = cleanText.replace('[' + str(x) + ']', '')
x = x + 1
cleanText = cleanText.replace('( listen)', '')
y = 1
while y < 10:
cleanText = cleanText.replace('[note ' + str(y) + ']', '')
y = y + 1
imgFull = 'https:' + imgURL
URLnolink = '<' + URL + '>'
#when an image is posted to discord chat, its url is posted as well.
# Creating embed message allows posting images witout url
em = discord.Embed()
em.set_image(url=imgFull)
await client.send_message(message.channel, embed=em)
await client.send_message(message.channel, cleanText)
await client.send_message(message.channel, 'Wikipedia: ' + URLnolink)