Skip to content
Permalink
Browse files
Add files via upload
  • Loading branch information
josej14 committed Nov 21, 2019
1 parent 9f816d5 commit 25ce945ba88f229302db36bf2ac2a86d81b7b0fd
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 0 deletions.
@@ -0,0 +1,41 @@
# This whole code was copied from https://www.thepythoncode.com/article/access-wikipedia-python?fbclid=IwAR0oEeD19gpwYc99fvv0bPkWOHHZOP1Hk_j8JMw0J07Z6taDBGhRiZJ8uZ8

import wikipedia
# print the summary of what python is
print(wikipedia.summary("Python Programming Language"))

# search for a term
result = wikipedia.search("Neural networks")
print("Result search of 'Neaural networks':", result)

#get the page: Neural network
page = wikipedia.page(result[0])

# get the title of the page
title = page.title

# get the categories of the page
categories = page.categories

# get the whole wikipedia page text (content)
content = page.content

#get all the links in the page
links = page.links

# get the page references
references = page.references

#summary
summary = page.summary

# print info
print("Page content:\n", content, "\n")
print("Page title:", title, "\n")
print("Categories:", categories, "\n")
print("Links:", links, "\n")
print("References:", references, "\n")
print("Summary:", summary, "\n")



@@ -0,0 +1,30 @@
#This whole section of code is the weather feature
#code copied from: https://www.youtube.com/watch?v=gOWm5rF_qdc&list=PLtN7kQKfzoJOJYt_yhn_xHWsORCMsEOXq&index=4&t=194s

#pwown is a module
#Needs to be installed first for the code to be able to run
import pyowm

#api code within the bracket was obtained from: https://openweathermap.org/
own = pyowm.OWM('7050f6a47e2d8369b80ae88aafb6c357')

#The user inputs the name of the location
#The variable 'inputLoc' was created on my own
inputLoc = input("tell me your location: ")
location = own.weather_at_place(inputLoc)
weather = location.get_weather()

temp = weather.get_temperature('celsius')
humidity = weather.get_humidity()

#receive the temperature (in celcius) and humidity of chosen location
print(temp)
print(humidity)








@@ -0,0 +1,89 @@
# Whole code copied from https://www.thepythoncode.com/article/extract-all-website-links-python?fbclid=IwAR1v_yosVk4OIlyWkJgbwqO9HRfEnmdOzNySP8HFogJF5PJ0gM9tmQMwvGA

import requests
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama

# init the colorama module
colorama.init()
GREEN = colorama.Force.Green
GRAY = colorama.Force.LIGHTBLACK_EX
RESET = colorama.Force.RESET

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

def is_valid(url):
"""
Checks whether 'url' is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
"""
Returns all URLS that is found on 'url' in which it belongs to the same website
"""

# all URLs of 'url'
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")

for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href in None:
# href empty tag
continue

# join the url if it's relative (not absolute link)
href = urljoin(url, href)

parsed_href = urlparse(href)

#remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

if not is_valid(href):
#not a valid URL
continue
if href in internal_urls:
#already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls

# number of urls visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=50):
"""
Crawls a web page and extracts all links.
You'll find all links in 'external_urls' and internal_urls global set variables.
params:
max_urls (int): number of max urls to crawl , default is 30.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)

if _name_ == "_main_":
crawl("https://www.thepythoncode.com")
print([+])

0 comments on commit 25ce945

Please sign in to comment.