From 25ce945ba88f229302db36bf2ac2a86d81b7b0fd Mon Sep 17 00:00:00 2001 From: "Joshua Jose (josej14)" Date: Thu, 21 Nov 2019 11:04:53 +0000 Subject: [PATCH] Add files via upload --- Wikipedia.py | 41 ++++++++++++++++++++++ weather.py | 30 ++++++++++++++++ website links.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 Wikipedia.py create mode 100644 weather.py create mode 100644 website links.py diff --git a/Wikipedia.py b/Wikipedia.py new file mode 100644 index 0000000..61e8c93 --- /dev/null +++ b/Wikipedia.py @@ -0,0 +1,41 @@ +# This whole code was copied from https://www.thepythoncode.com/article/access-wikipedia-python?fbclid=IwAR0oEeD19gpwYc99fvv0bPkWOHHZOP1Hk_j8JMw0J07Z6taDBGhRiZJ8uZ8 + +import wikipedia +# print the summary of what python is +print(wikipedia.summary("Python Programming Language")) + +# search for a term +result = wikipedia.search("Neural networks") +print("Result search of 'Neaural networks':", result) + +#get the page: Neural network +page = wikipedia.page(result[0]) + +# get the title of the page +title = page.title + +# get the categories of the page +categories = page.categories + +# get the whole wikipedia page text (content) +content = page.content + +#get all the links in the page +links = page.links + +# get the page references +references = page.references + +#summary +summary = page.summary + +# print info +print("Page content:\n", content, "\n") +print("Page title:", title, "\n") +print("Categories:", categories, "\n") +print("Links:", links, "\n") +print("References:", references, "\n") +print("Summary:", summary, "\n") + + + diff --git a/weather.py b/weather.py new file mode 100644 index 0000000..c3b144a --- /dev/null +++ b/weather.py @@ -0,0 +1,30 @@ +#This whole section of code is the weather feature +#code copied from: https://www.youtube.com/watch?v=gOWm5rF_qdc&list=PLtN7kQKfzoJOJYt_yhn_xHWsORCMsEOXq&index=4&t=194s + +#pwown is a module +#Needs to be installed first for the code to be able to run +import pyowm + +#api code within the bracket was obtained from: https://openweathermap.org/ +own = pyowm.OWM('7050f6a47e2d8369b80ae88aafb6c357') + +#The user inputs the name of the location +#The variable 'inputLoc' was created on my own +inputLoc = input("tell me your location: ") +location = own.weather_at_place(inputLoc) +weather = location.get_weather() + +temp = weather.get_temperature('celsius') +humidity = weather.get_humidity() + +#receive the temperature (in celcius) and humidity of chosen location +print(temp) +print(humidity) + + + + + + + + diff --git a/website links.py b/website links.py new file mode 100644 index 0000000..a06a795 --- /dev/null +++ b/website links.py @@ -0,0 +1,89 @@ +# Whole code copied from https://www.thepythoncode.com/article/extract-all-website-links-python?fbclid=IwAR1v_yosVk4OIlyWkJgbwqO9HRfEnmdOzNySP8HFogJF5PJ0gM9tmQMwvGA + +import requests +from urllib.request import urlparse, urljoin +from bs4 import BeautifulSoup +import colorama + +# init the colorama module +colorama.init() +GREEN = colorama.Force.Green +GRAY = colorama.Force.LIGHTBLACK_EX +RESET = colorama.Force.RESET + +# initialize the set of links (unique links) +internal_urls = set() +external_urls = set() + +def is_valid(url): + """ + Checks whether 'url' is a valid URL. + """ + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + +def get_all_website_links(url): + """ + Returns all URLS that is found on 'url' in which it belongs to the same website + """ + +# all URLs of 'url' +urls = set() +# domain name of the URL without the protocol +domain_name = urlparse(url).netloc +soup = BeautifulSoup(requests.get(url).content, "html.parser") + +for a_tag in soup.findAll("a"): + href = a_tag.attrs.get("href") + if href == "" or href in None: + # href empty tag + continue + +# join the url if it's relative (not absolute link) +href = urljoin(url, href) + +parsed_href = urlparse(href) + +#remove URL GET parameters, URL fragments, etc. +href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + +if not is_valid(href): + #not a valid URL + continue +if href in internal_urls: + #already in the set + continue +if domain_name not in href: + # external link + if href not in external_urls: + print(f"{GRAY}[!] External link: {href}{RESET}") + external_urls.add(href) + continue +print(f"{GREEN}[*] Internal link: {href}{RESET}") +urls.add(href) +internal_urls.add(href) +return urls + +# number of urls visited so far will be stored here +total_urls_visited = 0 + +def crawl(url, max_urls=50): + """ + Crawls a web page and extracts all links. + You'll find all links in 'external_urls' and internal_urls global set variables. + params: + max_urls (int): number of max urls to crawl , default is 30. + + """ + global total_urls_visited + total_urls_visited += 1 + links = get_all_website_links(url) + for link in links: + if total_urls_visited > max_urls: + break + crawl(link, max_urls=max_urls) + +if _name_ == "_main_": + crawl("https://www.thepythoncode.com") + print([+]) +