Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Chatbot/website links.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
89 lines (73 sloc)
2.41 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Whole code copied from https://www.thepythoncode.com/article/extract-all-website-links-python?fbclid=IwAR1v_yosVk4OIlyWkJgbwqO9HRfEnmdOzNySP8HFogJF5PJ0gM9tmQMwvGA | |
import requests | |
from urllib.request import urlparse, urljoin | |
from bs4 import BeautifulSoup | |
import colorama | |
# init the colorama module | |
colorama.init() | |
GREEN = colorama.Force.Green | |
GRAY = colorama.Force.LIGHTBLACK_EX | |
RESET = colorama.Force.RESET | |
# initialize the set of links (unique links) | |
internal_urls = set() | |
external_urls = set() | |
def is_valid(url): | |
""" | |
Checks whether 'url' is a valid URL. | |
""" | |
parsed = urlparse(url) | |
return bool(parsed.netloc) and bool(parsed.scheme) | |
def get_all_website_links(url): | |
""" | |
Returns all URLS that is found on 'url' in which it belongs to the same website | |
""" | |
# all URLs of 'url' | |
urls = set() | |
# domain name of the URL without the protocol | |
domain_name = urlparse(url).netloc | |
soup = BeautifulSoup(requests.get(url).content, "html.parser") | |
for a_tag in soup.findAll("a"): | |
href = a_tag.attrs.get("href") | |
if href == "" or href in None: | |
# href empty tag | |
continue | |
# join the url if it's relative (not absolute link) | |
href = urljoin(url, href) | |
parsed_href = urlparse(href) | |
#remove URL GET parameters, URL fragments, etc. | |
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path | |
if not is_valid(href): | |
#not a valid URL | |
continue | |
if href in internal_urls: | |
#already in the set | |
continue | |
if domain_name not in href: | |
# external link | |
if href not in external_urls: | |
print(f"{GRAY}[!] External link: {href}{RESET}") | |
external_urls.add(href) | |
continue | |
print(f"{GREEN}[*] Internal link: {href}{RESET}") | |
urls.add(href) | |
internal_urls.add(href) | |
return urls | |
# number of urls visited so far will be stored here | |
total_urls_visited = 0 | |
def crawl(url, max_urls=50): | |
""" | |
Crawls a web page and extracts all links. | |
You'll find all links in 'external_urls' and internal_urls global set variables. | |
params: | |
max_urls (int): number of max urls to crawl , default is 30. | |
""" | |
global total_urls_visited | |
total_urls_visited += 1 | |
links = get_all_website_links(url) | |
for link in links: | |
if total_urls_visited > max_urls: | |
break | |
crawl(link, max_urls=max_urls) | |
if _name_ == "_main_": | |
crawl("https://www.thepythoncode.com") | |
print([+]) |