Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# Whole code copied from https://www.thepythoncode.com/article/extract-all-website-links-python?fbclid=IwAR1v_yosVk4OIlyWkJgbwqO9HRfEnmdOzNySP8HFogJF5PJ0gM9tmQMwvGA
import requests
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
# init the colorama module
colorama.init()
GREEN = colorama.Force.Green
GRAY = colorama.Force.LIGHTBLACK_EX
RESET = colorama.Force.RESET
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
def is_valid(url):
"""
Checks whether 'url' is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLS that is found on 'url' in which it belongs to the same website
"""
# all URLs of 'url'
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href in None:
# href empty tag
continue
# join the url if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
#remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
#not a valid URL
continue
if href in internal_urls:
#already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
# number of urls visited so far will be stored here
total_urls_visited = 0
def crawl(url, max_urls=50):
"""
Crawls a web page and extracts all links.
You'll find all links in 'external_urls' and internal_urls global set variables.
params:
max_urls (int): number of max urls to crawl , default is 30.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if _name_ == "_main_":
crawl("https://www.thepythoncode.com")
print([+])