Of course! A Python link checker is a fantastic project that combines web scraping, HTTP requests, and data processing. I'll guide you through building one, starting from a simple script and evolving to a more robust, command-line tool.

What is a Link Checker?
A link checker is a tool that scans a webpage (or a website) to find all the hyperlinks (<a href="...">) and then tests each one to see if it's valid. It checks for:
- Working Links (2xx status codes): The page loads successfully.
- Redirects (3xx status codes): The link leads to another page. We should usually follow these.
- Client Errors (4xx status codes): The link is broken (e.g., 404 Not Found, 403 Forbidden).
- Server Errors (5xx status codes): The target server has a problem (e.g., 500 Internal Server Error).
Method 1: The Simple & Direct Approach (Using requests and BeautifulSoup)
This is the best way to understand the core logic. We'll write a single script that takes a URL, finds all links, and checks them.
Step 1: Install Necessary Libraries
You'll need requests to make HTTP calls and beautifulsoup4 to parse the HTML.
pip install requests beautifulsoup4
Step 2: The Python Script (simple_checker.py)
This script defines a function to check a single URL and another function to find all links on a page.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# Set a user-agent to identify our bot
HEADERS = {
'User-Agent': 'My-Python-Link-Checker/1.0 (https://mywebsite.com/bot-info)'
}
def check_url(url, session):
"""
Checks a single URL and returns its status.
Follows redirects by default.
"""
try:
response = session.get(url, headers=HEADERS, timeout=10)
# If a redirect occurred, the final URL is what we want
final_url = response.url
# We only care about the final status code, not the intermediate ones
status_code = response.history[-1].status_code if response.history else response.status_code
if status_code == 200:
return url, final_url, "OK", None
elif 300 <= status_code < 400:
return url, final_url, f"Redirect ({status_code})", None
elif 400 <= status_code < 500:
return url, final_url, f"Client Error ({status_code})", None
elif 500 <= status_code < 600:
return url, final_url, f"Server Error ({status_code})", None
else:
return url, final_url, f"Unknown Status ({status_code})", None
except requests.exceptions.RequestException as e:
# Catch connection errors, timeouts, etc.
return url, None, "Failed", str(e)
def find_links(base_url, session):
"""
Finds all unique links on a given page.
"""
try:
response = session.get(base_url, headers=HEADERS, timeout=10)
response.raise_for_status() # Raise an exception for bad status codes
except requests.exceptions.RequestException as e:
print(f"Could not retrieve {base_url}: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
# Clean the link: remove fragments and query parameters if desired
# For now, we'll keep them as they are part of the URL
# href = href.split('?')[0].split('#')[0]
# Convert relative URLs to absolute URLs
absolute_url = urljoin(base_url, href)
# Normalize the URL (e.g., remove trailing slash from domain)
absolute_url = absolute_url.rstrip('/')
links.add(absolute_url)
return sorted(list(links))
def main(start_url):
"""
Main function to orchestrate the link checking process.
"""
print(f"Starting link check for: {start_url}\n")
# Use a session object for connection pooling and better performance
with requests.Session() as session:
all_links = find_links(start_url, session)
if not all_links:
print("No links found on the page.")
return
print(f"Found {len(all_links)} links. Checking them now...\n")
results = []
for link in all_links:
original_url, final_url, status, error = check_url(link, session)
results.append((original_url, final_url, status, error))
# --- Report Generation ---
print("--- Link Check Report ---")
for original, final, status, error in results:
print(f"Original: {original}")
print(f"Final: {final if final else 'N/A'}")
print(f"Status: {status}")
if error:
print(f"Error: {error}")
print("-" * 20)
if __name__ == "__main__":
# Example usage
# You can replace this with any URL you want to check
target_url = "https://www.python.org"
main(target_url)
How to Run It
Save the code as simple_checker.py and run it from your terminal:
python simple_checker.py
Limitations of the Simple Approach
- No Concurrency: It checks links one by one. For a large site, this will be very slow.
- No Depth Control: It only checks links on the initial page. It doesn't follow links to other pages on the same site.
- No Configuration: It's hard-coded to check a single URL.
Method 2: A More Advanced & Scalable Approach (Using click and concurrent.futures)
Let's build a proper command-line tool that can handle multiple URLs, check entire websites with a configurable depth, and use concurrency for speed.
Step 1: Install Additional Libraries
We'll use click for creating a professional command-line interface and tqdm for a nice progress bar.
pip install click tqdm
Step 2: The Advanced Script (linkchecker.py)
This script is structured as a command-line tool.

import click
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import concurrent.futures
from tqdm import tqdm
HEADERS = {
'User-Agent': 'Advanced-Python-Link-Checker/1.0 (https://mywebsite.com/bot-info)'
}
def is_valid_url(url, base_domain):
"""Check if a URL belongs to the same domain as the base URL."""
try:
return urlparse(url).netloc == urlparse(base_domain).netloc
except:
return False
def check_url(url, session):
"""Checks a single URL and returns a result tuple."""
try:
# Use a HEAD request for faster checks, but fall back to GET if needed
response = session.head(url, headers=HEADERS, timeout=10, allow_redirects=True)
final_url = response.url
status_code = response.status_code
except requests.exceptions.RequestException:
# HEAD can fail, so try GET as a fallback
try:
response = session.get(url, headers=HEADERS, timeout=10, allow_redirects=True)
final_url = response.url
status_code = response.status_code
except requests.exceptions.RequestException as e:
return (url, None, "Failed", str(e))
if status_code == 200:
return (url, final_url, "OK", None)
elif 300 <= status_code < 400:
return (url, final_url, f"Redirect ({status_code})", None)
elif 400 <= status_code < 500:
return (url, final_url, f"Client Error ({status_code})", None)
elif 500 <= status_code < 600:
return (url, final_url, f"Server Error ({status_code})", None)
else:
return (url, final_url, f"Unknown ({status_code})", None)
def find_links(base_url, session, domain):
"""Finds all links on a page that belong to the specified domain."""
try:
response = session.get(base_url, headers=HEADERS, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
return [], str(e)
soup = BeautifulSoup(response.text, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href'].strip()
if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
continue
absolute_url = urljoin(base_url, href)
absolute_url = absolute_url.rstrip('/')
if is_valid_url(absolute_url, domain):
links.add(absolute_url)
return sorted(list(links))
@click.command()
@click.argument('url')
@click.option('--depth', '-d', default=1, help='Depth of the crawl (1 = only the starting page).')
@click.option('--threads', '-t', default=10, help='Number of concurrent threads to use.')
def main(url, depth, threads):
"""A fast, concurrent link checker for a given URL and its sub-pages."""
# Normalize the input URL
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
url = url.rstrip('/')
base_domain = urlparse(url).netloc
print(f"Checking links for: {url}")
print(f"Domain: {base_domain}")
print(f"Crawl depth: {depth}")
print(f"Using {threads} threads.\n")
urls_to_check = {url}
urls_checked = set()
all_results = []
# Use a ThreadPoolExecutor for concurrent requests
with requests.Session() as session, concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
for current_depth in range(depth + 1):
if not urls_to_check:
break
print(f"--- Depth {current_depth}: Checking {len(urls_to_check)} pages ---")
# Prepare all tasks for the current depth
future_to_url = {
executor.submit(check_url, u, session): u
for u in urls_to_check if u not in urls_checked
}
# Use tqdm for a progress bar
with tqdm(total=len(future_to_url), desc="Checking URLs") as pbar:
for future in concurrent.futures.as_completed(future_to_url):
original_url = future_to_url[future]
urls_checked.add(original_url)
try:
result = future.result()
all_results.append(result)
# If this is the final depth, don't crawl for more links
if current_depth < depth:
# Check the page for new links only if it was successful
if result[2] == "OK":
new_links = find_links(original_url, session, base_domain)
urls_to_check.update(new_links)
except Exception as e:
all_results.append((original_url, None, "Exception", str(e)))
finally:
pbar.update(1)
# --- Report Generation ---
print("\n--- Final Report ---")
broken_links = [r for r in all_results if "Error" in r[2] or "Failed" in r[2] or "Exception" in r[2]]
if broken_links:
click.echo(click.style(f"Found {len(broken_links)} broken links:", fg="red"))
for original, final, status, error in broken_links:
click.echo(f" - {original} -> {status} ({error})")
else:
click.echo(click.style("All links are working!", fg="green"))
total_checked = len(all_results)
click.echo(f"\nTotal links checked: {total_checked}")
if __name__ == '__main__':
main()
How to Run the Advanced Tool
Save the code as linkchecker.py. You can now run it with various options:
Check only the links on the main page of example.com:
python linkchecker.py https://www.example.com
Check links up to 3 pages deep on python.org:
python linkchecker.py https://www.python.org --depth 3
Use 20 threads for faster checking:
python linkchecker.py https://www.wikipedia.org --depth 2 --threads 20
Get help:
python linkchecker.py --help
Further Improvements
- Rate Limiting: Be a good web citizen. Add a small delay (
time.sleep(0.5)) between requests to avoid overwhelming the server. - Output Formats: Save the report to a file (e.g., CSV, JSON, or Markdown).
- Authentication: Add support for checking links behind a login (this is much more complex and might involve tools like
selenium). - Robustness: Handle more edge cases, like infinite loops (a page linking to itself), or very large pages.
