-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck-links.py
More file actions
executable file
·128 lines (97 loc) · 3.61 KB
/
check-links.py
File metadata and controls
executable file
·128 lines (97 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python -B
# vi: set syntax=python ts=4 sw=4 sts=4 et ff=unix ai si :
#
# check-links uses several Python modules: lxml, requests, and beautifulsoup4.
#
# (c) Steven Scholnick <scholnicks@gmail.com>
# The check-links source code is published under a MIT license.
"""
check-links - validate links in one or more HTML files. The HTML files can exist locally or be accessible via a URL.
Usage:
check-links [options] <filesOrURLs> ...
Options:
-e, --external Validate external links only
-h, --help Show this help screen
-i, --image Turns on image checking
-o, --ok Shows the good links
-r, --root=<root_directory> Sets the root web directory
-v, --verbose Verbose Mode
--version Prints the version
"""
import os
import sys
import requests
from bs4 import BeautifulSoup
FORMAT = "{0:80.70s} {1:10s}"
def main(html_files):
try:
for filePath in html_files:
if filePath.startswith("http"):
processRemoteFile(filePath)
else:
processFile(filePath)
except KeyboardInterrupt:
print()
sys.exit(0)
def processRemoteFile(url):
"""Parses a URL's text and checks all of the a links"""
r = requests.get(url)
if r.status_code != 200:
print("Cannot access URL: " + url)
return
root = os.path.dirname(url)
soup = BeautifulSoup(r.text)
for tag in soup.find_all("a"):
link = tag["href"]
if not link.startswith("http"):
link = root + link
checkRemote(link)
def processFile(filePath):
"""parses an HTML file"""
if arguments["--verbose"]:
print("Processing file {0}".format(filePath))
with open(filePath, "r") as fp:
soup = BeautifulSoup(fp, "lxml")
checkLinks(soup.find_all("link"), "href")
checkLinks(soup.find_all("a"), "href")
if arguments["--image"]:
checkLinks(soup.find_all("img"), "src")
checkLinks(soup.find_all("script"), "src")
def checkLinks(tags, externalAttributeName):
"""checks the links in the passed in tags"""
for t in tags:
try:
link = t.attrs[externalAttributeName]
if link.startswith("http"):
checkRemote(link)
else:
checkLocal(link)
except KeyError:
# no src or href, just move onto the next tag
pass
def checkRemote(link):
"""checks a remote (http/https) link"""
try:
response = requests.get(link)
if response.status_code != 200 or arguments["--ok"]:
print(FORMAT.format(link, str(response.status_code)))
except requests.exceptions.RequestException as e:
print(FORMAT.format(link, e.message if arguments["--verbose"] else "Cannot contact host"))
def checkLocal(path):
"""Checks for the existence of a local file on disk"""
if "mailto" in path or path.startswith("#") or arguments["--external"]:
return
if path[0] == "/" and arguments["--root"]:
root = arguments["--root"]
path = (root[:-1] if root[-1] == "/" else root) + path
exists = os.path.exists(path)
if not exists or arguments["--ok"]:
print(FORMAT.format(path, "Good" if exists else "Missing"))
if __name__ == "__main__":
from docopt import docopt
arguments = docopt(__doc__, version="2.0.0")
if arguments["--verbose"]:
arguments["--ok"] = True
if arguments["--root"]:
arguments["--root"] = os.path.expanduser(arguments["--root"])
main(arguments["<filesOrURLs>"])