1 #!/usr/bin/env ruby
2
3 require 'net/http'
4
5 # --------------------------------------------------------------
6 # SETTINGS
7 # --------------------------------------------------------------
8 # Skip checking external links entirely
9 skip_external = false
10
11 # Safety first! Limit the max number of links to check.
12 # Includes external links. Adjust to taste.
13 limit = 5000
14
15 # How long would you like to wait for a connection and request?
16 # (Connection and request will get this timeout *each*, so the
17 # theoretical maximum delay could be twice this amount per link.)
18 timeout_seconds = 10
19 # --------------------------------------------------------------
20 # --------------------------------------------------------------
21
22 if ARGV.empty?
23 STDERR.puts "Usage: chklnks 'http://example.com' ['/no-follow1/ /no-follow2/']"
24 exit 1
25 end
26
27 # Get the start URI from the command line.
28 start_uri = URI.parse(ARGV[0])
29 start_uri.normalize!
30 STDERR.puts "Starting at #{start_uri}..."
31
32 $no_follows = nil
33
34 if ARGV[1]
35 $no_follows = ARGV[1].split
36 end
37
38 start_uri = URI.parse(ARGV[0])
39
40 # The Link struct stores fAcTz about each unique URI.
41 Link = Struct.new(
42 :uri,
43 :from_uris,
44 :to_uri,
45 :check_only,
46 :external,
47 :yuck_invalid_link,
48 :visited,
49 :connected,
50 :result_headers,
51 :result_http_code,
52 :result_ok,
53 :redirected,
54 :redirects_to,
55 :redirect_result_headers,
56 :redirect_result_http_code,
57 :redirect_result_ok,
58 :redirect_redirected
59 )
60
61 # The links hash stores unique absolute URIs collected as pages
62 # are scanned for links.
63 @links = {}
64
65 # Links are stored in a FIFO queue to be visited. The @links hash
66 # above prevents duplicate entries.
67 @to_visit = Queue.new
68
69 # Add our initial uri to the master link hash. Setting property
70 # :check_only to false or else we won't follow the links on the page!
71 @links[start_uri.to_s] = Link.new(
72 start_uri, # uri
73 [], # from_uris
74 start_uri, # to_uri
75 false, # check_only
76 false # external
77 )
78
79 # Prime the to_visit queue with our start URI.
80 @to_visit.push @links[start_uri.to_s]
81
82
83 # Returns an absolute URI for the given +to_uri+ URI relative to
84 # the +from_uri+. The +to_uri+ may be relative or absolute.
85 # Relative:
86 # from = URI.parse("http://example.com/foo/")
87 # from_to(from, URI.parse("page")) # http://example.com/foo/page
88 # from_to(from, URI.parse("../bar")) # http://example.com/bar
89 # from_to(from, URI.parse("/bar")) # http://example.com/bar
90 # Absolute":
91 # from_to(from, URI.parse("http://ruby-doc.org/")) # http://ruby-doc.org/
92 def from_to(from_uri, to_uri)
93 to_uri.normalize! # hTtP://eXAMple.com --> http://example.com
94
95 if from_uri.relative?
96 # This should never have happened - the "from" URI should have
97 # been absolute - this is the page we retrieved from which the
98 # links are coming. How did this happen!?
99 raise BadURIError, "from_to(): from_uri is relative: #{from_uri}"
100 end
101
102 if to_uri.absolute?
103 return to_uri #absolute URI is ready to go!
104 end
105
106 return from_uri.merge(to_uri)
107 end
108
109 def add_link(from_uri, link_str)
110 begin
111 to_uri = URI.parse link_str
112 rescue
113 # parse failed, so the href must have been malformed
114 @links[to_uri] = Link.new(
115 nil, # Link.uri
116 [from_uri], # Link.from_uris
117 nil, # Link.to_uri
118 nil, # Link.check_only
119 nil, # Link.external
120 link_str # Link.yuck_invalid_link
121 )
122 return
123 end
124
125 to_uri.fragment = nil # clear any fragment portion
126 to_uri.normalize!
127
128 absolute_uri = from_to(from_uri, to_uri)
129
130 if @links.key? absolute_uri.to_s
131 # Already in hash. Add the from_uri to the Link
132 @links[absolute_uri.to_s].from_uris.push from_uri
133 else
134 # Create new Link for this Link. Store in hash.
135 @links[absolute_uri.to_s] = Link.new(
136 absolute_uri, # uri
137 [from_uri], # array literal with link's first "from"
138 to_uri, # the semi-unmodified link uri
139 to_uri.absolute?, # Link.check_only
140 to_uri.absolute? # Link.external
141 )
142
143 # Set to check_only if a no_follows pattern matches
144 if $no_follows
145 $no_follows.each do |nf|
146 if absolute_uri.to_s.include?(nf)
147 @links[absolute_uri.to_s].check_only = true
148 end
149 end
150 end
151
152 # Hax: don't attempt to extract links from non-page content which has
153 # been linked in an anchor tag. (In case Content-Type fails us.)
154 if link_str.end_with?(
155 ".jpg",
156 ".gif",
157 ".jpeg",
158 ".pdf",
159 ".png",
160 ".svg",
161 ".css")
162 @links[absolute_uri.to_s].check_only = true
163 end
164
165 # Add it to the to_visit queue
166 @to_visit.push @links[absolute_uri.to_s]
167 end
168 end
169
170
171 # Do The Crawl!
172 visited_count = 0
173
174 while !@to_visit.empty? and visited_count < limit
175 # Shift off the next queued URI to visit
176 link = @to_visit.shift
177 visited_count += 1
178 truncated_uri = link.uri.to_s.length > 50 ? link.uri.to_s[0..50]+"..." : link.uri.to_s
179 STDERR.puts "#{visited_count} done, #{@to_visit.length} left | #{truncated_uri}"
180
181 if link.external and skip_external
182 STDERR.puts "(Skipping external link!)"
183 next
184 end
185
186 if link.yuck_invalid_link
187 STDERR.puts "(Skipping invalid URI!)"
188 next
189 end
190
191 link.visited = true
192
193 ################
194 # HEAD request #
195 ################
196 host = link.uri.hostname
197 port = link.uri.port
198 path = link.uri.path
199 ssl = link.uri.scheme == 'https'
200 begin
201 http = Net::HTTP.start( host, port, {
202 :use_ssl => ssl,
203 :read_timeout => timeout_seconds,
204 :open_timeout => timeout_seconds
205 })
206 rescue
207 STDERR.puts "* Could not connect to host '#{host}'!"
208 next
209 end
210 begin
211 res = http.request_head(path)
212 rescue Net::ReadTimeout
213 STDERR.puts "* Connected to '#{host}', but HEAD timed out."
214 next
215 rescue
216 STDERR.puts "* Error while performing HEAD request: #{$!}"
217 next
218 end
219 link.connected = true;
220
221 # Stow the results
222 link.result_headers = res.each_capitalized.map { |k,v| [k, v] }.to_h
223 link.result_http_code = res.code
224 link.result_ok = res.instance_of? Net::HTTPOK
225 link.redirected = res.is_a? Net::HTTPRedirection
226
227 if link.result_headers.key? "Content-Type" and
228 link.result_headers["Content-Type"] != "text/html"
229 link.check_only = true
230 end
231
232 # Handle redirection
233 if link.redirected
234 link.redirects_to = link.result_headers["Location"]
235
236 # Try a single level of redirection (could just add the
237 # link to the queue, but I think it would be a pain to
238 # track the results that way...)
239 redir_result = http.request_head(link.redirects_to)
240 link.redirect_result_headers = redir_result.each_capitalized.map { |k,v| [k, v] }.to_h
241 link.redirect_result_http_code = redir_result.code
242 link.redirect_result_ok = redir_result.instance_of? Net::HTTPOK
243 link.redirect_redirected = redir_result.is_a? Net::HTTPRedirection
244 end
245
246 # Continue with a page scan?
247 next if link.check_only or !link.result_ok
248
249 ###############
250 # GET request # (To scan page for links.)
251 ###############
252 res = Net::HTTP.get_response(link.uri)
253
254 # Scan result body for <a href="sweet links!">...</a>
255 # Note: I _could_ also check for a <base href=""> tag here.
256 res.body.scan(/<a[^>]* href="([^"]+)"/i) do |href|
257 # href[0] because scan nests capture group results [['foo'],['bar']]
258 path = href[0]
259 add_link(link.uri, path)
260 end
261 end
262
263 if limit <= 0
264 STDERR.puts "*NOTE* Link limit of #{limit} was reached!"
265 end
266
267 # Counters for stat results
268 links_total = 0
269 links_ok = 0
270 links_redirect = 0
271 links_other = 0
272 links_internal = 0
273 links_external = 0
274
275 @links.each do |key, link|
276 # Stat counts
277 links_total += 1
278 links_ok += 1 if link.result_ok
279 links_redirect +=1 if link.redirected
280 links_other += 1 if link.visited and !link.result_ok and !link.redirected
281 links_external += 1 if link.external
282 links_internal += 1 if not link.external
283
284 if link.redirected and !link.redirect_result_ok
285 links_other += 1 # this is also an error
286 end
287 end
288
289 puts %{<!DOCTYPE html>
290 <html><head>
291 <title>chklnks - #{start_uri}</title>
292 <meta charset="utf-8">
293 <style>
294 code { background: #fff; padding: 5px; }
295 .link { border: 1px solid #444; }
296 .link.unvisited { background: #aaa; }
297 .link.redirect { background: #ffb; }
298 .link.error { background: #f88; }
299 .link.bad-uri { background: #aaa; }
300 .link.ok { background: #8f8; }
301 .tag { color: #FFF; width: 100px; font-size: small; display: inline-block;
302 padding: 2px; text-align: center; }
303 .tag.unvisited { background: #666; }
304 .tag.redirect { background: #562; }
305 .tag.error { background: #900; }
306 .tag.bad-uri { background: #900; }
307 .tag.ok { background: #083; }
308 .tag.external { background: #04B; }
309 .tag.internal { background: #0B6; }
310 .show_details { background: #FFF; color: #000; text-decoration: none; }
311 </style>
312 </head>
313 <body>
314 <h1>chklnks - #{start_uri}</h1>
315 <table>
316 <tr><td>Total links<td>#{links_total}
317 <tr><td>OK links<td>#{links_ok}
318 <tr><td>Redirected links<td>#{links_redirect}
319 <tr><td>Error links<td>#{links_other}
320 <tr><td>Internal links<td>#{links_internal}
321 <tr><td>External links<td>#{links_external}
322 </table>
323 }
324
325 # Yup, we loop through a second time. Cost: virtually nothing.
326 jsid = 0
327 @links.each do |key, link|
328 redirect_result_tag = ""
329
330 if link.yuck_invalid_link
331 result_tag="bad-uri"
332 link.uri = link.yuck_invalid_link
333 elsif not link.visited
334 result_tag="unvisited"
335 elsif link.result_ok
336 result_tag="ok"
337 elsif link.redirected
338 result_tag="redirect"
339 rr = link.redirect_result_ok ? 'ok' : 'error'
340 redirect_result_tag = %{<b class="tag #{rr}">➞#{rr} #{link.redirect_result_http_code}</b>}
341 else
342 result_tag="error"
343 end
344
345 ternal_tag = link.external ? 'external' : 'internal'
346 jsid += 1
347
348 puts %{<div class="link #{result_tag}">
349 <b class="tag #{result_tag}">#{result_tag} #{link.result_http_code}</b>
350 #{redirect_result_tag}
351 <b class="tag #{ternal_tag}">#{ternal_tag}</b>
352 <a href="#" data-jsid="details_#{jsid}" class="show_details">Details▼</a>
353 <a href="#{link.uri}">#{link.uri}</a>
354 <div id="details_#{jsid}" class="details" style="display: none">
355 Linked from:
356 <ul>
357 }
358 link.from_uris.each {|from|
359 puts %{<li><a href="#{from}">#{from}</a></li>}
360 }
361 puts "</ul>"
362
363 if link.yuck_invalid_link
364 puts "Unable to parse this href: <code>#{link.yuck_invalid_link}</code>"
365 end
366
367 if link.visited and link.connected
368 puts "Headers: <ul>"
369 link.result_headers.each {|k,v|
370 puts "<li>#{k}: #{v}</li>"
371 }
372 puts "</ul>"
373 elsif link.visited and !link.connected
374 puts "<p><b>Unable to connect to host!</b></p>"
375 else
376 puts "<p><b>Link not visited!</b></p>"
377 end
378
379 # If it's a redirection, nest the results of the redirection here
380 if link.redirected
381 puts %{ <div class="redirect_results">
382 #{redirect_result_tag}
383 <a href="#{link.redirects_to}">#{link.redirects_to}</a><br>}
384 puts "Headers from redirected URI: <ul>"
385 link.redirect_result_headers.each {|k,v|
386 puts "<li>#{k}: #{v}</li>"
387 }
388 puts "</ul> <!-- end redirect headers list -->"
389 puts "</div> <!-- end redirect nested results -->"
390 end
391
392 puts %{
393 </div> <!-- link details -->
394 </div> <!-- link -->
395 }
396 end
397
398 puts %{
399 <script>
400 // Add click handler to toggle visibility of details sections.
401 document.querySelectorAll('.show_details').forEach( function(e){
402 e.addEventListener('click', function(event){
403 event.preventDefault();
404 var target = this.dataset.jsid;
405 console.log(target);
406 document.getElementById(target).style.display = 'block';
407 });
408 });
409 </script>
410 Generated by chklnks.rb on #{Time.now}.
411 </body>
412 </html>
413 }