colorful rat Ratfactor.com > Dave's Repos

chklnks.rb

Website link crawler and checker written in Ruby
git clone http://ratfactor.com/repos/chklnks.rb/chklnks.rb.git

chklnks.rb/chklnks

Download raw file: chklnks

1 #!/usr/bin/env ruby 2 3 require 'net/http' 4 5 # -------------------------------------------------------------- 6 # SETTINGS 7 # -------------------------------------------------------------- 8 # Skip checking external links entirely 9 skip_external = false 10 11 # Safety first! Limit the max number of links to check. 12 # Includes external links. Adjust to taste. 13 limit = 5000 14 15 # How long would you like to wait for a connection and request? 16 # (Connection and request will get this timeout *each*, so the 17 # theoretical maximum delay could be twice this amount per link.) 18 timeout_seconds = 10 19 # -------------------------------------------------------------- 20 # -------------------------------------------------------------- 21 22 if ARGV.empty? 23 STDERR.puts "Usage: chklnks 'http://example.com' ['/no-follow1/ /no-follow2/']" 24 exit 1 25 end 26 27 # Get the start URI from the command line. 28 start_uri = URI.parse(ARGV[0]) 29 start_uri.normalize! 30 STDERR.puts "Starting at #{start_uri}..." 31 32 $no_follows = nil 33 34 if ARGV[1] 35 $no_follows = ARGV[1].split 36 end 37 38 start_uri = URI.parse(ARGV[0]) 39 40 # The Link struct stores fAcTz about each unique URI. 41 Link = Struct.new( 42 :uri, 43 :from_uris, 44 :to_uri, 45 :check_only, 46 :external, 47 :yuck_invalid_link, 48 :visited, 49 :connected, 50 :result_headers, 51 :result_http_code, 52 :result_ok, 53 :redirected, 54 :redirects_to, 55 :redirect_result_headers, 56 :redirect_result_http_code, 57 :redirect_result_ok, 58 :redirect_redirected 59 ) 60 61 # The links hash stores unique absolute URIs collected as pages 62 # are scanned for links. 63 @links = {} 64 65 # Links are stored in a FIFO queue to be visited. The @links hash 66 # above prevents duplicate entries. 67 @to_visit = Queue.new 68 69 # Add our initial uri to the master link hash. Setting property 70 # :check_only to false or else we won't follow the links on the page! 71 @links[start_uri.to_s] = Link.new( 72 start_uri, # uri 73 [], # from_uris 74 start_uri, # to_uri 75 false, # check_only 76 false # external 77 ) 78 79 # Prime the to_visit queue with our start URI. 80 @to_visit.push @links[start_uri.to_s] 81 82 83 # Returns an absolute URI for the given +to_uri+ URI relative to 84 # the +from_uri+. The +to_uri+ may be relative or absolute. 85 # Relative: 86 # from = URI.parse("http://example.com/foo/") 87 # from_to(from, URI.parse("page")) # http://example.com/foo/page 88 # from_to(from, URI.parse("../bar")) # http://example.com/bar 89 # from_to(from, URI.parse("/bar")) # http://example.com/bar 90 # Absolute": 91 # from_to(from, URI.parse("http://ruby-doc.org/")) # http://ruby-doc.org/ 92 def from_to(from_uri, to_uri) 93 to_uri.normalize! # hTtP://eXAMple.com --> http://example.com 94 95 if from_uri.relative? 96 # This should never have happened - the "from" URI should have 97 # been absolute - this is the page we retrieved from which the 98 # links are coming. How did this happen!? 99 raise BadURIError, "from_to(): from_uri is relative: #{from_uri}" 100 end 101 102 if to_uri.absolute? 103 return to_uri #absolute URI is ready to go! 104 end 105 106 return from_uri.merge(to_uri) 107 end 108 109 def add_link(from_uri, link_str) 110 begin 111 to_uri = URI.parse link_str 112 rescue 113 # parse failed, so the href must have been malformed 114 @links[to_uri] = Link.new( 115 nil, # Link.uri 116 [from_uri], # Link.from_uris 117 nil, # Link.to_uri 118 nil, # Link.check_only 119 nil, # Link.external 120 link_str # Link.yuck_invalid_link 121 ) 122 return 123 end 124 125 to_uri.fragment = nil # clear any fragment portion 126 to_uri.normalize! 127 128 absolute_uri = from_to(from_uri, to_uri) 129 130 if @links.key? absolute_uri.to_s 131 # Already in hash. Add the from_uri to the Link 132 @links[absolute_uri.to_s].from_uris.push from_uri 133 else 134 # Create new Link for this Link. Store in hash. 135 @links[absolute_uri.to_s] = Link.new( 136 absolute_uri, # uri 137 [from_uri], # array literal with link's first "from" 138 to_uri, # the semi-unmodified link uri 139 to_uri.absolute?, # Link.check_only 140 to_uri.absolute? # Link.external 141 ) 142 143 # Set to check_only if a no_follows pattern matches 144 if $no_follows 145 $no_follows.each do |nf| 146 if absolute_uri.to_s.include?(nf) 147 @links[absolute_uri.to_s].check_only = true 148 end 149 end 150 end 151 152 # Hax: don't attempt to extract links from non-page content which has 153 # been linked in an anchor tag. (In case Content-Type fails us.) 154 if link_str.end_with?( 155 ".jpg", 156 ".gif", 157 ".jpeg", 158 ".pdf", 159 ".png", 160 ".svg", 161 ".css") 162 @links[absolute_uri.to_s].check_only = true 163 end 164 165 # Add it to the to_visit queue 166 @to_visit.push @links[absolute_uri.to_s] 167 end 168 end 169 170 171 # Do The Crawl! 172 visited_count = 0 173 174 while !@to_visit.empty? and visited_count < limit 175 # Shift off the next queued URI to visit 176 link = @to_visit.shift 177 visited_count += 1 178 truncated_uri = link.uri.to_s.length > 50 ? link.uri.to_s[0..50]+"..." : link.uri.to_s 179 STDERR.puts "#{visited_count} done, #{@to_visit.length} left | #{truncated_uri}" 180 181 if link.external and skip_external 182 STDERR.puts "(Skipping external link!)" 183 next 184 end 185 186 if link.yuck_invalid_link 187 STDERR.puts "(Skipping invalid URI!)" 188 next 189 end 190 191 link.visited = true 192 193 ################ 194 # HEAD request # 195 ################ 196 host = link.uri.hostname 197 port = link.uri.port 198 path = link.uri.path 199 ssl = link.uri.scheme == 'https' 200 begin 201 http = Net::HTTP.start( host, port, { 202 :use_ssl => ssl, 203 :read_timeout => timeout_seconds, 204 :open_timeout => timeout_seconds 205 }) 206 rescue 207 STDERR.puts "* Could not connect to host '#{host}'!" 208 next 209 end 210 begin 211 res = http.request_head(path) 212 rescue Net::ReadTimeout 213 STDERR.puts "* Connected to '#{host}', but HEAD timed out." 214 next 215 rescue 216 STDERR.puts "* Error while performing HEAD request: #{$!}" 217 next 218 end 219 link.connected = true; 220 221 # Stow the results 222 link.result_headers = res.each_capitalized.map { |k,v| [k, v] }.to_h 223 link.result_http_code = res.code 224 link.result_ok = res.instance_of? Net::HTTPOK 225 link.redirected = res.is_a? Net::HTTPRedirection 226 227 if link.result_headers.key? "Content-Type" and 228 link.result_headers["Content-Type"] != "text/html" 229 link.check_only = true 230 end 231 232 # Handle redirection 233 if link.redirected 234 link.redirects_to = link.result_headers["Location"] 235 236 # Try a single level of redirection (could just add the 237 # link to the queue, but I think it would be a pain to 238 # track the results that way...) 239 redir_result = http.request_head(link.redirects_to) 240 link.redirect_result_headers = redir_result.each_capitalized.map { |k,v| [k, v] }.to_h 241 link.redirect_result_http_code = redir_result.code 242 link.redirect_result_ok = redir_result.instance_of? Net::HTTPOK 243 link.redirect_redirected = redir_result.is_a? Net::HTTPRedirection 244 end 245 246 # Continue with a page scan? 247 next if link.check_only or !link.result_ok 248 249 ############### 250 # GET request # (To scan page for links.) 251 ############### 252 res = Net::HTTP.get_response(link.uri) 253 254 # Scan result body for <a href="sweet links!">...</a> 255 # Note: I _could_ also check for a <base href=""> tag here. 256 res.body.scan(/<a[^>]* href="([^"]+)"/i) do |href| 257 # href[0] because scan nests capture group results [['foo'],['bar']] 258 path = href[0] 259 add_link(link.uri, path) 260 end 261 end 262 263 if limit <= 0 264 STDERR.puts "*NOTE* Link limit of #{limit} was reached!" 265 end 266 267 # Counters for stat results 268 links_total = 0 269 links_ok = 0 270 links_redirect = 0 271 links_other = 0 272 links_internal = 0 273 links_external = 0 274 275 @links.each do |key, link| 276 # Stat counts 277 links_total += 1 278 links_ok += 1 if link.result_ok 279 links_redirect +=1 if link.redirected 280 links_other += 1 if link.visited and !link.result_ok and !link.redirected 281 links_external += 1 if link.external 282 links_internal += 1 if not link.external 283 284 if link.redirected and !link.redirect_result_ok 285 links_other += 1 # this is also an error 286 end 287 end 288 289 puts %{<!DOCTYPE html> 290 <html><head> 291 <title>chklnks - #{start_uri}</title> 292 <meta charset="utf-8"> 293 <style> 294 code { background: #fff; padding: 5px; } 295 .link { border: 1px solid #444; } 296 .link.unvisited { background: #aaa; } 297 .link.redirect { background: #ffb; } 298 .link.error { background: #f88; } 299 .link.bad-uri { background: #aaa; } 300 .link.ok { background: #8f8; } 301 .tag { color: #FFF; width: 100px; font-size: small; display: inline-block; 302 padding: 2px; text-align: center; } 303 .tag.unvisited { background: #666; } 304 .tag.redirect { background: #562; } 305 .tag.error { background: #900; } 306 .tag.bad-uri { background: #900; } 307 .tag.ok { background: #083; } 308 .tag.external { background: #04B; } 309 .tag.internal { background: #0B6; } 310 .show_details { background: #FFF; color: #000; text-decoration: none; } 311 </style> 312 </head> 313 <body> 314 <h1>chklnks - #{start_uri}</h1> 315 <table> 316 <tr><td>Total links<td>#{links_total} 317 <tr><td>OK links<td>#{links_ok} 318 <tr><td>Redirected links<td>#{links_redirect} 319 <tr><td>Error links<td>#{links_other} 320 <tr><td>Internal links<td>#{links_internal} 321 <tr><td>External links<td>#{links_external} 322 </table> 323 } 324 325 # Yup, we loop through a second time. Cost: virtually nothing. 326 jsid = 0 327 @links.each do |key, link| 328 redirect_result_tag = "" 329 330 if link.yuck_invalid_link 331 result_tag="bad-uri" 332 link.uri = link.yuck_invalid_link 333 elsif not link.visited 334 result_tag="unvisited" 335 elsif link.result_ok 336 result_tag="ok" 337 elsif link.redirected 338 result_tag="redirect" 339 rr = link.redirect_result_ok ? 'ok' : 'error' 340 redirect_result_tag = %{<b class="tag #{rr}">&#10142;#{rr} #{link.redirect_result_http_code}</b>} 341 else 342 result_tag="error" 343 end 344 345 ternal_tag = link.external ? 'external' : 'internal' 346 jsid += 1 347 348 puts %{<div class="link #{result_tag}"> 349 <b class="tag #{result_tag}">#{result_tag} #{link.result_http_code}</b> 350 #{redirect_result_tag} 351 <b class="tag #{ternal_tag}">#{ternal_tag}</b> 352 <a href="#" data-jsid="details_#{jsid}" class="show_details">Details&#9660;</a> 353 <a href="#{link.uri}">#{link.uri}</a> 354 <div id="details_#{jsid}" class="details" style="display: none"> 355 Linked from: 356 <ul> 357 } 358 link.from_uris.each {|from| 359 puts %{<li><a href="#{from}">#{from}</a></li>} 360 } 361 puts "</ul>" 362 363 if link.yuck_invalid_link 364 puts "Unable to parse this href: <code>#{link.yuck_invalid_link}</code>" 365 end 366 367 if link.visited and link.connected 368 puts "Headers: <ul>" 369 link.result_headers.each {|k,v| 370 puts "<li>#{k}: #{v}</li>" 371 } 372 puts "</ul>" 373 elsif link.visited and !link.connected 374 puts "<p><b>Unable to connect to host!</b></p>" 375 else 376 puts "<p><b>Link not visited!</b></p>" 377 end 378 379 # If it's a redirection, nest the results of the redirection here 380 if link.redirected 381 puts %{ <div class="redirect_results"> 382 #{redirect_result_tag} 383 <a href="#{link.redirects_to}">#{link.redirects_to}</a><br>} 384 puts "Headers from redirected URI: <ul>" 385 link.redirect_result_headers.each {|k,v| 386 puts "<li>#{k}: #{v}</li>" 387 } 388 puts "</ul> <!-- end redirect headers list -->" 389 puts "</div> <!-- end redirect nested results -->" 390 end 391 392 puts %{ 393 </div> <!-- link details --> 394 </div> <!-- link --> 395 } 396 end 397 398 puts %{ 399 <script> 400 // Add click handler to toggle visibility of details sections. 401 document.querySelectorAll('.show_details').forEach( function(e){ 402 e.addEventListener('click', function(event){ 403 event.preventDefault(); 404 var target = this.dataset.jsid; 405 console.log(target); 406 document.getElementById(target).style.display = 'block'; 407 }); 408 }); 409 </script> 410 Generated by chklnks.rb on #{Time.now}. 411 </body> 412 </html> 413 }