#!/usr/bin/env ruby require 'net/http' # -------------------------------------------------------------- # SETTINGS # -------------------------------------------------------------- # Skip checking external links entirely skip_external = false # Safety first! Limit the max number of links to check. # Includes external links. Adjust to taste. limit = 5000 # How long would you like to wait for a connection and request? # (Connection and request will get this timeout *each*, so the # theoretical maximum delay could be twice this amount per link.) timeout_seconds = 10 # -------------------------------------------------------------- # -------------------------------------------------------------- if ARGV.empty? STDERR.puts "Usage: chklnks 'http://example.com' ['/no-follow1/ /no-follow2/']" exit 1 end # Get the start URI from the command line. start_uri = URI.parse(ARGV[0]) start_uri.normalize! STDERR.puts "Starting at #{start_uri}..." $no_follows = nil if ARGV[1] $no_follows = ARGV[1].split end start_uri = URI.parse(ARGV[0]) # The Link struct stores fAcTz about each unique URI. Link = Struct.new( :uri, :from_uris, :to_uri, :check_only, :external, :yuck_invalid_link, :visited, :connected, :result_headers, :result_http_code, :result_ok, :redirected, :redirects_to, :redirect_result_headers, :redirect_result_http_code, :redirect_result_ok, :redirect_redirected ) # The links hash stores unique absolute URIs collected as pages # are scanned for links. @links = {} # Links are stored in a FIFO queue to be visited. The @links hash # above prevents duplicate entries. @to_visit = Queue.new # Add our initial uri to the master link hash. Setting property # :check_only to false or else we won't follow the links on the page! @links[start_uri.to_s] = Link.new( start_uri, # uri [], # from_uris start_uri, # to_uri false, # check_only false # external ) # Prime the to_visit queue with our start URI. @to_visit.push @links[start_uri.to_s] # Returns an absolute URI for the given +to_uri+ URI relative to # the +from_uri+. The +to_uri+ may be relative or absolute. # Relative: # from = URI.parse("http://example.com/foo/") # from_to(from, URI.parse("page")) # http://example.com/foo/page # from_to(from, URI.parse("../bar")) # http://example.com/bar # from_to(from, URI.parse("/bar")) # http://example.com/bar # Absolute": # from_to(from, URI.parse("http://ruby-doc.org/")) # http://ruby-doc.org/ def from_to(from_uri, to_uri) to_uri.normalize! # hTtP://eXAMple.com --> http://example.com if from_uri.relative? # This should never have happened - the "from" URI should have # been absolute - this is the page we retrieved from which the # links are coming. How did this happen!? raise BadURIError, "from_to(): from_uri is relative: #{from_uri}" end if to_uri.absolute? return to_uri #absolute URI is ready to go! end return from_uri.merge(to_uri) end def add_link(from_uri, link_str) begin to_uri = URI.parse link_str rescue # parse failed, so the href must have been malformed @links[to_uri] = Link.new( nil, # Link.uri [from_uri], # Link.from_uris nil, # Link.to_uri nil, # Link.check_only nil, # Link.external link_str # Link.yuck_invalid_link ) return end to_uri.fragment = nil # clear any fragment portion to_uri.normalize! absolute_uri = from_to(from_uri, to_uri) if @links.key? absolute_uri.to_s # Already in hash. Add the from_uri to the Link @links[absolute_uri.to_s].from_uris.push from_uri else # Create new Link for this Link. Store in hash. @links[absolute_uri.to_s] = Link.new( absolute_uri, # uri [from_uri], # array literal with link's first "from" to_uri, # the semi-unmodified link uri to_uri.absolute?, # Link.check_only to_uri.absolute? # Link.external ) # Set to check_only if a no_follows pattern matches if $no_follows $no_follows.each do |nf| if absolute_uri.to_s.include?(nf) @links[absolute_uri.to_s].check_only = true end end end # Hax: don't attempt to extract links from non-page content which has # been linked in an anchor tag. (In case Content-Type fails us.) if link_str.end_with?( ".jpg", ".gif", ".jpeg", ".pdf", ".png", ".svg", ".css") @links[absolute_uri.to_s].check_only = true end # Add it to the to_visit queue @to_visit.push @links[absolute_uri.to_s] end end # Do The Crawl! visited_count = 0 while !@to_visit.empty? and visited_count < limit # Shift off the next queued URI to visit link = @to_visit.shift visited_count += 1 truncated_uri = link.uri.to_s.length > 50 ? link.uri.to_s[0..50]+"..." : link.uri.to_s STDERR.puts "#{visited_count} done, #{@to_visit.length} left | #{truncated_uri}" if link.external and skip_external STDERR.puts "(Skipping external link!)" next end if link.yuck_invalid_link STDERR.puts "(Skipping invalid URI!)" next end link.visited = true ################ # HEAD request # ################ host = link.uri.hostname port = link.uri.port path = link.uri.path ssl = link.uri.scheme == 'https' begin http = Net::HTTP.start( host, port, { :use_ssl => ssl, :read_timeout => timeout_seconds, :open_timeout => timeout_seconds }) rescue STDERR.puts "* Could not connect to host '#{host}'!" next end begin res = http.request_head(path) rescue Net::ReadTimeout STDERR.puts "* Connected to '#{host}', but HEAD timed out." next rescue STDERR.puts "* Error while performing HEAD request: #{$!}" next end link.connected = true; # Stow the results link.result_headers = res.each_capitalized.map { |k,v| [k, v] }.to_h link.result_http_code = res.code link.result_ok = res.instance_of? Net::HTTPOK link.redirected = res.is_a? Net::HTTPRedirection if link.result_headers.key? "Content-Type" and link.result_headers["Content-Type"] != "text/html" link.check_only = true end # Handle redirection if link.redirected link.redirects_to = link.result_headers["Location"] # Try a single level of redirection (could just add the # link to the queue, but I think it would be a pain to # track the results that way...) redir_result = http.request_head(link.redirects_to) link.redirect_result_headers = redir_result.each_capitalized.map { |k,v| [k, v] }.to_h link.redirect_result_http_code = redir_result.code link.redirect_result_ok = redir_result.instance_of? Net::HTTPOK link.redirect_redirected = redir_result.is_a? Net::HTTPRedirection end # Continue with a page scan? next if link.check_only or !link.result_ok ############### # GET request # (To scan page for links.) ############### res = Net::HTTP.get_response(link.uri) # Scan result body for ... # Note: I _could_ also check for a tag here. res.body.scan(/]* href="([^"]+)"/i) do |href| # href[0] because scan nests capture group results [['foo'],['bar']] path = href[0] add_link(link.uri, path) end end if limit <= 0 STDERR.puts "*NOTE* Link limit of #{limit} was reached!" end # Counters for stat results links_total = 0 links_ok = 0 links_redirect = 0 links_other = 0 links_internal = 0 links_external = 0 @links.each do |key, link| # Stat counts links_total += 1 links_ok += 1 if link.result_ok links_redirect +=1 if link.redirected links_other += 1 if link.visited and !link.result_ok and !link.redirected links_external += 1 if link.external links_internal += 1 if not link.external if link.redirected and !link.redirect_result_ok links_other += 1 # this is also an error end end puts %{ chklnks - #{start_uri}

chklnks - #{start_uri}

Total links#{links_total}
OK links#{links_ok}
Redirected links#{links_redirect}
Error links#{links_other}
Internal links#{links_internal}
External links#{links_external}
} # Yup, we loop through a second time. Cost: virtually nothing. jsid = 0 @links.each do |key, link| redirect_result_tag = "" if link.yuck_invalid_link result_tag="bad-uri" link.uri = link.yuck_invalid_link elsif not link.visited result_tag="unvisited" elsif link.result_ok result_tag="ok" elsif link.redirected result_tag="redirect" rr = link.redirect_result_ok ? 'ok' : 'error' redirect_result_tag = %{➞#{rr} #{link.redirect_result_http_code}} else result_tag="error" end ternal_tag = link.external ? 'external' : 'internal' jsid += 1 puts %{ } end puts %{ Generated by chklnks.rb on #{Time.now}. }