#!/usr/bin/env ruby
require 'net/http'
# --------------------------------------------------------------
# SETTINGS
# --------------------------------------------------------------
# Skip checking external links entirely
skip_external = false
# Safety first! Limit the max number of links to check.
# Includes external links. Adjust to taste.
limit = 5000
# How long would you like to wait for a connection and request?
# (Connection and request will get this timeout *each*, so the
# theoretical maximum delay could be twice this amount per link.)
timeout_seconds = 10
# --------------------------------------------------------------
# --------------------------------------------------------------
if ARGV.empty?
STDERR.puts "Usage: chklnks 'http://example.com' ['/no-follow1/ /no-follow2/']"
exit 1
end
# Get the start URI from the command line.
start_uri = URI.parse(ARGV[0])
start_uri.normalize!
STDERR.puts "Starting at #{start_uri}..."
$no_follows = nil
if ARGV[1]
$no_follows = ARGV[1].split
end
start_uri = URI.parse(ARGV[0])
# The Link struct stores fAcTz about each unique URI.
Link = Struct.new(
:uri,
:from_uris,
:to_uri,
:check_only,
:external,
:yuck_invalid_link,
:visited,
:connected,
:result_headers,
:result_http_code,
:result_ok,
:redirected,
:redirects_to,
:redirect_result_headers,
:redirect_result_http_code,
:redirect_result_ok,
:redirect_redirected
)
# The links hash stores unique absolute URIs collected as pages
# are scanned for links.
@links = {}
# Links are stored in a FIFO queue to be visited. The @links hash
# above prevents duplicate entries.
@to_visit = Queue.new
# Add our initial uri to the master link hash. Setting property
# :check_only to false or else we won't follow the links on the page!
@links[start_uri.to_s] = Link.new(
start_uri, # uri
[], # from_uris
start_uri, # to_uri
false, # check_only
false # external
)
# Prime the to_visit queue with our start URI.
@to_visit.push @links[start_uri.to_s]
# Returns an absolute URI for the given +to_uri+ URI relative to
# the +from_uri+. The +to_uri+ may be relative or absolute.
# Relative:
# from = URI.parse("http://example.com/foo/")
# from_to(from, URI.parse("page")) # http://example.com/foo/page
# from_to(from, URI.parse("../bar")) # http://example.com/bar
# from_to(from, URI.parse("/bar")) # http://example.com/bar
# Absolute":
# from_to(from, URI.parse("http://ruby-doc.org/")) # http://ruby-doc.org/
def from_to(from_uri, to_uri)
to_uri.normalize! # hTtP://eXAMple.com --> http://example.com
if from_uri.relative?
# This should never have happened - the "from" URI should have
# been absolute - this is the page we retrieved from which the
# links are coming. How did this happen!?
raise BadURIError, "from_to(): from_uri is relative: #{from_uri}"
end
if to_uri.absolute?
return to_uri #absolute URI is ready to go!
end
return from_uri.merge(to_uri)
end
def add_link(from_uri, link_str)
begin
to_uri = URI.parse link_str
rescue
# parse failed, so the href must have been malformed
@links[to_uri] = Link.new(
nil, # Link.uri
[from_uri], # Link.from_uris
nil, # Link.to_uri
nil, # Link.check_only
nil, # Link.external
link_str # Link.yuck_invalid_link
)
return
end
to_uri.fragment = nil # clear any fragment portion
to_uri.normalize!
absolute_uri = from_to(from_uri, to_uri)
if @links.key? absolute_uri.to_s
# Already in hash. Add the from_uri to the Link
@links[absolute_uri.to_s].from_uris.push from_uri
else
# Create new Link for this Link. Store in hash.
@links[absolute_uri.to_s] = Link.new(
absolute_uri, # uri
[from_uri], # array literal with link's first "from"
to_uri, # the semi-unmodified link uri
to_uri.absolute?, # Link.check_only
to_uri.absolute? # Link.external
)
# Set to check_only if a no_follows pattern matches
if $no_follows
$no_follows.each do |nf|
if absolute_uri.to_s.include?(nf)
@links[absolute_uri.to_s].check_only = true
end
end
end
# Hax: don't attempt to extract links from non-page content which has
# been linked in an anchor tag. (In case Content-Type fails us.)
if link_str.end_with?(
".jpg",
".gif",
".jpeg",
".pdf",
".png",
".svg",
".css")
@links[absolute_uri.to_s].check_only = true
end
# Add it to the to_visit queue
@to_visit.push @links[absolute_uri.to_s]
end
end
# Do The Crawl!
visited_count = 0
while !@to_visit.empty? and visited_count < limit
# Shift off the next queued URI to visit
link = @to_visit.shift
visited_count += 1
truncated_uri = link.uri.to_s.length > 50 ? link.uri.to_s[0..50]+"..." : link.uri.to_s
STDERR.puts "#{visited_count} done, #{@to_visit.length} left | #{truncated_uri}"
if link.external and skip_external
STDERR.puts "(Skipping external link!)"
next
end
if link.yuck_invalid_link
STDERR.puts "(Skipping invalid URI!)"
next
end
link.visited = true
################
# HEAD request #
################
host = link.uri.hostname
port = link.uri.port
path = link.uri.path
ssl = link.uri.scheme == 'https'
begin
http = Net::HTTP.start( host, port, {
:use_ssl => ssl,
:read_timeout => timeout_seconds,
:open_timeout => timeout_seconds
})
rescue
STDERR.puts "* Could not connect to host '#{host}'!"
next
end
begin
res = http.request_head(path)
rescue Net::ReadTimeout
STDERR.puts "* Connected to '#{host}', but HEAD timed out."
next
rescue
STDERR.puts "* Error while performing HEAD request: #{$!}"
next
end
link.connected = true;
# Stow the results
link.result_headers = res.each_capitalized.map { |k,v| [k, v] }.to_h
link.result_http_code = res.code
link.result_ok = res.instance_of? Net::HTTPOK
link.redirected = res.is_a? Net::HTTPRedirection
if link.result_headers.key? "Content-Type" and
link.result_headers["Content-Type"] != "text/html"
link.check_only = true
end
# Handle redirection
if link.redirected
link.redirects_to = link.result_headers["Location"]
# Try a single level of redirection (could just add the
# link to the queue, but I think it would be a pain to
# track the results that way...)
redir_result = http.request_head(link.redirects_to)
link.redirect_result_headers = redir_result.each_capitalized.map { |k,v| [k, v] }.to_h
link.redirect_result_http_code = redir_result.code
link.redirect_result_ok = redir_result.instance_of? Net::HTTPOK
link.redirect_redirected = redir_result.is_a? Net::HTTPRedirection
end
# Continue with a page scan?
next if link.check_only or !link.result_ok
###############
# GET request # (To scan page for links.)
###############
res = Net::HTTP.get_response(link.uri)
# Scan result body for ...
# Note: I _could_ also check for a chklnks - #{start_uri}
Total links | #{links_total} |
OK links | #{links_ok} |
Redirected links | #{links_redirect} |
Error links | #{links_other} |
Internal links | #{links_internal} |
External links | #{links_external} |