module Spidr::Links

Public Instance Methods

each()
Alias for: each_url
each_meta_redirect() { |redirect| ... } click to toggle source

Enumerates over the meta-redirect links in the page.

@yield [link]

If a block is given, it will be passed every meta-redirect link
from the page.

@yieldparam [String] link

A meta-redirect link from the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/links.rb, line 23
def each_meta_redirect
  return enum_for(:each_meta_redirect) unless block_given?

  if (html? && doc)
    search('//meta[@http-equiv and @content]').each do |node|
      if node.get_attribute('http-equiv') =~ /refresh/i
        content = node.get_attribute('content')

        if (redirect = content.match(/url=(\S+)$/))
          yield redirect[1]
        end
      end
    end
  end
end
each_redirect() { |location| ... } click to toggle source

Enumerates over every HTTP or meta-redirect link in the page.

@yield [link]

The given block will be passed every redirection link from the page.

@yieldparam [String] link

A HTTP or meta-redirect link from the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/links.rb, line 76
def each_redirect(&block)
  return enum_for(:each_redirect) unless block

  location = headers['location']

  if location.nil?
    # check page-level meta redirects if there isn't a location header
    each_meta_redirect(&block)
  elsif location.kind_of?(Array)
    location.each(&block)
  else
    # usually the location header contains a single String
    yield location
  end
end
each_url() { |url| ... } click to toggle source

Enumerates over every absolute URL in the page.

@yield [url]

The given block will be passed every URL in the page.

@yieldparam [URI::HTTP] url

An absolute URL in the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/links.rb, line 174
def each_url
  return enum_for(:each_url) unless block_given?

  each_link do |link|
    if (url = to_absolute(link))
      yield url
    end
  end
end
Also aliased as: each
meta_redirect?() click to toggle source

Returns a boolean indicating whether or not page-level meta redirects are present in this page.

@return [Boolean]

Specifies whether the page includes page-level redirects.
# File lib/spidr/links.rb, line 46
def meta_redirect?
  !(each_meta_redirect.first.nil?)
end
meta_redirects() click to toggle source

The meta-redirect links of the page.

@return [Array<String>]

All meta-redirect links in the page.

@since 0.3.0

# File lib/spidr/links.rb, line 58
def meta_redirects
  each_meta_redirect.to_a
end
redirects_to() click to toggle source

URLs that this document redirects to.

@return [Array<String>]

The links that this page redirects to (usually found in a
location header or by way of a page-level meta redirect).
# File lib/spidr/links.rb, line 99
def redirects_to
  each_redirect.to_a
end
to_absolute(link) click to toggle source

Normalizes and expands a given link into a proper URI.

@param [String] link

The link to normalize and expand.

@return [URI::HTTP]

The normalized URI.
# File lib/spidr/links.rb, line 205
def to_absolute(link)
  begin
    new_url = url.merge(link.to_s)
  rescue Exception
    return nil
  end

  if new_url.path
    path = new_url.path

    # ensure that paths begin with a leading '/' for URI::FTP
    if (new_url.scheme == 'ftp' && path[0,1] != '/')
      path.insert(0,'/')
    end

    # make sure the path does not contain any .. or . directories,
    # since URI::Generic#merge cannot normalize paths such as
    # "/stuff/../"
    new_url.path = URI.expand_path(path)
  end

  return new_url
end
urls() click to toggle source

Absolute URIs from within the page.

@return [Array<URI::HTTP>]

The links from within the page, converted to absolute URIs.
# File lib/spidr/links.rb, line 192
def urls
  each_url.to_a
end