class Socialinvestigator::Client::NetClient

Public Instance Methods

check_regex( mashed_regex, value ) click to toggle source
# File lib/socialinvestigator/client/net.rb, line 474
def check_regex( mashed_regex, value )
  regex,result = mashed_regex.split( /\\;/ )
  md = Regexp.new( regex ).match( value )
  if md
    if result
      result = result.gsub( /\\1/, (md[1] || "" )).gsub( /\\2/, (md[2] || "") )
    else
      true
    end
  else
    false
  end
end
find_id_path( links, regex ) click to toggle source
# File lib/socialinvestigator/client/net.rb, line 459
def find_id_path( links, regex )
  links.collect do |link|
    if regex.match( link )
      res = $1 || link
      if (res =~ /share/)
        nil
      else
        res
      end
    end
  end.select do |x|
    x
  end.uniq
end
get_knowledge( url, noreverse = false, debug = false ) click to toggle source

Look up the domain

# File lib/socialinvestigator/client/net.rb, line 130
def get_knowledge( url, noreverse = false, debug = false )
  data = Socialinvestigator::Client::Net::PageKnowledge.new( debug )
  dns = Socialinvestigator::Client::Net::DNS.new

  uri = URI( url )

  data.remember( :hostname, uri.hostname )

  domain = dns.find_domain(uri.hostname)

  data.remember( :domain, domain )

  # Look at the domain info

  whois = Whois.lookup( domain )

  data.remember( :registered?, whois.registered? )
  if whois.registrar
    data.remember( :registrar_name, whois.registrar.name )
    data.remember( :registrar_url, whois.registrar.url )
  end

  data.remember( :created_on, whois.created_on.strftime( "%Y-%m-%d") ) if whois.created_on
  data.remember( :expires_on, whois.expires_on.strftime( "%Y-%m-%d") ) if whois.expires_on
  data.remember( :updated_on, whois.updated_on.strftime( "%Y-%m-%d") ) if whois.updated_on

  whois.contacts.each do |c|
    data.another( :emails, c.email.downcase ) if c.email
    case c.type
    when Whois::Record::Contact::TYPE_REGISTRANT
      data.remember( :registrant_contact, c )
    when Whois::Record::Contact::TYPE_ADMINISTRATIVE
      data.remember( :admin_contact, c )
    when Whois::Record::Contact::TYPE_TECHNICAL
      data.remember( :technical_contact, c )
    end
  end

  whois.parts.each do |p|
    if Whois::Record::Parser.parser_for(p).is_a? Whois::Record::Parser::Blank
      puts "Couldn't find a parser for #{p.host}:"
      data.another( :unparsed_whois, p.body )
    end
  end


  if !noreverse
    ip_address = Dnsruby::Resolv.getaddress uri.host

    if ip_address
      data.remember :ip_address, ip_address
      begin
        data.remember :server_name, Dnsruby::Resolv.getname( ip_address )
      rescue Dnsruby::NXDomain
        # Couldn't do the reverse lookup
      end

      location_info = HTTParty.get('http://freegeoip.net/json/' + ip_address)

      data.remember :server_country, location_info['country_name']
      data.remember :server_location, [location_info['city'], location_info['region_name']].collect { |x| (x != nil && x != "") ? x : nil }.select { |x| x }.join( ", ")
      data.remember :server_latitude, location_info['latitude']
      data.remember :server_longitude, location_info['longitude']

      ip_whois = Whois.lookup ip_address

      ip_whois.to_s.each_line.select { |x| x=~/Organization/ }.each do |org|
        if org =~ /Organization:\s*(.*)\n/
          data.another :server_ip_owner, $1
        end
      end
    end
  end

  # Load up the response

  # client = HTTPClient.new
  # client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
  # response = client.get( url )
  #       # @ssl = p.peer_cert

  response = HTTParty.get url

  # require 'pp'
  # pp response.headers

  data.remember( :server, response.headers['server'] )


  # Parse the HTML

  parsed = Nokogiri.parse response.body

  data.remember( :page_title, parsed.title )

  # RSS Feed:
  if feed = parsed.css( 'link[type="application/rss+xml"]' ).first
    feed = feed.attributes['href'].value
    data.remember( :rss_feed, feed )
  end

  # Atom Feed:
  if feed = parsed.css( 'link[type="application/atom+xml"]' ).first
    feed = feed.attributes['href'].value
    data.remember( :atom_feed, feed )
  end



  # Meta tags

  meta = {}
  parsed.css( "meta[name]" ).each do |t|
    meta[t.attributes["name"].value] = t.attributes["content"].value if t.attributes["content"]
  end

  parsed.css( "meta[property]" ).each do |t|
    meta[t.attributes["property"].value] = t.attributes["content"].value
  end

  # require 'pp'
  # pp meta

  data.remember( :author, meta['author'] ) 
  data.remember( :description, meta['description'] ) 
  data.remember( :keywords, meta['keywords'] ) 
  data.remember( :generator, meta['generator'])

  data.remember( :responsive, true )  if meta["viewport"] =~ /width=device-width/


  # Check Twitter Card:

  data.remember( :twitter_title, meta["twitter:title"] ) 
  data.remember( :twitter_creator, meta["twitter:creator"] ) 
  if /@(.*)/.match( meta["twitter:creator"] )
    data.another( :twitter_ids, $1 )
  end
  data.remember( :twitter_site_author, meta["twitter:site"] )
  if /@(.*)/.match( meta["twitter:site"] )
    data.another( :twitter_ids, $1 )
  end
  data.remember( :twitter_image, meta["twitter:image"] ) 
  data.remember( :twitter_description, meta["twitter:description"] )

  # Open Graph

  data.remember( :og_title, meta["og:title"] ) 
  data.remember( :og_description, meta["og:description"] )
  data.remember( :og_type, meta["og:type"] ) 
  data.remember( :og_image, meta["og:image"] ) 


  # Look inside the body:


  # Twitter

  # Look for twitter links
  twitter_links = hrefs( matching_links( parsed, /twitter.com\/[^\/]*$/ ), true )
  data.remember( :twitter_links, twitter_links ) 

  twitter_ids = find_id_path( twitter_links, /twitter.com\/([^\/]*$)/  ).each do |id|
    data.another( :twitter_ids, id )
  end

  # Look for twitter shared links

  twitter_shared = matching_links( parsed, /twitter.com\/share/ )

  twitter_shared.each do |l|
    text = l['data-text']

    # See if there's a "by @user" in the text
    if /by\s*@([^\s]*)/.match text
      data.another( :twitter_ids, $1 )
      data.remember( :twitter_by, $1 ) 
    end

    # Look for all "@usernames" in the text
    if text
      text.split.select { |x| x =~ /@\s*/ }.each do |id|
        data.another( :twitter_ids, id.slice( 1,100 ) ) # We don't want the @
      end
    end

    # See if there's a via link on the anchor tag
    if l['data-via']
      data.another( :twitter_ids, l['data-via'])
    end


    possible_via = URI.decode( (URI(l['href']).query) || "" ).split( /&/ ).collect { |x| x.split( /=/  ) }.select { |x| x[0] == 'via' }
    if possible_via.size > 0
      data.another( :twitter_ids, possible_via[0][1] )
    end
  end

  # Look for intent

  twitter_intent = hrefs( matching_links( parsed, /twitter.com\/intent/ ) )

  twitter_intent.each do |t|
    URI.decode( URI(t.gsub( / /, "+" )).query ).split( /&/ ).select do |x| 
      x =~ /via/
    end.collect do |x| 
      x.gsub( /via=/, "" )
    end.each do |via|
      data.another( :twitter_ids, via )
    end
  end
  # Look for email

  email_links = hrefs( matching_links( parsed, /mailto:/ ) )
  email_address = find_id_path( email_links, /mailto:(.*@.*\..*)/ ).each do |email|
    data.another( :emails, email )
  end

  # Linkedin

  linkedin_links = hrefs( matching_links( parsed, /linkedin.com/ ), true )
  data.remember( :linkedin_links, linkedin_links ) 

  # Instagram

  instagram_links = hrefs( matching_links( parsed, /instagram.com/ ) )
  data.remember( :instagram_links, instagram_links ) 

  # Facebook

  facebook_links = hrefs( matching_links( parsed, /facebook.com\/[^\/]*$/ ) )
  data.remember( :facebook_links, facebook_links ) 

  # Google plus

  googleplus_links = hrefs( matching_links( parsed, /plus.google.com\/[^\/]*$/ ) )
  data.remember( :googleplus_links, googleplus_links ) 

  # Github

  github_links = hrefs( matching_links( parsed, /github.com\/[^\/]*$/ ) )
  data.remember( :github_links, github_links ) 


  # Bonus!

  # Get this file from https://github.com/ElbertF/Wappalyzer/tree/master/share

  apps = Socialinvestigator::Config.config.apps_json
  if apps
    scripts = parsed.css( "script" ).collect { |x| x['src'] }.select { |x| x }
    # puts scripts

    apps['apps'].each do |app,checks|
      if checks['html']
        html_array = checks['html']
        html_array = [checks['html']] if html_array.is_a? String

        html_array.each do |html|
          result = check_regex( html, response.body )
          if result
            data.another :technologies, app
            data.another :technologies, checks['implies']
          end
        end
      end

      if checks['meta']
        checks['meta'].each do |k,code|
          result = check_regex( code, meta[k] )
          if result
            data.another :technologies, app
            data.another :technologies, checks['implies']
          end
        end
      end

      if checks['headers']
        checks['headers'].each do |k,code|
          result = check_regex( code, response.headers[k] )
          if result
            data.another :technologies, app
            data.another :technologies, checks['implies']
          end
        end
      end

      if checks['script']
        script_array = checks['script']
        script_array = [checks['script']] if script_array.is_a? String
        script_array.each do |script_regex|
          scripts.each do |script|
            result = check_regex( script_regex, script)
            if result
              data.another :technologies, app
              data.another :technologies, checks['implies']
            end
          end
        end
      end
    end
  end
  data
end
hrefs( links, filter_shared = false ) click to toggle source
# File lib/socialinvestigator/client/net.rb, line 447
def hrefs( links, filter_shared = false )
  links.collect do |x|
    x['href']
  end.select do |url|
    if filter_shared
      !(url =~ /share/)
    else
      true
    end
  end.uniq
end