#! /usr/bin/python

"""

 A modular version of webcrawler.py that uses two separate functions
 
 [canonical_URL, page_contents] = get_webpage(URL)
 links = scoop_hrefs(page_contents)

 to replace the old get_webinfo() function, which did too many things. 
 The advantage of breaking this up into two parts is that we can see which
 pages causes the program to break.  (Also, retriveing a web page from the
 net is very different than parsing a local html string for href links.)

 So before we would write

 [canonical_URL, sha1_hash, links] = get_webinfo(URL, Permission, RegExp)

 we write

  [canonical_URL, page_contents] = get_webpage(ULR)
  print canonical_URL
  links = scoop_hrefs(page_contents)
  for href in links
     print href
     etc.

 
 The function get_webpage() also deals with the proper 'robots.txt'
 permission issues (basically, it politely follows any restrictions)
 so we don't need to worry about it here.
""" 


from datetime import datetime
import sha, sys
import os.path
from urlparse import urlparse


#########################
#<<<<< get_webpage module 
#########################

from robotparser import RobotFileParser
from urlparse import urlparse
from urlparse import urljoin
import url_tools
import urllib

# Global constants (within get_webpage module)

invalid_URL =       "[-- invalid url --]"
not_readable_URL =  "[-- unreadable url --]"
url_errors = set([invalid_URL, not_readable_URL])
Permissions = {}

#  True, if url is allowed by robots.txt permission.  False, otherwise
#
def can_read(url):

  domain = domain_name(url)
  if domain not in Permissions :
         rp = RobotFileParser()
         rp.set_url(urljoin('http://' + domain, 'robots.txt'))
         try :
            rp.read()
         except:
            return False
         
         Permissions[domain] = rp

  return Permissions[domain].can_fetch("*", url)


def get_webpage(url):

  #print "get_webpage(" + url + ")"
  if not can_read(url)  :
      return url, not_readable_URL

  #  try to open url, if unsuccessful, return default info and exit
  #
  try :
    #print "Opening: " + url
    f = urllib.urlopen(url) 
    #print "Opened: " + url
  #
  # changed IOError to "anything" , since urlopen sometimes throws
  # httplib.BadStatusLine() exception, which apparently is not
  # covered under IOError.
  #
  except IOError :
  #except :
    return url,  invalid_URL
    
  #print "Sucess: " + url
  return f.geturl(), f.read()


################################
#>>>> end of get_webpage module
################################


################################
#<<<<  scoop_hrefs module
################################

from BeautifulSoup import BeautifulSoup
import re

def scoop_hrefs_beautiful_soup(html_page_contents):
  links = []
  try :
     b = BeautifulSoup(html_page_contents)
  except UnicodeDecodeError:
     pass
  else: 
     for tag in b.findAll('a', href=True):
       links.append(tag['href'])


  return links


#Alternate method using (precompiled) regular expressions:


href_regexp = re.compile('<a\s+href\s*?="\s*?(.*?)"', \
        re.IGNORECASE | re.MULTILINE)

def scoop_hrefs_regexp(html_page_contents):
  return href_regexp.findall(html_page_contents)


# use both methods and combine the results
#
def scoop_hrefs(html_page_contents):
    
    return set.union(set(scoop_hrefs_beautiful_soup(html_page_contents)), \
      set(scoop_hrefs_regexp(html_page_contents)))


################################
#>>>>>>  scoop_hrefs module
################################


################################
#<<<<<<  url_tools module
################################

from urlparse import urlparse
from urlparse import urljoin
from urlparse import urlunsplit

def domain_name(url):
    return urlparse(url)[1]


#  Takes "http://foo.com/page1" and "../page2" to
#  create "http://foo.com/page2"
# 
def href2url(originating_page, href):

    # strip out leading and trailing white space
    # and convert to lower-case

    href = href.strip().lower()   
    href = href.replace("%7","~")

    # Parse out the query and anchor section of url
    # and make relative urls (e.g. "../foo.html") into
    # absolute (complete with "http:")

    pieces = urlparse( urljoin(originating_page, href))
    url_scheme = pieces[0]
    url_location = pieces[1]
    url_path = pieces[2]
    return  urlunsplit((url_scheme, url_location, url_path, '',''))


################################
#>>>>>  url_tools module
################################


# returns True if this url is not to be followedl
# i.e. is a jpg, gif, pdf, zip, or other document
#

def file_extension(filename) :
    (base, ext) = os.path.splitext(filename)
    return ext 


# Global constants
#


# terminal URL extensions not added to the frontier (links-to-visit)
#
terminal_extensions  =  set(['.jpg', '.pdf', '.gif', '.pdf', \
                     '.ps',  '.gz',  '.tar', '.tgz', \
                     '.zip', '.ppt', '.txt', '.doc', \
                     '.mp3', '.wav', '.mpg', '.mov', \
                     '.avi', '.exe', '.qt',  '.jar'  \
                     '.Z',   '.mat', '.wrl'          \
                     ])


def main():

  if len(sys.argv) <= 1  :
    print "usage is url  [max-num-edges] "
    print "          | "
    print "          ^ "
    print " (both starting-point must be a valid URL and begin with http://...)"
    print " Ex:  http://math.nist.gov 100 "
    print " "
    sys.exit(2)

  starting_url = sys.argv[1]

  links_to_visit_set = set([])
  links_to_visit_set.add(starting_url)

  links_already_visited_set = set([])
  hash_codes_already_visited_set = set([])


  #
  # domain_name looks like http://foo.bar.com
  #
  StartingDomainName =  domain_name(starting_url)

  
  # if no max number of edges specified, do not impose a limit
  #
  max_num_edges = 0
  if len(sys.argv) > 2 :
        max_num_edges = int(sys.argv[2])

  num_edges = 0
  link_number = 0

  while (len(links_to_visit_set) > 0) and \
      ((max_num_edges < 1) or (num_edges < max_num_edges)):

    current_link = links_to_visit_set.pop()

    if current_link in links_already_visited_set: continue

    canonical_url, page_contents = get_webpage(current_link)
    
    link_number = link_number + 1
    print "* ", str(link_number)+":"+   \
        datetime.now().strftime("%Y-%m%-%d:%H:%M:%S"), current_link
    sys.stdout.flush()  #make link is printed if program breaks

    if (page_contents in url_errors):
       print page_contents
       print ""
       links_already_visited_set.add(current_link)
       continue

    sha1_hash =   sha.new(page_contents).hexdigest()

   
    #   there are four cases here when processing a new URL
    #   a) actual URL matches canonical URL, 
    #   b) hash-code is new or not.
    #  
    #   In the case of a) we record both in the output as
    #  
    #   113 * canonical_URL
    #   # actual URL
    #   
    #   and in b) we don't follow the page links with an old hash code,
    #   so we write a #! on the line after the hash code, e.g.
    #  
    #   113 * canonical_URL
    #   # actual URL
    #   hash-code
    #   #!
    #  


    if (current_link != canonical_url) :
      print "# ", canonical_url


    #  if this is an old hash-code, we already know the links
    #  so just stamp a #! to denote this has page been alread processed
    #  and continue to the next page.
    # 
    if (sha1_hash in hash_codes_already_visited_set) :
      print "!" + sha1_hash
      print ""
      continue
    
    print sha1_hash
  

    # now extract outgoing html links (which may be relative, e.g.
    #  "../index.html")

    links_on_page = scoop_hrefs(page_contents)
    
    
    # only follow outgoing links if this originates within domain
    #
    if (domain_name(canonical_url) == StartingDomainName) :
       for link in links_on_page:
         link = href2url(canonical_url, link)
         if ( not link.startswith("http")) : continue
         print link
         sys.stdout.flush()  #make sure last link is printed if program breaks
         if (link not in links_to_visit_set) and \
             (file_extension(link) not in terminal_extensions) :
                 links_to_visit_set.add(link)
                 # print "to visit ", link
         num_edges = num_edges + 1
   
    links_already_visited_set.add(current_link)
    if (current_link != canonical_url) :
      links_already_visited_set.add(canonical_url)
    hash_codes_already_visited_set.add(sha1_hash)

    print ""
      

  print "\n[-- DONE --]\n"

  print "read ", link_number,  " pages."
  print "number of edges : ", num_edges

  print " "
  print "[-- Frontier --]" 
  for edge in links_to_visit_set:
      print edge
  print "[-- Frontier end --]" 
    
if __name__ == "__main__":
  main()