#! /usr/bin/python """ A modular version of webcrawler.py that uses two separate functions [canonical_URL, page_contents] = get_webpage(URL) links = scoop_hrefs(page_contents) to replace the old get_webinfo() function, which did too many things. The advantage of breaking this up into two parts is that we can see which pages causes the program to break. (Also, retriveing a web page from the net is very different than parsing a local html string for href links.) So before we would write [canonical_URL, sha1_hash, links] = get_webinfo(URL, Permission, RegExp) we write [canonical_URL, page_contents] = get_webpage(ULR) print canonical_URL links = scoop_hrefs(page_contents) for href in links print href etc. The function get_webpage() also deals with the proper 'robots.txt' permission issues (basically, it politely follows any restrictions) so we don't need to worry about it here. """ from datetime import datetime import sha, sys import os.path from urlparse import urlparse ######################### #<<<<< get_webpage module ######################### from robotparser import RobotFileParser from urlparse import urlparse from urlparse import urljoin import url_tools import urllib # Global constants (within get_webpage module) invalid_URL = "[-- invalid url --]" not_readable_URL = "[-- unreadable url --]" url_errors = set([invalid_URL, not_readable_URL]) Permissions = {} # True, if url is allowed by robots.txt permission. False, otherwise # def can_read(url): domain = domain_name(url) if domain not in Permissions : rp = RobotFileParser() rp.set_url(urljoin('http://' + domain, 'robots.txt')) try : rp.read() except: return False Permissions[domain] = rp return Permissions[domain].can_fetch("*", url) def get_webpage(url): #print "get_webpage(" + url + ")" if not can_read(url) : return url, not_readable_URL # try to open url, if unsuccessful, return default info and exit # try : #print "Opening: " + url f = urllib.urlopen(url) #print "Opened: " + url # # changed IOError to "anything" , since urlopen sometimes throws # httplib.BadStatusLine() exception, which apparently is not # covered under IOError. # except IOError : #except : return url, invalid_URL #print "Sucess: " + url return f.geturl(), f.read() ################################ #>>>> end of get_webpage module ################################ ################################ #<<<< scoop_hrefs module ################################ from BeautifulSoup import BeautifulSoup import re def scoop_hrefs_beautiful_soup(html_page_contents): links = [] try : b = BeautifulSoup(html_page_contents) except UnicodeDecodeError: pass else: for tag in b.findAll('a', href=True): links.append(tag['href']) return links #Alternate method using (precompiled) regular expressions: href_regexp = re.compile('>>>>> scoop_hrefs module ################################ ################################ #<<<<<< url_tools module ################################ from urlparse import urlparse from urlparse import urljoin from urlparse import urlunsplit def domain_name(url): return urlparse(url)[1] # Takes "http://foo.com/page1" and "../page2" to # create "http://foo.com/page2" # def href2url(originating_page, href): # strip out leading and trailing white space # and convert to lower-case href = href.strip().lower() href = href.replace("%7","~") # Parse out the query and anchor section of url # and make relative urls (e.g. "../foo.html") into # absolute (complete with "http:") pieces = urlparse( urljoin(originating_page, href)) url_scheme = pieces[0] url_location = pieces[1] url_path = pieces[2] return urlunsplit((url_scheme, url_location, url_path, '','')) ################################ #>>>>> url_tools module ################################ # returns True if this url is not to be followedl # i.e. is a jpg, gif, pdf, zip, or other document # def file_extension(filename) : (base, ext) = os.path.splitext(filename) return ext # Global constants # # terminal URL extensions not added to the frontier (links-to-visit) # terminal_extensions = set(['.jpg', '.pdf', '.gif', '.pdf', \ '.ps', '.gz', '.tar', '.tgz', \ '.zip', '.ppt', '.txt', '.doc', \ '.mp3', '.wav', '.mpg', '.mov', \ '.avi', '.exe', '.qt', '.jar' \ '.Z', '.mat', '.wrl' \ ]) def main(): if len(sys.argv) <= 1 : print "usage is url [max-num-edges] " print " | " print " ^ " print " (both starting-point must be a valid URL and begin with http://...)" print " Ex: http://math.nist.gov 100 " print " " sys.exit(2) starting_url = sys.argv[1] links_to_visit_set = set([]) links_to_visit_set.add(starting_url) links_already_visited_set = set([]) hash_codes_already_visited_set = set([]) # # domain_name looks like http://foo.bar.com # StartingDomainName = domain_name(starting_url) # if no max number of edges specified, do not impose a limit # max_num_edges = 0 if len(sys.argv) > 2 : max_num_edges = int(sys.argv[2]) num_edges = 0 link_number = 0 while (len(links_to_visit_set) > 0) and \ ((max_num_edges < 1) or (num_edges < max_num_edges)): current_link = links_to_visit_set.pop() if current_link in links_already_visited_set: continue canonical_url, page_contents = get_webpage(current_link) link_number = link_number + 1 print "* ", str(link_number)+":"+ \ datetime.now().strftime("%Y-%m%-%d:%H:%M:%S"), current_link sys.stdout.flush() #make link is printed if program breaks if (page_contents in url_errors): print page_contents print "" links_already_visited_set.add(current_link) continue sha1_hash = sha.new(page_contents).hexdigest() # there are four cases here when processing a new URL # a) actual URL matches canonical URL, # b) hash-code is new or not. # # In the case of a) we record both in the output as # # 113 * canonical_URL # # actual URL # # and in b) we don't follow the page links with an old hash code, # so we write a #! on the line after the hash code, e.g. # # 113 * canonical_URL # # actual URL # hash-code # #! # if (current_link != canonical_url) : print "# ", canonical_url # if this is an old hash-code, we already know the links # so just stamp a #! to denote this has page been alread processed # and continue to the next page. # if (sha1_hash in hash_codes_already_visited_set) : print "!" + sha1_hash print "" continue print sha1_hash # now extract outgoing html links (which may be relative, e.g. # "../index.html") links_on_page = scoop_hrefs(page_contents) # only follow outgoing links if this originates within domain # if (domain_name(canonical_url) == StartingDomainName) : for link in links_on_page: link = href2url(canonical_url, link) if ( not link.startswith("http")) : continue print link sys.stdout.flush() #make sure last link is printed if program breaks if (link not in links_to_visit_set) and \ (file_extension(link) not in terminal_extensions) : links_to_visit_set.add(link) # print "to visit ", link num_edges = num_edges + 1 links_already_visited_set.add(current_link) if (current_link != canonical_url) : links_already_visited_set.add(canonical_url) hash_codes_already_visited_set.add(sha1_hash) print "" print "\n[-- DONE --]\n" print "read ", link_number, " pages." print "number of edges : ", num_edges print " " print "[-- Frontier --]" for edge in links_to_visit_set: print edge print "[-- Frontier end --]" if __name__ == "__main__": main()