from sys import argv from os import makedirs, unlink, sep from os.path import dirname, exists, isdir, splitext from string import replace, find, lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO class Downloader(object) :          def __init__(self, url) :         self.url = url         self.file = self.filename(url)          def filename(self, url, defFile = 'index.htm') :         parsedUrl = urlparse(url, 'http:', 0)         path = parsedUrl[1] + parsedUrl[2]         ext = splitext(path)         if ext[1] == '':                     if path[-1] == '/':                 path += defFile             else:                 path += '/' + defFile         localDir = dirname(path)         if sep != '/':             localDir = replace(localDir, '/', sep)         if not isdir(localDir):             if exists(localDir):                      unlink(localDir)             makedirs(localDir)         return path          def download(self) :         try :             retval = urlretrieve(self.url, self.file)         except IOError:             retval = ('***ERROR: invalid URL "%s"' % self.url)         return retval          def parseAndGetLinks(self) :         self.parser = HTMLParser(AbstractFormatter( /             DumbWriter(StringIO())))         self.parser.feed(open(self.file).read())         self.parser.close()         return self.parser.anchorlist class NetCrawler(object):     count = 0            def __init__(self, url) :         self.queue = [url]         self.seen = []         self.dom = urlparse(url)[1]          def getPage(self, url) :         dl = Downloader(url)         retval = dl.download()         if retval[0] == '*' :             print retval, '...skipping parse'             return         NetCrawler.count += 1         print '/n(', NetCrawler.count, ')'         print 'Url: ', url         print 'File: ', retval[0]         self.seen.append(url)         links = dl.parseAndGetLinks()         for eachLink in links :             if eachLink[ : 4] != 'http' and /                find(eachLink, '://') == -1:                 eachLink = urljoin(url, eachLink)                 print '*',eachLink             if find(lower(eachLink), 'mailto:') != -1:                 print '... discarded, mailto link'                 continue             if eachLink not in self.seen:                 if find(eachLink, self.dom) == -1 :                     print '... discarded, not in domain'                 else :                     if eachLink not in self.queue :                         self.queue.append(eachLink)                         print '... new, added to queue'                     else :                         print '... dirscarded, already in queue'             else :                 print '... discarded, already processed'          def run(self) :         while self.queue :             url = self.queue.pop()             self.getPage(url)
  def main() :          if len(argv) > 1 :         url = argv[1]     else :         try :             url = raw_input('Enter starting URL: ')         expect (KeyboardInterrupt, EOFError) :             url = ''     if not url :         return     netCrawler = NetCrawler(url)     netCrawler.run()
  if __name__ == '__main__' :     main()
 
  |