| 
 from sys import argv
 from os import makedirs, unlink, sep
 from os.path import dirname, exists, isdir, splitext
 from string import replace, find, lower
 from htmllib import HTMLParser
 from urllib import urlretrieve
 from urlparse import urlparse, urljoin
 from formatter import DumbWriter, AbstractFormatter
 from cStringIO import StringIO
 class Downloader(object) :
 
 def __init__(self, url) :
 self.url = url
 self.file = self.filename(url)
 
 def filename(self, url, defFile = 'index.htm') :
 parsedUrl = urlparse(url, 'http:', 0)
 path = parsedUrl[1] + parsedUrl[2]
 ext = splitext(path)
 if ext[1] == '':
 if path[-1] == '/':
 path += defFile
 else:
 path += '/' + defFile
 localDir = dirname(path)
 if sep != '/':
 localDir = replace(localDir, '/', sep)
 if not isdir(localDir):
 if exists(localDir):
 unlink(localDir)
 makedirs(localDir)
 return path
 
 def download(self) :
 try :
 retval = urlretrieve(self.url, self.file)
 except IOError:
 retval = ('***ERROR: invalid URL "%s"' % self.url)
 return retval
 
 def parseAndGetLinks(self) :
 self.parser = HTMLParser(AbstractFormatter( /
 DumbWriter(StringIO())))
 self.parser.feed(open(self.file).read())
 self.parser.close()
 return self.parser.anchorlist
 class NetCrawler(object):
 count = 0
 
 def __init__(self, url) :
 self.queue = [url]
 self.seen = []
 self.dom = urlparse(url)[1]
 
 def getPage(self, url) :
 dl = Downloader(url)
 retval = dl.download()
 if retval[0] == '*' :
 print retval, '...skipping parse'
 return
 NetCrawler.count += 1
 print '/n(', NetCrawler.count, ')'
 print 'Url: ', url
 print 'File: ', retval[0]
 self.seen.append(url)
 links = dl.parseAndGetLinks()
 for eachLink in links :
 if eachLink[ : 4] != 'http' and /
 find(eachLink, '://') == -1:
 eachLink = urljoin(url, eachLink)
 print '*',eachLink
 if find(lower(eachLink), 'mailto:') != -1:
 print '... discarded, mailto link'
 continue
 if eachLink not in self.seen:
 if find(eachLink, self.dom) == -1 :
 print '... discarded, not in domain'
 else :
 if eachLink not in self.queue :
 self.queue.append(eachLink)
 print '... new, added to queue'
 else :
 print '... dirscarded, already in queue'
 else :
 print '... discarded, already processed'
 
 def run(self) :
 while self.queue :
 url = self.queue.pop()
 self.getPage(url)
 
 def main() :
 
 if len(argv) > 1 :
 url = argv[1]
 else :
 try :
 url = raw_input('Enter starting URL: ')
 expect (KeyboardInterrupt, EOFError) :
 url = ''
 if not url :
 return
 netCrawler = NetCrawler(url)
 netCrawler.run()
 
 if __name__ == '__main__' :
 main()
 
 |