Index: calcium/trunk/calcium.py =================================================================== --- calcium/trunk/calcium.py (revision 158) +++ calcium/trunk/calcium.py (revision 159) @@ -1,3 +1,18 @@ #!/usr/bin/env python + +""" +Calcium: A script to seed the CoralCDN with pages from various "new links" +feeds. Calcium reads these feeds through the FeedTree proxy running on +localhost, to ensure prompt discovery of new links (without placing undue +stress on the webserver hoting the feed). + +The idea is that these "new links" will (presumably) show up on these feeds +(and therefore be discovered by Calcium) before they become "hot" (and +consequently unavailable, due to the Digg/Reddit/Slashdot effect). + +See also: + * FeedTree - http://feedtree.net + * CoralCDN - http://coralcdn.org +""" import sys ; sys.path.append('lib') @@ -8,23 +23,13 @@ import coralcache +from extractors import * +from utils import * + class Feed: - def __init__(self, url, requires_crawl=False): - self.url = url ; self.requires_crawl = requires_crawl - -USER_AGENT = 'Calcium/1.0 +http://feedtree.net/ (FeedTree + CoralCDN)' - -def urlfetch(url, length=-1): - req = urllib2.Request(url) - req.add_header('User-Agent', USER_AGENT) - opener = urllib2.build_opener() - return opener.open(req).read(length) - -def coral_seed(url): - sys.stdout.write('[%s]' % coralcache.coralize(url)) - page = urlfetch(coral_url, 1) - return True + def __init__(self, url, link_extractor=DefaultExtractor): + self.url = url ; self.link_extractor = link_extractor() CRAWL_FEEDS = [ - Feed('http://digg.com/rss/indexdig.xml', requires_crawl=True), + Feed('http://digg.com/rss/indexdig.xml', link_extractor=DiggExtractor), Feed('http://reddit.com/new.rss'), Feed('http://del.icio.us/rss/popular/'), @@ -34,63 +39,66 @@ URL_CACHE_FILE = 'feeds.shelf' - -links_seen = shelve.open(URL_CACHE_FILE,'c') - -print "Calcium: loaded %d old URLs" % len(links_seen) + +def coral_seed(url): + sys.stdout.write('[%s]' % coralcache.coralize(url)) + page = urlfetch(url, 1) + return True -def feedtree_fetch(url): - return urlfetch(PROXY_PREFIX + url) +def main(): + links_seen = shelve.open(URL_CACHE_FILE,'c') -try: - for feedinfo in CRAWL_FEEDS: - try: - sys.stdout.write("Fetching from FeedTree: " + feedinfo.url) - page = feedtree_fetch(feedinfo.url) - sys.stdout.write(" (%d b)\n" % len(page)) - #print page[0:80] - - doc = feedparser.parse(page) - #print doc - # print " - Title: %s" % doc.title - print " - Items: %d" % len(doc.entries) + print "Calcium: loaded %d old URLs" % len(links_seen) - for e in doc.entries: - link = str(e.link) - if not link in links_seen: - sys.stdout.write(" + Examining feed URL: " + link) - if not feedinfo.requires_crawl: - coral_url = link - else: - page = urlfetch(link) - soup = BeautifulSoup(page) -#

- title = soup.first('h3') - coral_url = title.a['href'] - - sys.stdout.write("\n => Coralizing new URL: %s " - % coral_url) - coral_seed(coral_url) - sys.stdout.write(" (OK)\n") + def feedtree_fetch(url): + return urlfetch(PROXY_PREFIX + url) - links_seen[link] = True - - except IOError, e: - print "\nIO exception: " + `e` - #raise e - except urllib2.HTTPError, e: - print "\nHTTP exception: " + `e` - # raise e - except KeyboardInterrupt: - print "Interrupted..." - links_seen.close() - sys.exit(1) + try: + for feedinfo in CRAWL_FEEDS: + try: + sys.stdout.write("Fetching from FeedTree: " + feedinfo.url) + page = feedtree_fetch(feedinfo.url) + sys.stdout.write(" (%d b)\n" % len(page)) + #print page[0:80] + + doc = feedparser.parse(page) + #print doc + # print " - Title: %s" % doc.title + print " - Items: %d" % len(doc.entries) -except KeyboardInterrupt: - print "Interrupted..." + for e in doc.entries: + link = str(e.link) + if not link in links_seen: + sys.stdout.write(" + Examining feed URL: " + link) + coral_url = feedinfo.link_extractor.get_link(link) + + sys.stdout.write("\n => Coralizing new URL: %s " + % coral_url) + + coral_seed(coral_url) + sys.stdout.write(" (OK)\n") + + links_seen[link] = True + + except IOError, e: + print "\nIO exception: " + `e` + #raise e + except urllib2.HTTPError, e: + print "\nHTTP exception: " + `e` + # raise e + except KeyboardInterrupt: + print "Interrupted..." + links_seen.close() + sys.exit(1) + + except KeyboardInterrupt: + print "Interrupted..." + links_seen.close() + sys.exit(1) + links_seen.close() - sys.exit(1) + sys.exit(0) -links_seen.close() -sys.exit(0) +if __name__ == '__main__': + main() Index: calcium/trunk/extractors.py =================================================================== --- calcium/trunk/extractors.py (revision 159) +++ calcium/trunk/extractors.py (revision 159) @@ -0,0 +1,17 @@ +import sys ; sys.path.append('lib') +from BeautifulSoup import BeautifulSoup +from utils import * + +class DefaultExtractor: + def __init__(self): pass + def get_link(self,link): + return link + +class DiggExtractor(DefaultExtractor): + def get_link(self,link): + page = urlfetch(link) + soup = BeautifulSoup(page) +#