| | 2 | |
|---|
| | 3 | """ |
|---|
| | 4 | Calcium: A script to seed the CoralCDN with pages from various "new links" |
|---|
| | 5 | feeds. Calcium reads these feeds through the FeedTree proxy running on |
|---|
| | 6 | localhost, to ensure prompt discovery of new links (without placing undue |
|---|
| | 7 | stress on the webserver hoting the feed). |
|---|
| | 8 | |
|---|
| | 9 | The idea is that these "new links" will (presumably) show up on these feeds |
|---|
| | 10 | (and therefore be discovered by Calcium) before they become "hot" (and |
|---|
| | 11 | consequently unavailable, due to the Digg/Reddit/Slashdot effect). |
|---|
| | 12 | |
|---|
| | 13 | See also: |
|---|
| | 14 | * FeedTree - http://feedtree.net |
|---|
| | 15 | * CoralCDN - http://coralcdn.org |
|---|
| | 16 | """ |
|---|
| 11 | | def __init__(self, url, requires_crawl=False): |
|---|
| 12 | | self.url = url ; self.requires_crawl = requires_crawl |
|---|
| 13 | | |
|---|
| 14 | | USER_AGENT = 'Calcium/1.0 +http://feedtree.net/ (FeedTree + CoralCDN)' |
|---|
| 15 | | |
|---|
| 16 | | def urlfetch(url, length=-1): |
|---|
| 17 | | req = urllib2.Request(url) |
|---|
| 18 | | req.add_header('User-Agent', USER_AGENT) |
|---|
| 19 | | opener = urllib2.build_opener() |
|---|
| 20 | | return opener.open(req).read(length) |
|---|
| 21 | | |
|---|
| 22 | | def coral_seed(url): |
|---|
| 23 | | sys.stdout.write('[%s]' % coralcache.coralize(url)) |
|---|
| 24 | | page = urlfetch(coral_url, 1) |
|---|
| 25 | | return True |
|---|
| | 29 | def __init__(self, url, link_extractor=DefaultExtractor): |
|---|
| | 30 | self.url = url ; self.link_extractor = link_extractor() |
|---|
| 77 | | links_seen[link] = True |
|---|
| 78 | | |
|---|
| 79 | | except IOError, e: |
|---|
| 80 | | print "\nIO exception: " + `e` |
|---|
| 81 | | #raise e |
|---|
| 82 | | except urllib2.HTTPError, e: |
|---|
| 83 | | print "\nHTTP exception: " + `e` |
|---|
| 84 | | # raise e |
|---|
| 85 | | except KeyboardInterrupt: |
|---|
| 86 | | print "Interrupted..." |
|---|
| 87 | | links_seen.close() |
|---|
| 88 | | sys.exit(1) |
|---|
| | 57 | try: |
|---|
| | 58 | for feedinfo in CRAWL_FEEDS: |
|---|
| | 59 | try: |
|---|
| | 60 | sys.stdout.write("Fetching from FeedTree: " + feedinfo.url) |
|---|
| | 61 | page = feedtree_fetch(feedinfo.url) |
|---|
| | 62 | sys.stdout.write(" (%d b)\n" % len(page)) |
|---|
| | 63 | #print page[0:80] |
|---|
| | 64 | |
|---|
| | 65 | doc = feedparser.parse(page) |
|---|
| | 66 | #print doc |
|---|
| | 67 | # print " - Title: %s" % doc.title |
|---|
| | 68 | print " - Items: %d" % len(doc.entries) |
|---|
| 90 | | except KeyboardInterrupt: |
|---|
| 91 | | print "Interrupted..." |
|---|
| | 70 | for e in doc.entries: |
|---|
| | 71 | link = str(e.link) |
|---|
| | 72 | if not link in links_seen: |
|---|
| | 73 | sys.stdout.write(" + Examining feed URL: " + link) |
|---|
| | 74 | coral_url = feedinfo.link_extractor.get_link(link) |
|---|
| | 75 | |
|---|
| | 76 | sys.stdout.write("\n => Coralizing new URL: %s " |
|---|
| | 77 | % coral_url) |
|---|
| | 78 | |
|---|
| | 79 | coral_seed(coral_url) |
|---|
| | 80 | sys.stdout.write(" (OK)\n") |
|---|
| | 81 | |
|---|
| | 82 | links_seen[link] = True |
|---|
| | 83 | |
|---|
| | 84 | except IOError, e: |
|---|
| | 85 | print "\nIO exception: " + `e` |
|---|
| | 86 | #raise e |
|---|
| | 87 | except urllib2.HTTPError, e: |
|---|
| | 88 | print "\nHTTP exception: " + `e` |
|---|
| | 89 | # raise e |
|---|
| | 90 | except KeyboardInterrupt: |
|---|
| | 91 | print "Interrupted..." |
|---|
| | 92 | links_seen.close() |
|---|
| | 93 | sys.exit(1) |
|---|
| | 94 | |
|---|
| | 95 | except KeyboardInterrupt: |
|---|
| | 96 | print "Interrupted..." |
|---|
| | 97 | links_seen.close() |
|---|
| | 98 | sys.exit(1) |
|---|
| | 99 | |
|---|