root/calcium/trunk/calcium.py

Revision 159, 3.0 kB (checked in by dsandler, 2 years ago)

The refactoring monster was here. Now Digg's link-extraction code is factored out, so other sites can be added.

  • Property svn:executable set to
Line 
1 #!/usr/bin/env python
2
3 """
4 Calcium: A script to seed the CoralCDN with pages from various "new links"
5 feeds.  Calcium reads these feeds through the FeedTree proxy running on
6 localhost, to ensure prompt discovery of new links (without placing undue
7 stress on the webserver hoting the feed).
8
9 The idea is that these "new links" will (presumably) show up on these feeds
10 (and therefore be discovered by Calcium) before they become "hot" (and
11 consequently unavailable, due to the Digg/Reddit/Slashdot effect).
12
13 See also:
14     * FeedTree - http://feedtree.net
15     * CoralCDN - http://coralcdn.org
16 """
17
18 import sys ; sys.path.append('lib')
19
20 import shelve, urllib2, urlparse
21 from BeautifulSoup import BeautifulSoup
22 import feedparser
23 import coralcache
24
25 from extractors import *
26 from utils import *
27
28 class Feed:
29     def __init__(self, url, link_extractor=DefaultExtractor):
30         self.url = url ; self.link_extractor = link_extractor()
31
32 CRAWL_FEEDS = [
33     Feed('http://digg.com/rss/indexdig.xml', link_extractor=DiggExtractor),
34     Feed('http://reddit.com/new.rss'),
35     Feed('http://del.icio.us/rss/popular/'),
36 ]
37
38 PROXY_PREFIX = 'http://127.0.0.1:8500/cache/'
39
40 URL_CACHE_FILE = 'feeds.shelf'
41    
42 def coral_seed(url):
43     sys.stdout.write('[%s]' % coralcache.coralize(url))
44     page = urlfetch(url, 1)
45     return True
46
47
48 def main():
49     links_seen = shelve.open(URL_CACHE_FILE,'c')
50
51     print "Calcium: loaded %d old URLs" % len(links_seen)
52
53
54     def feedtree_fetch(url):
55         return urlfetch(PROXY_PREFIX + url)
56
57     try:
58         for feedinfo in CRAWL_FEEDS:
59             try:
60                 sys.stdout.write("Fetching from FeedTree: " + feedinfo.url)
61                 page = feedtree_fetch(feedinfo.url)
62                 sys.stdout.write(" (%d b)\n" % len(page))
63                 #print page[0:80]
64                 
65                 doc = feedparser.parse(page)
66                 #print doc
67                 # print "  - Title: %s" % doc.title
68                 print "  - Items: %d" % len(doc.entries)
69
70                 for e in doc.entries:
71                     link = str(e.link)
72                     if not link in links_seen:
73                         sys.stdout.write(" + Examining feed URL: " + link)
74                         coral_url = feedinfo.link_extractor.get_link(link)
75
76                         sys.stdout.write("\n   => Coralizing new URL: %s "
77                                 % coral_url)
78
79                         coral_seed(coral_url)
80                         sys.stdout.write(" (OK)\n")
81
82                         links_seen[link] = True
83                        
84             except IOError, e:
85                 print "\nIO exception: " + `e`
86                 #raise e
87             except urllib2.HTTPError, e:
88                 print "\nHTTP exception: " + `e`
89                 # raise e
90             except KeyboardInterrupt:
91                 print "Interrupted..."
92                 links_seen.close()
93                 sys.exit(1)
94
95     except KeyboardInterrupt:
96         print "Interrupted..."
97         links_seen.close()
98         sys.exit(1)
99
100     links_seen.close()
101     sys.exit(0)
102
103 if __name__ == '__main__':
104     main()
Note: See TracBrowser for help on using the browser.