root/calcium/trunk/extractors.py

Revision 159, 0.6 kB (checked in by dsandler, 2 years ago)

The refactoring monster was here. Now Digg's link-extraction code is factored out, so other sites can be added.

Line 
1 import sys ; sys.path.append('lib')
2 from BeautifulSoup import BeautifulSoup
3 from utils import *
4
5 class DefaultExtractor:
6     def __init__(self): pass
7     def get_link(self,link):
8         return link
9
10 class DiggExtractor(DefaultExtractor):
11     def get_link(self,link):
12         page = urlfetch(link)
13         soup = BeautifulSoup(page)
14 # <div class="news-full" id="main0" style="z-index:1000"><div class="news-body"><h3 id="title1"><a href="http://www.chron.com/disp/story.mpl/ap/politics/3654155.html" >
15         title = soup.first('h3')
16         coral_url = title.a['href']
17         return coral_url
Note: See TracBrowser for help on using the browser.