root/calcium/trunk/lib/feedparser.py

Revision 156, 140.0 kB (checked in by dsandler, 3 years ago)

Import of Calcium, a script to automatically scrape feeds and send their links to the CoralCDN. By reading feeds through a FeedTreeProxy, Calcium has excellent update resolution (read: new links faster).

Calcium can be pointed at feeds of links that have a high probability of becoming oversubscribed to ensure that Coral has a copy of the page before the server is crushed.

Examples of such feeds include:

  • Property svn:executable set to
Line 
1 #!/usr/bin/env python
2 """Universal feed parser
3
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
8
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12 """
13
14 __version__ = "4.2-pre-" + "$Revision: 1.95 $"[11:15] + "-cvs"
15 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
16
17 Redistribution and use in source and binary forms, with or without modification,
18 are permitted provided that the following conditions are met:
19
20 * Redistributions of source code must retain the above copyright notice,
21   this list of conditions and the following disclaimer.
22 * Redistributions in binary form must reproduce the above copyright notice,
23   this list of conditions and the following disclaimer in the documentation
24   and/or other materials provided with the distribution.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE."""
37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
39                     "John Beimler <http://john.beimler.org/>",
40                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41                     "Aaron Swartz <http://aaronsw.com/>",
42                     "Kevin Marks <http://epeus.blogspot.com/>"]
43 _debug = 0
44
45 # HTTP "User-Agent" header to send to servers when downloading feeds.
46 # If you are embedding feedparser in a larger application, you should
47 # change this to your application name and URL.
48 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
49
50 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
51 # want to send an Accept header, set this to None.
52 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
53
54 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
55 # but if they're not installed, Python will keep searching through its own list
56 # of pre-installed parsers until it finds one that supports everything we need.
57 PREFERRED_XML_PARSERS = ["drv_libxml2"]
58
59 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
60 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
61 # or utidylib <http://utidylib.berlios.de/>.
62 TIDY_MARKUP = 0
63
64 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
65 # if TIDY_MARKUP = 1
66 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
67
68 # ---------- required modules (should come with any Python distribution) ----------
69 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
70 try:
71     from cStringIO import StringIO as _StringIO
72 except:
73     from StringIO import StringIO as _StringIO
74
75 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
76
77 # gzip is included with most Python distributions, but may not be available if you compiled your own
78 try:
79     import gzip
80 except:
81     gzip = None
82 try:
83     import zlib
84 except:
85     zlib = None
86
87 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
88 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
89 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
90 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
91 try:
92     import xml.sax
93     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
94     from xml.sax.saxutils import escape as _xmlescape
95     _XML_AVAILABLE = 1
96 except:
97     _XML_AVAILABLE = 0
98     def _xmlescape(data):
99         data = data.replace('&', '&amp;')
100         data = data.replace('>', '&gt;')
101         data = data.replace('<', '&lt;')
102         return data
103
104 # base64 support for Atom feeds that contain embedded binary data
105 try:
106     import base64, binascii
107 except:
108     base64 = binascii = None
109
110 # cjkcodecs and iconv_codec provide support for more character encodings.
111 # Both are available from http://cjkpython.i18n.org/
112 try:
113     import cjkcodecs.aliases
114 except:
115     pass
116 try:
117     import iconv_codec
118 except:
119     pass
120
121 # chardet library auto-detects character encodings
122 # Download from http://chardet.feedparser.org/
123 try:
124     import chardet
125     if _debug:
126         import chardet.constants
127         chardet.constants._debug = 1
128 except:
129     chardet = None
130
131 # BeautifulSoup parser used for parsing microformats from embedded HTML content
132 # http://www.crummy.com/software/BeautifulSoup/
133 try:
134     import BeautifulSoup
135 except:
136     BeautifulSoup = None
137
138 # ---------- don't touch these ----------
139 class ThingsNobodyCaresAboutButMe(Exception): pass
140 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
141 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
142 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
143 class UndeclaredNamespace(Exception): pass
144
145 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
146 sgmllib.special = re.compile('<!')
147 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
148
149 SUPPORTED_VERSIONS = {'': 'unknown',
150                       'rss090': 'RSS 0.90',
151                       'rss091n': 'RSS 0.91 (Netscape)',
152                       'rss091u': 'RSS 0.91 (Userland)',
153                       'rss092': 'RSS 0.92',
154                       'rss093': 'RSS 0.93',
155                       'rss094': 'RSS 0.94',
156                       'rss20': 'RSS 2.0',
157                       'rss10': 'RSS 1.0',
158                       'rss': 'RSS (unknown version)',
159                       'atom01': 'Atom 0.1',
160                       'atom02': 'Atom 0.2',
161                       'atom03': 'Atom 0.3',
162                       'atom10': 'Atom 1.0',
163                       'atom': 'Atom (unknown version)',
164                       'cdf': 'CDF',
165                       'hotrss': 'Hot RSS'
166                       }
167
168 try:
169     UserDict = dict
170 except NameError:
171     # Python 2.1 does not have dict
172     from UserDict import UserDict
173     def dict(aList):
174         rc = {}
175         for k, v in aList:
176             rc[k] = v
177         return rc
178
179 class FeedParserDict(UserDict):
180     keymap = {'channel': 'feed',
181               'items': 'entries',
182               'guid': 'id',
183               'date': 'updated',
184               'date_parsed': 'updated_parsed',
185               'description': ['subtitle', 'summary'],
186               'url': ['href'],
187               'modified': 'updated',
188               'modified_parsed': 'updated_parsed',
189               'issued': 'published',
190               'issued_parsed': 'published_parsed',
191               'copyright': 'rights',
192               'copyright_detail': 'rights_detail',
193               'tagline': 'subtitle',
194               'tagline_detail': 'subtitle_detail'}
195     def __getitem__(self, key):
196         if key == 'category':
197             return UserDict.__getitem__(self, 'tags')[0]['term']
198         if key == 'categories':
199             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
200         realkey = self.keymap.get(key, key)
201         if type(realkey) == types.ListType:
202             for k in realkey:
203                 if UserDict.has_key(self, k):
204                     return UserDict.__getitem__(self, k)
205         if UserDict.has_key(self, key):
206             return UserDict.__getitem__(self, key)
207         return UserDict.__getitem__(self, realkey)
208
209     def __setitem__(self, key, value):
210         for k in self.keymap.keys():
211             if key == k:
212                 key = self.keymap[k]
213                 if type(key) == types.ListType:
214                     key = key[0]
215         return UserDict.__setitem__(self, key, value)
216
217     def get(self, key, default=None):
218         if self.has_key(key):
219             return self[key]
220         else:
221             return default
222
223     def setdefault(self, key, value):
224         if not self.has_key(key):
225             self[key] = value
226         return self[key]
227        
228     def has_key(self, key):
229         try:
230             return hasattr(self, key) or UserDict.has_key(self, key)
231         except AttributeError:
232             return False
233        
234     def __getattr__(self, key):
235         try:
236             return self.__dict__[key]
237         except KeyError:
238             pass
239         try:
240             assert not key.startswith('_')
241             return self.__getitem__(key)
242         except:
243             raise AttributeError, "object has no attribute '%s'" % key
244
245     def __setattr__(self, key, value):
246         if key.startswith('_') or key == 'data':
247             self.__dict__[key] = value
248         else:
249             return self.__setitem__(key, value)
250
251     def __contains__(self, key):
252         return self.has_key(key)
253
254 def zopeCompatibilityHack():
255     global FeedParserDict
256     del FeedParserDict
257     def FeedParserDict(aDict=None):
258         rc = {}
259         if aDict:
260             rc.update(aDict)
261         return rc
262
263 _ebcdic_to_ascii_map = None
264 def _ebcdic_to_ascii(s):
265     global _ebcdic_to_ascii_map
266     if not _ebcdic_to_ascii_map:
267         emap = (
268             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
269             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
270             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
271             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
272             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
273             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
274             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
275             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
276             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
277             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
278             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
279             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
280             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
281             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
282             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
283             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
284             )
285         import string
286         _ebcdic_to_ascii_map = string.maketrans( \
287             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
288     return s.translate(_ebcdic_to_ascii_map)
289
290 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
291 def _urljoin(base, uri):
292     uri = _urifixer.sub(r'\1\3', uri)
293     return urlparse.urljoin(base, uri)
294
295 class _FeedParserMixin:
296     namespaces = {'': '',
297                   'http://backend.userland.com/rss': '',
298                   'http://blogs.law.harvard.edu/tech/rss': '',
299                   'http://purl.org/rss/1.0/': '',
300                   'http://my.netscape.com/rdf/simple/0.9/': '',
301                   'http://example.com/newformat#': '',
302                   'http://example.com/necho': '',
303                   'http://purl.org/echo/': '',
304                   'uri/of/echo/namespace#': '',
305                   'http://purl.org/pie/': '',
306                   'http://purl.org/atom/ns#': '',
307                   'http://www.w3.org/2005/Atom': '',
308                   'http://purl.org/rss/1.0/modules/rss091#': '',
309                  
310                   'http://webns.net/mvcb/':                               'admin',
311                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
312                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
313                   'http://media.tangent.org/rss/1.0/':                    'audio',
314                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
315                   'http://web.resource.org/cc/':                          'cc',
316                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
317                   'http://purl.org/rss/1.0/modules/company':              'co',
318                   'http://purl.org/rss/1.0/modules/content/':             'content',
319                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
320                   'http://purl.org/dc/elements/1.1/':                     'dc',
321                   'http://purl.org/dc/terms/':                            'dcterms',
322                   'http://purl.org/rss/1.0/modules/email/':               'email',
323                   'http://purl.org/rss/1.0/modules/event/':               'ev',
324                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
325                   'http://freshmeat.net/rss/fm/':                         'fm',
326                   'http://xmlns.com/foaf/0.1/':                           'foaf',
327                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
328                   'http://postneo.com/icbm/':                             'icbm',
329                   'http://purl.org/rss/1.0/modules/image/':               'image',
330                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
331                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
332                   'http://purl.org/rss/1.0/modules/link/':                'l',
333                   'http://search.yahoo.com/mrss':                         'media',
334                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
335                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
336                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
337                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
338                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
339                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
340                   'http://purl.org/rss/1.0/modules/search/':              'search',
341                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
342                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
343                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
344                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
345                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
346                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
347                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
348                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
349                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
350                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
351                   'http://wellformedweb.org/commentAPI/':                 'wfw',
352                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
353                   'http://www.w3.org/1999/xhtml':                         'xhtml',
354                   'http://www.w3.org/XML/1998/namespace':                 'xml',
355                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
356 }
357     _matchnamespaces = {}
358
359     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
360     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
361     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
362     html_types = ['text/html', 'application/xhtml+xml']
363    
364     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
365         if _debug: sys.stderr.write('initializing FeedParser\n')
366         if not self._matchnamespaces:
367             for k, v in self.namespaces.items():
368                 self._matchnamespaces[k.lower()] = v
369         self.feeddata = FeedParserDict() # feed-level data
370         self.encoding = encoding # character encoding
371         self.entries = [] # list of entry-level data
372         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
373         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
374
375         # the following are used internally to track state;
376         # this is really out of control and should be refactored
377         self.infeed = 0
378         self.inentry = 0
379         self.incontent = 0
380         self.intextinput = 0
381         self.inimage = 0
382         self.inauthor = 0
383         self.incontributor = 0
384         self.inpublisher = 0
385         self.insource = 0
386         self.sourcedata = FeedParserDict()
387         self.contentparams = FeedParserDict()
388         self._summaryKey = None
389         self.namespacemap = {}
390         self.elementstack = []
391         self.basestack = []
392         self.langstack = []
393         self.baseuri = baseuri or ''
394         self.lang = baselang or None
395         if baselang:
396             self.feeddata['language'] = baselang
397
398     def unknown_starttag(self, tag, attrs):
399         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
400         # normalize attrs
401         attrs = [(k.lower(), v) for k, v in attrs]
402         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
403        
404         # track xml:base and xml:lang
405         attrsD = dict(attrs)
406         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
407         self.baseuri = _urljoin(self.baseuri, baseuri)
408         lang = attrsD.get('xml:lang', attrsD.get('lang'))
409         if lang == '':
410             # xml:lang could be explicitly set to '', we need to capture that
411             lang = None
412         elif lang is None:
413             # if no xml:lang is specified, use parent lang
414             lang = self.lang
415         if lang:
416             if tag in ('feed', 'rss', 'rdf:RDF'):
417                 self.feeddata['language'] = lang
418         self.lang = lang
419         self.basestack.append(self.baseuri)
420         self.langstack.append(lang)
421        
422         # track namespaces
423         for prefix, uri in attrs:
424             if prefix.startswith('xmlns:'):
425                 self.trackNamespace(prefix[6:], uri)
426             elif prefix == 'xmlns':
427                 self.trackNamespace(None, uri)
428
429         # track inline content
430         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
431             # element declared itself as escaped markup, but it isn't really
432             self.contentparams['type'] = 'application/xhtml+xml'
433         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
434             # Note: probably shouldn't simply recreate localname here, but
435             # our namespace handling isn't actually 100% correct in cases where
436             # the feed redefines the default namespace (which is actually
437             # the usual case for inline content, thanks Sam), so here we
438             # cheat and just reconstruct the element based on localname
439             # because that compensates for the bugs in our namespace handling.
440             # This will horribly munge inline content with non-empty qnames,
441             # but nobody actually does that, so I'm not fixing it.
442             tag = tag.split(':')[-1]
443             return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
444
445         # match namespaces
446         if tag.find(':') <> -1:
447             prefix, suffix = tag.split(':', 1)
448         else:
449             prefix, suffix = '', tag
450         prefix = self.namespacemap.get(prefix, prefix)
451         if prefix:
452             prefix = prefix + '_'
453
454         # special hack for better tracking of empty textinput/image elements in illformed feeds
455         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
456             self.intextinput = 0
457         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
458             self.inimage = 0
459        
460         # call special handler (if defined) or default handler
461         methodname = '_start_' + prefix + suffix
462         try:
463             method = getattr(self, methodname)
464             return method(attrsD)
465         except AttributeError:
466             return self.push(prefix + suffix, 1)
467
468     def unknown_endtag(self, tag):
469         if _debug: sys.stderr.write('end %s\n' % tag)
470         # match namespaces
471         if tag.find(':') <> -1:
472             prefix, suffix = tag.split(':', 1)
473         else:
474             prefix, suffix = '', tag
475         prefix = self.namespacemap.get(prefix, prefix)
476         if prefix:
477             prefix = prefix + '_'
478
479         # call special handler (if defined) or default handler
480         methodname = '_end_' + prefix + suffix
481         try:
482             method = getattr(self, methodname)
483             method()
484         except AttributeError:
485             self.pop(prefix + suffix)
486
487         # track inline content
488         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
489             # element declared itself as escaped markup, but it isn't really
490             self.contentparams['type'] = 'application/xhtml+xml'
491         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
492             tag = tag.split(':')[-1]
493             self.handle_data('</%s>' % tag, escape=0)
494
495         # track xml:base and xml:lang going out of scope
496         if self.basestack:
497             self.basestack.pop()
498             if self.basestack and self.basestack[-1]:
499                 self.baseuri = self.basestack[-1]
500         if self.langstack:
501             self.langstack.pop()
502             if self.langstack: # and (self.langstack[-1] is not None):
503                 self.lang = self.langstack[-1]
504
505     def handle_charref(self, ref):
506         # called for each character reference, e.g. for '&#160;', ref will be '160'
507         if not self.elementstack: return
508         ref = ref.lower()
509         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
510             text = '&#%s;' % ref
511         else:
512             if ref[0] == 'x':
513                 c = int(ref[1:], 16)
514             else:
515                 c = int(ref)
516             text = unichr(c).encode('utf-8')
517         self.elementstack[-1][2].append(text)
518
519     def handle_entityref(self, ref):
520         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
521         if not self.elementstack: return
522         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
523         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
524             text = '&%s;' % ref
525         else:
526             # entity resolution graciously donated by Aaron Swartz
527             def name2cp(k):
528                 import htmlentitydefs
529                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
530                     return htmlentitydefs.name2codepoint[k]
531                 k = htmlentitydefs.entitydefs[k]
532                 if k.startswith('&#') and k.endswith(';'):
533                     return int(k[2:-1]) # not in latin-1
534                 return ord(k)
535             try: name2cp(ref)
536             except KeyError: text = '&%s;' % ref
537             else: text = unichr(name2cp(ref)).encode('utf-8')
538         self.elementstack[-1][2].append(text)
539
540     def handle_data(self, text, escape=1):
541         # called for each block of plain text, i.e. outside of any tag and
542         # not containing any character or entity references
543         if not self.elementstack: return
544         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
545             text = _xmlescape(text)
546         self.elementstack[-1][2].append(text)
547
548     def handle_comment(self, text):
549         # called for each comment, e.g. <!-- insert message here -->
550         pass
551
552     def handle_pi(self, text):
553         # called for each processing instruction, e.g. <?instruction>
554         pass
555
556     def handle_decl(self, text):
557         pass
558
559     def parse_declaration(self, i):
560         # override internal declaration handler to handle CDATA blocks
561         if _debug: sys.stderr.write('entering parse_declaration\n')
562         if self.rawdata[i:i+9] == '<![CDATA[':
563             k = self.rawdata.find(']]>', i)
564             if k == -1: k = len(self.rawdata)
565             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
566             return k+3
567         else:
568             k = self.rawdata.find('>', i)
569             return k+1
570
571     def mapContentType(self, contentType):
572         contentType = contentType.lower()
573         if contentType == 'text':
574             contentType = 'text/plain'
575         elif contentType == 'html':
576             contentType = 'text/html'
577         elif contentType == 'xhtml':
578             contentType = 'application/xhtml+xml'
579         return contentType
580    
581     def trackNamespace(self, prefix, uri):
582         loweruri = uri.lower()
583         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
584             self.version = 'rss090'
585         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
586             self.version = 'rss10'
587         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
588             self.version = 'atom10'
589         if loweruri.find('backend.userland.com/rss') <> -1:
590             # match any backend.userland.com namespace
591             uri = 'http://backend.userland.com/rss'
592             loweruri = uri
593         if self._matchnamespaces.has_key(loweruri):
594             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
595             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
596         else:
597             self.namespacesInUse[prefix or ''] = uri
598
599     def resolveURI(self, uri):
600         return _urljoin(self.baseuri or '', uri)
601    
602     def decodeEntities(self, element, data):
603         return data
604
605     def push(self, element, expectingText):
606         self.elementstack.append([element, expectingText, []])
607
608     def pop(self, element, stripWhitespace=1):
609         if not self.elementstack: return
610         if self.elementstack[-1][0] != element: return
611        
612         element, expectingText, pieces = self.elementstack.pop()
613         output = ''.join(pieces)
614         if stripWhitespace:
615             output = output.strip()
616         if not expectingText: return output
617
618         # decode base64 content
619         if base64 and self.contentparams.get('base64', 0):
620             try:
621                 output = base64.decodestring(output)
622             except binascii.Error:
623                 pass
624             except binascii.Incomplete:
625                 pass
626                
627         # resolve relative URIs
628         if (element in self.can_be_relative_uri) and output:
629             output = self.resolveURI(output)
630        
631         # decode entities within embedded markup
632         if not self.contentparams.get('base64', 0):
633             output = self.decodeEntities(element, output)
634
635         # remove temporary cruft from contentparams
636         try:
637             del self.contentparams['mode']
638