root/calcium/trunk/lib/BeautifulSoup.py

Revision 156, 39.9 kB (checked in by dsandler, 3 years ago)

Import of Calcium, a script to automatically scrape feeds and send their links to the CoralCDN. By reading feeds through a FeedTreeProxy, Calcium has excellent update resolution (read: new links faster).

Calcium can be pointed at feeds of links that have a high probability of becoming oversubscribed to ensure that Coral has a copy of the page before the server is crushed.

Examples of such feeds include:

Line 
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 v2.1.1
5 http://www.crummy.com/software/BeautifulSoup/
6
7 Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
8 into a tree representation. It provides methods and Pythonic idioms
9 that make it easy to search and modify the tree.
10
11 A well-formed XML/HTML document will yield a well-formed data
12 structure. An ill-formed XML/HTML document will yield a
13 correspondingly ill-formed data structure. If your document is only
14 locally well-formed, you can use this library to find and process the
15 well-formed part of it. The BeautifulSoup class has heuristics for
16 obtaining a sensible parse tree in the face of common HTML errors.
17
18 Beautiful Soup has no external dependencies. It works with Python 2.2
19 and up.
20
21 Beautiful Soup defines classes for four different parsing strategies:
22
23  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
24    language that kind of looks like XML.
25
26  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
27    or invalid.
28
29  * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
30    that trips up BeautifulSoup.
31
32  * BeautifulSOAP, for making it easier to parse XML documents that use
33    lots of subelements containing a single string, where you'd prefer
34    they put that string into an attribute (such as SOAP messages).
35
36 You can subclass BeautifulStoneSoup or BeautifulSoup to create a
37 parsing strategy specific to an XML schema or a particular bizarre
38 HTML document. Typically your subclass would just override
39 SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
40 """
41 from __future__ import generators
42
43 __author__ = "Leonard Richardson (leonardr@segfault.org)"
44 __version__ = "2.1.1"
45 __date__ = "$Date: 2004/10/18 00:14:20 $"
46 __copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
47 __license__ = "PSF"
48
49 from sgmllib import SGMLParser, SGMLParseError
50 import types
51 import re
52 import sgmllib
53
54 #This code makes Beautiful Soup able to parse XML with namespaces
55 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
56
57 class NullType(object):
58
59     """Similar to NoneType with a corresponding singleton instance
60     'Null' that, unlike None, accepts any message and returns itself.
61
62     Examples:
63     >>> Null("send", "a", "message")("and one more",
64     ...      "and what you get still") is Null
65     True
66     """
67
68     def __new__(cls):                    return Null
69     def __call__(self, *args, **kwargs): return Null
70 ##    def __getstate__(self, *args):       return Null
71     def __getattr__(self, attr):         return Null
72     def __getitem__(self, item):         return Null
73     def __setattr__(self, attr, value):  pass
74     def __setitem__(self, item, value):  pass
75     def __len__(self):                   return 0
76     # FIXME: is this a python bug? otherwise ``for x in Null: pass``
77     #        never terminates...
78     def __iter__(self):                  return iter([])
79     def __contains__(self, item):        return False
80     def __repr__(self):                  return "Null"
81 Null = object.__new__(NullType)
82
83 class PageElement:
84     """Contains the navigational information for some part of the page
85     (either a tag or a piece of text)"""
86
87     def setup(self, parent=Null, previous=Null):
88         """Sets up the initial relations between this element and
89         other elements."""
90         self.parent = parent
91         self.previous = previous
92         self.next = Null
93         self.previousSibling = Null
94         self.nextSibling = Null
95         if self.parent and self.parent.contents:
96             self.previousSibling = self.parent.contents[-1]
97             self.previousSibling.nextSibling = self
98
99     def findNext(self, name=None, attrs={}, text=None):
100         """Returns the first item that matches the given criteria and
101         appears after this Tag in the document."""
102         return self._first(self.fetchNext, name, attrs, text)
103     firstNext = findNext
104
105     def fetchNext(self, name=None, attrs={}, text=None, limit=None):
106         """Returns all items that match the given criteria and appear
107         before after Tag in the document."""
108         return self._fetch(name, attrs, text, limit, self.nextGenerator)
109
110     def findNextSibling(self, name=None, attrs={}, text=None):
111         """Returns the closest sibling to this Tag that matches the
112         given criteria and appears after this Tag in the document."""
113         return self._first(self.fetchNextSiblings, name, attrs, text)
114     firstNextSibling = findNextSibling
115
116     def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
117         """Returns the siblings of this Tag that match the given
118         criteria and appear after this Tag in the document."""
119         return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
120
121     def findPrevious(self, name=None, attrs={}, text=None):
122         """Returns the first item that matches the given criteria and
123         appears before this Tag in the document."""
124         return self._first(self.fetchPrevious, name, attrs, text)
125
126     def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
127         """Returns all items that match the given criteria and appear
128         before this Tag in the document."""
129         return self._fetch(name, attrs, text, limit, self.previousGenerator)
130     firstPrevious = findPrevious
131
132     def findPreviousSibling(self, name=None, attrs={}, text=None):
133         """Returns the closest sibling to this Tag that matches the
134         given criteria and appears before this Tag in the document."""
135         return self._first(self.fetchPreviousSiblings, name, attrs, text)
136     firstPreviousSibling = findPreviousSibling
137
138     def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
139                               limit=None):
140         """Returns the siblings of this Tag that match the given
141         criteria and appear before this Tag in the document."""
142         return self._fetch(name, attrs, text, limit,
143                            self.previousSiblingGenerator)
144
145     def findParent(self, name=None, attrs={}):
146         """Returns the closest parent of this Tag that matches the given
147         criteria."""
148         r = Null
149         l = self.fetchParents(name, attrs, 1)
150         if l:
151             r = l[0]
152         return r
153     firstParent = findParent
154
155     def fetchParents(self, name=None, attrs={}, limit=None):
156         """Returns the parents of this Tag that match the given
157         criteria."""
158         return self._fetch(name, attrs, None, limit, self.parentGenerator)
159
160     #These methods do the real heavy lifting.
161
162     def _first(self, method, name, attrs, text):
163         r = Null
164         l = method(name, attrs, text, 1)
165         if l:
166             r = l[0]
167         return r
168    
169     def _fetch(self, name, attrs, text, limit, generator):
170         "Iterates over a generator looking for things that match."
171         if not hasattr(attrs, 'items'):
172             attrs = {'class' : attrs}
173
174         results = []
175         g = generator()
176         while True:
177             try:
178                 i = g.next()
179             except StopIteration:
180                 break
181             found = None
182             if isinstance(i, Tag):
183                 if not text:
184                     if not name or self._matches(i, name):
185                         match = True
186                         for attr, matchAgainst in attrs.items():
187                             check = i.get(attr)
188                             if not self._matches(check, matchAgainst):
189                                 match = False
190                                 break
191                         if match:
192                             found = i
193             elif text:
194                 if self._matches(i, text):
195                     found = i                   
196             if found:
197                 results.append(found)
198                 if limit and len(results) >= limit:
199                     break
200         return results
201
202     #Generators that can be used to navigate starting from both
203     #NavigableTexts and Tags.               
204     def nextGenerator(self):
205         i = self
206         while i:
207             i = i.next
208             yield i
209
210     def nextSiblingGenerator(self):
211         i = self
212         while i:
213             i = i.nextSibling
214             yield i
215
216     def previousGenerator(self):
217         i = self
218         while i:
219             i = i.previous
220             yield i
221
222     def previousSiblingGenerator(self):
223         i = self
224         while i:
225             i = i.previousSibling
226             yield i
227
228     def parentGenerator(self):
229         i = self
230         while i:
231             i = i.parent
232             yield i
233
234     def _matches(self, chunk, howToMatch):
235         #print 'looking for %s in %s' % (howToMatch, chunk)
236         #
237         # If given a list of items, return true if the list contains a
238         # text element that matches.
239         if isList(chunk) and not isinstance(chunk, Tag):
240             for tag in chunk:
241                 if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
242                     return True
243             return False
244         if callable(howToMatch):
245             return howToMatch(chunk)
246         if isinstance(chunk, Tag):
247             #Custom match methods take the tag as an argument, but all other
248             #ways of matching match the tag name as a string
249             chunk = chunk.name
250         #Now we know that chunk is a string
251         if not isinstance(chunk, basestring):
252             chunk = str(chunk)
253         if hasattr(howToMatch, 'match'):
254             # It's a regexp object.
255             return howToMatch.search(chunk)
256         if isList(howToMatch):
257             return chunk in howToMatch
258         if hasattr(howToMatch, 'items'):
259             return howToMatch.has_key(chunk)
260         #It's just a string
261         return str(howToMatch) == chunk
262
263 class NavigableText(PageElement):
264
265     def __getattr__(self, attr):
266         "For backwards compatibility, text.string gives you text"
267         if attr == 'string':
268             return self
269         else:
270             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
271        
272 class NavigableString(str, NavigableText):
273     pass
274
275 class NavigableUnicodeString(unicode, NavigableText):
276     pass
277
278 class Tag(PageElement):
279
280     """Represents a found HTML tag with its attributes and contents."""
281
282     def __init__(self, name, attrs=None, parent=Null, previous=Null):
283         "Basic constructor."
284         self.name = name
285         if attrs == None:
286             attrs = []
287         self.attrs = attrs
288         self.contents = []
289         self.setup(parent, previous)
290         self.hidden = False
291
292     def get(self, key, default=None):
293         """Returns the value of the 'key' attribute for the tag, or
294         the value given for 'default' if it doesn't have that
295         attribute."""
296         return self._getAttrMap().get(key, default)   
297
298     def __getitem__(self, key):
299         """tag[key] returns the value of the 'key' attribute for the tag,
300         and throws an exception if it's not there."""
301         return self._getAttrMap()[key]
302
303     def __iter__(self):
304         "Iterating over a tag iterates over its contents."
305         return iter(self.contents)
306
307     def __len__(self):
308         "The length of a tag is the length of its list of contents."
309         return len(self.contents)
310
311     def __contains__(self, x):
312         return x in self.contents
313
314     def __nonzero__(self):
315         "A tag is non-None even if it has no contents."
316         return True
317
318     def __setitem__(self, key, value):       
319         """Setting tag[key] sets the value of the 'key' attribute for the
320         tag."""
321         self._getAttrMap()
322         self.attrMap[key] = value
323         found = False
324         for i in range(0, len(self.attrs)):
325             if self.attrs[i][0] == key:
326                 self.attrs[i] = (key, value)
327                 found = True
328         if not found:
329             self.attrs.append((key, value))
330         self._getAttrMap()[key] = value
331
332     def __delitem__(self, key):
333         "Deleting tag[key] deletes all 'key' attributes for the tag."
334         for item in self.attrs:
335             if item[0] == key:
336                 self.attrs.remove(item)
337                 #We don't break because bad HTML can define the same
338                 #attribute multiple times.
339             self._getAttrMap()
340             if self.attrMap.has_key(key):
341                 del self.attrMap[key]
342
343     def __call__(self, *args, **kwargs):
344         """Calling a tag like a function is the same as calling its
345         fetch() method. Eg. tag('a') returns a list of all the A tags
346         found within this tag."""
347         return apply(self.fetch, args, kwargs)
348
349     def __getattr__(self, tag):
350         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
351             return self.first(tag[:-3])
352         elif tag.find('__') != 0:
353             return self.first(tag)
354
355     def __eq__(self, other):
356         """Returns true iff this tag has the same name, the same attributes,
357         and the same contents (recursively) as the given tag.
358
359         NOTE: right now this will return false if two tags have the
360         same attributes in a different order. Should this be fixed?"""
361         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
362             return False
363         for i in range(0, len(self.contents)):
364             if self.contents[i] != other.contents[i]:
365                 return False
366         return True
367
368     def __ne__(self, other):
369         """Returns true iff this tag is not identical to the other tag,
370         as defined in __eq__."""
371         return not self == other
372
373     def __repr__(self):
374         """Renders this tag as a string."""
375         return str(self)
376
377     def __unicode__(self):
378         return self.__str__(1)
379
380     def __str__(self, needUnicode=None, showStructureIndent=None):
381         """Returns a string or Unicode representation of this tag and
382         its contents.
383
384         NOTE: since Python's HTML parser consumes whitespace, this
385         method is not certain to reproduce the whitespace present in
386         the original string."""
387        
388         attrs = []
389         if self.attrs:
390             for key, val in self.attrs:
391                 attrs.append('%s="%s"' % (key, val))
392         close = ''
393         closeTag = ''
394         if self.isSelfClosing():
395             close = ' /'
396         else:
397             closeTag = '</%s>' % self.name
398         indentIncrement = None       
399         if showStructureIndent != None:
400             indentIncrement = showStructureIndent
401             if not self.hidden:
402                 indentIncrement += 1
403         contents = self.renderContents(indentIncrement, needUnicode=needUnicode)       
404         if showStructureIndent:
405             space = '\n%s' % (' ' * showStructureIndent)
406         if self.hidden:
407             s = contents
408         else:
409             s = []
410             attributeString = ''
411             if attrs:
412                 attributeString = ' ' + ' '.join(attrs)           
413             if showStructureIndent:
414                 s.append(space)
415             s.append('<%s%s%s>' % (self.name, attributeString, close))
416             s.append(contents)
417             if closeTag and showStructureIndent != None:
418                 s.append(space)
419             s.append(closeTag)
420             s = ''.join(s)
421         isUnicode = type(s) == types.UnicodeType
422         if needUnicode and not isUnicode:
423             s = unicode(s)
424         elif isUnicode and needUnicode==False:
425             s = str(s)
426         return s
427
428     def prettify(self, needUnicode=None):
429         return self.__str__(needUnicode, showStructureIndent=True)
430
431     def renderContents(self, showStructureIndent=None, needUnicode=None):
432         """Renders the contents of this tag as a (possibly Unicode)
433         string."""
434         s=[]
435         for c in self:
436             text = None
437             if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
438                 text = unicode(c)
439             elif isinstance(c, Tag):
440                 s.append(c.__str__(needUnicode, showStructureIndent))
441             elif needUnicode:
442                 text = unicode(c)
443             else:
444                 text = str(c)
445             if text:
446                 if showStructureIndent != None:
447                     if text[-1] == '\n':
448                         text = text[:-1]
449                 s.append(text)
450         return ''.join(s)   
451
452     #Soup methods
453
454     def firstText(self, text, recursive=True):
455         """Convenience method to retrieve the first piece of text matching the
456         given criteria. 'text' can be a string, a regular expression object,
457         a callable that takes a string and returns whether or not the
458         string 'matches', etc."""
459         return self.first(recursive=recursive, text=text)
460
461     def fetchText(self, text, recursive=True, limit=None):
462         """Convenience method to retrieve all pieces of text matching the
463         given criteria. 'text' can be a string, a regular expression object,
464         a callable that takes a string and returns whether or not the
465         string 'matches', etc."""
466         return self.fetch(recursive=recursive, text=text, limit=limit)
467
468     def first(self, name=None, attrs={}, recursive=True, text=None):
469         """Return only the first child of this
470         Tag matching the given criteria."""
471         r = Null
472         l = self.fetch(name, attrs, recursive, text, 1)
473         if l:
474             r = l[0]
475         return r
476     findChild = first
477
478     def fetch(self, name=None, attrs={}, recursive=True, text=None,
479               limit=None):
480         """Extracts a list of Tag objects that match the given
481         criteria.  You can specify the name of the Tag and any
482         attributes you want the Tag to have.
483
484         The value of a key-value pair in the 'attrs' map can be a
485         string, a list of strings, a regular expression object, or a
486         callable that takes a string and returns whether or not the
487         string matches for some custom definition of 'matches'. The
488         same is true of the tag name."""
489         generator = self.recursiveChildGenerator
490         if not recursive:
491             generator = self.childGenerator
492         return self._fetch(name, attrs, text, limit, generator)
493     fetchChildren = fetch
494    
495     #Utility methods
496
497     def isSelfClosing(self):
498         """Returns true iff this is a self-closing tag as defined in the HTML
499         standard.
500
501         TODO: This is specific to BeautifulSoup and its subclasses, but it's
502         used by __str__"""
503         return self.name in BeautifulSoup.SELF_CLOSING_TAGS
504
505     def append(self, tag):
506         """Appends the given tag to the contents of this tag."""
507         self.contents.append(tag)
508
509     #Private methods
510
511     def _getAttrMap(self):
512         """Initializes a map representation of this tag's attributes,
513         if not already initialized."""
514         if not getattr(self, 'attrMap'):
515             self.attrMap = {}
516             for (key, value) in self.attrs:
517                 self.attrMap[key] = value
518         return self.attrMap
519
520     #Generator methods
521     def childGenerator(self):
522         for i in range(0, len(self.contents)):
523             yield self.contents[i]
524         raise StopIteration
525    
526     def recursiveChildGenerator(self):
527         stack = [(self, 0)]
528         while stack:
529             tag, start = stack.pop()
530             if isinstance(tag, Tag):           
531                 for i in range(start, len(tag.contents)):
532                     a = tag.contents[i]
533                     yield a
534                     if isinstance(a, Tag) and tag.contents:
535                         if i < len(tag.contents) - 1:
536                             stack.append((tag, i+1))
537                         stack.append((a, 0))
538                         break
539         raise StopIteration
540
541
542 def isList(l):
543     """Convenience method that works with all 2.x versions of Python
544     to determine whether or not something is listlike."""
545     return hasattr(l, '__iter__') \
546            or (type(l) in (types.ListType, types.TupleType))
547
548 def buildTagMap(default, *args):
549     """Turns a list of maps, lists, or scalars into a single map.
550     Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
551     of lists and partial maps."""
552     built = {}
553     for portion in args:
554         if hasattr(portion, 'items'):
555             #It's a map. Merge it.
556             for k,v in portion.items():
557                 built[k] = v
558         elif isList(portion):
559             #It's a list. Map each item to the default.
560             for k in portion:
561                 built[k] = default
562         else:
563             #It's a scalar. Map it to the default.
564             built[portion] = default
565     return built
566
567 class BeautifulStoneSoup(Tag, SGMLParser):
568
569     """This class contains the basic parser and fetch code. It defines
570     a parser that knows nothing about tag behavior except for the
571     following:
572   
573       You can't close a tag without closing all the tags it encloses.
574       That is, "<foo><bar></foo>" actually means
575       "<foo><bar></bar></foo>".
576
577     [Another possible explanation is "<foo><bar /></foo>", but since
578     this class defines no SELF_CLOSING_TAGS, it will never use that
579     explanation.]
580
581     This class is useful for parsing XML or made-up markup languages,
582     or when BeautifulSoup makes an assumption counter to what you were
583     expecting."""
584
585     SELF_CLOSING_TAGS = {}
586     NESTABLE_TAGS = {}
587     RESET_NESTING_TAGS = {}
588     QUOTE_TAGS = {}
589
590     #As a public service we will by default silently replace MS smart quotes
591     #and similar characters with their HTML or ASCII equivalents.
592     MS_CHARS = { '\x80' : '&euro;',
593                  '\x81' : ' ',
594                  '\x82' : '&sbquo;',
595                  '\x83' : '&fnof;',
596                  '\x84' : '&bdquo;',
597                  '\x85' : '&hellip;',
598                  '\x86' : '&dagger;',
599                  '\x87' : '&Dagger;',
600                  '\x88' : '&caret;',
601                  '\x89' : '%',
602                  '\x8A' : '&Scaron;',
603                  '\x8B' : '&lt;',
604                  '\x8C' : '&OElig;',
605                  '\x8D' : '?',
606                  '\x8E' : 'Z',
607                  '\x8F' : '?',
608                  '\x90' : '?',
609                  '\x91' : '&lsquo;',
610                  '\x92' : '&rsquo;',
611                  '\x93' : '&ldquo;',
612                  '\x94' : '&rdquo;',
613                  '\x95' : '&bull;',
614                  '\x96' : '&ndash;',
615                  '\x97' : '&mdash;',
616                  '\x98' : '&tilde;',
617                  '\x99' : '&trade;',
618                  '\x9a' : '&scaron;',
619                  '\x9b' : '&gt;',
620                  '\x9c' : '&oelig;',
621                  '\x9d' : '?',
622                  '\x9e' : 'z',
623                  '\x9f' : '&Yuml;',}
624
625     PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
626                        lambda(x):x.group(1) + ' />'),
627                       (re.compile('<!\s+([^<>]*)>'),
628                        lambda(x):'<!' + x.group(1) + '>'),
629                       (re.compile("([\x80-\x9f])"),
630                        lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
631                       ]
632
633     ROOT_TAG_NAME = '[document]'
634
635     def __init__(self, text=None, avoidParserProblems=True,
636                  initialTextIsEverything=True):
637         """Initialize this as the 'root tag' and feed in any text to
638         the parser.
639
640         NOTE about avoidParserProblems: sgmllib will process most bad
641         HTML, and BeautifulSoup has tricks for dealing with some HTML
642         that kills sgmllib, but Beautiful Soup can nonetheless choke
643         or lose data if your data uses self-closing tags or
644         declarations incorrectly. By default, Beautiful Soup sanitizes
645         its input to avoid the vast majority of these problems. The
646         problems are relatively rare, even in bad HTML, so feel free
647         to pass in False to avoidParserProblems if they don't apply to
648         you, and you'll get better performance. The only reason I have
649         this turned on by default is so I don't get so many tech
650         support questions.
651
652         The two most common instances of invalid HTML that will choke
653         sgmllib are fixed by the default parser massage techniques:
654
655          <br/> (No space between name of closing tag and tag close)
656          <! --Comment--> (Extraneous whitespace in declaration)
657
658         You can pass in a custom list of (RE object, replace method)
659         tuples to get Beautiful Soup to scrub your input the way you
660         want."""
661         Tag.__init__(self, self.ROOT_TAG_NAME)
662         if avoidParserProblems \
663            and not isList(avoidParserProblems):
664             avoidParserProblems = self.PARSER_MASSAGE           
665         self.avoidParserProblems = avoidParserProblems
666         SGMLParser.__init__(self)
667         self.quoteStack = []
668         self.hidden = 1
669         self.reset()
670         if hasattr(text, 'read'):
671             #It's a file-type object.
672             text = text.read()
673         if text:
674             self.feed(text)
675         if initialTextIsEverything:
676             self.done()
677
678     def __getattr__(self, methodName):
679         """This method routes method call requests to either the SGMLParser
680         superclass or the Tag superclass, depending on the method name."""
681         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
682                or methodName.find('do_') == 0:
683             return SGMLParser.__getattr__(self, methodName)
684         elif methodName.find('__') != 0:
685             return Tag.__getattr__(self, methodName)
686         else:
687             raise AttributeError
688
689     def feed(self, text):
690         if self.avoidParserProblems:
691             for fix, m in self.avoidParserProblems:
692                 text = fix.sub(m, text)
693         SGMLParser.feed(self, text)
694
695     def done(self):
696         """Called when you're done parsing, so that the unclosed tags can be
697         correctly processed."""
698         self.endData() #NEW
699         while self.currentTag.name != self.ROOT_TAG_NAME:
700             self.popTag()
701            
702     def reset(self):
703         SGMLParser.reset(self)
704         self.currentData = []
705         self.currentTag = None
706         self.tagStack = []
707         self.pushTag(self)       
708    
709     def popTag(self):
710         tag = self.tagStack.pop()
711         # Tags with just one string-owning child get the child as a
712         # 'string' property, so that soup.tag.string is shorthand for
713         # soup.tag.contents[0]
714         if len(self.currentTag.contents) == 1 and \
715            isinstance(self.currentTag.contents[0], NavigableText):
716             self.currentTag.string = self.currentTag.contents[0]
717
718         #print "Pop", tag.name
719         if self.tagStack:
720             self.currentTag = self.tagStack[-1]
721         return self.currentTag
722
723     def pushTag(self, tag):
724