| 1 |
"""Beautiful Soup |
|---|
| 2 |
Elixir and Tonic |
|---|
| 3 |
"The Screen-Scraper's Friend" |
|---|
| 4 |
v2.1.1 |
|---|
| 5 |
http://www.crummy.com/software/BeautifulSoup/ |
|---|
| 6 |
|
|---|
| 7 |
Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance |
|---|
| 8 |
into a tree representation. It provides methods and Pythonic idioms |
|---|
| 9 |
that make it easy to search and modify the tree. |
|---|
| 10 |
|
|---|
| 11 |
A well-formed XML/HTML document will yield a well-formed data |
|---|
| 12 |
structure. An ill-formed XML/HTML document will yield a |
|---|
| 13 |
correspondingly ill-formed data structure. If your document is only |
|---|
| 14 |
locally well-formed, you can use this library to find and process the |
|---|
| 15 |
well-formed part of it. The BeautifulSoup class has heuristics for |
|---|
| 16 |
obtaining a sensible parse tree in the face of common HTML errors. |
|---|
| 17 |
|
|---|
| 18 |
Beautiful Soup has no external dependencies. It works with Python 2.2 |
|---|
| 19 |
and up. |
|---|
| 20 |
|
|---|
| 21 |
Beautiful Soup defines classes for four different parsing strategies: |
|---|
| 22 |
|
|---|
| 23 |
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|---|
| 24 |
language that kind of looks like XML. |
|---|
| 25 |
|
|---|
| 26 |
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid |
|---|
| 27 |
or invalid. |
|---|
| 28 |
|
|---|
| 29 |
* ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML |
|---|
| 30 |
that trips up BeautifulSoup. |
|---|
| 31 |
|
|---|
| 32 |
* BeautifulSOAP, for making it easier to parse XML documents that use |
|---|
| 33 |
lots of subelements containing a single string, where you'd prefer |
|---|
| 34 |
they put that string into an attribute (such as SOAP messages). |
|---|
| 35 |
|
|---|
| 36 |
You can subclass BeautifulStoneSoup or BeautifulSoup to create a |
|---|
| 37 |
parsing strategy specific to an XML schema or a particular bizarre |
|---|
| 38 |
HTML document. Typically your subclass would just override |
|---|
| 39 |
SELF_CLOSING_TAGS and/or NESTABLE_TAGS. |
|---|
| 40 |
""" |
|---|
| 41 |
from __future__ import generators |
|---|
| 42 |
|
|---|
| 43 |
__author__ = "Leonard Richardson (leonardr@segfault.org)" |
|---|
| 44 |
__version__ = "2.1.1" |
|---|
| 45 |
__date__ = "$Date: 2004/10/18 00:14:20 $" |
|---|
| 46 |
__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" |
|---|
| 47 |
__license__ = "PSF" |
|---|
| 48 |
|
|---|
| 49 |
from sgmllib import SGMLParser, SGMLParseError |
|---|
| 50 |
import types |
|---|
| 51 |
import re |
|---|
| 52 |
import sgmllib |
|---|
| 53 |
|
|---|
| 54 |
|
|---|
| 55 |
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|---|
| 56 |
|
|---|
| 57 |
class NullType(object): |
|---|
| 58 |
|
|---|
| 59 |
"""Similar to NoneType with a corresponding singleton instance |
|---|
| 60 |
'Null' that, unlike None, accepts any message and returns itself. |
|---|
| 61 |
|
|---|
| 62 |
Examples: |
|---|
| 63 |
>>> Null("send", "a", "message")("and one more", |
|---|
| 64 |
... "and what you get still") is Null |
|---|
| 65 |
True |
|---|
| 66 |
""" |
|---|
| 67 |
|
|---|
| 68 |
def __new__(cls): return Null |
|---|
| 69 |
def __call__(self, *args, **kwargs): return Null |
|---|
| 70 |
|
|---|
| 71 |
def __getattr__(self, attr): return Null |
|---|
| 72 |
def __getitem__(self, item): return Null |
|---|
| 73 |
def __setattr__(self, attr, value): pass |
|---|
| 74 |
def __setitem__(self, item, value): pass |
|---|
| 75 |
def __len__(self): return 0 |
|---|
| 76 |
|
|---|
| 77 |
|
|---|
| 78 |
def __iter__(self): return iter([]) |
|---|
| 79 |
def __contains__(self, item): return False |
|---|
| 80 |
def __repr__(self): return "Null" |
|---|
| 81 |
Null = object.__new__(NullType) |
|---|
| 82 |
|
|---|
| 83 |
class PageElement: |
|---|
| 84 |
"""Contains the navigational information for some part of the page |
|---|
| 85 |
(either a tag or a piece of text)""" |
|---|
| 86 |
|
|---|
| 87 |
def setup(self, parent=Null, previous=Null): |
|---|
| 88 |
"""Sets up the initial relations between this element and |
|---|
| 89 |
other elements.""" |
|---|
| 90 |
self.parent = parent |
|---|
| 91 |
self.previous = previous |
|---|
| 92 |
self.next = Null |
|---|
| 93 |
self.previousSibling = Null |
|---|
| 94 |
self.nextSibling = Null |
|---|
| 95 |
if self.parent and self.parent.contents: |
|---|
| 96 |
self.previousSibling = self.parent.contents[-1] |
|---|
| 97 |
self.previousSibling.nextSibling = self |
|---|
| 98 |
|
|---|
| 99 |
def findNext(self, name=None, attrs={}, text=None): |
|---|
| 100 |
"""Returns the first item that matches the given criteria and |
|---|
| 101 |
appears after this Tag in the document.""" |
|---|
| 102 |
return self._first(self.fetchNext, name, attrs, text) |
|---|
| 103 |
firstNext = findNext |
|---|
| 104 |
|
|---|
| 105 |
def fetchNext(self, name=None, attrs={}, text=None, limit=None): |
|---|
| 106 |
"""Returns all items that match the given criteria and appear |
|---|
| 107 |
before after Tag in the document.""" |
|---|
| 108 |
return self._fetch(name, attrs, text, limit, self.nextGenerator) |
|---|
| 109 |
|
|---|
| 110 |
def findNextSibling(self, name=None, attrs={}, text=None): |
|---|
| 111 |
"""Returns the closest sibling to this Tag that matches the |
|---|
| 112 |
given criteria and appears after this Tag in the document.""" |
|---|
| 113 |
return self._first(self.fetchNextSiblings, name, attrs, text) |
|---|
| 114 |
firstNextSibling = findNextSibling |
|---|
| 115 |
|
|---|
| 116 |
def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): |
|---|
| 117 |
"""Returns the siblings of this Tag that match the given |
|---|
| 118 |
criteria and appear after this Tag in the document.""" |
|---|
| 119 |
return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) |
|---|
| 120 |
|
|---|
| 121 |
def findPrevious(self, name=None, attrs={}, text=None): |
|---|
| 122 |
"""Returns the first item that matches the given criteria and |
|---|
| 123 |
appears before this Tag in the document.""" |
|---|
| 124 |
return self._first(self.fetchPrevious, name, attrs, text) |
|---|
| 125 |
|
|---|
| 126 |
def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): |
|---|
| 127 |
"""Returns all items that match the given criteria and appear |
|---|
| 128 |
before this Tag in the document.""" |
|---|
| 129 |
return self._fetch(name, attrs, text, limit, self.previousGenerator) |
|---|
| 130 |
firstPrevious = findPrevious |
|---|
| 131 |
|
|---|
| 132 |
def findPreviousSibling(self, name=None, attrs={}, text=None): |
|---|
| 133 |
"""Returns the closest sibling to this Tag that matches the |
|---|
| 134 |
given criteria and appears before this Tag in the document.""" |
|---|
| 135 |
return self._first(self.fetchPreviousSiblings, name, attrs, text) |
|---|
| 136 |
firstPreviousSibling = findPreviousSibling |
|---|
| 137 |
|
|---|
| 138 |
def fetchPreviousSiblings(self, name=None, attrs={}, text=None, |
|---|
| 139 |
limit=None): |
|---|
| 140 |
"""Returns the siblings of this Tag that match the given |
|---|
| 141 |
criteria and appear before this Tag in the document.""" |
|---|
| 142 |
return self._fetch(name, attrs, text, limit, |
|---|
| 143 |
self.previousSiblingGenerator) |
|---|
| 144 |
|
|---|
| 145 |
def findParent(self, name=None, attrs={}): |
|---|
| 146 |
"""Returns the closest parent of this Tag that matches the given |
|---|
| 147 |
criteria.""" |
|---|
| 148 |
r = Null |
|---|
| 149 |
l = self.fetchParents(name, attrs, 1) |
|---|
| 150 |
if l: |
|---|
| 151 |
r = l[0] |
|---|
| 152 |
return r |
|---|
| 153 |
firstParent = findParent |
|---|
| 154 |
|
|---|
| 155 |
def fetchParents(self, name=None, attrs={}, limit=None): |
|---|
| 156 |
"""Returns the parents of this Tag that match the given |
|---|
| 157 |
criteria.""" |
|---|
| 158 |
return self._fetch(name, attrs, None, limit, self.parentGenerator) |
|---|
| 159 |
|
|---|
| 160 |
|
|---|
| 161 |
|
|---|
| 162 |
def _first(self, method, name, attrs, text): |
|---|
| 163 |
r = Null |
|---|
| 164 |
l = method(name, attrs, text, 1) |
|---|
| 165 |
if l: |
|---|
| 166 |
r = l[0] |
|---|
| 167 |
return r |
|---|
| 168 |
|
|---|
| 169 |
def _fetch(self, name, attrs, text, limit, generator): |
|---|
| 170 |
"Iterates over a generator looking for things that match." |
|---|
| 171 |
if not hasattr(attrs, 'items'): |
|---|
| 172 |
attrs = {'class' : attrs} |
|---|
| 173 |
|
|---|
| 174 |
results = [] |
|---|
| 175 |
g = generator() |
|---|
| 176 |
while True: |
|---|
| 177 |
try: |
|---|
| 178 |
i = g.next() |
|---|
| 179 |
except StopIteration: |
|---|
| 180 |
break |
|---|
| 181 |
found = None |
|---|
| 182 |
if isinstance(i, Tag): |
|---|
| 183 |
if not text: |
|---|
| 184 |
if not name or self._matches(i, name): |
|---|
| 185 |
match = True |
|---|
| 186 |
for attr, matchAgainst in attrs.items(): |
|---|
| 187 |
check = i.get(attr) |
|---|
| 188 |
if not self._matches(check, matchAgainst): |
|---|
| 189 |
match = False |
|---|
| 190 |
break |
|---|
| 191 |
if match: |
|---|
| 192 |
found = i |
|---|
| 193 |
elif text: |
|---|
| 194 |
if self._matches(i, text): |
|---|
| 195 |
found = i |
|---|
| 196 |
if found: |
|---|
| 197 |
results.append(found) |
|---|
| 198 |
if limit and len(results) >= limit: |
|---|
| 199 |
break |
|---|
| 200 |
return results |
|---|
| 201 |
|
|---|
| 202 |
|
|---|
| 203 |
|
|---|
| 204 |
def nextGenerator(self): |
|---|
| 205 |
i = self |
|---|
| 206 |
while i: |
|---|
| 207 |
i = i.next |
|---|
| 208 |
yield i |
|---|
| 209 |
|
|---|
| 210 |
def nextSiblingGenerator(self): |
|---|
| 211 |
i = self |
|---|
| 212 |
while i: |
|---|
| 213 |
i = i.nextSibling |
|---|
| 214 |
yield i |
|---|
| 215 |
|
|---|
| 216 |
def previousGenerator(self): |
|---|
| 217 |
i = self |
|---|
| 218 |
while i: |
|---|
| 219 |
i = i.previous |
|---|
| 220 |
yield i |
|---|
| 221 |
|
|---|
| 222 |
def previousSiblingGenerator(self): |
|---|
| 223 |
i = self |
|---|
| 224 |
while i: |
|---|
| 225 |
i = i.previousSibling |
|---|
| 226 |
yield i |
|---|
| 227 |
|
|---|
| 228 |
def parentGenerator(self): |
|---|
| 229 |
i = self |
|---|
| 230 |
while i: |
|---|
| 231 |
i = i.parent |
|---|
| 232 |
yield i |
|---|
| 233 |
|
|---|
| 234 |
def _matches(self, chunk, howToMatch): |
|---|
| 235 |
|
|---|
| 236 |
|
|---|
| 237 |
|
|---|
| 238 |
|
|---|
| 239 |
if isList(chunk) and not isinstance(chunk, Tag): |
|---|
| 240 |
for tag in chunk: |
|---|
| 241 |
if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): |
|---|
| 242 |
return True |
|---|
| 243 |
return False |
|---|
| 244 |
if callable(howToMatch): |
|---|
| 245 |
return howToMatch(chunk) |
|---|
| 246 |
if isinstance(chunk, Tag): |
|---|
| 247 |
|
|---|
| 248 |
|
|---|
| 249 |
chunk = chunk.name |
|---|
| 250 |
|
|---|
| 251 |
if not isinstance(chunk, basestring): |
|---|
| 252 |
chunk = str(chunk) |
|---|
| 253 |
if hasattr(howToMatch, 'match'): |
|---|
| 254 |
|
|---|
| 255 |
return howToMatch.search(chunk) |
|---|
| 256 |
if isList(howToMatch): |
|---|
| 257 |
return chunk in howToMatch |
|---|
| 258 |
if hasattr(howToMatch, 'items'): |
|---|
| 259 |
return howToMatch.has_key(chunk) |
|---|
| 260 |
|
|---|
| 261 |
return str(howToMatch) == chunk |
|---|
| 262 |
|
|---|
| 263 |
class NavigableText(PageElement): |
|---|
| 264 |
|
|---|
| 265 |
def __getattr__(self, attr): |
|---|
| 266 |
"For backwards compatibility, text.string gives you text" |
|---|
| 267 |
if attr == 'string': |
|---|
| 268 |
return self |
|---|
| 269 |
else: |
|---|
| 270 |
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) |
|---|
| 271 |
|
|---|
| 272 |
class NavigableString(str, NavigableText): |
|---|
| 273 |
pass |
|---|
| 274 |
|
|---|
| 275 |
class NavigableUnicodeString(unicode, NavigableText): |
|---|
| 276 |
pass |
|---|
| 277 |
|
|---|
| 278 |
class Tag(PageElement): |
|---|
| 279 |
|
|---|
| 280 |
"""Represents a found HTML tag with its attributes and contents.""" |
|---|
| 281 |
|
|---|
| 282 |
def __init__(self, name, attrs=None, parent=Null, previous=Null): |
|---|
| 283 |
"Basic constructor." |
|---|
| 284 |
self.name = name |
|---|
| 285 |
if attrs == None: |
|---|
| 286 |
attrs = [] |
|---|
| 287 |
self.attrs = attrs |
|---|
| 288 |
self.contents = [] |
|---|
| 289 |
self.setup(parent, previous) |
|---|
| 290 |
self.hidden = False |
|---|
| 291 |
|
|---|
| 292 |
def get(self, key, default=None): |
|---|
| 293 |
"""Returns the value of the 'key' attribute for the tag, or |
|---|
| 294 |
the value given for 'default' if it doesn't have that |
|---|
| 295 |
attribute.""" |
|---|
| 296 |
return self._getAttrMap().get(key, default) |
|---|
| 297 |
|
|---|
| 298 |
def __getitem__(self, key): |
|---|
| 299 |
"""tag[key] returns the value of the 'key' attribute for the tag, |
|---|
| 300 |
and throws an exception if it's not there.""" |
|---|
| 301 |
return self._getAttrMap()[key] |
|---|
| 302 |
|
|---|
| 303 |
def __iter__(self): |
|---|
| 304 |
"Iterating over a tag iterates over its contents." |
|---|
| 305 |
return iter(self.contents) |
|---|
| 306 |
|
|---|
| 307 |
def __len__(self): |
|---|
| 308 |
"The length of a tag is the length of its list of contents." |
|---|
| 309 |
return len(self.contents) |
|---|
| 310 |
|
|---|
| 311 |
def __contains__(self, x): |
|---|
| 312 |
return x in self.contents |
|---|
| 313 |
|
|---|
| 314 |
def __nonzero__(self): |
|---|
| 315 |
"A tag is non-None even if it has no contents." |
|---|
| 316 |
return True |
|---|
| 317 |
|
|---|
| 318 |
def __setitem__(self, key, value): |
|---|
| 319 |
"""Setting tag[key] sets the value of the 'key' attribute for the |
|---|
| 320 |
tag.""" |
|---|
| 321 |
self._getAttrMap() |
|---|
| 322 |
self.attrMap[key] = value |
|---|
| 323 |
found = False |
|---|
| 324 |
for i in range(0, len(self.attrs)): |
|---|
| 325 |
if self.attrs[i][0] == key: |
|---|
| 326 |
self.attrs[i] = (key, value) |
|---|
| 327 |
found = True |
|---|
| 328 |
if not found: |
|---|
| 329 |
self.attrs.append((key, value)) |
|---|
| 330 |
self._getAttrMap()[key] = value |
|---|
| 331 |
|
|---|
| 332 |
def __delitem__(self, key): |
|---|
| 333 |
"Deleting tag[key] deletes all 'key' attributes for the tag." |
|---|
| 334 |
for item in self.attrs: |
|---|
| 335 |
if item[0] == key: |
|---|
| 336 |
self.attrs.remove(item) |
|---|
| 337 |
|
|---|
| 338 |
|
|---|
| 339 |
self._getAttrMap() |
|---|
| 340 |
if self.attrMap.has_key(key): |
|---|
| 341 |
del self.attrMap[key] |
|---|
| 342 |
|
|---|
| 343 |
def __call__(self, *args, **kwargs): |
|---|
| 344 |
"""Calling a tag like a function is the same as calling its |
|---|
| 345 |
fetch() method. Eg. tag('a') returns a list of all the A tags |
|---|
| 346 |
found within this tag.""" |
|---|
| 347 |
return apply(self.fetch, args, kwargs) |
|---|
| 348 |
|
|---|
| 349 |
def __getattr__(self, tag): |
|---|
| 350 |
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: |
|---|
| 351 |
return self.first(tag[:-3]) |
|---|
| 352 |
elif tag.find('__') != 0: |
|---|
| 353 |
return self.first(tag) |
|---|
| 354 |
|
|---|
| 355 |
def __eq__(self, other): |
|---|
| 356 |
"""Returns true iff this tag has the same name, the same attributes, |
|---|
| 357 |
and the same contents (recursively) as the given tag. |
|---|
| 358 |
|
|---|
| 359 |
NOTE: right now this will return false if two tags have the |
|---|
| 360 |
same attributes in a different order. Should this be fixed?""" |
|---|
| 361 |
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): |
|---|
| 362 |
return False |
|---|
| 363 |
for i in range(0, len(self.contents)): |
|---|
| 364 |
if self.contents[i] != other.contents[i]: |
|---|
| 365 |
return False |
|---|
| 366 |
return True |
|---|
| 367 |
|
|---|
| 368 |
def __ne__(self, other): |
|---|
| 369 |
"""Returns true iff this tag is not identical to the other tag, |
|---|
| 370 |
as defined in __eq__.""" |
|---|
| 371 |
return not self == other |
|---|
| 372 |
|
|---|
| 373 |
def __repr__(self): |
|---|
| 374 |
"""Renders this tag as a string.""" |
|---|
| 375 |
return str(self) |
|---|
| 376 |
|
|---|
| 377 |
def __unicode__(self): |
|---|
| 378 |
return self.__str__(1) |
|---|
| 379 |
|
|---|
| 380 |
def __str__(self, needUnicode=None, showStructureIndent=None): |
|---|
| 381 |
"""Returns a string or Unicode representation of this tag and |
|---|
| 382 |
its contents. |
|---|
| 383 |
|
|---|
| 384 |
NOTE: since Python's HTML parser consumes whitespace, this |
|---|
| 385 |
method is not certain to reproduce the whitespace present in |
|---|
| 386 |
the original string.""" |
|---|
| 387 |
|
|---|
| 388 |
attrs = [] |
|---|
| 389 |
if self.attrs: |
|---|
| 390 |
for key, val in self.attrs: |
|---|
| 391 |
attrs.append('%s="%s"' % (key, val)) |
|---|
| 392 |
close = '' |
|---|
| 393 |
closeTag = '' |
|---|
| 394 |
if self.isSelfClosing(): |
|---|
| 395 |
close = ' /' |
|---|
| 396 |
else: |
|---|
| 397 |
closeTag = '</%s>' % self.name |
|---|
| 398 |
indentIncrement = None |
|---|
| 399 |
if showStructureIndent != None: |
|---|
| 400 |
indentIncrement = showStructureIndent |
|---|
| 401 |
if not self.hidden: |
|---|
| 402 |
indentIncrement += 1 |
|---|
| 403 |
contents = self.renderContents(indentIncrement, needUnicode=needUnicode) |
|---|
| 404 |
if showStructureIndent: |
|---|
| 405 |
space = '\n%s' % (' ' * showStructureIndent) |
|---|
| 406 |
if self.hidden: |
|---|
| 407 |
s = contents |
|---|
| 408 |
else: |
|---|
| 409 |
s = [] |
|---|
| 410 |
attributeString = '' |
|---|
| 411 |
if attrs: |
|---|
| 412 |
attributeString = ' ' + ' '.join(attrs) |
|---|
| 413 |
if showStructureIndent: |
|---|
| 414 |
s.append(space) |
|---|
| 415 |
s.append('<%s%s%s>' % (self.name, attributeString, close)) |
|---|
| 416 |
s.append(contents) |
|---|
| 417 |
if closeTag and showStructureIndent != None: |
|---|
| 418 |
s.append(space) |
|---|
| 419 |
s.append(closeTag) |
|---|
| 420 |
s = ''.join(s) |
|---|
| 421 |
isUnicode = type(s) == types.UnicodeType |
|---|
| 422 |
if needUnicode and not isUnicode: |
|---|
| 423 |
s = unicode(s) |
|---|
| 424 |
elif isUnicode and needUnicode==False: |
|---|
| 425 |
s = str(s) |
|---|
| 426 |
return s |
|---|
| 427 |
|
|---|
| 428 |
def prettify(self, needUnicode=None): |
|---|
| 429 |
return self.__str__(needUnicode, showStructureIndent=True) |
|---|
| 430 |
|
|---|
| 431 |
def renderContents(self, showStructureIndent=None, needUnicode=None): |
|---|
| 432 |
"""Renders the contents of this tag as a (possibly Unicode) |
|---|
| 433 |
string.""" |
|---|
| 434 |
s=[] |
|---|
| 435 |
for c in self: |
|---|
| 436 |
text = None |
|---|
| 437 |
if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: |
|---|
| 438 |
text = unicode(c) |
|---|
| 439 |
elif isinstance(c, Tag): |
|---|
| 440 |
s.append(c.__str__(needUnicode, showStructureIndent)) |
|---|
| 441 |
elif needUnicode: |
|---|
| 442 |
text = unicode(c) |
|---|
| 443 |
else: |
|---|
| 444 |
text = str(c) |
|---|
| 445 |
if text: |
|---|
| 446 |
if showStructureIndent != None: |
|---|
| 447 |
if text[-1] == '\n': |
|---|
| 448 |
text = text[:-1] |
|---|
| 449 |
s.append(text) |
|---|
| 450 |
return ''.join(s) |
|---|
| 451 |
|
|---|
| 452 |
|
|---|
| 453 |
|
|---|
| 454 |
def firstText(self, text, recursive=True): |
|---|
| 455 |
"""Convenience method to retrieve the first piece of text matching the |
|---|
| 456 |
given criteria. 'text' can be a string, a regular expression object, |
|---|
| 457 |
a callable that takes a string and returns whether or not the |
|---|
| 458 |
string 'matches', etc.""" |
|---|
| 459 |
return self.first(recursive=recursive, text=text) |
|---|
| 460 |
|
|---|
| 461 |
def fetchText(self, text, recursive=True, limit=None): |
|---|
| 462 |
"""Convenience method to retrieve all pieces of text matching the |
|---|
| 463 |
given criteria. 'text' can be a string, a regular expression object, |
|---|
| 464 |
a callable that takes a string and returns whether or not the |
|---|
| 465 |
string 'matches', etc.""" |
|---|
| 466 |
return self.fetch(recursive=recursive, text=text, limit=limit) |
|---|
| 467 |
|
|---|
| 468 |
def first(self, name=None, attrs={}, recursive=True, text=None): |
|---|
| 469 |
"""Return only the first child of this |
|---|
| 470 |
Tag matching the given criteria.""" |
|---|
| 471 |
r = Null |
|---|
| 472 |
l = self.fetch(name, attrs, recursive, text, 1) |
|---|
| 473 |
if l: |
|---|
| 474 |
r = l[0] |
|---|
| 475 |
return r |
|---|
| 476 |
findChild = first |
|---|
| 477 |
|
|---|
| 478 |
def fetch(self, name=None, attrs={}, recursive=True, text=None, |
|---|
| 479 |
limit=None): |
|---|
| 480 |
"""Extracts a list of Tag objects that match the given |
|---|
| 481 |
criteria. You can specify the name of the Tag and any |
|---|
| 482 |
attributes you want the Tag to have. |
|---|
| 483 |
|
|---|
| 484 |
The value of a key-value pair in the 'attrs' map can be a |
|---|
| 485 |
string, a list of strings, a regular expression object, or a |
|---|
| 486 |
callable that takes a string and returns whether or not the |
|---|
| 487 |
string matches for some custom definition of 'matches'. The |
|---|
| 488 |
same is true of the tag name.""" |
|---|
| 489 |
generator = self.recursiveChildGenerator |
|---|
| 490 |
if not recursive: |
|---|
| 491 |
generator = self.childGenerator |
|---|
| 492 |
return self._fetch(name, attrs, text, limit, generator) |
|---|
| 493 |
fetchChildren = fetch |
|---|
| 494 |
|
|---|
| 495 |
|
|---|
| 496 |
|
|---|
| 497 |
def isSelfClosing(self): |
|---|
| 498 |
"""Returns true iff this is a self-closing tag as defined in the HTML |
|---|
| 499 |
standard. |
|---|
| 500 |
|
|---|
| 501 |
TODO: This is specific to BeautifulSoup and its subclasses, but it's |
|---|
| 502 |
used by __str__""" |
|---|
| 503 |
return self.name in BeautifulSoup.SELF_CLOSING_TAGS |
|---|
| 504 |
|
|---|
| 505 |
def append(self, tag): |
|---|
| 506 |
"""Appends the given tag to the contents of this tag.""" |
|---|
| 507 |
self.contents.append(tag) |
|---|
| 508 |
|
|---|
| 509 |
|
|---|
| 510 |
|
|---|
| 511 |
def _getAttrMap(self): |
|---|
| 512 |
"""Initializes a map representation of this tag's attributes, |
|---|
| 513 |
if not already initialized.""" |
|---|
| 514 |
if not getattr(self, 'attrMap'): |
|---|
| 515 |
self.attrMap = {} |
|---|
| 516 |
for (key, value) in self.attrs: |
|---|
| 517 |
self.attrMap[key] = value |
|---|
| 518 |
return self.attrMap |
|---|
| 519 |
|
|---|
| 520 |
|
|---|
| 521 |
def childGenerator(self): |
|---|
| 522 |
for i in range(0, len(self.contents)): |
|---|
| 523 |
yield self.contents[i] |
|---|
| 524 |
raise StopIteration |
|---|
| 525 |
|
|---|
| 526 |
def recursiveChildGenerator(self): |
|---|
| 527 |
stack = [(self, 0)] |
|---|
| 528 |
while stack: |
|---|
| 529 |
tag, start = stack.pop() |
|---|
| 530 |
if isinstance(tag, Tag): |
|---|
| 531 |
for i in range(start, len(tag.contents)): |
|---|
| 532 |
a = tag.contents[i] |
|---|
| 533 |
yield a |
|---|
| 534 |
if isinstance(a, Tag) and tag.contents: |
|---|
| 535 |
if i < len(tag.contents) - 1: |
|---|
| 536 |
stack.append((tag, i+1)) |
|---|
| 537 |
stack.append((a, 0)) |
|---|
| 538 |
break |
|---|
| 539 |
raise StopIteration |
|---|
| 540 |
|
|---|
| 541 |
|
|---|
| 542 |
def isList(l): |
|---|
| 543 |
"""Convenience method that works with all 2.x versions of Python |
|---|
| 544 |
to determine whether or not something is listlike.""" |
|---|
| 545 |
return hasattr(l, '__iter__') \ |
|---|
| 546 |
or (type(l) in (types.ListType, types.TupleType)) |
|---|
| 547 |
|
|---|
| 548 |
def buildTagMap(default, *args): |
|---|
| 549 |
"""Turns a list of maps, lists, or scalars into a single map. |
|---|
| 550 |
Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out |
|---|
| 551 |
of lists and partial maps.""" |
|---|
| 552 |
built = {} |
|---|
| 553 |
for portion in args: |
|---|
| 554 |
if hasattr(portion, 'items'): |
|---|
| 555 |
|
|---|
| 556 |
for k,v in portion.items(): |
|---|
| 557 |
built[k] = v |
|---|
| 558 |
elif isList(portion): |
|---|
| 559 |
|
|---|
| 560 |
for k in portion: |
|---|
| 561 |
built[k] = default |
|---|
| 562 |
else: |
|---|
| 563 |
|
|---|
| 564 |
built[portion] = default |
|---|
| 565 |
return built |
|---|
| 566 |
|
|---|
| 567 |
class BeautifulStoneSoup(Tag, SGMLParser): |
|---|
| 568 |
|
|---|
| 569 |
"""This class contains the basic parser and fetch code. It defines |
|---|
| 570 |
a parser that knows nothing about tag behavior except for the |
|---|
| 571 |
following: |
|---|
| 572 |
|
|---|
| 573 |
You can't close a tag without closing all the tags it encloses. |
|---|
| 574 |
That is, "<foo><bar></foo>" actually means |
|---|
| 575 |
"<foo><bar></bar></foo>". |
|---|
| 576 |
|
|---|
| 577 |
[Another possible explanation is "<foo><bar /></foo>", but since |
|---|
| 578 |
this class defines no SELF_CLOSING_TAGS, it will never use that |
|---|
| 579 |
explanation.] |
|---|
| 580 |
|
|---|
| 581 |
This class is useful for parsing XML or made-up markup languages, |
|---|
| 582 |
or when BeautifulSoup makes an assumption counter to what you were |
|---|
| 583 |
expecting.""" |
|---|
| 584 |
|
|---|
| 585 |
SELF_CLOSING_TAGS = {} |
|---|
| 586 |
NESTABLE_TAGS = {} |
|---|
| 587 |
RESET_NESTING_TAGS = {} |
|---|
| 588 |
QUOTE_TAGS = {} |
|---|
| 589 |
|
|---|
| 590 |
|
|---|
| 591 |
|
|---|
| 592 |
MS_CHARS = { '\x80' : '€', |
|---|
| 593 |
'\x81' : ' ', |
|---|
| 594 |
'\x82' : '‚', |
|---|
| 595 |
'\x83' : 'ƒ', |
|---|
| 596 |
'\x84' : '„', |
|---|
| 597 |
'\x85' : '…', |
|---|
| 598 |
'\x86' : '†', |
|---|
| 599 |
'\x87' : '‡', |
|---|
| 600 |
'\x88' : '⁁', |
|---|
| 601 |
'\x89' : '%', |
|---|
| 602 |
'\x8A' : 'Š', |
|---|
| 603 |
'\x8B' : '<', |
|---|
| 604 |
'\x8C' : 'Œ', |
|---|
| 605 |
'\x8D' : '?', |
|---|
| 606 |
'\x8E' : 'Z', |
|---|
| 607 |
'\x8F' : '?', |
|---|
| 608 |
'\x90' : '?', |
|---|
| 609 |
'\x91' : '‘', |
|---|
| 610 |
'\x92' : '’', |
|---|
| 611 |
'\x93' : '“', |
|---|
| 612 |
'\x94' : '”', |
|---|
| 613 |
'\x95' : '•', |
|---|
| 614 |
'\x96' : '–', |
|---|
| 615 |
'\x97' : '—', |
|---|
| 616 |
'\x98' : '˜', |
|---|
| 617 |
'\x99' : '™', |
|---|
| 618 |
'\x9a' : 'š', |
|---|
| 619 |
'\x9b' : '>', |
|---|
| 620 |
'\x9c' : 'œ', |
|---|
| 621 |
'\x9d' : '?', |
|---|
| 622 |
'\x9e' : 'z', |
|---|
| 623 |
'\x9f' : 'Ÿ',} |
|---|
| 624 |
|
|---|
| 625 |
PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), |
|---|
| 626 |
lambda(x):x.group(1) + ' />'), |
|---|
| 627 |
(re.compile('<!\s+([^<>]*)>'), |
|---|
| 628 |
lambda(x):'<!' + x.group(1) + '>'), |
|---|
| 629 |
(re.compile("([\x80-\x9f])"), |
|---|
| 630 |
lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) |
|---|
| 631 |
] |
|---|
| 632 |
|
|---|
| 633 |
ROOT_TAG_NAME = '[document]' |
|---|
| 634 |
|
|---|
| 635 |
def __init__(self, text=None, avoidParserProblems=True, |
|---|
| 636 |
initialTextIsEverything=True): |
|---|
| 637 |
"""Initialize this as the 'root tag' and feed in any text to |
|---|
| 638 |
the parser. |
|---|
| 639 |
|
|---|
| 640 |
NOTE about avoidParserProblems: sgmllib will process most bad |
|---|
| 641 |
HTML, and BeautifulSoup has tricks for dealing with some HTML |
|---|
| 642 |
that kills sgmllib, but Beautiful Soup can nonetheless choke |
|---|
| 643 |
or lose data if your data uses self-closing tags or |
|---|
| 644 |
declarations incorrectly. By default, Beautiful Soup sanitizes |
|---|
| 645 |
its input to avoid the vast majority of these problems. The |
|---|
| 646 |
problems are relatively rare, even in bad HTML, so feel free |
|---|
| 647 |
to pass in False to avoidParserProblems if they don't apply to |
|---|
| 648 |
you, and you'll get better performance. The only reason I have |
|---|
| 649 |
this turned on by default is so I don't get so many tech |
|---|
| 650 |
support questions. |
|---|
| 651 |
|
|---|
| 652 |
The two most common instances of invalid HTML that will choke |
|---|
| 653 |
sgmllib are fixed by the default parser massage techniques: |
|---|
| 654 |
|
|---|
| 655 |
<br/> (No space between name of closing tag and tag close) |
|---|
| 656 |
<! --Comment--> (Extraneous whitespace in declaration) |
|---|
| 657 |
|
|---|
| 658 |
You can pass in a custom list of (RE object, replace method) |
|---|
| 659 |
tuples to get Beautiful Soup to scrub your input the way you |
|---|
| 660 |
want.""" |
|---|
| 661 |
Tag.__init__(self, self.ROOT_TAG_NAME) |
|---|
| 662 |
if avoidParserProblems \ |
|---|
| 663 |
and not isList(avoidParserProblems): |
|---|
| 664 |
avoidParserProblems = self.PARSER_MASSAGE |
|---|
| 665 |
self.avoidParserProblems = avoidParserProblems |
|---|
| 666 |
SGMLParser.__init__(self) |
|---|
| 667 |
self.quoteStack = [] |
|---|
| 668 |
self.hidden = 1 |
|---|
| 669 |
self.reset() |
|---|
| 670 |
if hasattr(text, 'read'): |
|---|
| 671 |
|
|---|
| 672 |
text = text.read() |
|---|
| 673 |
if text: |
|---|
| 674 |
self.feed(text) |
|---|
| 675 |
if initialTextIsEverything: |
|---|
| 676 |
self.done() |
|---|
| 677 |
|
|---|
| 678 |
def __getattr__(self, methodName): |
|---|
| 679 |
"""This method routes method call requests to either the SGMLParser |
|---|
| 680 |
superclass or the Tag superclass, depending on the method name.""" |
|---|
| 681 |
if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ |
|---|
| 682 |
or methodName.find('do_') == 0: |
|---|
| 683 |
return SGMLParser.__getattr__(self, methodName) |
|---|
| 684 |
elif methodName.find('__') != 0: |
|---|
| 685 |
return Tag.__getattr__(self, methodName) |
|---|
| 686 |
else: |
|---|
| 687 |
raise AttributeError |
|---|
| 688 |
|
|---|
| 689 |
def feed(self, text): |
|---|
| 690 |
if self.avoidParserProblems: |
|---|
| 691 |
for fix, m in self.avoidParserProblems: |
|---|
| 692 |
text = fix.sub(m, text) |
|---|
| 693 |
SGMLParser.feed(self, text) |
|---|
| 694 |
|
|---|
| 695 |
def done(self): |
|---|
| 696 |
"""Called when you're done parsing, so that the unclosed tags can be |
|---|
| 697 |
correctly processed.""" |
|---|
| 698 |
self.endData() |
|---|
| 699 |
while self.currentTag.name != self.ROOT_TAG_NAME: |
|---|
| 700 |
self.popTag() |
|---|
| 701 |
|
|---|
| 702 |
def reset(self): |
|---|
| 703 |
SGMLParser.reset(self) |
|---|
| 704 |
self.currentData = [] |
|---|
| 705 |
self.currentTag = None |
|---|
| 706 |
self.tagStack = [] |
|---|
| 707 |
self.pushTag(self) |
|---|
| 708 |
|
|---|
| 709 |
def popTag(self): |
|---|
| 710 |
tag = self.tagStack.pop() |
|---|
| 711 |
|
|---|
| 712 |
|
|---|
| 713 |
|
|---|
| 714 |
if len(self.currentTag.contents) == 1 and \ |
|---|
| 715 |
isinstance(self.currentTag.contents[0], NavigableText): |
|---|
| 716 |
self.currentTag.string = self.currentTag.contents[0] |
|---|
| 717 |
|
|---|
| 718 |
|
|---|
| 719 |
if self.tagStack: |
|---|
| 720 |
self.currentTag = self.tagStack[-1] |
|---|
| 721 |
return self.currentTag |
|---|
| 722 |
|
|---|
| 723 |
def pushTag(self, tag): |
|---|
| 724 |
|
|---|