PYTHON CODE

profileHussain2018
Newfolder21.rar

New folder (2)/miniproject.rar

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/AUTHORS.txt

Behold, mortal, the origins of Beautiful Soup... ================================================ Leonard Richardson is the primary programmer. Aaron DeVore is awesome. Mark Pilgrim provided the encoding detection code that forms the base of UnicodeDammit. Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful Soup 4 working under Python 3. Simon Willison wrote soupselect, which was used to make Beautiful Soup support CSS selectors. Sam Ruby helped with a lot of edge cases. Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his work in solving the nestable tags conundrum. An incomplete list of people have contributed patches to Beautiful Soup: Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn Webster, Paul Wright, Danny Yoo An incomplete list of people who made suggestions or found bugs or found ways to break Beautiful Soup: Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de Sousa Rocha, Yichun Wei, Per Vognsen

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/builder/_html5lib.py

__all__ = [ 'HTML5TreeBuilder', ] import warnings from bs4.builder import ( PERMISSIVE, HTML, HTML_5, HTMLTreeBuilder, ) from bs4.element import NamespacedAttribute import html5lib from html5lib.constants import namespaces from bs4.element import ( Comment, Doctype, NavigableString, Tag, ) class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" features = ['html5lib', PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding return markup, None, None, False # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0] def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( self.soup, namespaceHTMLElements) return self.underlying_builder def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'%s' % fragment class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element class AttrList(object): def __init__(self, element): self.element = element self.attrs = dict(self.element.attrs) def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): "set attr", name, value self.element[name] = value def items(self): return list(self.attrs.items()) def keys(self): return list(self.attrs.keys()) def __len__(self): return len(self.attrs) def __getitem__(self, name): return self.attrs[name] def __contains__(self, name): return name in list(self.attrs.keys()) class Element(html5lib.treebuilders._base.Node): def __init__(self, element, soup, namespace): html5lib.treebuilders._base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # XXX This has O(n^2) performance, for input like # "aaa..." old_element = self.element.contents[-1] new_element = self.soup.new_string(old_element + node.element) old_element.replace_with(new_element) else: self.element.append(node.element) node.parent = self def getAttributes(self): return AttrList(self.element) def setAttributes(self, attributes): if attributes is not None and len(attributes) > 0: converted_attributes = [] for name, value in list(attributes.items()): if isinstance(name, tuple): new_name = NamespacedAttribute(*name) del attributes[name] attributes[new_name] = value self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) for name, value in attributes.items(): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. # # The Tag constructor called this method when the Tag was created, # but we just set/changed the attributes, so call it again. self.soup.builder.set_up_substitutions(self.element) attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) old_node = self.element.contents[index-1] new_str = self.soup.new_string(old_node + node.element) old_node.replace_with(new_str) else: self.element.insert(index, node.element) node.parent = self def removeChild(self, node): node.element.extract() def reparentChildren(self, newParent): while self.element.contents: child = self.element.contents[0] child.extract() if isinstance(child, Tag): newParent.appendChild( Element(child, self.soup, namespaces["html"])) else: newParent.appendChild( TextNode(child, self.soup)) def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node def hasContent(self): return self.element.contents def getNameTuple(self): if self.namespace == None: return namespaces["html"], self.name else: return self.namespace, self.name nameTuple = property(getNameTuple) class TextNode(Element): def __init__(self, element, soup): html5lib.treebuilders._base.Node.__init__(self, None) self.element = element self.soup = soup def cloneNode(self): raise NotImplementedError

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/builder/_htmlparser.py

"""Use the HTMLParser library to parse HTML files that aren't too bad.""" __all__ = [ 'HTMLParserTreeBuilder', ] from HTMLParser import ( HTMLParser, HTMLParseError, ) import sys import warnings # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' # argument, which we'd like to set to False. Unfortunately, # http://bugs.python.org/issue13273 makes strict=True a better bet # before Python 3.2.3. # # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] CONSTRUCTOR_TAKES_STRICT = ( major > 3 or (major == 3 and minor > 2) or (major == 3 and minor == 2 and release >= 3)) from bs4.element import ( CData, Comment, Declaration, Doctype, ProcessingInstruction, ) from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( HTML, HTMLTreeBuilder, STRICT, ) HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): # XXX namespace self.soup.handle_starttag(name, None, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) else: real_name = int(name) try: data = unichr(real_name) except (ValueError, OverflowError), e: data = u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character else: data = "&%s;" % name self.handle_data(data) def handle_comment(self, data): self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] else: cls = Declaration self.soup.endData() self.soup.handle_data(data) self.soup.endData(cls) def handle_pi(self, data): self.soup.endData() if data.endswith("?") and data.lower().startswith("xml"): # "An XHTML processing instruction using the trailing '?' # will cause the '?' to be included in data." - HTMLParser # docs. # # Strip the question mark so we don't end up with two # question marks. data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False features = [HTML, STRICT, HTMLPARSER] def __init__(self, *args, **kwargs): if CONSTRUCTOR_TAKES_STRICT: kwargs['strict'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3. if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: import re attrfind_tolerant = re.compile( r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value |[^'\">\s]+ # bare value ) )? ) )* \s* # trailing whitespace """, re.VERBOSE) BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = rawdata[i+1:k].lower() while k < endpos: if self.strict: m = attrfind.match(rawdata, k) else: m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) if self.strict: self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) BeautifulSoupHTMLParser.parse_starttag = parse_starttag BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/builder/_lxml.py

__all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] from StringIO import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, HTMLTreeBuilder, PERMISSIVE, TreeBuilder, XML) from bs4.dammit import UnicodeDammit LXML = 'lxml' class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @property def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. return etree.XMLParser(target=self, strip_cdata=False, recover=True) def __init__(self, parser=None, empty_element_tags=None): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) if parser is None: # Use the default parser. parser = self.default_parser if isinstance(parser, collections.Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None self.nsmaps = None def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. if tag[0] == '{': return tuple(tag[1:].split('}', 1)) else: return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): if isinstance(markup, basestring): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) self.parser.feed(data) while data != '': # Now call feed() on the rest of the data, chunk by chunk. data = markup.read(self.CHUNK_SIZE) if data != '': self.parser.feed(data) self.parser.close() def close(self): self.nsmaps = None def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. if len(nsmap) == 0 and self.nsmaps != None: # There are no new namespaces for this tag, but namespaces # are in play, so we need a separate tag stack to know # when they end. self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. if self.nsmaps is None: self.nsmaps = [] inverted_nsmap = dict((value, key) for key, value in nsmap.items()) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() for prefix, namespace in nsmap.items(): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace namespace, name = self._getNsTag(name) if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] namespace, name = self._getNsTag(name) nsprefix = None if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_endtag(name, nsprefix) if self.nsmaps != None: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. self.nsmaps.pop() if len(self.nsmaps) == 0: # Namespaces are no longer in play, so don't bother keeping # track of the namespace stack. self.nsmaps = None def pi(self, target, data): pass def data(self, content): self.soup.handle_data(content) def doctype(self, name, pubid, system): self.soup.endData() doctype = Doctype.for_name_and_ids(name, pubid, system) self.soup.object_was_parsed(doctype) def comment(self, content): "Handle comments as Comment objects." self.soup.endData() self.soup.handle_data(content) self.soup.endData(Comment) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False @property def default_parser(self): return etree.HTMLParser def feed(self, markup): self.parser.feed(markup) self.parser.close() def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'%s' % fragment

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/builder/__init__.py

from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, whitespace_re ) __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', 'TreeBuilderRegistry', ] # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' class TreeBuilderRegistry(object): def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): """Register a treebuilder based on its advertised features.""" for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): if len(self.builders) == 0: # There are no builders at all. return None if len(features) == 0: # They didn't ask for any features. Give them the most # recently registered builder. return self.builders[0] # Go down the list of features in order, and eliminate any builders # that don't match every feature. features = list(features) features.reverse() candidates = None candidate_set = None while len(features) > 0: feature = features.pop() we_have_the_feature = self.builders_for_feature.get(feature, []) if len(we_have_the_feature) > 0: if candidates is None: candidates = we_have_the_feature candidate_set = set(candidates) else: # Eliminate any candidates that don't have this feature. candidate_set = candidate_set.intersection( set(we_have_the_feature)) # The only valid candidates are the ones in candidate_set. # Go through the original list of candidates and pick the first one # that's in candidate_set. if candidate_set is None: return None for candidate in candidates: if candidate in candidate_set: return candidate return None # The BeautifulSoup class will take feature lists from developers and use them # to look up builders in this registry. builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" features = [] is_xml = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} def __init__(self): self.soup = None def reset(self): pass def can_be_empty_element(self, tag_name): """Might a tag with this name be an empty-element tag? The final markup may or may not actually present this tag as self-closing. For instance: an HTMLBuilder does not consider a <p> tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty <p> tag will be presented as "<p></p>", not "<p />". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no contents. "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will be left alone. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): return markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. Different parsers do this differently. For instance, lxml introduces an empty <head> tag, and html5lib doesn't. Abstracting this away lets us write simple tests which run HTML fragments through the parser and compare the results against other HTML fragments. This method should not be used outside of tests. """ return fragment def set_up_substitutions(self, tag): return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): """Replaces class="foo bar" with class=["foo", "bar"] Modifies its input in place. """ if self.cdata_list_attributes: universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), []) for cdata_list_attr in itertools.chain(universal, tag_specific): if cdata_list_attr in dict(attrs): # Basically, we have a "class" attribute whose # value is a whitespace-separated list of CSS # classes. Split it into a list. value = attrs[cdata_list_attr] values = whitespace_re.split(value) attrs[cdata_list_attr] = values return attrs class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" def feed(self, markup): raise NotImplementedError() def close(self): pass def startElement(self, name, attrs): attrs = dict((key[1], value) for key, value in list(attrs.items())) #print "Start %s, %r" % (name, attrs) self.soup.handle_starttag(name, attrs) def endElement(self, name): #print "End %s" % name self.soup.handle_endtag(name) def startElementNS(self, nsTuple, nodeName, attrs): # Throw away (ns, nodeName) for now. self.startElement(nodeName, attrs) def endElementNS(self, nsTuple, nodeName): # Throw away (ns, nodeName) for now. self.endElement(nodeName) #handler.endElementNS((ns, node.nodeName), node.nodeName) def startPrefixMapping(self, prefix, nodeValue): # Ignore the prefix for now. pass def endPrefixMapping(self, prefix): # Ignore the prefix for now. # handler.endPrefixMapping(prefix) pass def characters(self, content): self.soup.handle_data(content) def startDocument(self): pass def endDocument(self): pass class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. Such as which tags are empty-element tags. """ preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, # 'foo' and 'bar', not the single value 'foo bar'. When we # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. cdata_list_attributes = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], "td" : ["headers"], "th" : ["headers"], "td" : ["headers"], "form" : ["accept-charset"], "object" : ["archive"], # These are HTML5 specific, as are *.accesskey and *.dropzone above. "area" : ["rel"], "icon" : ["sizes"], "iframe" : ["sandbox"], "output" : ["for"], } def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': return False http_equiv = tag.get('http-equiv') content = tag.get('content') charset = tag.get('charset') # We are interested in <meta> tags that say what encoding the # document was originally in. This means HTML 5-style <meta> # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". # # In both cases we will replace the value of the appropriate # attribute with a standin object that can take on any # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None) def register_treebuilders_from(module): """Copy TreeBuilders from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules['bs4.builder'] for name in module.__all__: obj = getattr(module, name) if issubclass(obj, TreeBuilder): setattr(this_module, name, obj) this_module.__all__.append(name) # Register the builder while we're at it. this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only # want to use HTMLParser as a last result. from . import _htmlparser register_treebuilders_from(_htmlparser) try: from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass try: from . import _lxml register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/dammit.py

# -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit This class forces XML data into a standard format (usually to UTF-8 or Unicode). It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs from htmlentitydefs import codepoint2name import re import warnings # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ # or 'apt-get install python-chardet' # or 'easy_install chardet' try: import chardet #import chardet.constants #chardet.constants._debug = 1 except ImportError: chardet = None # Available from http://cjkpython.i18n.org/. try: import iconv_codec except ImportError: pass xml_encoding_re = re.compile( '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) html_meta_re = re.compile( '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): """Substitute XML or HTML entities for the corresponding characters.""" def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): character = unichr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # &quot;, unless it happens within an attribute value, which # is handled elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to turn &quot; into the quotation mark. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() CHARACTER_TO_XML_ENTITY = { "'": "apos", '"': "quot", "&": "amp", "<": "lt", ">": "gt", } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity @classmethod def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity @classmethod def quoted_attribute_value(self, value): """Make a value into a quoted XML attribute, possibly escaping it. Most strings will be quoted using double quotes. Bob's Bar -> "Bob's Bar" If a string contains double quotes, it will be quoted using single quotes. Welcome to "my bar" -> 'Welcome to "my bar"' If a string contains both single and double quotes, the double quotes will be escaped, and the string will be quoted using double quotes. Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot; """ quote_with = '"' if '"' in value: if "'" in value: # The string contains both single and double # quotes. Turn the double quotes into # entities. We quote the double quotes rather than # the single quotes because the entity name is # "&quot;" whether this is HTML or XML. If we # quoted the single quotes, we'd have to decide # between &apos; and &squot;. replace_with = "&quot;" value = value.replace('"', replace_with) else: # There are double quotes but no single quotes. # We can use single quotes to quote the attribute. quote_with = "'" return quote_with + value + quote_with @classmethod def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. :param value: A string to be substituted. The less-than sign will become &lt;, the greater-than sign will become &gt;, and any ampersands that are not part of an entity defition will become &amp;. :param make_quoted_attribute: If True, then the string will be quoted, as befits an attribute value. """ # Escape angle brackets, and ampersands that aren't part of # entities. value = cls.BARE_AMPERSAND_OR_BRACKET.sub( cls._substitute_xml_entity, value) if make_quoted_attribute: value = cls.quoted_attribute_value(value) return value @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. This differs from data.encode(encoding, 'xmlcharrefreplace') in that the goal is to make the result more readable (to those with ASCII displays) rather than to recover from errors. There's absolutely nothing wrong with a UTF-8 string containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that character with "&eacute;" will make it more readable to some people. """ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( cls._substitute_html_entity, s) class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.""" # This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. CHARSET_ALIASES = {"macintosh": "mac-roman", "x-sjis": "shift-jis"} ENCODINGS_WITH_SMART_QUOTES = [ "windows-1252", "iso-8859-1", "iso-8859-2", ] def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return new_markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, is_html) self.markup = new_markup u = None if new_markup != markup: # _detectEncoding modified the markup, then converted it to # Unicode and then to UTF-8. So convert it from UTF-8. u = self._convert_from("utf8") self.original_encoding = sniffed_encoding if not u: for proposed_encoding in ( override_encodings + [document_encoding, sniffed_encoding]): if proposed_encoding is not None: u = self._convert_from(proposed_encoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convert_from(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convert_from(proposed_encoding) if u: break # As an absolute last resort, try the encodings again with # character replacement. if not u: for proposed_encoding in ( override_encodings + [ document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): if proposed_encoding != "ascii": u = self._convert_from(proposed_encoding, "replace") if u is not None: warnings.warn( UnicodeWarning( "Some characters could not be decoded, and were " "replaced with REPLACEMENT CHARACTER.")) self.contains_replacement_characters = True break # We could at this point force it to ASCII, but that would # destroy so much data that I think giving up is better self.unicode_markup = u if not u: self.original_encoding = None def _sub_ms_char(self, match): """Changes a MS smart quote character to an XML or HTML entity, or an ASCII character.""" orig = match.group(1) if self.smart_quotes_to == 'ascii': sub = self.MS_CHARS_TO_ASCII.get(orig).encode() else: sub = self.MS_CHARS.get(orig) if type(sub) == tuple: if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() else: sub = sub.encode() return sub def _convert_from(self, proposed, errors="strict"): proposed = self.find_codec(proposed) if not proposed or (proposed, errors) in self.tried_encodings: return None self.tried_encodings.append((proposed, errors)) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: #print "Trying to convert document to %s (errors=%s)" % ( # proposed, errors) u = self._to_unicode(markup, proposed, errors) self.markup = u self.original_encoding = proposed except Exception as e: #print "That didn't work!" #print e return None #print "Correct encoding: %s" % proposed return self.markup def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding, errors) return newdata def _detectEncoding(self, xml_data, is_html=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == b'\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == b'\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ and (xml_data[2:4] != b'\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ (xml_data[2:4] != b'\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == b'\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass except: xml_encoding_match = None xml_encoding_match = xml_encoding_re.match(xml_data) if not xml_encoding_match and is_html: xml_encoding_match = html_meta_re.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].decode( 'ascii').lower() if is_html: self.declared_html_encoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): pass return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), b'\x81': ' ', b'\x82': ('sbquo', '201A'), b'\x83': ('fnof', '192'), b'\x84': ('bdquo', '201E'), b'\x85': ('hellip', '2026'), b'\x86': ('dagger', '2020'), b'\x87': ('Dagger', '2021'), b'\x88': ('circ', '2C6'), b'\x89': ('permil', '2030'), b'\x8A': ('Scaron', '160'), b'\x8B': ('lsaquo', '2039'), b'\x8C': ('OElig', '152'), b'\x8D': '?', b'\x8E': ('#x17D', '17D'), b'\x8F': '?', b'\x90': '?', b'\x91': ('lsquo', '2018'), b'\x92': ('rsquo', '2019'), b'\x93': ('ldquo', '201C'), b'\x94': ('rdquo', '201D'), b'\x95': ('bull', '2022'), b'\x96': ('ndash', '2013'), b'\x97': ('mdash', '2014'), b'\x98': ('tilde', '2DC'), b'\x99': ('trade', '2122'), b'\x9a': ('scaron', '161'), b'\x9b': ('rsaquo', '203A'), b'\x9c': ('oelig', '153'), b'\x9d': '?', b'\x9e': ('#x17E', '17E'), b'\x9f': ('Yuml', ''),} # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains # horrors like stripping diacritical marks to turn á into a, but also # contains non-horrors like turning “ into ". MS_CHARS_TO_ASCII = { b'\x80' : 'EUR', b'\x81' : ' ', b'\x82' : ',', b'\x83' : 'f', b'\x84' : ',,', b'\x85' : '...', b'\x86' : '+', b'\x87' : '++', b'\x88' : '^', b'\x89' : '%', b'\x8a' : 'S', b'\x8b' : '<', b'\x8c' : 'OE', b'\x8d' : '?', b'\x8e' : 'Z', b'\x8f' : '?', b'\x90' : '?', b'\x91' : "'", b'\x92' : "'", b'\x93' : '"', b'\x94' : '"', b'\x95' : '*', b'\x96' : '-', b'\x97' : '--', b'\x98' : '~', b'\x99' : '(TM)', b'\x9a' : 's', b'\x9b' : '>', b'\x9c' : 'oe', b'\x9d' : '?', b'\x9e' : 'z', b'\x9f' : 'Y', b'\xa0' : ' ', b'\xa1' : '!', b'\xa2' : 'c', b'\xa3' : 'GBP', b'\xa4' : '$', #This approximation is especially parochial--this is the #generic currency symbol. b'\xa5' : 'YEN', b'\xa6' : '|', b'\xa7' : 'S', b'\xa8' : '..', b'\xa9' : '', b'\xaa' : '(th)', b'\xab' : '<<', b'\xac' : '!', b'\xad' : ' ', b'\xae' : '(R)', b'\xaf' : '-', b'\xb0' : 'o', b'\xb1' : '+-', b'\xb2' : '2', b'\xb3' : '3', b'\xb4' : ("'", 'acute'), b'\xb5' : 'u', b'\xb6' : 'P', b'\xb7' : '*', b'\xb8' : ',', b'\xb9' : '1', b'\xba' : '(th)', b'\xbb' : '>>', b'\xbc' : '1/4', b'\xbd' : '1/2', b'\xbe' : '3/4', b'\xbf' : '?', b'\xc0' : 'A', b'\xc1' : 'A', b'\xc2' : 'A', b'\xc3' : 'A', b'\xc4' : 'A', b'\xc5' : 'A', b'\xc6' : 'AE', b'\xc7' : 'C', b'\xc8' : 'E', b'\xc9' : 'E', b'\xca' : 'E', b'\xcb' : 'E', b'\xcc' : 'I', b'\xcd' : 'I', b'\xce' : 'I', b'\xcf' : 'I', b'\xd0' : 'D', b'\xd1' : 'N', b'\xd2' : 'O', b'\xd3' : 'O', b'\xd4' : 'O', b'\xd5' : 'O', b'\xd6' : 'O', b'\xd7' : '*', b'\xd8' : 'O', b'\xd9' : 'U', b'\xda' : 'U', b'\xdb' : 'U', b'\xdc' : 'U', b'\xdd' : 'Y', b'\xde' : 'b', b'\xdf' : 'B', b'\xe0' : 'a', b'\xe1' : 'a', b'\xe2' : 'a', b'\xe3' : 'a', b'\xe4' : 'a', b'\xe5' : 'a', b'\xe6' : 'ae', b'\xe7' : 'c', b'\xe8' : 'e', b'\xe9' : 'e', b'\xea' : 'e', b'\xeb' : 'e', b'\xec' : 'i', b'\xed' : 'i', b'\xee' : 'i', b'\xef' : 'i', b'\xf0' : 'o', b'\xf1' : 'n', b'\xf2' : 'o', b'\xf3' : 'o', b'\xf4' : 'o', b'\xf5' : 'o', b'\xf6' : 'o', b'\xf7' : '/', b'\xf8' : 'o', b'\xf9' : 'u', b'\xfa' : 'u', b'\xfb' : 'u', b'\xfc' : 'u', b'\xfd' : 'y', b'\xfe' : 'b', b'\xff' : 'y', } # A map used when removing rogue Windows-1252/ISO-8859-1 # characters in otherwise UTF-8 documents. # # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in # Windows-1252. WINDOWS_1252_TO_UTF8 = { 0x80 : b'\xe2\x82\xac', # € 0x82 : b'\xe2\x80\x9a', # ‚ 0x83 : b'\xc6\x92', # ƒ 0x84 : b'\xe2\x80\x9e', # „ 0x85 : b'\xe2\x80\xa6', # … 0x86 : b'\xe2\x80\xa0', # † 0x87 : b'\xe2\x80\xa1', # ‡ 0x88 : b'\xcb\x86', # ˆ 0x89 : b'\xe2\x80\xb0', # ‰ 0x8a : b'\xc5\xa0', # Š 0x8b : b'\xe2\x80\xb9', # ‹ 0x8c : b'\xc5\x92', # Œ 0x8e : b'\xc5\xbd', # Ž 0x91 : b'\xe2\x80\x98', # ‘ 0x92 : b'\xe2\x80\x99', # ’ 0x93 : b'\xe2\x80\x9c', # “ 0x94 : b'\xe2\x80\x9d', # ” 0x95 : b'\xe2\x80\xa2', # • 0x96 : b'\xe2\x80\x93', # – 0x97 : b'\xe2\x80\x94', # — 0x98 : b'\xcb\x9c', # ˜ 0x99 : b'\xe2\x84\xa2', # ™ 0x9a : b'\xc5\xa1', # š 0x9b : b'\xe2\x80\xba', # › 0x9c : b'\xc5\x93', # œ 0x9e : b'\xc5\xbe', # ž 0x9f : b'\xc5\xb8', # Ÿ 0xa0 : b'\xc2\xa0', #   0xa1 : b'\xc2\xa1', # ¡ 0xa2 : b'\xc2\xa2', # ¢ 0xa3 : b'\xc2\xa3', # £ 0xa4 : b'\xc2\xa4', # ¤ 0xa5 : b'\xc2\xa5', # ¥ 0xa6 : b'\xc2\xa6', # ¦ 0xa7 : b'\xc2\xa7', # § 0xa8 : b'\xc2\xa8', # ¨ 0xa9 : b'\xc2\xa9', # © 0xaa : b'\xc2\xaa', # ª 0xab : b'\xc2\xab', # « 0xac : b'\xc2\xac', # ¬ 0xad : b'\xc2\xad', # ­ 0xae : b'\xc2\xae', # ® 0xaf : b'\xc2\xaf', # ¯ 0xb0 : b'\xc2\xb0', # ° 0xb1 : b'\xc2\xb1', # ± 0xb2 : b'\xc2\xb2', # ² 0xb3 : b'\xc2\xb3', # ³ 0xb4 : b'\xc2\xb4', # ´ 0xb5 : b'\xc2\xb5', # µ 0xb6 : b'\xc2\xb6', # ¶ 0xb7 : b'\xc2\xb7', # · 0xb8 : b'\xc2\xb8', # ¸ 0xb9 : b'\xc2\xb9', # ¹ 0xba : b'\xc2\xba', # º 0xbb : b'\xc2\xbb', # » 0xbc : b'\xc2\xbc', # ¼ 0xbd : b'\xc2\xbd', # ½ 0xbe : b'\xc2\xbe', # ¾ 0xbf : b'\xc2\xbf', # ¿ 0xc0 : b'\xc3\x80', # À 0xc1 : b'\xc3\x81', # Á 0xc2 : b'\xc3\x82', #  0xc3 : b'\xc3\x83', # à 0xc4 : b'\xc3\x84', # Ä 0xc5 : b'\xc3\x85', # Å 0xc6 : b'\xc3\x86', # Æ 0xc7 : b'\xc3\x87', # Ç 0xc8 : b'\xc3\x88', # È 0xc9 : b'\xc3\x89', # É 0xca : b'\xc3\x8a', # Ê 0xcb : b'\xc3\x8b', # Ë 0xcc : b'\xc3\x8c', # Ì 0xcd : b'\xc3\x8d', # Í 0xce : b'\xc3\x8e', # Î 0xcf : b'\xc3\x8f', # Ï 0xd0 : b'\xc3\x90', # Ð 0xd1 : b'\xc3\x91', # Ñ 0xd2 : b'\xc3\x92', # Ò 0xd3 : b'\xc3\x93', # Ó 0xd4 : b'\xc3\x94', # Ô 0xd5 : b'\xc3\x95', # Õ 0xd6 : b'\xc3\x96', # Ö 0xd7 : b'\xc3\x97', # × 0xd8 : b'\xc3\x98', # Ø 0xd9 : b'\xc3\x99', # Ù 0xda : b'\xc3\x9a', # Ú 0xdb : b'\xc3\x9b', # Û 0xdc : b'\xc3\x9c', # Ü 0xdd : b'\xc3\x9d', # Ý 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä 0xe5 : b'\xc3\xa5', # å 0xe6 : b'\xc3\xa6', # æ 0xe7 : b'\xc3\xa7', # ç 0xe8 : b'\xc3\xa8', # è 0xe9 : b'\xc3\xa9', # é 0xea : b'\xc3\xaa', # ê 0xeb : b'\xc3\xab', # ë 0xec : b'\xc3\xac', # ì 0xed : b'\xc3\xad', # í 0xee : b'\xc3\xae', # î 0xef : b'\xc3\xaf', # ï 0xf0 : b'\xc3\xb0', # ð 0xf1 : b'\xc3\xb1', # ñ 0xf2 : b'\xc3\xb2', # ò 0xf3 : b'\xc3\xb3', # ó 0xf4 : b'\xc3\xb4', # ô 0xf5 : b'\xc3\xb5', # õ 0xf6 : b'\xc3\xb6', # ö 0xf7 : b'\xc3\xb7', # ÷ 0xf8 : b'\xc3\xb8', # ø 0xf9 : b'\xc3\xb9', # ù 0xfa : b'\xc3\xba', # ú 0xfb : b'\xc3\xbb', # û 0xfc : b'\xc3\xbc', # ü 0xfd : b'\xc3\xbd', # ý 0xfe : b'\xc3\xbe', # þ } MULTIBYTE_MARKERS_AND_SIZES = [ (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF (0xe0, 0xef, 3), # 3-byte characters start with E0-EF (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 ] FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] @classmethod def detwingle(cls, in_bytes, main_encoding="utf8", embedded_encoding="windows-1252"): """Fix characters from one encoding embedded in some other encoding. Currently the only situation supported is Windows-1252 (or its subset ISO-8859-1), embedded in UTF-8. The input must be a bytestring. If you've already converted the document to Unicode, you're too late. The output is a bytestring in which `embedded_encoding` characters have been converted to their `main_encoding` equivalents. """ if embedded_encoding.replace('_', '-').lower() not in ( 'windows-1252', 'windows_1252'): raise NotImplementedError( "Windows-1252 and ISO-8859-1 are the only currently supported " "embedded encodings.") if main_encoding.lower() not in ('utf8', 'utf-8'): raise NotImplementedError( "UTF-8 is the only currently supported main encoding.") byte_chunks = [] chunk_start = 0 pos = 0 while pos < len(in_bytes): byte = in_bytes[pos] if not isinstance(byte, int): # Python 2.x byte = ord(byte) if (byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER): # This is the start of a UTF-8 multibyte character. Skip # to the end. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: if byte >= start and byte <= end: pos += size break elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: # We found a Windows-1252 character! # Save the string up to this point as a chunk. byte_chunks.append(in_bytes[chunk_start:pos]) # Now translate the Windows-1252 character into UTF-8 # and add it as another, one-byte chunk. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) pos += 1 chunk_start = pos else: # Go on to the next character. pos += 1 if chunk_start == 0: # The string is unchanged. return in_bytes else: # Store the final chunk. byte_chunks.append(in_bytes[chunk_start:]) return b''.join(byte_chunks)

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/element.py

import collections import re import sys import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(unicode): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '<meta charset="utf8">', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: <meta http-equiv="content-type" content="text/html; charset=utf8"> The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return unicode.__new__(unicode, original_value) obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" # There are five possible values for the "formatter" argument passed in # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities # are converted to those entities on output. # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: &amp; &lt; &gt; # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". # A function - This function will be called on every string that # needs to undergo entity substition FORMATTERS = { "html" : EntitySubstitution.substitute_html, "minimal" : EntitySubstitution.substitute_xml, None : None } @classmethod def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" if not callable(formatter): formatter = self.FORMATTERS.get( formatter, EntitySubstitution.substitute_xml) if formatter is None: output = s else: output = formatter(s) return output def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous_element = previous_element if previous_element is not None: self.previous_element.next_element = self self.next_element = None self.previous_sibling = None self.next_sibling = None if self.parent is not None and self.parent.contents: self.previous_sibling = self.parent.contents[-1] self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): if replace_with is self: return if replace_with is self.parent: raise ValueError("Cannot replace a Tag with its parent.") old_parent = self.parent my_index = self.parent.index(self) self.extract() old_parent.insert(my_index, replace_with) return self replaceWith = replace_with # BS3 def unwrap(self): my_parent = self.parent my_index = self.parent.index(self) self.extract() for child in reversed(self.contents[:]): my_parent.insert(my_index, child) return self replace_with_children = unwrap replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside def extract(self): """Destructively rips this element out of the tree.""" if self.parent is not None: del self.parent.contents[self.parent.index(self)] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. last_child = self._last_descendant() next_element = last_child.next_element if self.previous_element is not None: self.previous_element.next_element = next_element if next_element is not None: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None if self.previous_sibling is not None: self.previous_sibling.next_sibling = self.next_sibling if self.next_sibling is not None: self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self def _last_descendant(self): "Finds the last element beneath this object to be parsed." last_child = self while hasattr(last_child, 'contents') and last_child.contents: last_child = last_child.contents[-1] return last_child # BS3: Not part of the API! _lastRecursiveChild = _last_descendant def insert(self, position, new_child): if new_child is self: raise ValueError("Cannot insert a tag into itself.") if (isinstance(new_child, basestring) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one # of this object's children. if new_child.parent is self: current_index = self.index(new_child) if current_index < position: # We're moving this element further down the list # of this object's children. That means that when # we extract this element, our target index will # jump down one. position -= 1 new_child.extract() new_child.parent = self previous_child = None if position == 0: new_child.previous_sibling = None new_child.previous_element = self else: previous_child = self.contents[position - 1] new_child.previous_sibling = previous_child new_child.previous_sibling.next_sibling = new_child new_child.previous_element = previous_child._last_descendant() if new_child.previous_element is not None: new_child.previous_element.next_element = new_child new_childs_last_element = new_child._last_descendant() if position >= len(self.contents): new_child.next_sibling = None parent = self parents_next_sibling = None while parents_next_sibling is None and parent is not None: parents_next_sibling = parent.next_sibling parent = parent.parent if parents_next_sibling is not None: # We found the element that comes next in the document. break if parents_next_sibling is not None: new_childs_last_element.next_element = parents_next_sibling else: # The last element of this tag is the last element in # the document. new_childs_last_element.next_element = None else: next_child = self.contents[position] new_child.next_sibling = next_child if new_child.next_sibling is not None: new_child.next_sibling.previous_sibling = new_child new_childs_last_element.next_element = next_child if new_childs_last_element.next_element is not None: new_childs_last_element.next_element.previous_element = new_childs_last_element self.contents.insert(position, new_child) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def insert_before(self, predecessor): """Makes the given element the immediate predecessor of this one. The two elements will have the same parent, and the given element will be immediately before this one. """ if self is predecessor: raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(predecessor, PageElement): predecessor.extract() index = parent.index(self) parent.insert(index, predecessor) def insert_after(self, successor): """Makes the given element the immediate successor of this one. The two elements will have the same parent, and the given element will be immediately after this one. """ if self is successor: raise ValueError("Can't insert an element after itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(successor, PageElement): successor.extract() index = parent.index(self) parent.insert(index+1, successor) def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_all_next, name, attrs, text, **kwargs) findNext = find_next # BS3 def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_next_siblings, name, attrs, text, **kwargs) findNextSibling = find_next_sibling # BS3 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_siblings, **kwargs) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 def find_previous(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._find_one( self.find_all_previous, name, attrs, text, **kwargs) findPrevious = find_previous # BS3 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._find_one(self.find_previous_siblings, name, attrs, text, **kwargs) findPreviousSibling = find_previous_sibling # BS3 def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_siblings, **kwargs) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None l = self.find_parents(name, attrs, 1) if l: r = l[0] return r findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._find_all(name, attrs, None, limit, self.parents, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @property def next(self): return self.next_element @property def previous(self): return self.previous_element #These methods do the real heavy lifting. def _find_one(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name elif text is None and not limit and not attrs and not kwargs: # Optimization to find all tags. if name is True or name is None: return [element for element in generator if isinstance(element, Tag)] # Optimization to find all tags with a given name. elif isinstance(name, basestring): return [element for element in generator if isinstance(element, Tag) and element.name == name] else: strainer = SoupStrainer(name, attrs, text, **kwargs) else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) while True: try: i = next(generator) except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These generators can be used to navigate starting from both #NavigableStrings and Tags. @property def next_elements(self): i = self.next_element while i is not None: yield i i = i.next_element @property def next_siblings(self): i = self.next_sibling while i is not None: yield i i = i.next_sibling @property def previous_elements(self): i = self.previous_element while i is not None: yield i i = i.previous_element @property def previous_siblings(self): i = self.previous_sibling while i is not None: yield i i = i.previous_sibling @property def parents(self): i = self.parent while i is not None: yield i i = i.parent # Methods for supporting CSS selectors. tag_name_re = re.compile('^[a-z0-9]+$') # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # \---/ \---/\-------------/ \-------/ # | | | | # | | | The value # | | ~,|,^,$,* or = # | Attribute # Tag attribselect_re = re.compile( r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + r'=?"?(?P<value>[^\]"]*)"?\]$' ) def _attr_value_as_string(self, value, default=None): """Force an attribute value into a string representation. A multi-valued attribute will be converted into a space-separated stirng. """ value = self.get(value, default) if isinstance(value, list) or isinstance(value, tuple): value =" ".join(value) return value def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. Takes an operator, attribute and optional value. Returns a function that will return True for elements that match that combination. """ if operator == '=': # string representation of `attribute` is equal to `value` return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': # space-separated list representation of `attribute` # contains `value` def _includes_value(element): attribute_value = element.get(attribute, []) if not isinstance(attribute_value, list): attribute_value = attribute_value.split() return value in attribute_value return _includes_value elif operator == '^': # string representation of `attribute` starts with `value` return lambda el: el._attr_value_as_string( attribute, '').startswith(value) elif operator == '$': # string represenation of `attribute` ends with `value` return lambda el: el._attr_value_as_string( attribute, '').endswith(value) elif operator == '*': # string representation of `attribute` contains `value` return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': # string representation of `attribute` is either exactly # `value` or starts with `value` and then a dash. def _is_or_starts_with_dash(element): attribute_value = element._attr_value_as_string(attribute, '') return (attribute_value == value or attribute_value.startswith( value + '-')) return _is_or_starts_with_dash else: return lambda el: el.has_attr(attribute) def select(self, selector): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] for index, token in enumerate(tokens): if tokens[index - 1] == '>': # already found direct descendants in last step. skip this # step. continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector tag, attribute, operator, value = m.groups() if not tag: tag = True checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: found.extend( [el for el in context.find_all(tag) if checker(el)]) current_context = found continue if '#' in token: # ID selector tag, id = token.split('#', 1) if tag == "": tag = True el = current_context[0].find(tag, {'id': id}) if el is None: return [] # No match current_context = [el] continue if '.' in token: # Class selector tag_name, klass = token.split('.', 1) if not tag_name: tag_name = True classes = set(klass.split('.')) found = [] def classes_match(tag): if tag_name is not True and tag.name != tag_name: return False if not tag.has_attr('class'): return False return classes.issubset(tag['class']) for context in current_context: found.extend(context.find_all(classes_match)) current_context = found continue if token == '*': # Star selector found = [] for context in current_context: found.extend(context.findAll(True)) current_context = found continue if token == '>': # Child selector tag = tokens[index + 1] if not tag: tag = True found = [] for context in current_context: found.extend(context.find_all(tag, recursive=False)) current_context = found continue # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] found = [] for context in current_context: found.extend(context.findAll(token)) current_context = found return current_context # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): return self.next_elements def nextSiblingGenerator(self): return self.next_siblings def previousGenerator(self): return self.previous_elements def previousSiblingGenerator(self): return self.previous_siblings def parentGenerator(self): return self.parents class NavigableString(unicode, PageElement): PREFIX = '' SUFFIX = '' def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (unicode(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError( "'%s' object has no attribute '%s'" % ( self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. The string will be passed into the formatter (to trigger side effects), but the return value will be ignored. """ def output_ready(self, formatter="minimal"): """CData strings are passed into the formatter. But the return value is ignored.""" self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): PREFIX = u'<![CDATA[' SUFFIX = u']]>' class ProcessingInstruction(PreformattedString): PREFIX = u'<?' SUFFIX = u'?>' class Comment(PreformattedString): PREFIX = u'<!--' SUFFIX = u'-->' class Declaration(PreformattedString): PREFIX = u'<!' SUFFIX = u'!>' class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): value = name if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: value += ' "%s"' % system_id elif system_id is not None: value += ' SYSTEM "%s"' % system_id return Doctype(value) PREFIX = u'<!DOCTYPE ' SUFFIX = u'>\n' class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: self.parser_class = None else: # We don't actually store the parser object: that lets extracted # chunks be garbage-collected. self.parser_class = parser.__class__ if name is None: raise ValueError("No value provided for new tag's name.") self.name = name self.namespace = namespace self.prefix = prefix if attrs is None: attrs = {} elif builder.cdata_list_attributes: attrs = builder._replace_cdata_list_attribute_values( self.name, attrs) else: attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False # Set up any substitutions, such as the charset in a META tag. if builder is not None: builder.set_up_substitutions(self) self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False parserClass = _alias("parser_class") # BS3 @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) A tag that has contents is never an empty-element tag. A tag that has no contents may or may not be an empty-element tag. It depends on the builder used to create the tag. If the builder has a designated list of empty-element tags, then only a tag whose name shows up in that list is considered an empty-element tag. If the builder has no designated list of empty-element tags, then any tag with no contents is an empty-element tag. """ return len(self.contents) == 0 and self.can_be_empty_element isSelfClosing = is_empty_element # BS3 @property def string(self): """Convenience property to get the single string within this tag. :Return: If this tag has a single string child, return value is that string. If this tag has no children, or more than one child, return value is None. If this tag has one child tag, return value is the 'string' attribute of the child tag, recursively. """ if len(self.contents) != 1: return None child = self.contents[0] if isinstance(child, NavigableString): return child return child.string @string.setter def string(self, string): self.clear() self.append(string.__class__(string)) def _all_strings(self, strip=False): """Yield all child strings, possibly stripping them.""" for descendant in self.descendants: if not isinstance(descendant, NavigableString): continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant strings = property(_all_strings) @property def stripped_strings(self): for string in self._all_strings(True): yield string def get_text(self, separator="", strip=False): """ Get all child strings, concatenated using the given separator. """ return separator.join([s for s in self._all_strings(strip)]) getText = get_text text = property(get_text) def decompose(self): """Recursively destroys the contents of this tree.""" self.extract() i = self while i is not None: next = i.next_element i.__dict__.clear() i = next def clear(self, decompose=False): """ Extract all children. If decompose is True, decompose instead. """ if decompose: for element in self.contents[:]: if isinstance(element, Tag): element.decompose() else: element.extract() else: for element in self.contents[:]: element.extract() def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with tag.contents.index(element) getting the index of equal elements. """ for i, child in enumerate(self.contents): if child is element: return i raise ValueError("Tag.index: element not in tag") def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self.attrs.get(key, default) def has_attr(self, key): return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." self.attrs.pop(key, None) def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( '.%sTag is deprecated, use .find("%s") instead.' % ( tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag=="contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag.""" if self is other: return True if (not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other)): return False for i, my_child in enumerate(self.contents): if my_child != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.encode(encoding) def __unicode__(self): return self.decode() def __str__(self): return self.encode() if PY3K: __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = '/' else: closeTag = '</%s>' % self.name prefix = '' if self.prefix: prefix = self.prefix + ":" pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) else: return self.encode(encoding, True, formatter=formatter) def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a Unicode string. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ pretty_print = (indent_level is not None) s = [] for c in self: text = None if isinstance(c, NavigableString): text = c.output_ready(formatter) elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) if text and indent_level: text = text.strip() if text: if pretty_print: s.append(" " * (indent_level - 1)) s.append(text) if pretty_print: s.append("\n") return ''.join(s) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a bytestring.""" contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): if not prettyPrint: indentLevel = None return self.encode_contents( indent_level=indentLevel, encoding=encoding) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.descendants if not recursive: generator = self.children return self._find_all(name, attrs, text, limit, generator, **kwargs) findAll = find_all # BS3 findChildren = find_all # BS2 #Generator methods @property def children(self): # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def descendants(self): if not len(self.contents): return stopNode = self._last_descendant().next_element current = self.contents[0] while current is not stopNode: yield current current = current.next_element # Old names for backwards compatibility def childGenerator(self): return self.children def recursiveChildGenerator(self): return self.descendants # This was kind of misleading because has_key() (attributes) was # different from __in__ (contents). has_key() is gone in Python 3, # anyway. has_key = has_attr # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' # attribute. kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs normalized_attrs = {} for key, value in attrs.items(): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs self.text = self._normalize_search_value(text) def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value # If it's a bytestring, convert it to Unicode, treating it as UTF-8. if isinstance(value, bytes): return value.decode("utf8") # If it's listlike, convert it into a list of strings. if hasattr(value, '__iter__'): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) and not isinstance(v, unicode)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. new_value.append(v) else: new_value.append(self._normalize_search_value(v)) return new_value # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. return unicode(str(value)) def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): found = None markup = None if isinstance(markup_name, Tag): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( isinstance(self.name, collections.Callable) and not isinstance(markup_name, Tag)) if ((not self.name) or call_function_with_tag_data or (markup and self._matches(markup, self.name)) or (not markup and self._matches(markup_name, self.name))): if call_function_with_tag_data: match = self.name(markup_name, markup_attrs) else: match = True markup_attr_map = None for attr, match_against in list(self.attrs.items()): if not markup_attr_map: if hasattr(markup_attrs, 'get'): markup_attr_map = markup_attrs else: markup_attr_map = {} for k, v in markup_attrs: markup_attr_map[k] = v attr_value = markup_attr_map.get(attr) if not self._matches(attr_value, match_against): match = False break if match: if markup: found = markup else: found = markup_name if found and self.text and not self._matches(found.string, self.text): found = None return found searchTag = search_tag def search(self, markup): # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text or self.name or self.attrs: found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isinstance(markup, basestring): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: raise Exception( "I don't know how to match against a %s" % markup.__class__) return found def _matches(self, markup, match_against): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. if (isinstance(match_against, unicode) and ' ' in match_against): # A bit of a special case. If they try to match "foo # bar" on a multivalue attribute's value, only accept # the literal value "foo bar" # # XXX This is going to be pretty slow because we keep # splitting match_against. But it shouldn't come up # too often. return (whitespace_re.split(match_against) == markup) else: for item in markup: if self._matches(item, match_against): return True return False if match_against is True: # True matches any non-None value. return markup is not None if isinstance(match_against, collections.Callable): return match_against(markup) # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name # Ensure that `markup` is either a Unicode string, or None. markup = self._normalize_search_value(markup) if markup is None: # None matches None, False, an empty string, an empty list, and so on. return not match_against if isinstance(match_against, unicode): # Exact string match return markup == match_against if hasattr(match_against, 'match'): # Regexp match return match_against.search(markup) if hasattr(match_against, '__iter__'): # The markup must be an exact match against something # in the iterable. return markup in match_against class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/testing.py

"""Helper classes for tests.""" import copy import functools import unittest from unittest import TestCase from bs4 import BeautifulSoup from bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, Doctype, SoupStrainer, ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder class SoupTest(unittest.TestCase): @property def default_builder(self): return default_builder() def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) def document_for(self, markup): """Turn an HTML fragment into a document. The details depend on the builder. """ return self.default_builder.test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = BeautifulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. Any HTML treebuilder, present or future, should be able to pass these tests. With invalid markup, there's room for interpretation, and different parsers can handle it differently. But with the markup in these tests, there's not much room for interpretation. """ def assertDoctypeHandled(self, doctype_fragment): """Assert that a given doctype string is handled correctly.""" doctype_str, soup = self._document_with_doctype(doctype_fragment) # Make sure a Doctype object was created. doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') def _document_with_doctype(self, doctype_fragment): """Generate and parse a document with the given doctype.""" doctype = '' % doctype_fragment markup = doctype + '\n

foo

' soup = self.soup(markup) return doctype, soup def test_normal_doctypes(self): """Make sure normal, everyday HTML doctypes are handled correctly.""" self.assertDoctypeHandled("html") self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) def test_system_doctype(self): self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') def test_namespaced_system_doctype(self): # We can handle a namespaced doctype with a system ID. self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') def test_real_xhtml_document(self): """A real XHTML document should come out more or less the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) def test_deepcopy(self): """Make sure you can copy the tree builder. This is important because the builder is part of a BeautifulSoup object, and we want to be able to copy that. """ copy.deepcopy(self.default_builder) def test_p_tag_is_never_empty_element(self): """A

tag is never designated as an empty-element tag. Even if the markup shows it as an empty-element tag, it shouldn't be presented that way. """ soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_unclosed_tags_get_closed(self): """A tag that's not closed by the end of the document should be closed. This applies to all tags except empty-element tags. """ self.assertSoupEquals("

", "

") self.assertSoupEquals("", "") self.assertSoupEquals(" ", " ") def test_br_is_always_empty_element_tag(self): """A tag is designated as an empty-element tag. Some parsers treat as one tag, some parsers as two tags, but it should always be an empty-element tag. """ soup = self.soup(" ") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), " ") def test_nested_formatting_elements(self): self.assertSoupEquals("") def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) def test_preserved_whitespace_in_pre_and_textarea(self): """Whitespace must be preserved in
 and  tags."""
        self.assertSoupEquals("
   
") self.assertSoupEquals(" woo ") def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): """Block elements can be nested.""" soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_correctly_nested_tables(self): """One table can go inside another one.""" markup = (' ' '
' "
Here's another table:" ' ' '
foo
' '') self.assertSoupEquals( markup, '
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "
Foo
" "
Bar
" "
Baz
") def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') def test_entities_in_attributes_converted_to_unicode(self): expect = u'

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) def test_quot_entity_converted_to_quotation_mark(self): self.assertSoupEquals("

I said "good day!"

", '

I said "good day!"

') def test_out_of_range_entity(self): expect = u"\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("빲�", expect) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose data.""" markup = b'4' soup = self.soup(markup) self.assertEqual(markup, soup.encode()) html = soup.html self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) self.assertEqual( 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) def test_multivalued_attribute_value_becomes_list(self): markup = b'
' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder # to detect any differences between them. # def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('', '') self.assertSoupEquals( 'foo', 'foo') def test_escaped_ampersand_in_attribute_value_is_left_alone(self): self.assertSoupEquals('') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("
  ") self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'
'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'
') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. hebrew_document = b'

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") self.assertEqual(soup.original_encoding, 'iso8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) content = parsed_meta['content'] self.assertEqual('text/html; charset=x-sjis', content) # But that value is actually a ContentMetaAttributeValue object. self.assertTrue(isinstance(content, ContentMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_html5_style_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") charset = parsed_meta['charset'] self.assertEqual('x-sjis', charset) # But that value is actually a CharsetMetaAttributeValue object. self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('utf8', charset.encode("utf8")) def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) class XMLTreeBuilderSmokeTest(object): def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") self.assertEqual( soup.encode("latin1"), b'\n') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" markup = (b'\n' + b'0' * (2**12) + b'') soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("

", "

") self.assertSoupEquals("

foo

") def test_namespaces_are_preserved(self): markup = 'This tag is in the a namespaceThis tag is in the b namespace' soup = self.soup(markup) root = soup.root self.assertEqual("http://example.com/", root['xmlns:a']) self.assertEqual("http://example.net/", root['xmlns:b']) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" def test_real_xhtml_document(self): # Since XHTML is not HTML5, HTML5 parsers are not tested to handle # XHTML documents in any particular way. pass def test_html_tags_have_namespace(self): markup = "" soup = self.soup(markup) self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) def test_svg_tags_have_namespace(self): markup = '' soup = self.soup(markup) namespace = "http://www.w3.org/2000/svg" self.assertEqual(namespace, soup.svg.namespace) self.assertEqual(namespace, soup.circle.namespace) def test_mathml_tags_have_namespace(self): markup = '5' soup = self.soup(markup) namespace = 'http://www.w3.org/1998/Math/MathML' self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) def skipIf(condition, reason): def nothing(test, *args, **kwargs): return None def decorator(test_item): if condition: return nothing else: return test_item return decorator

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_builder_registry.py

"""Tests of the builder registry.""" import unittest from bs4 import BeautifulSoup from bs4.builder import ( builder_registry as registry, HTMLParserTreeBuilder, TreeBuilderRegistry, ) try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError: HTML5LIB_PRESENT = False try: from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, ) LXML_PRESENT = True except ImportError: LXML_PRESENT = False class BuiltInRegistryTest(unittest.TestCase): """Test the built-in registry with the default builders registered.""" def test_combination(self): if LXML_PRESENT: self.assertEqual(registry.lookup('fast', 'html'), LXMLTreeBuilder) if LXML_PRESENT: self.assertEqual(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder) def test_lookup_by_markup_type(self): if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) else: self.assertEqual(registry.lookup('xml'), None) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) else: self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) def test_named_library(self): if LXML_PRESENT: self.assertEqual(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('lxml', 'html'), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib'), HTML5TreeBuilder) self.assertEqual(registry.lookup('html.parser'), HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): # You can pass in a string. BeautifulSoup("", features="html") # Or a list of strings. BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. self.assertRaises(ValueError, BeautifulSoup, "", features="no-such-feature") class RegistryTest(unittest.TestCase): """Test the TreeBuilderRegistry class in general.""" def setUp(self): self.registry = TreeBuilderRegistry() def builder_for_features(self, *feature_list): cls = type('Builder_' + '_'.join(feature_list), (object,), {'features' : feature_list}) self.registry.register(cls) return cls def test_register_with_no_features(self): builder = self.builder_for_features() # Since the builder advertises no features, you can't find it # by looking up features. self.assertEqual(self.registry.lookup('foo'), None) # But you can find it by doing a lookup with no features, if # this happens to be the only registered builder. self.assertEqual(self.registry.lookup(), builder) def test_register_with_features_makes_lookup_succeed(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('foo'), builder) self.assertEqual(self.registry.lookup('bar'), builder) def test_lookup_fails_when_no_builder_implements_feature(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('baz'), None) def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): builder1 = self.builder_for_features('foo') builder2 = self.builder_for_features('bar') self.assertEqual(self.registry.lookup(), builder2) def test_lookup_fails_when_no_tree_builders_registered(self): self.assertEqual(self.registry.lookup(), None) def test_lookup_gets_most_recent_builder_supporting_all_features(self): has_one = self.builder_for_features('foo') has_the_other = self.builder_for_features('bar') has_both_early = self.builder_for_features('foo', 'bar', 'baz') has_both_late = self.builder_for_features('foo', 'bar', 'quux') lacks_one = self.builder_for_features('bar') has_the_other = self.builder_for_features('foo') # There are two builders featuring 'foo' and 'bar', but # the one that also features 'quux' was registered later. self.assertEqual(self.registry.lookup('foo', 'bar'), has_both_late) # There is only one builder featuring 'foo', 'bar', and 'baz'. self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), has_both_early) def test_lookup_fails_when_cannot_reconcile_requested_features(self): builder1 = self.builder_for_features('foo', 'bar') builder2 = self.builder_for_features('foo', 'baz') self.assertEqual(self.registry.lookup('bar', 'baz'), None)

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_docs.py

"Test harness for doctests." # pylint: disable-msg=E0611,W0142 __metaclass__ = type __all__ = [ 'additional_tests', ] import atexit import doctest import os #from pkg_resources import ( # resource_filename, resource_exists, resource_listdir, cleanup_resources) import unittest DOCTEST_FLAGS = ( doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) # def additional_tests(): # "Run the doc tests (README.txt and docs/*, if any exist)" # doctest_files = [ # os.path.abspath(resource_filename('bs4', 'README.txt'))] # if resource_exists('bs4', 'docs'): # for name in resource_listdir('bs4', 'docs'): # if name.endswith('.txt'): # doctest_files.append( # os.path.abspath( # resource_filename('bs4', 'docs/%s' % name))) # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) # atexit.register(cleanup_resources) # return unittest.TestSuite(( # doctest.DocFileSuite(*doctest_files, **kwargs)))

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_html5lib.py

"""Tests to ensure that the html5lib tree builder generates good trees.""" import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError, e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( HTML5TreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): return HTML5TreeBuilder() def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" with warnings.catch_warnings(record=True) as w: soup = self.soup(markup, parse_only=strainer) self.assertEqual( soup.decode(), self.document_for(markup)) self.assertTrue( "the html5lib tree builder doesn't support parse_only" in str(w[0].message)) def test_correctly_nested_tables(self): """html5lib inserts <tbody> tags where other parsers don't.""" markup = ('<table id="1">' '<tr>' "<td>Here's another table:" '<table id="2">' '<tr><td>foo</td></tr>' '</table></td>') self.assertSoupEquals( markup, '<table id="1"><tbody><tr><td>Here\'s another table:' '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' '</td></tr></tbody></table>') self.assertSoupEquals( "<table><thead><tr><td>Foo</td></tr></thead>" "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>")

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_htmlparser.py

"""Tests to ensure that the html.parser tree builder generates good trees.""" from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): return HTMLParserTreeBuilder() def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_lxml.py

"""Tests to ensure that the lxml tree builder generates good trees.""" import re import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError, e: LXML_PRESENT = False from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilder() def test_out_of_range_entity(self): self.assertSoupEquals( "

foo&#10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo&#x10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo빲�bar

", "

foobar

") def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=False) as w: soup = BeautifulStoneSoup("") self.assertEqual(u"", unicode(soup.b)) def test_real_xhtml_document(self): """lxml strips the XML definition from an XHTML doc, which is fine.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b''), markup.replace(b'\n', b'').replace( b'', b'')) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilderForXML()

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_soup.py

# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" import unittest from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, ) import bs4.dammit from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import ( SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError, e: LXML_PRESENT = False class TestDeprecatedConstructorArguments(SoupTest): def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", parseOnlyThese=SoupStrainer("b")) msg = str(w[0].message) self.assertTrue("parseOnlyThese" in msg) self.assertTrue("parse_only" in msg) self.assertEqual(b"", soup.encode()) def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding) def test_unrecognized_keyword_argument(self): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) @skipIf( not LXML_PRESENT, "lxml not present, not testing BeautifulStoneSoup.") def test_beautifulstonesoup(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") self.assertTrue(isinstance(soup, BeautifulSoup)) self.assertTrue("BeautifulStoneSoup class is deprecated") class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.encode(), b"YesYes Yes") class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = u"foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), u"foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = b"\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) def test_xml_attribute_quoting_normally_uses_double_quotes(self): self.assertEqual(self.sub.substitute_xml("Welcome", True), '"Welcome"') self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), '"Bob\'s Bar"') def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, True), "'Welcome to \"my bar\"'") def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' self.assertEqual( self.sub.substitute_xml(s, True), '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' self.assertEqual(self.sub.substitute_xml(quoted), quoted) def test_xml_quoting_handles_angle_brackets(self): self.assertEqual( self.sub.substitute_xml("foo"), "foo<bar>") def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' self.assertEqual(self.sub.substitute_html(text), text) class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setUp(self): super(TestEncodingConversion, self).setUp() self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_ascii(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """''""""") def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_detect_html5_style_meta_tag(self): for data in ( b'', b"", b"", b""): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding) def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe\x00\xe1\x00\xe9\x00\x00' dammit = UnicodeDammit(data) self.assertEqual(u"áé", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual( u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none(self): a = NamespacedAttribute("xmlns", None) self.assertEqual(a, "xmlns") def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) def test_attributes_are_equivalent_if_prefix_and_name_identical(self): a = NamespacedAttribute("a", "b", "c") b = NamespacedAttribute("a", "b", "c") self.assertEqual(a, b) # The actual namespace is not considered. c = NamespacedAttribute("a", "b", None) self.assertEqual(a, c) # But name and prefix are important. d = NamespacedAttribute("a", "z", "c") self.assertNotEqual(a, d) e = NamespacedAttribute("z", "b", "c") self.assertNotEqual(a, e) class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") self.assertEqual("euc-jp", value) self.assertEqual("euc-jp", value.original_value) self.assertEqual("utf8", value.encode("utf8")) def test_content_meta_attribute_value(self): value = ContentMetaAttributeValue("text/html; charset=euc-jp") self.assertEqual("text/html; charset=euc-jp", value) self.assertEqual("text/html; charset=euc-jp", value.original_value) self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/test_tree.py

# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( CData, Doctype, NavigableString, SoupStrainer, Tag, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup(u'

Räksmörgås

') self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. self.assertEqual(soup.find_all(text="bar"), [u"bar"]) # Match any of a number of strings. self.assertEqual( soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), [u"Foo", u"bar", u'\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), [u"Foo", u"bar", u'\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("1 2 3 4 5") self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) self.assertSelects(soup.find_all('a', limit=1), ["1"]) self.assertSelects( soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assertSelects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assertSelects(soup('a', limit=1), ["1"]) self.assertSelects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): soup = self.soup("") # Create a self-referential list. l = [] l.append(l) # Without special code in _normalize_search_value, this would cause infinite # recursion. self.assertEqual([], soup.find_all(l)) class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('4') self.assertEqual("4", soup.find("mathml:msqrt").string) self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" def setUp(self): super(TreeTest, self).setUp() self.tree = self.soup(""" First tag. Second tag. Third Nested tag. tag.""") def test_find_all_by_tag_name(self): # Find all the tags. self.assertSelects( self.tree.find_all('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_name_and_text(self): self.assertSelects( self.tree.find_all('a', text='First tag.'), ['First tag.']) self.assertSelects( self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) self.assertSelects( self.tree.find_all('a', text=re.compile("tag")), ['First tag.', 'Nested tag.']) def test_find_all_on_non_root_element(self): # You can call find_all on any node, not just the root. self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) def test_calling_element_invokes_find_all(self): self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_strainer(self): self.assertSelects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_names(self): self.assertSelects( self.tree.find_all(['a', 'b']), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_dict(self): self.assertSelects( self.tree.find_all({'a' : True, 'b' : True}), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_re(self): self.assertSelects( self.tree.find_all(re.compile('^[ab]$')), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_with_tags_matching_method(self): # You can define an oracle method that determines whether # a tag matches the search. def id_matches_name(tag): return tag.name == tag.get('id') tree = self.soup(""" Match 1. Does not match. Match 2.""") self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by # attribute. tree = self.soup(""" Matching a. Non-matching Matching b.a. """) self.assertSelects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): peace = u"םולש".encode("utf8") data = u''.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument # to find_all) and 'class' (a reserved word in Python.) tree = self.soup(""" Name match. Class match. Non-match. A tag called 'name1'. """) # This doesn't do what you want. self.assertSelects(tree.find_all(name='name1'), ["A tag called 'name1'."]) # This does what you want. self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) # Passing class='class2' would cause a syntax error. self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): # Passing in a string to 'attrs' will search the CSS class. tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") attrs = { 'class' : re.compile("o") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, ["Found it"]) f = tree.find_all("gar", re.compile("a")) self.assertSelects(f, ["Found it"]) # Since the class is not the string "foo bar", but the two # strings "foo" and "bar", this will not find anything. attrs = { 'class' : re.compile("o b") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, []) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) def big_attribute_value(value): return len(value) > 3 self.assertSelects(soup.find_all("a", big_attribute_value), []) def small_attribute_value(value): return len(value) ') a, a2 = soup.find_all("a") self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) # If you specify the attribute as a string that contains a # space, only that specific value will be found. self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" Match. Non-match.""") strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) def test_find_all_with_missing_atribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) def test_find_all_with_defined_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that have that attribute set to any value. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects( tree.find_all(id=True), ["ID present.", "ID is empty."]) def test_find_all_with_numeric_attribute(self): # If you search for a number, it's treated as a string. tree = self.soup("""Unquoted attribute. Quoted attribute.""") expected = ["Unquoted attribute.", "Quoted attribute."] self.assertSelects(tree.find_all(id=1), expected) self.assertSelects(tree.find_all(id="1"), expected) def test_find_all_with_list_attribute_values(self): # You can pass a list of attribute values instead of just one, # and you'll get tags that match any of the values. tree = self.soup("""1 2 3 No ID.""") self.assertSelects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) def test_find_all_with_regular_expression_attribute_value(self): # You can pass a regular expression as an attribute value, and # you'll get tags whose values for that attribute match the # regular expression. tree = self.soup("""One a. Two as. Mixed as and bs. One b. No ID.""") self.assertSelects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) def test_find_by_name_and_containing_string(self): soup = self.soup("foobarfoo") a = soup.a self.assertEqual([a], soup.find_all("a", text="foo")) self.assertEqual([], soup.find_all("a", text="bar")) self.assertEqual([], soup.find_all("a", text="bar")) def test_find_by_name_and_containing_string_when_string_is_buried(self): soup = self.soup("foo foo") self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) def test_find_by_attribute_and_containing_string(self): soup = self.soup('foofoo') a = soup.a self.assertEqual([a], soup.find_all(id=2, text="foo")) self.assertEqual([], soup.find_all(id=1, text="bar")) class TestIndex(TreeTest): """Test Tag.index""" def test_index(self): tree = self.soup(""" Identical Not identical Identical Identical with child Also not identical Identical with child """) div = tree.div for i, element in enumerate(div.contents): self.assertEqual(i, div.index(element)) self.assertRaises(ValueError, tree.index, 1) class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" def setUp(self): super(TestParentOperations, self).setUp() self.tree = self.soup('''

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/tests/__init__.py

"The beautifulsoup tests."

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/bs4/__init__.py

"""Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup provides provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. Beautiful Soup works with Python 2.6 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson ([email protected])" __version__ = "4.1.0" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import re import warnings from .builder import builder_registry from .dammit import UnicodeDammit from .element import ( CData, Comment, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, NavigableString, PageElement, ProcessingInstruction, ResultSet, SoupStrainer, Tag, ) # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ This class defines the basic interface called by the tree builders. These methods will be called by the parser: reset() feed(markup) The tree builder may call these methods from its feed() implementation: handle_starttag(name, attrs) # See note about return value handle_endtag(name) handle_data(data) # Appends to the current data node endData(containerClass=NavigableString) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, 'data' events, and "done with data" events. If you encounter an empty-element tag (aka a self-closing tag, like HTML's <br> tag), call handle_starttag and then handle_endtag. """ ROOT_TAG_NAME = u'[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. You can pass in features='html' " "or features='xml' to get a builder capable of handling " "one or the other.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise ValueError( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) = ( self.builder.prepare_markup(markup, from_encoding)) try: self._feed() except StopParsing: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None def _feed(self): # Convert the document to Unicode. self.builder.reset() self.builder.feed(self.markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def reset(self): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" navigable = NavigableString(s) navigable.setup() return navigable def insert_before(self, successor): raise ValueError("BeautifulSoup objects don't support insert_before().") def insert_after(self, successor): raise ValueError("BeautifulSoup objects don't support insert_after().") def popTag(self): tag = self.tagStack.pop() #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): if self.currentData: currentData = u''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.builder.preserve_whitespace_tags)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ not self.parse_only.search(currentData)): return o = containerClass(currentData) self.object_was_parsed(o) def object_was_parsed(self, o): """Add an object to the parse tree.""" o.setup(self.currentTag, self.previous_element) if self.previous_element: self.previous_element.next_element = o self.previous_element = o self.currentTag.contents.append(o) def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): if (name == self.tagStack[i].name and nsprefix == self.tagStack[i].nsprefix == nsprefix): numPops = len(self.tagStack) - i break if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the SoupStrainer. You should proceed as if the tag had not occured in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 and (self.parse_only.text or not self.parse_only.search_tag(name, attrs))): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self.previous_element) if tag is None: return tag if self.previous_element: self.previous_element.next_element = tag self.previous_element = tag self.pushTag(tag) return tag def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): self.currentData.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: # Print the XML declaration encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = u'<?xml version="1.0"%s?>\n' % encoding_part else: prefix = u'' if not pretty_print: indent_level = None else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" def __init__(self, *args, **kwargs): kwargs['features'] = 'xml' warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.') super(BeautifulStoneSoup, self).__init__(*args, **kwargs) class StopParsing(Exception): pass #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print soup.prettify()

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/builder/_html5lib.py

__all__ = [ 'HTML5TreeBuilder', ] import warnings from bs4.builder import ( PERMISSIVE, HTML, HTML_5, HTMLTreeBuilder, ) from bs4.element import NamespacedAttribute import html5lib from html5lib.constants import namespaces from bs4.element import ( Comment, Doctype, NavigableString, Tag, ) class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" features = ['html5lib', PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding return markup, None, None, False # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0] def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( self.soup, namespaceHTMLElements) return self.underlying_builder def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return '%s' % fragment class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element class AttrList(object): def __init__(self, element): self.element = element self.attrs = dict(self.element.attrs) def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): "set attr", name, value self.element[name] = value def items(self): return list(self.attrs.items()) def keys(self): return list(self.attrs.keys()) def __len__(self): return len(self.attrs) def __getitem__(self, name): return self.attrs[name] def __contains__(self, name): return name in list(self.attrs.keys()) class Element(html5lib.treebuilders._base.Node): def __init__(self, element, soup, namespace): html5lib.treebuilders._base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # XXX This has O(n^2) performance, for input like # "aaa..." old_element = self.element.contents[-1] new_element = self.soup.new_string(old_element + node.element) old_element.replace_with(new_element) else: self.element.append(node.element) node.parent = self def getAttributes(self): return AttrList(self.element) def setAttributes(self, attributes): if attributes is not None and len(attributes) > 0: converted_attributes = [] for name, value in list(attributes.items()): if isinstance(name, tuple): new_name = NamespacedAttribute(*name) del attributes[name] attributes[new_name] = value self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. # # The Tag constructor called this method when the Tag was created, # but we just set/changed the attributes, so call it again. self.soup.builder.set_up_substitutions(self.element) attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) old_node = self.element.contents[index-1] new_str = self.soup.new_string(old_node + node.element) old_node.replace_with(new_str) else: self.element.insert(index, node.element) node.parent = self def removeChild(self, node): node.element.extract() def reparentChildren(self, newParent): while self.element.contents: child = self.element.contents[0] child.extract() if isinstance(child, Tag): newParent.appendChild( Element(child, self.soup, namespaces["html"])) else: newParent.appendChild( TextNode(child, self.soup)) def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node def hasContent(self): return self.element.contents def getNameTuple(self): if self.namespace == None: return namespaces["html"], self.name else: return self.namespace, self.name nameTuple = property(getNameTuple) class TextNode(Element): def __init__(self, element, soup): html5lib.treebuilders._base.Node.__init__(self, None) self.element = element self.soup = soup def cloneNode(self): raise NotImplementedError

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/builder/_htmlparser.py

"""Use the HTMLParser library to parse HTML files that aren't too bad.""" __all__ = [ 'HTMLParserTreeBuilder', ] from html.parser import ( HTMLParser, HTMLParseError, ) import sys import warnings # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' # argument, which we'd like to set to False. Unfortunately, # http://bugs.python.org/issue13273 makes strict=True a better bet # before Python 3.2.3. # # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] CONSTRUCTOR_TAKES_STRICT = ( major > 3 or (major == 3 and minor > 2) or (major == 3 and minor == 2 and release >= 3)) from bs4.element import ( CData, Comment, Declaration, Doctype, ProcessingInstruction, ) from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( HTML, HTMLTreeBuilder, STRICT, ) HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): # XXX namespace self.soup.handle_starttag(name, None, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) else: real_name = int(name) try: data = chr(real_name) except (ValueError, OverflowError) as e: data = "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character else: data = "&%s;" % name self.handle_data(data) def handle_comment(self, data): self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] else: cls = Declaration self.soup.endData() self.soup.handle_data(data) self.soup.endData(cls) def handle_pi(self, data): self.soup.endData() if data.endswith("?") and data.lower().startswith("xml"): # "An XHTML processing instruction using the trailing '?' # will cause the '?' to be included in data." - HTMLParser # docs. # # Strip the question mark so we don't end up with two # question marks. data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False features = [HTML, STRICT, HTMLPARSER] def __init__(self, *args, **kwargs): if CONSTRUCTOR_TAKES_STRICT: kwargs['strict'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, str): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3. if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: import re attrfind_tolerant = re.compile( r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value |[^'\">\s]+ # bare value ) )? ) )* \s* # trailing whitespace """, re.VERBOSE) BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = rawdata[i+1:k].lower() while k < endpos: if self.strict: m = attrfind.match(rawdata, k) else: m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) if self.strict: self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) BeautifulSoupHTMLParser.parse_starttag = parse_starttag BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/builder/_lxml.py

__all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] from io import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, HTMLTreeBuilder, PERMISSIVE, TreeBuilder, XML) from bs4.dammit import UnicodeDammit LXML = 'lxml' class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @property def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. return etree.XMLParser(target=self, strip_cdata=False, recover=True) def __init__(self, parser=None, empty_element_tags=None): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) if parser is None: # Use the default parser. parser = self.default_parser if isinstance(parser, collections.Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None self.nsmaps = None def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. if tag[0] == '{': return tuple(tag[1:].split('}', 1)) else: return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ if isinstance(markup, str): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): if isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) self.parser.feed(data) while data != '': # Now call feed() on the rest of the data, chunk by chunk. data = markup.read(self.CHUNK_SIZE) if data != '': self.parser.feed(data) self.parser.close() def close(self): self.nsmaps = None def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. if len(nsmap) == 0 and self.nsmaps != None: # There are no new namespaces for this tag, but namespaces # are in play, so we need a separate tag stack to know # when they end. self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. if self.nsmaps is None: self.nsmaps = [] inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace namespace, name = self._getNsTag(name) if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] namespace, name = self._getNsTag(name) nsprefix = None if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_endtag(name, nsprefix) if self.nsmaps != None: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. self.nsmaps.pop() if len(self.nsmaps) == 0: # Namespaces are no longer in play, so don't bother keeping # track of the namespace stack. self.nsmaps = None def pi(self, target, data): pass def data(self, content): self.soup.handle_data(content) def doctype(self, name, pubid, system): self.soup.endData() doctype = Doctype.for_name_and_ids(name, pubid, system) self.soup.object_was_parsed(doctype) def comment(self, content): "Handle comments as Comment objects." self.soup.endData() self.soup.handle_data(content) self.soup.endData(Comment) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return '\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False @property def default_parser(self): return etree.HTMLParser def feed(self, markup): self.parser.feed(markup) self.parser.close() def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return '%s' % fragment

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/builder/__init__.py

from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, whitespace_re ) __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', 'TreeBuilderRegistry', ] # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' class TreeBuilderRegistry(object): def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): """Register a treebuilder based on its advertised features.""" for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): if len(self.builders) == 0: # There are no builders at all. return None if len(features) == 0: # They didn't ask for any features. Give them the most # recently registered builder. return self.builders[0] # Go down the list of features in order, and eliminate any builders # that don't match every feature. features = list(features) features.reverse() candidates = None candidate_set = None while len(features) > 0: feature = features.pop() we_have_the_feature = self.builders_for_feature.get(feature, []) if len(we_have_the_feature) > 0: if candidates is None: candidates = we_have_the_feature candidate_set = set(candidates) else: # Eliminate any candidates that don't have this feature. candidate_set = candidate_set.intersection( set(we_have_the_feature)) # The only valid candidates are the ones in candidate_set. # Go through the original list of candidates and pick the first one # that's in candidate_set. if candidate_set is None: return None for candidate in candidates: if candidate in candidate_set: return candidate return None # The BeautifulSoup class will take feature lists from developers and use them # to look up builders in this registry. builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" features = [] is_xml = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} def __init__(self): self.soup = None def reset(self): pass def can_be_empty_element(self, tag_name): """Might a tag with this name be an empty-element tag? The final markup may or may not actually present this tag as self-closing. For instance: an HTMLBuilder does not consider a <p> tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty <p> tag will be presented as "<p></p>", not "<p />". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no contents. "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will be left alone. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): return markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. Different parsers do this differently. For instance, lxml introduces an empty <head> tag, and html5lib doesn't. Abstracting this away lets us write simple tests which run HTML fragments through the parser and compare the results against other HTML fragments. This method should not be used outside of tests. """ return fragment def set_up_substitutions(self, tag): return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): """Replaces class="foo bar" with class=["foo", "bar"] Modifies its input in place. """ if self.cdata_list_attributes: universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), []) for cdata_list_attr in itertools.chain(universal, tag_specific): if cdata_list_attr in dict(attrs): # Basically, we have a "class" attribute whose # value is a whitespace-separated list of CSS # classes. Split it into a list. value = attrs[cdata_list_attr] values = whitespace_re.split(value) attrs[cdata_list_attr] = values return attrs class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" def feed(self, markup): raise NotImplementedError() def close(self): pass def startElement(self, name, attrs): attrs = dict((key[1], value) for key, value in list(attrs.items())) #print "Start %s, %r" % (name, attrs) self.soup.handle_starttag(name, attrs) def endElement(self, name): #print "End %s" % name self.soup.handle_endtag(name) def startElementNS(self, nsTuple, nodeName, attrs): # Throw away (ns, nodeName) for now. self.startElement(nodeName, attrs) def endElementNS(self, nsTuple, nodeName): # Throw away (ns, nodeName) for now. self.endElement(nodeName) #handler.endElementNS((ns, node.nodeName), node.nodeName) def startPrefixMapping(self, prefix, nodeValue): # Ignore the prefix for now. pass def endPrefixMapping(self, prefix): # Ignore the prefix for now. # handler.endPrefixMapping(prefix) pass def characters(self, content): self.soup.handle_data(content) def startDocument(self): pass def endDocument(self): pass class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. Such as which tags are empty-element tags. """ preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, # 'foo' and 'bar', not the single value 'foo bar'. When we # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. cdata_list_attributes = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], "td" : ["headers"], "th" : ["headers"], "td" : ["headers"], "form" : ["accept-charset"], "object" : ["archive"], # These are HTML5 specific, as are *.accesskey and *.dropzone above. "area" : ["rel"], "icon" : ["sizes"], "iframe" : ["sandbox"], "output" : ["for"], } def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': return False http_equiv = tag.get('http-equiv') content = tag.get('content') charset = tag.get('charset') # We are interested in <meta> tags that say what encoding the # document was originally in. This means HTML 5-style <meta> # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". # # In both cases we will replace the value of the appropriate # attribute with a standin object that can take on any # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None) def register_treebuilders_from(module): """Copy TreeBuilders from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules['bs4.builder'] for name in module.__all__: obj = getattr(module, name) if issubclass(obj, TreeBuilder): setattr(this_module, name, obj) this_module.__all__.append(name) # Register the builder while we're at it. this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only # want to use HTMLParser as a last result. from . import _htmlparser register_treebuilders_from(_htmlparser) try: from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass try: from . import _lxml register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/dammit.py

# -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit This class forces XML data into a standard format (usually to UTF-8 or Unicode). It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs from html.entities import codepoint2name import re import warnings # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ # or 'apt-get install python-chardet' # or 'easy_install chardet' try: import chardet #import chardet.constants #chardet.constants._debug = 1 except ImportError: chardet = None # Available from http://cjkpython.i18n.org/. try: import iconv_codec except ImportError: pass xml_encoding_re = re.compile( '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) html_meta_re = re.compile( '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): """Substitute XML or HTML entities for the corresponding characters.""" def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # &quot;, unless it happens within an attribute value, which # is handled elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to turn &quot; into the quotation mark. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() CHARACTER_TO_XML_ENTITY = { "'": "apos", '"': "quot", "&": "amp", "<": "lt", ">": "gt", } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity @classmethod def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity @classmethod def quoted_attribute_value(self, value): """Make a value into a quoted XML attribute, possibly escaping it. Most strings will be quoted using double quotes. Bob's Bar -> "Bob's Bar" If a string contains double quotes, it will be quoted using single quotes. Welcome to "my bar" -> 'Welcome to "my bar"' If a string contains both single and double quotes, the double quotes will be escaped, and the string will be quoted using double quotes. Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot; """ quote_with = '"' if '"' in value: if "'" in value: # The string contains both single and double # quotes. Turn the double quotes into # entities. We quote the double quotes rather than # the single quotes because the entity name is # "&quot;" whether this is HTML or XML. If we # quoted the single quotes, we'd have to decide # between &apos; and &squot;. replace_with = "&quot;" value = value.replace('"', replace_with) else: # There are double quotes but no single quotes. # We can use single quotes to quote the attribute. quote_with = "'" return quote_with + value + quote_with @classmethod def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. :param value: A string to be substituted. The less-than sign will become &lt;, the greater-than sign will become &gt;, and any ampersands that are not part of an entity defition will become &amp;. :param make_quoted_attribute: If True, then the string will be quoted, as befits an attribute value. """ # Escape angle brackets, and ampersands that aren't part of # entities. value = cls.BARE_AMPERSAND_OR_BRACKET.sub( cls._substitute_xml_entity, value) if make_quoted_attribute: value = cls.quoted_attribute_value(value) return value @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. This differs from data.encode(encoding, 'xmlcharrefreplace') in that the goal is to make the result more readable (to those with ASCII displays) rather than to recover from errors. There's absolutely nothing wrong with a UTF-8 string containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that character with "&eacute;" will make it more readable to some people. """ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( cls._substitute_html_entity, s) class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.""" # This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. CHARSET_ALIASES = {"macintosh": "mac-roman", "x-sjis": "shift-jis"} ENCODINGS_WITH_SMART_QUOTES = [ "windows-1252", "iso-8859-1", "iso-8859-2", ] def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False if markup == '' or isinstance(markup, str): self.markup = markup self.unicode_markup = str(markup) self.original_encoding = None return new_markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, is_html) self.markup = new_markup u = None if new_markup != markup: # _detectEncoding modified the markup, then converted it to # Unicode and then to UTF-8. So convert it from UTF-8. u = self._convert_from("utf8") self.original_encoding = sniffed_encoding if not u: for proposed_encoding in ( override_encodings + [document_encoding, sniffed_encoding]): if proposed_encoding is not None: u = self._convert_from(proposed_encoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, str): u = self._convert_from(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convert_from(proposed_encoding) if u: break # As an absolute last resort, try the encodings again with # character replacement. if not u: for proposed_encoding in ( override_encodings + [ document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): if proposed_encoding != "ascii": u = self._convert_from(proposed_encoding, "replace") if u is not None: warnings.warn( UnicodeWarning( "Some characters could not be decoded, and were " "replaced with REPLACEMENT CHARACTER.")) self.contains_replacement_characters = True break # We could at this point force it to ASCII, but that would # destroy so much data that I think giving up is better self.unicode_markup = u if not u: self.original_encoding = None def _sub_ms_char(self, match): """Changes a MS smart quote character to an XML or HTML entity, or an ASCII character.""" orig = match.group(1) if self.smart_quotes_to == 'ascii': sub = self.MS_CHARS_TO_ASCII.get(orig).encode() else: sub = self.MS_CHARS.get(orig) if type(sub) == tuple: if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() else: sub = sub.encode() return sub def _convert_from(self, proposed, errors="strict"): proposed = self.find_codec(proposed) if not proposed or (proposed, errors) in self.tried_encodings: return None self.tried_encodings.append((proposed, errors)) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: #print "Trying to convert document to %s (errors=%s)" % ( # proposed, errors) u = self._to_unicode(markup, proposed, errors) self.markup = u self.original_encoding = proposed except Exception as e: #print "That didn't work!" #print e return None #print "Correct encoding: %s" % proposed return self.markup def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = str(data, encoding, errors) return newdata def _detectEncoding(self, xml_data, is_html=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == b'\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == b'\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = str(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ and (xml_data[2:4] != b'\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = str(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ (xml_data[2:4] != b'\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = str(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = str(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == b'\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass except: xml_encoding_match = None xml_encoding_match = xml_encoding_re.match(xml_data) if not xml_encoding_match and is_html: xml_encoding_match = html_meta_re.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].decode( 'ascii').lower() if is_html: self.declared_html_encoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): pass return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), b'\x81': ' ', b'\x82': ('sbquo', '201A'), b'\x83': ('fnof', '192'), b'\x84': ('bdquo', '201E'), b'\x85': ('hellip', '2026'), b'\x86': ('dagger', '2020'), b'\x87': ('Dagger', '2021'), b'\x88': ('circ', '2C6'), b'\x89': ('permil', '2030'), b'\x8A': ('Scaron', '160'), b'\x8B': ('lsaquo', '2039'), b'\x8C': ('OElig', '152'), b'\x8D': '?', b'\x8E': ('#x17D', '17D'), b'\x8F': '?', b'\x90': '?', b'\x91': ('lsquo', '2018'), b'\x92': ('rsquo', '2019'), b'\x93': ('ldquo', '201C'), b'\x94': ('rdquo', '201D'), b'\x95': ('bull', '2022'), b'\x96': ('ndash', '2013'), b'\x97': ('mdash', '2014'), b'\x98': ('tilde', '2DC'), b'\x99': ('trade', '2122'), b'\x9a': ('scaron', '161'), b'\x9b': ('rsaquo', '203A'), b'\x9c': ('oelig', '153'), b'\x9d': '?', b'\x9e': ('#x17E', '17E'), b'\x9f': ('Yuml', ''),} # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains # horrors like stripping diacritical marks to turn á into a, but also # contains non-horrors like turning “ into ". MS_CHARS_TO_ASCII = { b'\x80' : 'EUR', b'\x81' : ' ', b'\x82' : ',', b'\x83' : 'f', b'\x84' : ',,', b'\x85' : '...', b'\x86' : '+', b'\x87' : '++', b'\x88' : '^', b'\x89' : '%', b'\x8a' : 'S', b'\x8b' : '<', b'\x8c' : 'OE', b'\x8d' : '?', b'\x8e' : 'Z', b'\x8f' : '?', b'\x90' : '?', b'\x91' : "'", b'\x92' : "'", b'\x93' : '"', b'\x94' : '"', b'\x95' : '*', b'\x96' : '-', b'\x97' : '--', b'\x98' : '~', b'\x99' : '(TM)', b'\x9a' : 's', b'\x9b' : '>', b'\x9c' : 'oe', b'\x9d' : '?', b'\x9e' : 'z', b'\x9f' : 'Y', b'\xa0' : ' ', b'\xa1' : '!', b'\xa2' : 'c', b'\xa3' : 'GBP', b'\xa4' : '$', #This approximation is especially parochial--this is the #generic currency symbol. b'\xa5' : 'YEN', b'\xa6' : '|', b'\xa7' : 'S', b'\xa8' : '..', b'\xa9' : '', b'\xaa' : '(th)', b'\xab' : '<<', b'\xac' : '!', b'\xad' : ' ', b'\xae' : '(R)', b'\xaf' : '-', b'\xb0' : 'o', b'\xb1' : '+-', b'\xb2' : '2', b'\xb3' : '3', b'\xb4' : ("'", 'acute'), b'\xb5' : 'u', b'\xb6' : 'P', b'\xb7' : '*', b'\xb8' : ',', b'\xb9' : '1', b'\xba' : '(th)', b'\xbb' : '>>', b'\xbc' : '1/4', b'\xbd' : '1/2', b'\xbe' : '3/4', b'\xbf' : '?', b'\xc0' : 'A', b'\xc1' : 'A', b'\xc2' : 'A', b'\xc3' : 'A', b'\xc4' : 'A', b'\xc5' : 'A', b'\xc6' : 'AE', b'\xc7' : 'C', b'\xc8' : 'E', b'\xc9' : 'E', b'\xca' : 'E', b'\xcb' : 'E', b'\xcc' : 'I', b'\xcd' : 'I', b'\xce' : 'I', b'\xcf' : 'I', b'\xd0' : 'D', b'\xd1' : 'N', b'\xd2' : 'O', b'\xd3' : 'O', b'\xd4' : 'O', b'\xd5' : 'O', b'\xd6' : 'O', b'\xd7' : '*', b'\xd8' : 'O', b'\xd9' : 'U', b'\xda' : 'U', b'\xdb' : 'U', b'\xdc' : 'U', b'\xdd' : 'Y', b'\xde' : 'b', b'\xdf' : 'B', b'\xe0' : 'a', b'\xe1' : 'a', b'\xe2' : 'a', b'\xe3' : 'a', b'\xe4' : 'a', b'\xe5' : 'a', b'\xe6' : 'ae', b'\xe7' : 'c', b'\xe8' : 'e', b'\xe9' : 'e', b'\xea' : 'e', b'\xeb' : 'e', b'\xec' : 'i', b'\xed' : 'i', b'\xee' : 'i', b'\xef' : 'i', b'\xf0' : 'o', b'\xf1' : 'n', b'\xf2' : 'o', b'\xf3' : 'o', b'\xf4' : 'o', b'\xf5' : 'o', b'\xf6' : 'o', b'\xf7' : '/', b'\xf8' : 'o', b'\xf9' : 'u', b'\xfa' : 'u', b'\xfb' : 'u', b'\xfc' : 'u', b'\xfd' : 'y', b'\xfe' : 'b', b'\xff' : 'y', } # A map used when removing rogue Windows-1252/ISO-8859-1 # characters in otherwise UTF-8 documents. # # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in # Windows-1252. WINDOWS_1252_TO_UTF8 = { 0x80 : b'\xe2\x82\xac', # € 0x82 : b'\xe2\x80\x9a', # ‚ 0x83 : b'\xc6\x92', # ƒ 0x84 : b'\xe2\x80\x9e', # „ 0x85 : b'\xe2\x80\xa6', # … 0x86 : b'\xe2\x80\xa0', # † 0x87 : b'\xe2\x80\xa1', # ‡ 0x88 : b'\xcb\x86', # ˆ 0x89 : b'\xe2\x80\xb0', # ‰ 0x8a : b'\xc5\xa0', # Š 0x8b : b'\xe2\x80\xb9', # ‹ 0x8c : b'\xc5\x92', # Œ 0x8e : b'\xc5\xbd', # Ž 0x91 : b'\xe2\x80\x98', # ‘ 0x92 : b'\xe2\x80\x99', # ’ 0x93 : b'\xe2\x80\x9c', # “ 0x94 : b'\xe2\x80\x9d', # ” 0x95 : b'\xe2\x80\xa2', # • 0x96 : b'\xe2\x80\x93', # – 0x97 : b'\xe2\x80\x94', # — 0x98 : b'\xcb\x9c', # ˜ 0x99 : b'\xe2\x84\xa2', # ™ 0x9a : b'\xc5\xa1', # š 0x9b : b'\xe2\x80\xba', # › 0x9c : b'\xc5\x93', # œ 0x9e : b'\xc5\xbe', # ž 0x9f : b'\xc5\xb8', # Ÿ 0xa0 : b'\xc2\xa0', #   0xa1 : b'\xc2\xa1', # ¡ 0xa2 : b'\xc2\xa2', # ¢ 0xa3 : b'\xc2\xa3', # £ 0xa4 : b'\xc2\xa4', # ¤ 0xa5 : b'\xc2\xa5', # ¥ 0xa6 : b'\xc2\xa6', # ¦ 0xa7 : b'\xc2\xa7', # § 0xa8 : b'\xc2\xa8', # ¨ 0xa9 : b'\xc2\xa9', # © 0xaa : b'\xc2\xaa', # ª 0xab : b'\xc2\xab', # « 0xac : b'\xc2\xac', # ¬ 0xad : b'\xc2\xad', # ­ 0xae : b'\xc2\xae', # ® 0xaf : b'\xc2\xaf', # ¯ 0xb0 : b'\xc2\xb0', # ° 0xb1 : b'\xc2\xb1', # ± 0xb2 : b'\xc2\xb2', # ² 0xb3 : b'\xc2\xb3', # ³ 0xb4 : b'\xc2\xb4', # ´ 0xb5 : b'\xc2\xb5', # µ 0xb6 : b'\xc2\xb6', # ¶ 0xb7 : b'\xc2\xb7', # · 0xb8 : b'\xc2\xb8', # ¸ 0xb9 : b'\xc2\xb9', # ¹ 0xba : b'\xc2\xba', # º 0xbb : b'\xc2\xbb', # » 0xbc : b'\xc2\xbc', # ¼ 0xbd : b'\xc2\xbd', # ½ 0xbe : b'\xc2\xbe', # ¾ 0xbf : b'\xc2\xbf', # ¿ 0xc0 : b'\xc3\x80', # À 0xc1 : b'\xc3\x81', # Á 0xc2 : b'\xc3\x82', #  0xc3 : b'\xc3\x83', # à 0xc4 : b'\xc3\x84', # Ä 0xc5 : b'\xc3\x85', # Å 0xc6 : b'\xc3\x86', # Æ 0xc7 : b'\xc3\x87', # Ç 0xc8 : b'\xc3\x88', # È 0xc9 : b'\xc3\x89', # É 0xca : b'\xc3\x8a', # Ê 0xcb : b'\xc3\x8b', # Ë 0xcc : b'\xc3\x8c', # Ì 0xcd : b'\xc3\x8d', # Í 0xce : b'\xc3\x8e', # Î 0xcf : b'\xc3\x8f', # Ï 0xd0 : b'\xc3\x90', # Ð 0xd1 : b'\xc3\x91', # Ñ 0xd2 : b'\xc3\x92', # Ò 0xd3 : b'\xc3\x93', # Ó 0xd4 : b'\xc3\x94', # Ô 0xd5 : b'\xc3\x95', # Õ 0xd6 : b'\xc3\x96', # Ö 0xd7 : b'\xc3\x97', # × 0xd8 : b'\xc3\x98', # Ø 0xd9 : b'\xc3\x99', # Ù 0xda : b'\xc3\x9a', # Ú 0xdb : b'\xc3\x9b', # Û 0xdc : b'\xc3\x9c', # Ü 0xdd : b'\xc3\x9d', # Ý 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä 0xe5 : b'\xc3\xa5', # å 0xe6 : b'\xc3\xa6', # æ 0xe7 : b'\xc3\xa7', # ç 0xe8 : b'\xc3\xa8', # è 0xe9 : b'\xc3\xa9', # é 0xea : b'\xc3\xaa', # ê 0xeb : b'\xc3\xab', # ë 0xec : b'\xc3\xac', # ì 0xed : b'\xc3\xad', # í 0xee : b'\xc3\xae', # î 0xef : b'\xc3\xaf', # ï 0xf0 : b'\xc3\xb0', # ð 0xf1 : b'\xc3\xb1', # ñ 0xf2 : b'\xc3\xb2', # ò 0xf3 : b'\xc3\xb3', # ó 0xf4 : b'\xc3\xb4', # ô 0xf5 : b'\xc3\xb5', # õ 0xf6 : b'\xc3\xb6', # ö 0xf7 : b'\xc3\xb7', # ÷ 0xf8 : b'\xc3\xb8', # ø 0xf9 : b'\xc3\xb9', # ù 0xfa : b'\xc3\xba', # ú 0xfb : b'\xc3\xbb', # û 0xfc : b'\xc3\xbc', # ü 0xfd : b'\xc3\xbd', # ý 0xfe : b'\xc3\xbe', # þ } MULTIBYTE_MARKERS_AND_SIZES = [ (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF (0xe0, 0xef, 3), # 3-byte characters start with E0-EF (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 ] FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] @classmethod def detwingle(cls, in_bytes, main_encoding="utf8", embedded_encoding="windows-1252"): """Fix characters from one encoding embedded in some other encoding. Currently the only situation supported is Windows-1252 (or its subset ISO-8859-1), embedded in UTF-8. The input must be a bytestring. If you've already converted the document to Unicode, you're too late. The output is a bytestring in which `embedded_encoding` characters have been converted to their `main_encoding` equivalents. """ if embedded_encoding.replace('_', '-').lower() not in ( 'windows-1252', 'windows_1252'): raise NotImplementedError( "Windows-1252 and ISO-8859-1 are the only currently supported " "embedded encodings.") if main_encoding.lower() not in ('utf8', 'utf-8'): raise NotImplementedError( "UTF-8 is the only currently supported main encoding.") byte_chunks = [] chunk_start = 0 pos = 0 while pos < len(in_bytes): byte = in_bytes[pos] if not isinstance(byte, int): # Python 2.x byte = ord(byte) if (byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER): # This is the start of a UTF-8 multibyte character. Skip # to the end. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: if byte >= start and byte <= end: pos += size break elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: # We found a Windows-1252 character! # Save the string up to this point as a chunk. byte_chunks.append(in_bytes[chunk_start:pos]) # Now translate the Windows-1252 character into UTF-8 # and add it as another, one-byte chunk. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) pos += 1 chunk_start = pos else: # Go on to the next character. pos += 1 if chunk_start == 0: # The string is unchanged. return in_bytes else: # Store the final chunk. byte_chunks.append(in_bytes[chunk_start:]) return b''.join(byte_chunks)

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/element.py

import collections import re import sys import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(str): def __new__(cls, prefix, name, namespace=None): if name is None: obj = str.__new__(cls, prefix) else: obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '<meta charset="utf8">', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = str.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: <meta http-equiv="content-type" content="text/html; charset=utf8"> The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return str.__new__(str, original_value) obj = str.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" # There are five possible values for the "formatter" argument passed in # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities # are converted to those entities on output. # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: &amp; &lt; &gt; # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". # A function - This function will be called on every string that # needs to undergo entity substition FORMATTERS = { "html" : EntitySubstitution.substitute_html, "minimal" : EntitySubstitution.substitute_xml, None : None } @classmethod def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" if not callable(formatter): formatter = self.FORMATTERS.get( formatter, EntitySubstitution.substitute_xml) if formatter is None: output = s else: output = formatter(s) return output def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous_element = previous_element if previous_element is not None: self.previous_element.next_element = self self.next_element = None self.previous_sibling = None self.next_sibling = None if self.parent is not None and self.parent.contents: self.previous_sibling = self.parent.contents[-1] self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): if replace_with is self: return if replace_with is self.parent: raise ValueError("Cannot replace a Tag with its parent.") old_parent = self.parent my_index = self.parent.index(self) self.extract() old_parent.insert(my_index, replace_with) return self replaceWith = replace_with # BS3 def unwrap(self): my_parent = self.parent my_index = self.parent.index(self) self.extract() for child in reversed(self.contents[:]): my_parent.insert(my_index, child) return self replace_with_children = unwrap replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside def extract(self): """Destructively rips this element out of the tree.""" if self.parent is not None: del self.parent.contents[self.parent.index(self)] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. last_child = self._last_descendant() next_element = last_child.next_element if self.previous_element is not None: self.previous_element.next_element = next_element if next_element is not None: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None if self.previous_sibling is not None: self.previous_sibling.next_sibling = self.next_sibling if self.next_sibling is not None: self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self def _last_descendant(self): "Finds the last element beneath this object to be parsed." last_child = self while hasattr(last_child, 'contents') and last_child.contents: last_child = last_child.contents[-1] return last_child # BS3: Not part of the API! _lastRecursiveChild = _last_descendant def insert(self, position, new_child): if new_child is self: raise ValueError("Cannot insert a tag into itself.") if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one # of this object's children. if new_child.parent is self: current_index = self.index(new_child) if current_index < position: # We're moving this element further down the list # of this object's children. That means that when # we extract this element, our target index will # jump down one. position -= 1 new_child.extract() new_child.parent = self previous_child = None if position == 0: new_child.previous_sibling = None new_child.previous_element = self else: previous_child = self.contents[position - 1] new_child.previous_sibling = previous_child new_child.previous_sibling.next_sibling = new_child new_child.previous_element = previous_child._last_descendant() if new_child.previous_element is not None: new_child.previous_element.next_element = new_child new_childs_last_element = new_child._last_descendant() if position >= len(self.contents): new_child.next_sibling = None parent = self parents_next_sibling = None while parents_next_sibling is None and parent is not None: parents_next_sibling = parent.next_sibling parent = parent.parent if parents_next_sibling is not None: # We found the element that comes next in the document. break if parents_next_sibling is not None: new_childs_last_element.next_element = parents_next_sibling else: # The last element of this tag is the last element in # the document. new_childs_last_element.next_element = None else: next_child = self.contents[position] new_child.next_sibling = next_child if new_child.next_sibling is not None: new_child.next_sibling.previous_sibling = new_child new_childs_last_element.next_element = next_child if new_childs_last_element.next_element is not None: new_childs_last_element.next_element.previous_element = new_childs_last_element self.contents.insert(position, new_child) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def insert_before(self, predecessor): """Makes the given element the immediate predecessor of this one. The two elements will have the same parent, and the given element will be immediately before this one. """ if self is predecessor: raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(predecessor, PageElement): predecessor.extract() index = parent.index(self) parent.insert(index, predecessor) def insert_after(self, successor): """Makes the given element the immediate successor of this one. The two elements will have the same parent, and the given element will be immediately after this one. """ if self is successor: raise ValueError("Can't insert an element after itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(successor, PageElement): successor.extract() index = parent.index(self) parent.insert(index+1, successor) def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_all_next, name, attrs, text, **kwargs) findNext = find_next # BS3 def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_next_siblings, name, attrs, text, **kwargs) findNextSibling = find_next_sibling # BS3 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_siblings, **kwargs) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 def find_previous(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._find_one( self.find_all_previous, name, attrs, text, **kwargs) findPrevious = find_previous # BS3 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._find_one(self.find_previous_siblings, name, attrs, text, **kwargs) findPreviousSibling = find_previous_sibling # BS3 def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_siblings, **kwargs) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None l = self.find_parents(name, attrs, 1) if l: r = l[0] return r findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._find_all(name, attrs, None, limit, self.parents, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @property def next(self): return self.next_element @property def previous(self): return self.previous_element #These methods do the real heavy lifting. def _find_one(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name elif text is None and not limit and not attrs and not kwargs: # Optimization to find all tags. if name is True or name is None: return [element for element in generator if isinstance(element, Tag)] # Optimization to find all tags with a given name. elif isinstance(name, str): return [element for element in generator if isinstance(element, Tag) and element.name == name] else: strainer = SoupStrainer(name, attrs, text, **kwargs) else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) while True: try: i = next(generator) except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These generators can be used to navigate starting from both #NavigableStrings and Tags. @property def next_elements(self): i = self.next_element while i is not None: yield i i = i.next_element @property def next_siblings(self): i = self.next_sibling while i is not None: yield i i = i.next_sibling @property def previous_elements(self): i = self.previous_element while i is not None: yield i i = i.previous_element @property def previous_siblings(self): i = self.previous_sibling while i is not None: yield i i = i.previous_sibling @property def parents(self): i = self.parent while i is not None: yield i i = i.parent # Methods for supporting CSS selectors. tag_name_re = re.compile('^[a-z0-9]+$') # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # \---/ \---/\-------------/ \-------/ # | | | | # | | | The value # | | ~,|,^,$,* or = # | Attribute # Tag attribselect_re = re.compile( r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + r'=?"?(?P<value>[^\]"]*)"?\]$' ) def _attr_value_as_string(self, value, default=None): """Force an attribute value into a string representation. A multi-valued attribute will be converted into a space-separated stirng. """ value = self.get(value, default) if isinstance(value, list) or isinstance(value, tuple): value =" ".join(value) return value def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. Takes an operator, attribute and optional value. Returns a function that will return True for elements that match that combination. """ if operator == '=': # string representation of `attribute` is equal to `value` return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': # space-separated list representation of `attribute` # contains `value` def _includes_value(element): attribute_value = element.get(attribute, []) if not isinstance(attribute_value, list): attribute_value = attribute_value.split() return value in attribute_value return _includes_value elif operator == '^': # string representation of `attribute` starts with `value` return lambda el: el._attr_value_as_string( attribute, '').startswith(value) elif operator == '$': # string represenation of `attribute` ends with `value` return lambda el: el._attr_value_as_string( attribute, '').endswith(value) elif operator == '*': # string representation of `attribute` contains `value` return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': # string representation of `attribute` is either exactly # `value` or starts with `value` and then a dash. def _is_or_starts_with_dash(element): attribute_value = element._attr_value_as_string(attribute, '') return (attribute_value == value or attribute_value.startswith( value + '-')) return _is_or_starts_with_dash else: return lambda el: el.has_attr(attribute) def select(self, selector): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] for index, token in enumerate(tokens): if tokens[index - 1] == '>': # already found direct descendants in last step. skip this # step. continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector tag, attribute, operator, value = m.groups() if not tag: tag = True checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: found.extend( [el for el in context.find_all(tag) if checker(el)]) current_context = found continue if '#' in token: # ID selector tag, id = token.split('#', 1) if tag == "": tag = True el = current_context[0].find(tag, {'id': id}) if el is None: return [] # No match current_context = [el] continue if '.' in token: # Class selector tag_name, klass = token.split('.', 1) if not tag_name: tag_name = True classes = set(klass.split('.')) found = [] def classes_match(tag): if tag_name is not True and tag.name != tag_name: return False if not tag.has_attr('class'): return False return classes.issubset(tag['class']) for context in current_context: found.extend(context.find_all(classes_match)) current_context = found continue if token == '*': # Star selector found = [] for context in current_context: found.extend(context.findAll(True)) current_context = found continue if token == '>': # Child selector tag = tokens[index + 1] if not tag: tag = True found = [] for context in current_context: found.extend(context.find_all(tag, recursive=False)) current_context = found continue # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] found = [] for context in current_context: found.extend(context.findAll(token)) current_context = found return current_context # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): return self.next_elements def nextSiblingGenerator(self): return self.next_siblings def previousGenerator(self): return self.previous_elements def previousSiblingGenerator(self): return self.previous_siblings def parentGenerator(self): return self.parents class NavigableString(str, PageElement): PREFIX = '' SUFFIX = '' def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, str): return str.__new__(cls, value) return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError( "'%s' object has no attribute '%s'" % ( self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. The string will be passed into the formatter (to trigger side effects), but the return value will be ignored. """ def output_ready(self, formatter="minimal"): """CData strings are passed into the formatter. But the return value is ignored.""" self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): PREFIX = '<![CDATA[' SUFFIX = ']]>' class ProcessingInstruction(PreformattedString): PREFIX = '<?' SUFFIX = '?>' class Comment(PreformattedString): PREFIX = '<!--' SUFFIX = '-->' class Declaration(PreformattedString): PREFIX = '<!' SUFFIX = '!>' class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): value = name if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: value += ' "%s"' % system_id elif system_id is not None: value += ' SYSTEM "%s"' % system_id return Doctype(value) PREFIX = '<!DOCTYPE ' SUFFIX = '>\n' class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: self.parser_class = None else: # We don't actually store the parser object: that lets extracted # chunks be garbage-collected. self.parser_class = parser.__class__ if name is None: raise ValueError("No value provided for new tag's name.") self.name = name self.namespace = namespace self.prefix = prefix if attrs is None: attrs = {} elif builder.cdata_list_attributes: attrs = builder._replace_cdata_list_attribute_values( self.name, attrs) else: attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False # Set up any substitutions, such as the charset in a META tag. if builder is not None: builder.set_up_substitutions(self) self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False parserClass = _alias("parser_class") # BS3 @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) A tag that has contents is never an empty-element tag. A tag that has no contents may or may not be an empty-element tag. It depends on the builder used to create the tag. If the builder has a designated list of empty-element tags, then only a tag whose name shows up in that list is considered an empty-element tag. If the builder has no designated list of empty-element tags, then any tag with no contents is an empty-element tag. """ return len(self.contents) == 0 and self.can_be_empty_element isSelfClosing = is_empty_element # BS3 @property def string(self): """Convenience property to get the single string within this tag. :Return: If this tag has a single string child, return value is that string. If this tag has no children, or more than one child, return value is None. If this tag has one child tag, return value is the 'string' attribute of the child tag, recursively. """ if len(self.contents) != 1: return None child = self.contents[0] if isinstance(child, NavigableString): return child return child.string @string.setter def string(self, string): self.clear() self.append(string.__class__(string)) def _all_strings(self, strip=False): """Yield all child strings, possibly stripping them.""" for descendant in self.descendants: if not isinstance(descendant, NavigableString): continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant strings = property(_all_strings) @property def stripped_strings(self): for string in self._all_strings(True): yield string def get_text(self, separator="", strip=False): """ Get all child strings, concatenated using the given separator. """ return separator.join([s for s in self._all_strings(strip)]) getText = get_text text = property(get_text) def decompose(self): """Recursively destroys the contents of this tree.""" self.extract() i = self while i is not None: next = i.next_element i.__dict__.clear() i = next def clear(self, decompose=False): """ Extract all children. If decompose is True, decompose instead. """ if decompose: for element in self.contents[:]: if isinstance(element, Tag): element.decompose() else: element.extract() else: for element in self.contents[:]: element.extract() def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with tag.contents.index(element) getting the index of equal elements. """ for i, child in enumerate(self.contents): if child is element: return i raise ValueError("Tag.index: element not in tag") def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self.attrs.get(key, default) def has_attr(self, key): return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __bool__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." self.attrs.pop(key, None) def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( '.%sTag is deprecated, use .find("%s") instead.' % ( tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag=="contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag.""" if self is other: return True if (not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other)): return False for i, my_child in enumerate(self.contents): if my_child != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.encode(encoding) def __unicode__(self): return self.decode() def __str__(self): return self.encode() if PY3K: __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, str): val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = '/' else: closeTag = '</%s>' % self.name prefix = '' if self.prefix: prefix = self.prefix + ":" pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) else: return self.encode(encoding, True, formatter=formatter) def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a Unicode string. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ pretty_print = (indent_level is not None) s = [] for c in self: text = None if isinstance(c, NavigableString): text = c.output_ready(formatter) elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) if text and indent_level: text = text.strip() if text: if pretty_print: s.append(" " * (indent_level - 1)) s.append(text) if pretty_print: s.append("\n") return ''.join(s) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a bytestring.""" contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): if not prettyPrint: indentLevel = None return self.encode_contents( indent_level=indentLevel, encoding=encoding) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.descendants if not recursive: generator = self.children return self._find_all(name, attrs, text, limit, generator, **kwargs) findAll = find_all # BS3 findChildren = find_all # BS2 #Generator methods @property def children(self): # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def descendants(self): if not len(self.contents): return stopNode = self._last_descendant().next_element current = self.contents[0] while current is not stopNode: yield current current = current.next_element # Old names for backwards compatibility def childGenerator(self): return self.children def recursiveChildGenerator(self): return self.descendants # This was kind of misleading because has_key() (attributes) was # different from __in__ (contents). has_key() is gone in Python 3, # anyway. has_key = has_attr # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' # attribute. kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs normalized_attrs = {} for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs self.text = self._normalize_search_value(text) def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. if (isinstance(value, str) or callable(value) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value # If it's a bytestring, convert it to Unicode, treating it as UTF-8. if isinstance(value, bytes): return value.decode("utf8") # If it's listlike, convert it into a list of strings. if hasattr(value, '__iter__'): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. new_value.append(v) else: new_value.append(self._normalize_search_value(v)) return new_value # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. return str(str(value)) def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): found = None markup = None if isinstance(markup_name, Tag): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( isinstance(self.name, collections.Callable) and not isinstance(markup_name, Tag)) if ((not self.name) or call_function_with_tag_data or (markup and self._matches(markup, self.name)) or (not markup and self._matches(markup_name, self.name))): if call_function_with_tag_data: match = self.name(markup_name, markup_attrs) else: match = True markup_attr_map = None for attr, match_against in list(self.attrs.items()): if not markup_attr_map: if hasattr(markup_attrs, 'get'): markup_attr_map = markup_attrs else: markup_attr_map = {} for k, v in markup_attrs: markup_attr_map[k] = v attr_value = markup_attr_map.get(attr) if not self._matches(attr_value, match_against): match = False break if match: if markup: found = markup else: found = markup_name if found and self.text and not self._matches(found.string, self.text): found = None return found searchTag = search_tag def search(self, markup): # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text or self.name or self.attrs: found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: raise Exception( "I don't know how to match against a %s" % markup.__class__) return found def _matches(self, markup, match_against): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. if (isinstance(match_against, str) and ' ' in match_against): # A bit of a special case. If they try to match "foo # bar" on a multivalue attribute's value, only accept # the literal value "foo bar" # # XXX This is going to be pretty slow because we keep # splitting match_against. But it shouldn't come up # too often. return (whitespace_re.split(match_against) == markup) else: for item in markup: if self._matches(item, match_against): return True return False if match_against is True: # True matches any non-None value. return markup is not None if isinstance(match_against, collections.Callable): return match_against(markup) # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name # Ensure that `markup` is either a Unicode string, or None. markup = self._normalize_search_value(markup) if markup is None: # None matches None, False, an empty string, an empty list, and so on. return not match_against if isinstance(match_against, str): # Exact string match return markup == match_against if hasattr(match_against, 'match'): # Regexp match return match_against.search(markup) if hasattr(match_against, '__iter__'): # The markup must be an exact match against something # in the iterable. return markup in match_against class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/testing.py

"""Helper classes for tests.""" import copy import functools import unittest from unittest import TestCase from bs4 import BeautifulSoup from bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, Doctype, SoupStrainer, ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder class SoupTest(unittest.TestCase): @property def default_builder(self): return default_builder() def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) def document_for(self, markup): """Turn an HTML fragment into a document. The details depend on the builder. """ return self.default_builder.test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = BeautifulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. Any HTML treebuilder, present or future, should be able to pass these tests. With invalid markup, there's room for interpretation, and different parsers can handle it differently. But with the markup in these tests, there's not much room for interpretation. """ def assertDoctypeHandled(self, doctype_fragment): """Assert that a given doctype string is handled correctly.""" doctype_str, soup = self._document_with_doctype(doctype_fragment) # Make sure a Doctype object was created. doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') def _document_with_doctype(self, doctype_fragment): """Generate and parse a document with the given doctype.""" doctype = '' % doctype_fragment markup = doctype + '\n

foo

' soup = self.soup(markup) return doctype, soup def test_normal_doctypes(self): """Make sure normal, everyday HTML doctypes are handled correctly.""" self.assertDoctypeHandled("html") self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) def test_system_doctype(self): self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') def test_namespaced_system_doctype(self): # We can handle a namespaced doctype with a system ID. self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') def test_real_xhtml_document(self): """A real XHTML document should come out more or less the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) def test_deepcopy(self): """Make sure you can copy the tree builder. This is important because the builder is part of a BeautifulSoup object, and we want to be able to copy that. """ copy.deepcopy(self.default_builder) def test_p_tag_is_never_empty_element(self): """A

tag is never designated as an empty-element tag. Even if the markup shows it as an empty-element tag, it shouldn't be presented that way. """ soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_unclosed_tags_get_closed(self): """A tag that's not closed by the end of the document should be closed. This applies to all tags except empty-element tags. """ self.assertSoupEquals("

", "

") self.assertSoupEquals("", "") self.assertSoupEquals(" ", " ") def test_br_is_always_empty_element_tag(self): """A tag is designated as an empty-element tag. Some parsers treat as one tag, some parsers as two tags, but it should always be an empty-element tag. """ soup = self.soup(" ") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), " ") def test_nested_formatting_elements(self): self.assertSoupEquals("") def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) def test_preserved_whitespace_in_pre_and_textarea(self): """Whitespace must be preserved in
 and  tags."""
        self.assertSoupEquals("
   
") self.assertSoupEquals(" woo ") def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): """Block elements can be nested.""" soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_correctly_nested_tables(self): """One table can go inside another one.""" markup = (' ' '
' "
Here's another table:" ' ' '
foo
' '') self.assertSoupEquals( markup, '
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "
Foo
" "
Bar
" "
Baz
") def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') def test_entities_in_attributes_converted_to_unicode(self): expect = '

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) def test_quot_entity_converted_to_quotation_mark(self): self.assertSoupEquals("

I said "good day!"

", '

I said "good day!"

') def test_out_of_range_entity(self): expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("빲�", expect) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose data.""" markup = b'4' soup = self.soup(markup) self.assertEqual(markup, soup.encode()) html = soup.html self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) self.assertEqual( 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) def test_multivalued_attribute_value_becomes_list(self): markup = b'
' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder # to detect any differences between them. # def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('', '') self.assertSoupEquals( 'foo', 'foo') def test_escaped_ampersand_in_attribute_value_is_left_alone(self): self.assertSoupEquals('') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("
  ") self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'
'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'
') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. hebrew_document = b'

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") self.assertEqual(soup.original_encoding, 'iso8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) content = parsed_meta['content'] self.assertEqual('text/html; charset=x-sjis', content) # But that value is actually a ContentMetaAttributeValue object. self.assertTrue(isinstance(content, ContentMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_html5_style_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") charset = parsed_meta['charset'] self.assertEqual('x-sjis', charset) # But that value is actually a CharsetMetaAttributeValue object. self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('utf8', charset.encode("utf8")) def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) class XMLTreeBuilderSmokeTest(object): def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") self.assertEqual( soup.encode("latin1"), b'\n') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" markup = (b'\n' + b'0' * (2**12) + b'') soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("

", "

") self.assertSoupEquals("

foo

") def test_namespaces_are_preserved(self): markup = 'This tag is in the a namespaceThis tag is in the b namespace' soup = self.soup(markup) root = soup.root self.assertEqual("http://example.com/", root['xmlns:a']) self.assertEqual("http://example.net/", root['xmlns:b']) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" def test_real_xhtml_document(self): # Since XHTML is not HTML5, HTML5 parsers are not tested to handle # XHTML documents in any particular way. pass def test_html_tags_have_namespace(self): markup = "" soup = self.soup(markup) self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) def test_svg_tags_have_namespace(self): markup = '' soup = self.soup(markup) namespace = "http://www.w3.org/2000/svg" self.assertEqual(namespace, soup.svg.namespace) self.assertEqual(namespace, soup.circle.namespace) def test_mathml_tags_have_namespace(self): markup = '5' soup = self.soup(markup) namespace = 'http://www.w3.org/1998/Math/MathML' self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) def skipIf(condition, reason): def nothing(test, *args, **kwargs): return None def decorator(test_item): if condition: return nothing else: return test_item return decorator

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_builder_registry.py

"""Tests of the builder registry.""" import unittest from bs4 import BeautifulSoup from bs4.builder import ( builder_registry as registry, HTMLParserTreeBuilder, TreeBuilderRegistry, ) try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError: HTML5LIB_PRESENT = False try: from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, ) LXML_PRESENT = True except ImportError: LXML_PRESENT = False class BuiltInRegistryTest(unittest.TestCase): """Test the built-in registry with the default builders registered.""" def test_combination(self): if LXML_PRESENT: self.assertEqual(registry.lookup('fast', 'html'), LXMLTreeBuilder) if LXML_PRESENT: self.assertEqual(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder) def test_lookup_by_markup_type(self): if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) else: self.assertEqual(registry.lookup('xml'), None) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) else: self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) def test_named_library(self): if LXML_PRESENT: self.assertEqual(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('lxml', 'html'), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib'), HTML5TreeBuilder) self.assertEqual(registry.lookup('html.parser'), HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): # You can pass in a string. BeautifulSoup("", features="html") # Or a list of strings. BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. self.assertRaises(ValueError, BeautifulSoup, "", features="no-such-feature") class RegistryTest(unittest.TestCase): """Test the TreeBuilderRegistry class in general.""" def setUp(self): self.registry = TreeBuilderRegistry() def builder_for_features(self, *feature_list): cls = type('Builder_' + '_'.join(feature_list), (object,), {'features' : feature_list}) self.registry.register(cls) return cls def test_register_with_no_features(self): builder = self.builder_for_features() # Since the builder advertises no features, you can't find it # by looking up features. self.assertEqual(self.registry.lookup('foo'), None) # But you can find it by doing a lookup with no features, if # this happens to be the only registered builder. self.assertEqual(self.registry.lookup(), builder) def test_register_with_features_makes_lookup_succeed(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('foo'), builder) self.assertEqual(self.registry.lookup('bar'), builder) def test_lookup_fails_when_no_builder_implements_feature(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('baz'), None) def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): builder1 = self.builder_for_features('foo') builder2 = self.builder_for_features('bar') self.assertEqual(self.registry.lookup(), builder2) def test_lookup_fails_when_no_tree_builders_registered(self): self.assertEqual(self.registry.lookup(), None) def test_lookup_gets_most_recent_builder_supporting_all_features(self): has_one = self.builder_for_features('foo') has_the_other = self.builder_for_features('bar') has_both_early = self.builder_for_features('foo', 'bar', 'baz') has_both_late = self.builder_for_features('foo', 'bar', 'quux') lacks_one = self.builder_for_features('bar') has_the_other = self.builder_for_features('foo') # There are two builders featuring 'foo' and 'bar', but # the one that also features 'quux' was registered later. self.assertEqual(self.registry.lookup('foo', 'bar'), has_both_late) # There is only one builder featuring 'foo', 'bar', and 'baz'. self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), has_both_early) def test_lookup_fails_when_cannot_reconcile_requested_features(self): builder1 = self.builder_for_features('foo', 'bar') builder2 = self.builder_for_features('foo', 'baz') self.assertEqual(self.registry.lookup('bar', 'baz'), None)

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_docs.py

"Test harness for doctests." # pylint: disable-msg=E0611,W0142 __metaclass__ = type __all__ = [ 'additional_tests', ] import atexit import doctest import os #from pkg_resources import ( # resource_filename, resource_exists, resource_listdir, cleanup_resources) import unittest DOCTEST_FLAGS = ( doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) # def additional_tests(): # "Run the doc tests (README.txt and docs/*, if any exist)" # doctest_files = [ # os.path.abspath(resource_filename('bs4', 'README.txt'))] # if resource_exists('bs4', 'docs'): # for name in resource_listdir('bs4', 'docs'): # if name.endswith('.txt'): # doctest_files.append( # os.path.abspath( # resource_filename('bs4', 'docs/%s' % name))) # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) # atexit.register(cleanup_resources) # return unittest.TestSuite(( # doctest.DocFileSuite(*doctest_files, **kwargs)))

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_html5lib.py

"""Tests to ensure that the html5lib tree builder generates good trees.""" import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( HTML5TreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): return HTML5TreeBuilder() def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" with warnings.catch_warnings(record=True) as w: soup = self.soup(markup, parse_only=strainer) self.assertEqual( soup.decode(), self.document_for(markup)) self.assertTrue( "the html5lib tree builder doesn't support parse_only" in str(w[0].message)) def test_correctly_nested_tables(self): """html5lib inserts <tbody> tags where other parsers don't.""" markup = ('<table id="1">' '<tr>' "<td>Here's another table:" '<table id="2">' '<tr><td>foo</td></tr>' '</table></td>') self.assertSoupEquals( markup, '<table id="1"><tbody><tr><td>Here\'s another table:' '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' '</td></tr></tbody></table>') self.assertSoupEquals( "<table><thead><tr><td>Foo</td></tr></thead>" "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>")

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_htmlparser.py

"""Tests to ensure that the html.parser tree builder generates good trees.""" from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): return HTMLParserTreeBuilder() def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_lxml.py

"""Tests to ensure that the lxml tree builder generates good trees.""" import re import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilder() def test_out_of_range_entity(self): self.assertSoupEquals( "

foo&#10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo&#x10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo빲�bar

", "

foobar

") def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=False) as w: soup = BeautifulStoneSoup("") self.assertEqual("", str(soup.b)) def test_real_xhtml_document(self): """lxml strips the XML definition from an XHTML doc, which is fine.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b''), markup.replace(b'\n', b'').replace( b'', b'')) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilderForXML()

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_soup.py

# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" import unittest from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, ) import bs4.dammit from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import ( SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False class TestDeprecatedConstructorArguments(SoupTest): def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", parseOnlyThese=SoupStrainer("b")) msg = str(w[0].message) self.assertTrue("parseOnlyThese" in msg) self.assertTrue("parse_only" in msg) self.assertEqual(b"", soup.encode()) def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding) def test_unrecognized_keyword_argument(self): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) @skipIf( not LXML_PRESENT, "lxml not present, not testing BeautifulStoneSoup.") def test_beautifulstonesoup(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") self.assertTrue(isinstance(soup, BeautifulSoup)) self.assertTrue("BeautifulStoneSoup class is deprecated") class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.encode(), b"YesYes Yes") class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = "foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = b"\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) def test_xml_attribute_quoting_normally_uses_double_quotes(self): self.assertEqual(self.sub.substitute_xml("Welcome", True), '"Welcome"') self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), '"Bob\'s Bar"') def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, True), "'Welcome to \"my bar\"'") def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' self.assertEqual( self.sub.substitute_xml(s, True), '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' self.assertEqual(self.sub.substitute_xml(quoted), quoted) def test_xml_quoting_handles_angle_brackets(self): self.assertEqual( self.sub.substitute_xml("foo"), "foo<bar>") def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' self.assertEqual(self.sub.substitute_html(text), text) class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setUp(self): super(TestEncodingConversion, self).setUp() self.unicode_data = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, "\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_ascii(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """''""""") def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, '\xe9') self.assertEqual(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'iso-8859-8') self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_detect_html5_style_meta_tag(self): for data in ( b'', b"", b"", b""): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding) def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe\x00\xe1\x00\xe9\x00\x00' dammit = UnicodeDammit(data) self.assertEqual("áé", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual( "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none(self): a = NamespacedAttribute("xmlns", None) self.assertEqual(a, "xmlns") def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) def test_attributes_are_equivalent_if_prefix_and_name_identical(self): a = NamespacedAttribute("a", "b", "c") b = NamespacedAttribute("a", "b", "c") self.assertEqual(a, b) # The actual namespace is not considered. c = NamespacedAttribute("a", "b", None) self.assertEqual(a, c) # But name and prefix are important. d = NamespacedAttribute("a", "z", "c") self.assertNotEqual(a, d) e = NamespacedAttribute("z", "b", "c") self.assertNotEqual(a, e) class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") self.assertEqual("euc-jp", value) self.assertEqual("euc-jp", value.original_value) self.assertEqual("utf8", value.encode("utf8")) def test_content_meta_attribute_value(self): value = ContentMetaAttributeValue("text/html; charset=euc-jp") self.assertEqual("text/html; charset=euc-jp", value) self.assertEqual("text/html; charset=euc-jp", value.original_value) self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/test_tree.py

# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( CData, Doctype, NavigableString, SoupStrainer, Tag, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup('

Räksmörgås

') self.assertEqual(soup.find(text='Räksmörgås'), 'Räksmörgås') class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. self.assertEqual(soup.find_all(text="bar"), ["bar"]) # Match any of a number of strings. self.assertEqual( soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), ["Foo", "bar", '\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), ["Foo", "bar", '\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("1 2 3 4 5") self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) self.assertSelects(soup.find_all('a', limit=1), ["1"]) self.assertSelects( soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assertSelects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assertSelects(soup('a', limit=1), ["1"]) self.assertSelects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): soup = self.soup("") # Create a self-referential list. l = [] l.append(l) # Without special code in _normalize_search_value, this would cause infinite # recursion. self.assertEqual([], soup.find_all(l)) class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('4') self.assertEqual("4", soup.find("mathml:msqrt").string) self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" def setUp(self): super(TreeTest, self).setUp() self.tree = self.soup(""" First tag. Second tag. Third Nested tag. tag.""") def test_find_all_by_tag_name(self): # Find all the tags. self.assertSelects( self.tree.find_all('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_name_and_text(self): self.assertSelects( self.tree.find_all('a', text='First tag.'), ['First tag.']) self.assertSelects( self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) self.assertSelects( self.tree.find_all('a', text=re.compile("tag")), ['First tag.', 'Nested tag.']) def test_find_all_on_non_root_element(self): # You can call find_all on any node, not just the root. self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) def test_calling_element_invokes_find_all(self): self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_strainer(self): self.assertSelects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_names(self): self.assertSelects( self.tree.find_all(['a', 'b']), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_dict(self): self.assertSelects( self.tree.find_all({'a' : True, 'b' : True}), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_re(self): self.assertSelects( self.tree.find_all(re.compile('^[ab]$')), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_with_tags_matching_method(self): # You can define an oracle method that determines whether # a tag matches the search. def id_matches_name(tag): return tag.name == tag.get('id') tree = self.soup(""" Match 1. Does not match. Match 2.""") self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by # attribute. tree = self.soup(""" Matching a. Non-matching Matching b.a. """) self.assertSelects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): peace = "םולש".encode("utf8") data = ''.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument # to find_all) and 'class' (a reserved word in Python.) tree = self.soup(""" Name match. Class match. Non-match. A tag called 'name1'. """) # This doesn't do what you want. self.assertSelects(tree.find_all(name='name1'), ["A tag called 'name1'."]) # This does what you want. self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) # Passing class='class2' would cause a syntax error. self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): # Passing in a string to 'attrs' will search the CSS class. tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") attrs = { 'class' : re.compile("o") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, ["Found it"]) f = tree.find_all("gar", re.compile("a")) self.assertSelects(f, ["Found it"]) # Since the class is not the string "foo bar", but the two # strings "foo" and "bar", this will not find anything. attrs = { 'class' : re.compile("o b") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, []) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) def big_attribute_value(value): return len(value) > 3 self.assertSelects(soup.find_all("a", big_attribute_value), []) def small_attribute_value(value): return len(value) ') a, a2 = soup.find_all("a") self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) # If you specify the attribute as a string that contains a # space, only that specific value will be found. self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" Match. Non-match.""") strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) def test_find_all_with_missing_atribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) def test_find_all_with_defined_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that have that attribute set to any value. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects( tree.find_all(id=True), ["ID present.", "ID is empty."]) def test_find_all_with_numeric_attribute(self): # If you search for a number, it's treated as a string. tree = self.soup("""Unquoted attribute. Quoted attribute.""") expected = ["Unquoted attribute.", "Quoted attribute."] self.assertSelects(tree.find_all(id=1), expected) self.assertSelects(tree.find_all(id="1"), expected) def test_find_all_with_list_attribute_values(self): # You can pass a list of attribute values instead of just one, # and you'll get tags that match any of the values. tree = self.soup("""1 2 3 No ID.""") self.assertSelects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) def test_find_all_with_regular_expression_attribute_value(self): # You can pass a regular expression as an attribute value, and # you'll get tags whose values for that attribute match the # regular expression. tree = self.soup("""One a. Two as. Mixed as and bs. One b. No ID.""") self.assertSelects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) def test_find_by_name_and_containing_string(self): soup = self.soup("foobarfoo") a = soup.a self.assertEqual([a], soup.find_all("a", text="foo")) self.assertEqual([], soup.find_all("a", text="bar")) self.assertEqual([], soup.find_all("a", text="bar")) def test_find_by_name_and_containing_string_when_string_is_buried(self): soup = self.soup("foo foo") self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) def test_find_by_attribute_and_containing_string(self): soup = self.soup('foofoo') a = soup.a self.assertEqual([a], soup.find_all(id=2, text="foo")) self.assertEqual([], soup.find_all(id=1, text="bar")) class TestIndex(TreeTest): """Test Tag.index""" def test_index(self): tree = self.soup(""" Identical Not identical Identical Identical with child Also not identical Identical with child """) div = tree.div for i, element in enumerate(div.contents): self.assertEqual(i, div.index(element)) self.assertRaises(ValueError, tree.index, 1) class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" def setUp(self): super(TestParentOperations, self).setUp() self.tree = self.soup('''

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/tests/__init__.py

"The beautifulsoup tests."

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/build/lib/bs4/__init__.py

"""Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup provides provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. Beautiful Soup works with Python 2.6 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson ([email protected])" __version__ = "4.1.0" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import re import warnings from .builder import builder_registry from .dammit import UnicodeDammit from .element import ( CData, Comment, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, NavigableString, PageElement, ProcessingInstruction, ResultSet, SoupStrainer, Tag, ) # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ This class defines the basic interface called by the tree builders. These methods will be called by the parser: reset() feed(markup) The tree builder may call these methods from its feed() implementation: handle_starttag(name, attrs) # See note about return value handle_endtag(name) handle_data(data) # Appends to the current data node endData(containerClass=NavigableString) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, 'data' events, and "done with data" events. If you encounter an empty-element tag (aka a self-closing tag, like HTML's <br> tag), call handle_starttag and then handle_endtag. """ ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. You can pass in features='html' " "or features='xml' to get a builder capable of handling " "one or the other.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if len(kwargs) > 0: arg = list(kwargs.keys()).pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise ValueError( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) = ( self.builder.prepare_markup(markup, from_encoding)) try: self._feed() except StopParsing: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None def _feed(self): # Convert the document to Unicode. self.builder.reset() self.builder.feed(self.markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def reset(self): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" navigable = NavigableString(s) navigable.setup() return navigable def insert_before(self, successor): raise ValueError("BeautifulSoup objects don't support insert_before().") def insert_after(self, successor): raise ValueError("BeautifulSoup objects don't support insert_after().") def popTag(self): tag = self.tagStack.pop() #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): if self.currentData: currentData = ''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.builder.preserve_whitespace_tags)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ not self.parse_only.search(currentData)): return o = containerClass(currentData) self.object_was_parsed(o) def object_was_parsed(self, o): """Add an object to the parse tree.""" o.setup(self.currentTag, self.previous_element) if self.previous_element: self.previous_element.next_element = o self.previous_element = o self.currentTag.contents.append(o) def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): if (name == self.tagStack[i].name and nsprefix == self.tagStack[i].nsprefix == nsprefix): numPops = len(self.tagStack) - i break if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the SoupStrainer. You should proceed as if the tag had not occured in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 and (self.parse_only.text or not self.parse_only.search_tag(name, attrs))): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self.previous_element) if tag is None: return tag if self.previous_element: self.previous_element.next_element = tag self.previous_element = tag self.pushTag(tag) return tag def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): self.currentData.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: # Print the XML declaration encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = '<?xml version="1.0"%s?>\n' % encoding_part else: prefix = '' if not pretty_print: indent_level = None else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" def __init__(self, *args, **kwargs): kwargs['features'] = 'xml' warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.') super(BeautifulStoneSoup, self).__init__(*args, **kwargs) class StopParsing(Exception): pass #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print(soup.prettify())

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/COPYING.txt

Beautiful Soup is made available under the MIT license: Copyright (c) 2004-2012 Leonard Richardson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE, DAMMIT. Beautiful Soup incorporates code from the html5lib library, which is also made available under the MIT license.

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/doc/Makefile

# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest help: @echo "Please use \`make <target>' where <target> is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." make -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt."

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/doc/source/6.1.jpg

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/doc/source/conf.py

# -*- coding: utf-8 -*- # # Beautiful Soup documentation build configuration file, created by # sphinx-quickstart on Thu Jan 26 11:22:55 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Beautiful Soup' copyright = u'2012, Leonard Richardson' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '4' # The full version, including alpha/beta/rc tags. release = '4.0.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # "<project> v<release> documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a <link> tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'BeautifulSoupdoc' # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', u'Leonard Richardson', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'beautifulsoup', u'Beautiful Soup Documentation', [u'Leonard Richardson'], 1) ] # -- Options for Epub output --------------------------------------------------- # Bibliographic Dublin Core info. epub_title = u'Beautiful Soup' epub_author = u'Leonard Richardson' epub_publisher = u'Leonard Richardson' epub_copyright = u'2012, Leonard Richardson' # The language of the text. It defaults to the language option # or en if the language is not set. #epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. #epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. #epub_identifier = '' # A unique identification for the text. #epub_uid = '' # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_post_files = [] # A list of files that should not be packed into the epub file. #epub_exclude_files = [] # The depth of the table of contents in toc.ncx. #epub_tocdepth = 3 # Allow duplicate toc entries. #epub_tocdup = True

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/doc/source/index.rst

Beautiful Soup Documentation ============================ .. image:: 6.1.jpg :align: right :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." `Beautiful Soup `_ is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work. These instructions illustrate all major features of Beautiful Soup 4, with examples. I show you what the library is good for, how it works, how to use it, how to make it do what you want, and what to do when it violates your expectations. The examples in this documentation should work the same way in Python 2.7 and Python 3.2. You might be looking for the documentation for `Beautiful Soup 3 `_. If you want to learn about the differences between Beautiful Soup 3 and Beautiful Soup 4, see `Porting code to BS4`_. Getting help ------------ If you have questions about Beautiful Soup, or run into problems, `send mail to the discussion group `_. Quick Start =========== Here's an HTML document I'll be using as an example throughout this document. It's part of a story from `Alice in Wonderland`:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" Running the "three sisters" document through Beautiful Soup gives us a ``BeautifulSoup`` object, which represents the document as a nested data structure:: from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) print(soup.prettify()) # # # # # #

# # The Dormouse's story # #

#

# Once upon a time there were three little sisters; and their names were # # Elsie # # , # # Lacie # # and # # Tillie # # ; and they lived at the bottom of a well. #

#

# ... #

# # Here are some simple ways to navigate that data structure:: soup.title # soup.title.name # u'title' soup.title.string # u'The Dormouse's story' soup.title.parent.name # u'head' soup.p #

The Dormouse's story

soup.p['class'] # u'title' soup.a # Elsie soup.find_all('a') # [Elsie, # Lacie, # Tillie] soup.find(id="link3") # Tillie One common task is extracting all the URLs found within a page's tags:: for link in soup.find_all('a'): print(link.get('href')) # http://example.com/elsie # http://example.com/lacie # http://example.com/tillie Another common task is extracting all the text from a page:: print(soup.get_text()) # The Dormouse's story # # The Dormouse's story # # Once upon a time there were three little sisters; and their names were # Elsie, # Lacie and # Tillie; # and they lived at the bottom of a well. # # ... Does this look like what you need? If so, read on. Installing Beautiful Soup ========================= If you're using a recent version of Debian or Ubuntu Linux, you can install Beautiful Soup with the system package manager: :kbd:`$ apt-get install python-beautifulsoup4` Beautiful Soup 4 is published through PyPi, so if you can't install it with the system packager, you can install it with ``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, and the same package works on Python 2 and Python 3. :kbd:`$ easy_install beautifulsoup4` :kbd:`$ pip install beautifulsoup4` (The ``BeautifulSoup`` package is probably `not` what you want. That's the previous major release, `Beautiful Soup 3`_. Lots of software uses BS3, so it's still available, but if you're writing new code you should install ``beautifulsoup4``.) If you don't have ``easy_install`` or ``pip`` installed, you can `download the Beautiful Soup 4 source tarball `_ and install it with ``setup.py``. :kbd:`$ python setup.py install` If all else fails, the license for Beautiful Soup allows you to package the entire library with your application. You can download the tarball, copy its ``bs4`` directory into your application's codebase, and use Beautiful Soup without installing it at all. I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it should work with other recent versions. Problems after installation --------------------------- Beautiful Soup is packaged as Python 2 code. When you install it for use with Python 3, it's automatically converted to Python 3 code. If you don't install the package, the code won't be converted. There have also been reports on Windows machines of the wrong version being installed. If you get the ``ImportError`` "No module named HTMLParser", your problem is that you're running the Python 2 version of the code under Python 3. If you get the ``ImportError`` "No module named html.parser", your problem is that you're running the Python 3 version of the code under Python 2. In both cases, your best bet is to completely remove the Beautiful Soup installation from your system (including any directory created when you unzipped the tarball) and try the installation again. If you get the ``SyntaxError`` "Invalid syntax" on the line ``ROOT_TAG_NAME = u'[document]'``, you need to convert the Python 2 code to Python 3. You can do this either by installing the package: :kbd:`$ python3 setup.py install` or by manually running Python's ``2to3`` conversion script on the ``bs4`` directory: :kbd:`$ 2to3-3.2 -w bs4` .. _parser-installation: Installing a parser ------------------- Beautiful Soup supports the HTML parser included in Python's standard library, but it also supports a number of third-party Python parsers. One is the `lxml parser `_. Depending on your setup, you might install lxml with one of these commands: :kbd:`$ apt-get install python-lxml` :kbd:`$ easy_install lxml` :kbd:`$ pip install lxml` If you're using Python 2, another alternative is the pure-Python `html5lib parser `_, which parses HTML the way a web browser does. Depending on your setup, you might install html5lib with one of these commands: :kbd:`$ apt-get install python-html5lib` :kbd:`$ easy_install html5lib` :kbd:`$ pip install html5lib` This table summarizes the advantages and disadvantages of each parser library: +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | Parser | Typical usage | Advantages | Disadvantages | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | Python's html.parser | ``BeautifulSoup(markup, "html.parser")`` | * Batteries included | * Not very lenient | | | | * Decent speed | (before Python 2.7.3 | | | | * Lenient (as of Python 2.7.3 | or 3.2.2) | | | | and 3.2.) | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency | | | | * Lenient | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | lxml's XML parser | ``BeautifulSoup(markup, ["lxml", "xml"])`` | * Very fast | * External C dependency | | | ``BeautifulSoup(markup, "xml")`` | * The only currently supported | | | | | XML parser | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | html5lib | ``BeautifulSoup(markup, html5lib)`` | * Extremely lenient | * Very slow | | | | * Parses pages the same way a | * External Python | | | | web browser does | dependency | | | | * Creates valid HTML5 | * Python 2 only | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ If you can, I recommend you install and use lxml for speed. If you're using a version of Python 2 earlier than 2.7.3, or a version of Python 3 earlier than 3.2.2, it's `essential` that you install lxml or html5lib--Python's built-in HTML parser is just not very good in older versions. Note that if a document is invalid, different parsers will generate different Beautiful Soup trees for it. See `Differences between parsers`_ for details. Making the soup =============== To parse a document, pass it into the ``BeautifulSoup`` constructor. You can pass in a string or an open filehandle:: from bs4 import BeautifulSoup soup = BeautifulSoup(open("index.html")) soup = BeautifulSoup("data") First, the document is converted to Unicode, and HTML entities are converted to Unicode characters:: BeautifulSoup("Sacré bleu!") Sacré bleu! Beautiful Soup then parses the document using the best available parser. It will use an HTML parser unless you specifically tell it to use an XML parser. (See `Parsing XML`_.) Kinds of objects ================ Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you'll only ever have to deal with about four `kinds` of objects. .. _Tag: ``Tag`` ------- A ``Tag`` object corresponds to an XML or HTML tag in the original document:: soup = BeautifulSoup('Extremely bold') tag = soup.b type(tag) # Tags have a lot of attributes and methods, and I'll cover most of them in `Navigating the tree`_ and `Searching the tree`_. For now, the most important features of a tag are its name and attributes. Name ^^^^ Every tag has a name, accessible as ``.name``:: tag.name # u'b' If you change a tag's name, the change will be reflected in any HTML markup generated by Beautiful Soup:: tag.name = "blockquote" tag #
Extremely bold
Attributes ^^^^^^^^^^ A tag may have any number of attributes. The tag ```` has an attribute "class" whose value is "boldest". You can access a tag's attributes by treating the tag like a dictionary:: tag['class'] # u'boldest' You can access that dictionary directly as ``.attrs``:: tag.attrs # {u'class': u'boldest'} You can add, remove, and modify a tag's attributes. Again, this is done by treating the tag as a dictionary:: tag['class'] = 'verybold' tag['id'] = 1 tag #
Extremely bold
del tag['class'] del tag['id'] tag #
Extremely bold
tag['class'] # KeyError: 'class' print(tag.get('class')) # None .. _multivalue: Multi-valued attributes &&&&&&&&&&&&&&&&&&&&&&& HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more. The most common multi-valued attribute is ``class`` (that is, a tag can have more than one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, ``headers``, and ``accesskey``. Beautiful Soup presents the value(s) of a multi-valued attribute as a list:: css_soup = BeautifulSoup('

') css_soup.p['class'] # ["body", "strikeout"] css_soup = BeautifulSoup('

') css_soup.p['class'] # ["body"] If an attribute `looks` like it has more than one value, but it's not a multi-valued attribute as defined by any version of the HTML standard, Beautiful Soup will leave the attribute alone:: id_soup = BeautifulSoup('

') id_soup.p['id'] # 'my id' When you turn a tag back into a string, multiple attribute values are consolidated:: rel_soup = BeautifulSoup('

Back to the homepage

') rel_soup.a['rel'] # ['index'] rel_soup.a['rel'] = ['index', 'contents'] print(rel_soup.p) #

Back to the homepage

If you parse a document as XML, there are no multi-valued attributes:: xml_soup = BeautifulSoup('

', 'xml') xml_soup.p['class'] # u'body strikeout' ``NavigableString`` ------------------- A string corresponds to a bit of text within a tag. Beautiful Soup uses the ``NavigableString`` class to contain these bits of text:: tag.string # u'Extremely bold' type(tag.string) # A ``NavigableString`` is just like a Python Unicode string, except that it also supports some of the features described in `Navigating the tree`_ and `Searching the tree`_. You can convert a ``NavigableString`` to a Unicode string with ``unicode()``:: unicode_string = unicode(tag.string) unicode_string # u'Extremely bold' type(unicode_string) # You can't edit a string in place, but you can replace one string with another, using :ref:`replace_with`:: tag.string.replace_with("No longer bold") tag #

No longer bold
``NavigableString`` supports most of the features described in `Navigating the tree`_ and `Searching the tree`_, but not all of them. In particular, since a string can't contain anything (the way a tag may contain a string or another tag), strings don't support the ``.contents`` or ``.string`` attributes, or the ``find()`` method. ``BeautifulSoup`` ----------------- The ``BeautifulSoup`` object itself represents the document as a whole. For most purposes, you can treat it as a :ref:`Tag` object. This means it supports most of the methods described in `Navigating the tree`_ and `Searching the tree`_. Since the ``BeautifulSoup`` object doesn't correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it's useful to look at its ``.name``, so it's been given the special ``.name`` "[document]":: soup.name # u'[document]' Comments and other special strings ---------------------------------- ``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost everything you'll see in an HTML or XML file, but there are a few leftover bits. The only one you'll probably ever need to worry about is the comment:: markup = "" soup = BeautifulSoup(markup) comment = soup.b.string type(comment) # The ``Comment`` object is just a special type of ``NavigableString``:: comment # u'Hey, buddy. Want to buy a used parser' But when it appears as part of an HTML document, a ``Comment`` is displayed with special formatting:: print(soup.b.prettify()) # # # Beautiful Soup defines classes for anything else that might show up in an XML document: ``CData``, ``ProcessingInstruction``, ``Declaration``, and ``Doctype``. Just like ``Comment``, these classes are subclasses of ``NavigableString`` that add something extra to the string. Here's an example that replaces the comment with a CDATA block:: from bs4 import CData cdata = CData("A CDATA block") comment.replace_with(cdata) print(soup.b.prettify()) # # A CDATA block # Navigating the tree =================== Here's the "Three sisters" HTML document again:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) I'll use this as an example to show you how to move from one part of a document to another. Going down ---------- Tags may contain strings and other tags. These elements are the tag's `children`. Beautiful Soup provides a lot of different attributes for navigating and iterating over a tag's children. Note that Beautiful Soup strings don't support any of these attributes, because a string can't have children. Navigating using tag names ^^^^^^^^^^^^^^^^^^^^^^^^^^ The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the tag, just say ``soup.head``:: soup.head # soup.title # You can do use this trick again and again to zoom in on a certain part of the parse tree. This code gets the first tag beneath the tag:: soup.body.b # The Dormouse's story Using a tag name as an attribute will give you only the `first` tag by that name:: soup.a # Elsie If you need to get `all` the tags, or anything more complicated than the first tag with a certain name, you'll need to use one of the methods described in `Searching the tree`_, such as `find_all()`:: soup.find_all('a') # [ Elsie, # Lacie, # Tillie] ``.contents`` and ``.children`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A tag's children are available in a list called ``.contents``:: head_tag = soup.head head_tag # head_tag.contents [] title_tag = head_tag.contents[0] title_tag # title_tag.contents # [u'The Dormouse's story'] The ``BeautifulSoup`` object itself has children. In this case, the tag is the child of the ``BeautifulSoup`` object.:: len(soup.contents) # 1 soup.contents[0].name # u'html' A string does not have ``.contents``, because it can't contain anything:: text = title_tag.contents[0] text.contents # AttributeError: 'NavigableString' object has no attribute 'contents' Instead of getting them as a list, you can iterate over a tag's children using the ``.children`` generator:: for child in title_tag.children: print(child) # The Dormouse's story ``.descendants`` ^^^^^^^^^^^^^^^^ The ``.contents`` and ``.children`` attributes only consider a tag's `direct` children. For instance, the tag has a single direct child--the ] But the tag. The ``.descendants`` attribute lets you iterate over `all` of a tag's children, recursively: its direct children, the children of its direct children, and so on:: for child in head_tag.descendants: print(child) # # The Dormouse's story The tag has only one child, but it has two descendants: the tag), but it has a whole lot of descendants:: len(list(soup.children)) # 1 len(list(soup.descendants)) # 25 .. _.string: ``.string`` ^^^^^^^^^^^ If a tag has only one child, and that child is a ``NavigableString``, the child is made available as ``.string``:: title_tag.string # u'The Dormouse's story' If a tag's only child is another tag, and `that` tag has a ``.string``, then the parent tag is considered to have the same ``.string`` as its child:: head_tag.contents # [] head_tag.string # u'The Dormouse's story' If a tag contains more than one thing, then it's not clear what ``.string`` should refer to, so ``.string`` is defined to be ``None``:: print(soup.html.string) # None .. _string-generators: ``.strings`` and ``stripped_strings`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If there's more than one thing inside a tag, you can still look at just the strings. Use the ``.strings`` generator:: for string in soup.strings: print(repr(string)) # u"The Dormouse's story" # u'\n\n' # u"The Dormouse's story" # u'\n\n' # u'Once upon a time there were three little sisters; and their names were\n' # u'Elsie' # u',\n' # u'Lacie' # u' and\n' # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'\n\n' # u'...' # u'\n' These strings tend to have a lot of extra whitespace, which you can remove by using the ``.stripped_strings`` generator instead:: for string in soup.stripped_strings: print(repr(string)) # u"The Dormouse's story" # u"The Dormouse's story" # u'Once upon a time there were three little sisters; and their names were' # u'Elsie' # u',' # u'Lacie' # u'and' # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'...' Here, strings consisting entirely of whitespace are ignored, and whitespace at the beginning and end of strings is removed. Going up -------- Continuing the "family tree" analogy, every tag and every string has a `parent`: the tag that contains it. .. _.parent: ``.parent`` ^^^^^^^^^^^ You can access an element's parent with the ``.parent`` attribute. In the example "three sisters" document, the tag is the parent of the title_tag.parent # The title string itself has a parent: the The parent of a top-level tag like is the ``BeautifulSoup`` object itself:: html_tag = soup.html type(html_tag.parent) # And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: print(soup.parent) # None .. _.parents: ``.parents`` ^^^^^^^^^^^^ You can iterate over all of an element's parents with ``.parents``. This example uses ``.parents`` to travel from an tag buried deep within the document, to the very top of the document:: link = soup.a link # Elsie for parent in link.parents: if parent is None: print(parent) else: print(parent.name) # p # body # html # [document] # None Going sideways -------------- Consider a simple document like this:: sibling_soup = BeautifulSoup("text1text2") print(sibling_soup.prettify()) # # # # # text1 # # # text2 # # # # The tag and the tag are at the same level: they're both direct children of the same tag. We call them `siblings`. When a document is pretty-printed, siblings show up at the same indentation level. You can also use this relationship in the code you write. ``.next_sibling`` and ``.previous_sibling`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can use ``.next_sibling`` and ``.previous_sibling`` to navigate between page elements that are on the same level of the parse tree:: sibling_soup.b.next_sibling # text2 sibling_soup.c.previous_sibling # text1 The tag has a ``.next_sibling``, but no ``.previous_sibling``, because there's nothing before the tag `on the same level of the tree`. For the same reason, the tag has a ``.previous_sibling`` but no ``.next_sibling``:: print(sibling_soup.b.previous_sibling) # None print(sibling_soup.c.next_sibling) # None The strings "text1" and "text2" are `not` siblings, because they don't have the same parent:: sibling_soup.b.string # u'text1' print(sibling_soup.b.string.next_sibling) # None In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a tag will usually be a string containing whitespace. Going back to the "three sisters" document:: Elsie Lacie Tillie You might think that the ``.next_sibling`` of the first tag would be the second tag. But actually, it's a string: the comma and newline that separate the first tag from the second:: link = soup.a link # Elsie link.next_sibling # u',\n' The second tag is actually the ``.next_sibling`` of the comma:: link.next_sibling.next_sibling # Lacie .. _sibling-generators: ``.next_siblings`` and ``.previous_siblings`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can iterate over a tag's siblings with ``.next_siblings`` or ``.previous_siblings``:: for sibling in soup.a.next_siblings: print(repr(sibling)) # u',\n' # Lacie # u' and\n' # Tillie # u'; and they lived at the bottom of a well.' # None for sibling in soup.find(id="link3").previous_siblings: print(repr(sibling)) # ' and\n' # Lacie # u',\n' # Elsie # u'Once upon a time there were three little sisters; and their names were\n' # None Going back and forth -------------------- Take a look at the beginning of the "three sisters" document::

The Dormouse's story

An HTML parser takes this string of characters and turns it into a series of events: "open an tag", "open a tag", "open a

tag", and so on. Beautiful Soup offers tools for reconstructing the initial parse of the document. .. _element-generators: ``.next_element`` and ``.previous_element`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``.next_element`` attribute of a string or tag points to whatever was parsed immediately afterwards. It might be the same as ``.next_sibling``, but it's usually drastically different. Here's the final tag in the "three sisters" document. Its ``.next_sibling`` is a string: the conclusion of the sentence that was interrupted by the start of the tag.:: last_a_tag = soup.find("a", id="link3") last_a_tag # Tillie last_a_tag.next_sibling # '; and they lived at the bottom of a well.' But the ``.next_element`` of that tag, the thing that was parsed immediately after the tag, is `not` the rest of that sentence: it's the word "Tillie":: last_a_tag.next_element # u'Tillie' That's because in the original markup, the word "Tillie" appeared before that semicolon. The parser encountered an tag, then the word "Tillie", then the closing tag, then the semicolon and rest of the sentence. The semicolon is on the same level as the tag, but the word "Tillie" was encountered first. The ``.previous_element`` attribute is the exact opposite of ``.next_element``. It points to whatever element was parsed immediately before this one:: last_a_tag.previous_element # u' and\n' last_a_tag.previous_element.next_element # Tillie ``.next_elements`` and ``.previous_elements`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You should get the idea by now. You can use these iterators to move forward or backward in the document as it was parsed:: for element in last_a_tag.next_elements: print(repr(element)) # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'\n\n' #

...

# u'...' # u'\n' # None Searching the tree ================== Beautiful Soup defines a lot of methods for searching the parse tree, but they're all very similar. I'm going to spend a lot of time explain the two most popular methods: ``find()`` and ``find_all()``. The other methods take almost exactly the same arguments, so I'll just cover them briefly. Once again, I'll be using the "three sisters" document as an example:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) By passing in a filter to an argument like ``find_all()``, you can isolate whatever parts of the document you're interested. Kinds of filters ---------------- Before talking in detail about ``find_all()`` and similar methods, I want to show examples of different filters you can pass into these methods. These filters show up again and again, throughout the search API. You can use them to filter based on a tag's name, on its attributes, on the text of a string, or on some combination of these. .. _a string: A string ^^^^^^^^ The simplest filter is a string. Pass a string to a search method and Beautiful Soup will perform a match against that exact string. This code finds all the tags in the document:: soup.find_all('b') # [The Dormouse's story] If you pass in a byte string, Beautiful Soup will assume the string is encoded as UTF-8. You can avoid this by passing in a Unicode string instead. .. _a regular expression: A regular expression ^^^^^^^^^^^^^^^^^^^^ If you pass in a regular expression object, Beautiful Soup will filter against that regular expression. This code finds all the tags whose names start with the letter "b"; in this case, the tag and the tag:: import re for tag in soup.find_all(re.compile("b.*")): print(tag.name) # body # b .. _a list: A list ^^^^^^ If you pass in a list, Beautiful Soup will allow a string match against `any` item in that list. This code finds all the tags `and` all the tags:: soup.find_all(["a", "b"]) # [The Dormouse's story, # Elsie, # Lacie, # Tillie] .. _the value True: ``True`` ^^^^^^^^ The value ``True`` matches everything it can. This code finds `all` the tags in the document, but none of the text strings:: for tag in soup.find_all(True): print(tag.name) # html # head # title # body # p # b # p # a # a # a # p .. a function: A function ^^^^^^^^^^ If none of the other matches work for you, define a function that takes an element as its only argument. The function should return ``True`` if the argument matches, and ``False`` otherwise. Here's a function that returns ``True`` if a tag defines the "class" attribute but doesn't define the "id" attribute:: def has_class_but_no_id(tag): return tag.has_key('class') and not tag.has_key('id') Pass this function into ``find_all()`` and you'll pick up all the

tags:: soup.find_all(has_class_but_no_id) # [

The Dormouse's story

, #

Once upon a time there were...

, #

...

] This function only picks up the

tags. It doesn't pick up the tags, because those tags define both "class" and "id". It doesn't pick up tags like

and ] soup.find_all("p", "title") # [

The Dormouse's story

] soup.find_all("a") # [Elsie, # Lacie, # Tillie] soup.find_all(id="link2") # [Lacie] import re soup.find(text=re.compile("sisters")) # u'Once upon a time there were three little sisters; and their names were\n' Some of these should look familiar, but others are new. What does it mean to pass in a value for ``text``, or ``id``? Why does ``find_all("p", "title")`` find a

tag with the CSS class "title"? Let's look at the arguments to ``find_all()``. .. _name: The ``name`` argument ^^^^^^^^^^^^^^^^^^^^^ Pass in a value for ``name`` and you'll tell Beautiful Soup to only consider tags with certain names. Text strings will be ignored, as will tags whose names that don't match. This is the simplest usage:: soup.find_all("title") # [

] Recall from `Kinds of filters`_ that the value to ``name`` can be `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. .. _kwargs: The keyword arguments ^^^^^^^^^^^^^^^^^^^^^ Any argument that's not recognized will be turned into a filter on one of a tag's attributes. If you pass in a value for an argument called ``id``, Beautiful Soup will filter against each tag's 'id' attribute:: soup.find_all(id='link2') # [Lacie] If you pass in a value for ``href``, Beautiful Soup will filter against each tag's 'href' attribute:: soup.find_all(href=re.compile("elsie")) # [Elsie] You can filter an attribute based on `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. This code finds all tags that have an ``id`` attribute, regardless of what the value is:: soup.find_all(id=True) # [Elsie, # Lacie, # Tillie] You can filter multiple attributes at once by passing in more than one keyword argument:: soup.find_all(href=re.compile("elsie"), id='link1') # [three] .. _attrs: Searching by CSS class ^^^^^^^^^^^^^^^^^^^^^^ Instead of using keyword arguments, you can filter tags based on their attributes by passing a dictionary in for ``attrs``. These two lines of code are equivalent:: soup.find_all(href=re.compile("elsie"), id='link1') soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) The ``attrs`` argument would be a pretty obscure feature were it not for one thing: CSS. It's very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, "class", is also a Python reserved word. You can use ``attrs`` to search by CSS class:: soup.find_all("a", { "class" : "sister" }) # [Elsie, # Lacie, # Tillie] But that's a lot of code for such a common operation. Instead, you can pass a string `attrs` instead of a dictionary. The string will be used to restrict the CSS class:: soup.find_all("a", "sister") # [Elsie, # Lacie, # Tillie] You can also pass in a regular expression, a function or True. Anything you pass in for ``attrs`` that's not a dictionary will be used to search against the CSS class:: soup.find_all(attrs=re.compile("itl")) # [

The Dormouse's story

] def has_six_characters(css_class): return css_class is not None and len(css_class) == 6 soup.find_all(attrs=has_six_characters) # [Elsie, # Lacie, # Tillie] :ref:`Remember ` that a single tag can have multiple values for its "class" attribute. When you search for a tag that matches a certain CSS class, you're matching against `any` of its CSS classes:: css_soup = BeautifulSoup('

') css_soup.find_all("p", "strikeout") # [

] css_soup.find_all("p", "body") # [

] Searching for the string value of the ``class`` attribute won't work:: css_soup.find_all("p", "body strikeout") # [] .. _text: The ``text`` argument ^^^^^^^^^^^^^^^^^^^^^ With ``text`` you can search for strings instead of tags. As with ``name`` and the keyword arguments, you can pass in `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. Here are some examples:: soup.find_all(text="Elsie") # [u'Elsie'] soup.find_all(text=["Tillie", "Elsie", "Lacie"]) # [u'Elsie', u'Lacie', u'Tillie'] soup.find_all(text=re.compile("Dormouse")) [u"The Dormouse's story", u"The Dormouse's story"] def is_the_only_string_within_a_tag(s): """Return True if this string is the only child of its parent tag.""" return (s == s.parent.string) soup.find_all(text=is_the_only_string_within_a_tag) # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] Although ``text`` is for finding strings, you can combine it with arguments for finding tags, Beautiful Soup will find all tags whose ``.string`` matches your value for ``text``. This code finds the tags whose ``.string`` is "Elsie":: soup.find_all("a", text="Elsie") # [ Elsie] .. _limit: The ``limit`` argument ^^^^^^^^^^^^^^^^^^^^^^ ``find_all()`` returns all the tags and strings that match your filters. This can take a while if the document is large. If you don't need `all` the results, you can pass in a number for ``limit``. This works just like the LIMIT keyword in SQL. It tells Beautiful Soup to stop gathering results after it's found a certain number. There are three links in the "three sisters" document, but this code only finds the first two:: soup.find_all("a", limit=2) # [Elsie, # Lacie] .. _recursive: The ``recursive`` argument ^^^^^^^^^^^^^^^^^^^^^^^^^^ If you call ``mytag.find_all()``, Beautiful Soup will examine all the descendants of ``mytag``: its children, its children's children, and so on. If you only want Beautiful Soup to consider direct children, you can pass in ``recursive=False``. See the difference here:: soup.html.find_all("title") # [] soup.html.find_all("title", recursive=False) # [] Here's that part of the document:: ... The tag, but it's not `directly` beneath the tag: the tag is in the way. Beautiful Soup finds the tag, but when ``recursive=False`` restricts it to the tag's immediate children, it finds nothing. Beautiful Soup offers a lot of tree-searching methods (covered below), and they mostly take the same arguments as ``find_all()``: ``name``, ``attrs``, ``text``, ``limit``, and the keyword arguments. But the ``recursive`` argument is different: ``find_all()`` and ``find()`` are the only methods that support it. Passing ``recursive=False`` into a method like ``find_parents()`` wouldn't be very useful. Calling a tag is like calling ``find_all()`` -------------------------------------------- Because ``find_all()`` is the most popular method in the Beautiful Soup search API, you can use a shortcut for it. If you treat the ``BeautifulSoup`` object or a ``Tag`` object as though it were a function, then it's the same as calling ``find_all()`` on that object. These two lines of code are equivalent:: soup.find_all("a") soup("a") These two lines are also equivalent:: soup.title.find_all(text=True) soup.title(text=True) ``find()`` ---------- Signature: find(:ref:`name `, :ref:`attrs `, :ref:`recursive `, :ref:`text `, :ref:`**kwargs `) The ``find_all()`` method scans the entire document looking for results, but sometimes you only want to find one result. If you know a document only has one tag, it's a waste of time to scan the entire document looking for more. Rather than passing in ``limit=1`` every time you call ``find_all``, you can use the ``find()`` method. These two lines of code are `nearly` equivalent:: soup.find_all('title', limit=1) # [] soup.find('title') # The only difference is that ``find_all()`` returns a list containing the single result, and ``find()`` just returns the result. If ``find_all()`` can't find anything, it returns an empty list. If ``find()`` can't find anything, it returns ``None``:: print(soup.find("nosuchtag")) # None Remember the ``soup.head.title`` trick from `Navigating using tag names`_? That trick works by repeatedly calling ``find()``:: soup.head.title # soup.find("head").find("title") # ``find_parents()`` and ``find_parent()`` ---------------------------------------- Signature: find_parents(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_parent(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) I spent a lot of time above covering ``find_all()`` and ``find()``. The Beautiful Soup API defines ten other methods for searching the tree, but don't be afraid. Five of these methods are basically the same as ``find_all()``, and the other five are basically the same as ``find()``. The only differences are in what parts of the tree they search. First let's consider ``find_parents()`` and ``find_parent()``. Remember that ``find_all()`` and ``find()`` work their way down the tree, looking at tag's descendants. These methods do the opposite: they work their way `up` the tree, looking at a tag's (or a string's) parents. Let's try them out, starting from a string buried deep in the "three daughters" document:: a_string = soup.find(text="Lacie") a_string # u'Lacie' a_string.find_parents("a") # [Lacie] a_string.find_parent("p") #

Once upon a time there were three little sisters; and their names were # Elsie, # Lacie and # Tillie; # and they lived at the bottom of a well.

a_string.find_parents("p", class="title") # [] One of the three tags is the direct parent of the string in question, so our search finds it. One of the three

tags is an indirect parent of the string, and our search finds that as well. There's a

tag with the CSS class "title" `somewhere` in the document, but it's not one of this string's parents, so we can't find it with ``find_parents()``. You may have made the connection between ``find_parent()`` and ``find_parents()``, and the `.parent`_ and `.parents`_ attributes mentioned earlier. The connection is very strong. These search methods actually use ``.parents`` to iterate over all the parents, and check each one against the provided filter to see if it matches. ``find_next_siblings()`` and ``find_next_sibling()`` ---------------------------------------------------- Signature: find_next_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_next_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.next_siblings ` to iterate over the rest of an element's siblings in the tree. The ``find_next_siblings()`` method returns all the siblings that match, and ``find_next_sibling()`` only returns the first one:: first_link = soup.a first_link # Elsie first_link.find_next_siblings("a") # [Lacie, # Tillie] first_story_paragraph = soup.find("p", "story") first_story_paragraph.find_next_sibling("p") #

...

``find_previous_siblings()`` and ``find_previous_sibling()`` ------------------------------------------------------------ Signature: find_previous_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_previous_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.previous_siblings ` to iterate over an element's siblings that precede it in the tree. The ``find_previous_siblings()`` method returns all the siblings that match, and ``find_previous_sibling()`` only returns the first one:: last_link = soup.find("a", id="link3") last_link # Tillie last_link.find_previous_siblings("a") # [Lacie, # Elsie] first_story_paragraph = soup.find("p", "story") first_story_paragraph.find_previous_sibling("p") #

The Dormouse's story

``find_all_next()`` and ``find_next()`` --------------------------------------- Signature: find_all_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.next_elements ` to iterate over whatever tags and strings that come after it in the document. The ``find_all_next()`` method returns all matches, and ``find_next()`` only returns the first match:: first_link = soup.a first_link # Elsie first_link.find_all_next(text=True) # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] first_link.find_next("p") #

...

In the first example, the string "Elsie" showed up, even though it was contained within the tag we started from. In the second example, the last

tag in the document showed up, even though it's not in the same part of the tree as the tag we started from. For these methods, all that matters is that an element match the filter, and show up later in the document than the starting element. ``find_all_previous()`` and ``find_previous()`` ----------------------------------------------- Signature: find_all_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.previous_elements ` to iterate over the tags and strings that came before it in the document. The ``find_all_previous()`` method returns all matches, and ``find_previous()`` only returns the first match:: first_link = soup.a first_link # Elsie first_link.find_all_previous("p") # [

Once upon a time there were three little sisters; ...

, #

The Dormouse's story

] first_link.find_previous("title") # The call to ``find_all_previous("p")`` found the first paragraph in the document (the one with class="title"), but it also finds the second paragraph, the

tag that contains the tag we started with. This shouldn't be too surprising: we're looking at all the tags that show up earlier in the document than the one we started with. A

tag that contains an tag must have shown up before the tag it contains. CSS selectors ------------- Beautiful Soup supports a subset of the `CSS selector standard `_. Just construct the selector as a string and pass it into the ``.select()`` method of a ``Tag`` or the ``BeautifulSoup`` object itself. You can find tags:: soup.select("title") # [

] Find tags beneath other tags:: soup.select("body a") # [Elsie, # Lacie, # Tillie] soup.select("html head title") # [] Find tags `directly` beneath other tags:: soup.select("head > title") # [] soup.select("p > a") # [Elsie, # Lacie, # Tillie] soup.select("body > a") # [] Find tags by CSS class:: soup.select(".sister") # [Elsie, # Lacie, # Tillie] soup.select("[class~=sister]") # [Elsie, # Lacie, # Tillie] Find tags by ID:: soup.select("#link1") # [Elsie] soup.select("a#link2") # [Lacie] Test for the existence of an attribute:: soup.select('a[href]') # [Elsie, # Lacie, # Tillie] Find tags by attribute value:: soup.select('a[href="http://example.com/elsie"]') # [Elsie] soup.select('a[href^="http://example.com/"]') # [Elsie, # Lacie, # Tillie] soup.select('a[href$="tillie"]') # [Tillie] soup.select('a[href*=".com/el"]') # [Elsie] Match language codes:: multilingual_markup = """

Hello

Howdy, y'all

Pip-pip, old fruit

Bonjour mes amis

""" multilingual_soup = BeautifulSoup(multilingual_markup) multilingual_soup.select('p[lang|=en]') # [

Hello

, #

Howdy, y'all

, #

Pip-pip, old fruit

] This is a convenience for users who know the CSS selector syntax. You can do all this stuff with the Beautiful Soup API. And if CSS selectors are all you need, you might as well use lxml directly, because it's faster. But this lets you `combine` simple CSS selectors with the Beautiful Soup API. Modifying the tree ================== Beautiful Soup's main strength is in searching the parse tree, but you can also modify the tree and write your changes as a new HTML or XML document. Changing tag names and attributes --------------------------------- I covered this earlier, in `Attributes`_, but it bears repeating. You can rename a tag, change the values of its attributes, add new attributes, and delete attributes:: soup = BeautifulSoup('Extremely bold') tag = soup.b tag.name = "blockquote" tag['class'] = 'verybold' tag['id'] = 1 tag #
Extremely bold
del tag['class'] del tag['id'] tag #
Extremely bold
Modifying ``.string`` --------------------- If you set a tag's ``.string`` attribute, the tag's contents are replaced with the string you give:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.string = "New link text." tag # New link text. Be careful: if the tag contained other tags, they and all their contents will be destroyed. ``append()`` ------------ You can add to a tag's contents with ``Tag.append()``. It works just like calling ``.append()`` on a Python list:: soup = BeautifulSoup("Foo") soup.a.append("Bar") soup # FooBar soup.a.contents # [u'Foo', u'Bar'] ``BeautifulSoup.new_string()`` and ``.new_tag()`` ------------------------------------------------- If you need to add a string to a document, no problem--you can pass a Python string in to ``append()``, or you can call the factory method ``BeautifulSoup.new_string()``:: soup = BeautifulSoup("") tag = soup.b tag.append("Hello") new_string = soup.new_string(" there") tag.append(new_string) tag # Hello there. tag.contents # [u'Hello', u' there'] What if you need to create a whole new tag? The best solution is to call the factory method ``BeautifulSoup.new_tag()``:: soup = BeautifulSoup("") original_tag = soup.b new_tag = soup.new_tag("a", href="http://www.example.com") original_tag.append(new_tag) original_tag # new_tag.string = "Link text." original_tag # Link text. Only the first argument, the tag name, is required. ``insert()`` ------------ ``Tag.insert()`` is just like ``Tag.append()``, except the new element doesn't necessarily go at the end of its parent's ``.contents``. It'll be inserted at whatever numeric position you say. It works just like ``.insert()`` on a Python list:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.insert(1, "but did not endorse ") tag # I linked to but did not endorse example.com tag.contents # [u'I linked to ', u'but did not endorse', example.com] ``insert_before()`` and ``insert_after()`` ------------------------------------------ The ``insert_before()`` method inserts a tag or string immediately before something else in the parse tree:: soup = BeautifulSoup("stop") tag = soup.new_tag("i") tag.string = "Don't" soup.b.string.insert_before(tag) soup.b # Don'tstop The ``insert_after()`` method moves a tag or string so that it immediately follows something else in the parse tree:: soup.b.i.insert_after(soup.new_string(" ever ")) soup.b # Don't ever stop soup.b.contents # [Don't, u' ever ', u'stop'] ``clear()`` ----------- ``Tag.clear()`` removes the contents of a tag:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.clear() tag # ``extract()`` ------------- ``PageElement.extract()`` removes a tag or string from the tree. It returns the tag or string that was extracted:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a i_tag = soup.i.extract() a_tag # I linked to i_tag # example.com print(i_tag.parent) None At this point you effectively have two parse trees: one rooted at the ``BeautifulSoup`` object you used to parse the document, and one rooted at the tag that was extracted. You can go on to call ``extract`` on a child of the element you extracted:: my_string = i_tag.string.extract() my_string # u'example.com' print(my_string.parent) # None i_tag # ``decompose()`` --------------- ``Tag.decompose()`` removes a tag from the tree, then `completely destroys it and its contents`:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a soup.i.decompose() a_tag # I linked to .. _replace_with: ``replace_with()`` ------------------ ``PageElement.replace_with()`` removes a tag or string from the tree, and replaces it with the tag or string of your choice:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a new_tag = soup.new_tag("b") new_tag.string = "example.net" a_tag.i.replace_with(new_tag) a_tag # I linked to example.net ``replace_with()`` returns the tag or string that was replaced, so that you can examine it or add it back to another part of the tree. ``wrap()`` ---------- ``PageElement.wrap()`` wraps an element in the tag you specify. It returns the new wrapper:: soup = BeautifulSoup("

I wish I was bold.

") soup.p.string.wrap(soup.new_tag("b")) # I wish I was bold. soup.p.wrap(soup.new_tag("div") #

I wish I was bold.

This method is new in Beautiful Soup 4.0.5. ``unwrap()`` --------------------------- ``Tag.unwrap()`` is the opposite of ``wrap()``. It replaces a tag with whatever's inside that tag. It's good for stripping out markup:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a a_tag.i.unwrap() a_tag # I linked to example.com Like ``replace_with()``, ``unwrap()`` returns the tag that was replaced. (In earlier versions of Beautiful Soup, ``unwrap()`` was called ``replace_with_children()``, and that name will still work.) Output ====== Pretty-printing --------------- The ``prettify()`` method will turn a Beautiful Soup parse tree into a nicely formatted bytestring, with each HTML/XML tag on its own line:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) soup.prettify() # '\n \n \n \n \n...' print(soup.prettify()) # # # # # # I linked to # # example.com # # # # You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, or on any of its ``Tag`` objects:: print(soup.a.prettify()) # # I linked to # # example.com # # Non-pretty printing ------------------- If you just want a string, with no fancy formatting, you can call ``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag`` within it:: str(soup) # 'I linked to example.com' unicode(soup.a) # u'I linked to example.com' The ``str()`` function returns a string encoded in UTF-8. See `Encodings`_ for other options. You can also call ``encode()`` to get a bytestring, and ``decode()`` to get Unicode. .. _output_formatters: Output formatters ----------------- If you give Beautiful Soup a document that contains HTML entities like "&lquot;", they'll be converted to Unicode characters:: soup = BeautifulSoup("“Dammit!” he said.") unicode(soup) # u'\u201cDammit!\u201d he said.' If you then convert the document to a string, the Unicode characters will be encoded as UTF-8. You won't get the HTML entities back:: str(soup) # '\xe2\x80\x9cDammit!\xe2\x80\x9d he said.' By default, the only characters that are escaped upon output are bare ampersands and angle brackets. These get turned into "&", "<", and ">", so that Beautiful Soup doesn't inadvertently generate invalid HTML or XML:: soup = BeautifulSoup("

The law firm of Dewey, Cheatem, & Howe

") soup.p #

The law firm of Dewey, Cheatem, & Howe

soup = BeautifulSoup('A link') soup.a # A link You can change this behavior by providing a value for the ``formatter`` argument to ``prettify()``, ``encode()``, or ``decode()``. Beautiful Soup recognizes four possible values for ``formatter``. The default is ``formatter="minimal"``. Strings will only be processed enough to ensure that Beautiful Soup generates valid HTML/XML:: french = "

Il a dit <<Sacré bleu!>>

" soup = BeautifulSoup(french) print(soup.prettify(formatter="minimal")) # # #

# Il a dit <<Sacré bleu!>> #

# # If you pass in ``formatter="html"``, Beautiful Soup will convert Unicode characters to HTML entities whenever possible:: print(soup.prettify(formatter="html")) # # #

# Il a dit <<Sacré bleu!>> #

# # If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: print(soup.prettify(formatter=None)) # # #

# Il a dit > #

# # link_soup = BeautifulSoup('A link') print(link_soup.a.encode(formatter=None)) # A link Finally, if you pass in a function for ``formatter``, Beautiful Soup will call that function once for every string and attribute value in the document. You can do whatever you want in this function. Here's a formatter that converts strings to uppercase and does absolutely nothing else:: def uppercase(str): return str.upper() print(soup.prettify(formatter=uppercase)) # # #

# IL A DIT > #

# # print(link_soup.a.prettify(formatter=uppercase)) # # A LINK # If you're writing your own function, you should know about the ``EntitySubstitution`` class in the ``bs4.dammit`` module. This class implements Beautiful Soup's standard formatters as class methods: the "html" formatter is ``EntitySubstitution.substitute_html``, and the "minimal" formatter is ``EntitySubstitution.substitute_xml``. You can use these functions to simulate ``formatter=html`` or ``formatter==minimal``, but then do something extra. Here's an example that replaces Unicode characters with HTML entities whenever possible, but `also` converts all strings to uppercase:: from bs4.dammit import EntitySubstitution def uppercase_and_substitute_html_entities(str): return EntitySubstitution.substitute_html(str.upper()) print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) # # #

# IL A DIT <<SACRÉ BLEU!>> #

# # One last caveat: if you create a ``CData`` object, the text inside that object is always presented `exactly as it appears, with no formatting`. Beautiful Soup will call the formatter method, just in case you've written a custom method that counts all the strings in the document or something, but it will ignore the return value. from bs4.element import CData soup = BeautifulSoup("") soup.a.string = CData("one < three") print(soup.a.prettify(formatter="xml")) # # one < three # ``get_text()`` -------------- If you only want the text part of a document or tag, you can use the ``get_text()`` method. It returns all the text in a document or beneath a tag, as a single Unicode string:: markup = '\nI linked to example.com\n' soup = BeautifulSoup(markup) soup.get_text() u'\nI linked to example.com\n' soup.i.get_text() u'example.com' You can specify a string to be used to join the bits of text together:: # soup.get_text("|") u'\nI linked to |example.com|\n' You can tell Beautiful Soup to strip whitespace from the beginning and end of each bit of text:: # soup.get_text("|", strip=True) u'I linked to|example.com' But at that point you might want to use the :ref:`.stripped_strings ` generator instead, and process the text yourself:: [text for text in soup.stripped_strings] # [u'I linked to', u'example.com'] Specifying the parser to use ============================ If you just need to parse some HTML, you can dump the markup into the ``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful Soup will pick a parser for you and parse the data. But there are a few additional arguments you can pass in to the constructor to change which parser is used. The first argument to the ``BeautifulSoup`` constructor is a string or an open filehandle--the markup you want parsed. The second argument is `how` you'd like the markup parsed. If you don't specify anything, you'll get the best HTML parser that's installed. Beautiful Soup ranks lxml's parser as being the best, then html5lib's, then Python's built-in parser. You can override this by specifying one of the following: * What type of markup you want to parse. Currently supported are "html", "xml", and "html5". * The name of the parser library you want to use. Currently supported options are "lxml", "html5lib", and "html.parser" (Python's built-in HTML parser). The section `Installing a parser`_ contrasts the supported parsers. If you don't have an appropriate parser installed, Beautiful Soup will ignore your request and pick a different parser. Right now, the only supported XML parser is lxml. If you don't have lxml installed, asking for an XML parser won't give you one, and asking for "lxml" won't work either. Differences between parsers --------------------------- Beautiful Soup presents the same interface to a number of different parsers, but each parser is different. Different parsers will create different parse trees from the same document. The biggest differences are between the HTML parsers and the XML parsers. Here's a short document, parsed as HTML:: BeautifulSoup("") # Since an empty tag is not valid HTML, the parser turns it into a tag pair. Here's the same document parsed as XML (running this requires that you have lxml installed). Note that the empty tag is left alone, and that the document is given an XML declaration instead of being put into an tag.:: BeautifulSoup("", "xml") # # There are also differences between HTML parsers. If you give Beautiful Soup a perfectly-formed HTML document, these differences won't matter. One parser will be faster than another, but they'll all give you a data structure that looks exactly like the original HTML document. But if the document is not perfectly-formed, different parsers will give different results. Here's a short, invalid document parsed using lxml's HTML parser. Note that the dangling tag is simply ignored:: BeautifulSoup("", "lxml") # Here's the same document parsed using html5lib:: BeautifulSoup("", "html5lib") #

Instead of ignoring the dangling tag, html5lib pairs it with an opening

tag. This parser also adds an empty

tag to the document. Here's the same document parsed with Python's built-in HTML parser:: BeautifulSoup("
", "html.parser") # Like html5lib, this parser ignores the closing tag. Unlike html5lib, this parser makes no attempt to create a well-formed HTML document by adding a tag. Unlike lxml, it doesn't even bother to add an tag. Since the document "" is invalid, none of these techniques is the "correct" way to handle it. The html5lib parser uses techniques that are part of the HTML5 standard, so it has the best claim on being the "correct" way, but all three techniques are legitimate. Differences between parsers can affect your script. If you're planning on distributing your script to other people, or running it on multiple machines, you should specify a parser in the ``BeautifulSoup`` constructor. That will reduce the chances that your users parse a document differently from the way you parse it. Encodings ========= Any HTML or XML document is written in a specific encoding like ASCII or UTF-8. But when you load that document into Beautiful Soup, you'll discover it's been converted to Unicode:: markup = "

Sacr\xc3\xa9 bleu!

" soup = BeautifulSoup(markup) soup.h1 #

Sacré bleu!

soup.h1.string # u'Sacr\xe9 bleu!' It's not magic. (That sure would be nice.) Beautiful Soup uses a sub-library called `Unicode, Dammit`_ to detect a document's encoding and convert it to Unicode. The autodetected encoding is available as the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: soup.original_encoding 'utf-8' Unicode, Dammit guesses correctly most of the time, but sometimes it makes mistakes. Sometimes it guesses correctly, but only after a byte-by-byte search of the document that takes a very long time. If you happen to know a document's encoding ahead of time, you can avoid mistakes and delays by passing it to the ``BeautifulSoup`` constructor as ``from_encoding``. Here's a document written in ISO-8859-8. The document is so short that Unicode, Dammit can't get a good lock on it, and misidentifies it as ISO-8859-7:: markup = b"

\xed\xe5\xec\xf9

" soup = BeautifulSoup(markup) soup.h1

νεμω

soup.original_encoding 'ISO-8859-7' We can fix this by passing in the correct ``from_encoding``:: soup = BeautifulSoup(markup, from_encoding="iso-8859-8") soup.h1

םולש

soup.original_encoding 'iso8859-8' In rare cases (usually when a UTF-8 document contains text written in a completely different encoding), the only way to get Unicode may be to replace some characters with the special Unicode character "REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do this, it will set the ``.contains_replacement_characters`` attribute to ``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This lets you know that the Unicode representation is not an exact representation of the original--some data was lost. If a document contains �, but ``.contains_replacement_characters`` is ``False``, you'll know that the � was there originally (as it is in this paragraph) and doesn't stand in for missing data. Output encoding --------------- When you write out a document from Beautiful Soup, you get a UTF-8 document, even if the document wasn't in UTF-8 to begin with. Here's a document written in the Latin-1 encoding:: markup = b'''

Sacr\xe9 bleu!

''' soup = BeautifulSoup(markup) print(soup.prettify()) # # # # # #

# Sacré bleu! #

# # Note that the tag has been rewritten to reflect the fact that the document is now in UTF-8. If you don't want UTF-8, you can pass an encoding into ``prettify()``:: print(soup.prettify("latin-1")) # # # # ... You can also call encode() on the ``BeautifulSoup`` object, or any element in the soup, just as if it were a Python string:: soup.p.encode("latin-1") # '

Sacr\xe9 bleu!

' soup.p.encode("utf-8") # '

Sacr\xc3\xa9 bleu!

' Any characters that can't be represented in your chosen encoding will be converted into numeric XML entity references. Here's a document that includes the Unicode character SNOWMAN:: markup = u"\N{SNOWMAN}" snowman_soup = BeautifulSoup(markup) tag = snowman_soup.b The SNOWMAN character can be part of a UTF-8 document (it looks like ☃), but there's no representation for that character in ISO-Latin-1 or ASCII, so it's converted into "☃" for those encodings:: print(tag.encode("utf-8")) # ☃ print tag.encode("latin-1") # ☃ print tag.encode("ascii") # ☃ Unicode, Dammit --------------- You can use Unicode, Dammit without using Beautiful Soup. It's useful whenever you have data in an unknown encoding and you just want it to become Unicode:: from bs4 import UnicodeDammit dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") print(dammit.unicode_markup) # Sacré bleu! dammit.original_encoding # 'utf-8' The more data you give Unicode, Dammit, the more accurately it will guess. If you have your own suspicions as to what the encoding might be, you can pass them in as a list:: dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) # Sacré bleu! dammit.original_encoding # 'latin-1' Unicode, Dammit has two special features that Beautiful Soup doesn't use. Smart quotes ^^^^^^^^^^^^ You can use Unicode, Dammit to convert Microsoft smart quotes to HTML or XML entities:: markup = b"

I just \x93love\x94 Microsoft Word\x92s smart quotes

" UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup # u'

I just “love” Microsoft Word’s smart quotes

' UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup # u'

I just “love” Microsoft Word’s smart quotes

' You can also convert Microsoft smart quotes to ASCII quotes:: UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup # u'

I just "love" Microsoft Word\'s smart quotes

' Hopefully you'll find this feature useful, but Beautiful Soup doesn't use it. Beautiful Soup prefers the default behavior, which is to convert Microsoft smart quotes to Unicode characters along with everything else:: UnicodeDammit(markup, ["windows-1252"]).unicode_markup # u'

I just \u201clove\u201d Microsoft Word\u2019s smart quotes

' Inconsistent encodings ^^^^^^^^^^^^^^^^^^^^^^ Sometimes a document is mostly in UTF-8, but contains Windows-1252 characters such as (again) Microsoft smart quotes. This can happen when a website includes data from multiple sources. You can use ``UnicodeDammit.detwingle()`` to turn such a document into pure UTF-8. Here's a simple example:: snowmen = (u"\N{SNOWMAN}" * 3) quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") doc = snowmen.encode("utf8") + quote.encode("windows_1252") This document is a mess. The snowmen are in UTF-8 and the quotes are in Windows-1252. You can display the snowmen or the quotes, but not both:: print(doc) # ☃☃☃�I like snowmen!� print(doc.decode("windows-1252")) # ☃☃☃“I like snowmen!” Decoding the document as UTF-8 raises a ``UnicodeDecodeError``, and decoding it as Windows-1252 gives you gibberish. Fortunately, ``UnicodeDammit.detwingle()`` will convert the string to pure UTF-8, allowing you to decode it to Unicode and display the snowmen and quote marks simultaneously:: new_doc = UnicodeDammit.detwingle(doc) print(new_doc.decode("utf8")) # ☃☃☃“I like snowmen!” ``UnicodeDammit.detwingle()`` only knows how to handle Windows-1252 embedded in UTF-8 (or vice versa, I suppose), but this is the most common case. Note that you must know to call ``UnicodeDammit.detwingle()`` on your data before passing it into ``BeautifulSoup`` or the ``UnicodeDammit`` constructor. Beautiful Soup assumes that a document has a single encoding, whatever it might be. If you pass it a document that contains both UTF-8 and Windows-1252, it's likely to think the whole document is Windows-1252, and the document will come out looking like ` ☃☃☃“I like snowmen!”`. ``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0. Parsing only part of a document =============================== Let's say you want to use Beautiful Soup look at a document's tags. It's a waste of time and memory to parse the entire document and then go over it again looking for tags. It would be much faster to ignore everything that wasn't an tag in the first place. The ``SoupStrainer`` class allows you to choose which parts of an incoming document are parsed. You just create a ``SoupStrainer`` and pass it in to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. (Note that *this feature won't work if you're using the html5lib parser*. If you use html5lib, the whole document will be parsed, no matter what. This is because html5lib constantly rearranges the parse tree as it works, and if some part of the document didn't actually make it into the parse tree, it'll crash. To avoid confusion, in the examples below I'll be forcing Beautiful Soup to use Python's built-in parser.) ``SoupStrainer`` ---------------- The ``SoupStrainer`` class takes the same arguments as a typical method from `Searching the tree`_: :ref:`name `, :ref:`attrs `, :ref:`text `, and :ref:`**kwargs `. Here are three ``SoupStrainer`` objects:: from bs4 import SoupStrainer only_a_tags = SoupStrainer("a") only_tags_with_id_link2 = SoupStrainer(id="link2") def is_short_string(string): return len(string) < 10 only_short_strings = SoupStrainer(text=is_short_string) I'm going to bring back the "three sisters" document one more time, and we'll see what the document looks like when it's parsed with these three ``SoupStrainer`` objects:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) # # Elsie # # # Lacie # # # Tillie # print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) # # Lacie # print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) # Elsie # , # Lacie # and # Tillie # ... # You can also pass a ``SoupStrainer`` into any of the methods covered in `Searching the tree`_. This probably isn't terribly useful, but I thought I'd mention it:: soup = BeautifulSoup(html_doc) soup.find_all(only_short_strings) # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', # u'\n\n', u'...', u'\n'] Troubleshooting =============== Version mismatch problems ------------------------- * ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME = u'[document]'``): Caused by running the Python 2 version of Beautiful Soup under Python 3, without converting the code. * ``ImportError: No module named HTMLParser`` - Caused by running the Python 2 version of Beautiful Soup under Python 3. * ``ImportError: No module named html.parser`` - Caused by running the Python 3 version of Beautiful Soup under Python 2. * ``ImportError: No module named BeautifulSoup`` - Caused by running Beautiful Soup 3 code on a system that doesn't have BS3 installed. Or, by writing Beautiful Soup 4 code without knowing that the package name has changed to ``bs4``. * ``ImportError: No module named bs4`` - Caused by running Beautiful Soup 4 code on a system that doesn't have BS4 installed. Parsing XML ----------- By default, Beautiful Soup parses documents as HTML. To parse a document as XML, pass in "xml" as the second argument to the ``BeautifulSoup`` constructor:: soup = BeautifulSoup(markup, "xml") You'll need to :ref:`have lxml installed `. Other parser problems --------------------- * If your script works on one computer but not another, it's probably because the two computers have different parser libraries available. For example, you may have developed the script on a computer that has lxml installed, and then tried to run it on a computer that only has html5lib installed. See `Differences between parsers`_ for why this matters, and fix the problem by mentioning a specific parser library in the ``BeautifulSoup`` constructor. * ``HTMLParser.HTMLParseError: malformed start tag`` or ``HTMLParser.HTMLParseError: bad end tag`` - Caused by giving Python's built-in HTML parser a document it can't handle. Any other ``HTMLParseError`` is probably the same problem. Solution: :ref:`Install lxml or html5lib. ` * If you can't find a tag that you know is in the document (that is, ``find_all()`` returned ``[]`` or ``find()`` returned ``None``), you're probably using Python's built-in HTML parser, which sometimes skips tags it doesn't understand. Solution: :ref:`Install lxml or html5lib. ` Miscellaneous ------------- * ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` when the tag in question doesn't define the ``attr`` attribute. The most common errors are ``KeyError: 'href'`` and ``KeyError: 'class'``. Use ``tag.get('attr')`` if you're not sure ``attr`` is defined, just as you would with a Python dictionary. * ``UnicodeEncodeError: 'charmap' codec can't encode character u'\xfoo' in position bar`` (or just about any other ``UnicodeEncodeError``) - This is not a problem with Beautiful Soup: you're trying to print a Unicode character that your console doesn't know how to display. See `this page on the Python wiki `_ for help. One easy solution is to write the text to a file and then look at the file. Improving Performance --------------------- Beautiful Soup will never be as fast as the parsers it sits on top of. If response time is critical, if you're paying for computer time by the hour, or if there's any other reason why computer time is more valuable than programmer time, you should forget about Beautiful Soup and work directly atop `lxml `_. That said, there are things you can do to speed up Beautiful Soup. If you're not using lxml as the underlying parser, my advice is to :ref:`start `. Beautiful Soup parses documents significantly faster using lxml than using html.parser or html5lib. Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by doing a byte-by-byte examination of the file. This slows Beautiful Soup to a crawl. My tests indicate that this only happened on 2.x versions of Python, and that it happened most often with documents using Russian or Chinese encodings. If this is happening to you, you can fix it by using Python 3 for your script. Or, if you happen to know a document's encoding, you can pass it into the ``BeautifulSoup`` constructor as ``from_encoding``. `Parsing only part of a document`_ won't save you much time parsing the document, but it can save a lot of memory, and it'll make `searching` the document much faster. Beautiful Soup 3 ================ Beautiful Soup 3 is the previous release series, and is no longer being actively developed. It's currently packaged with all major Linux distributions: :kbd:`$ apt-get install python-beautifulsoup` It's also published through PyPi as ``BeautifulSoup``.: :kbd:`$ easy_install BeautifulSoup` :kbd:`$ pip install BeautifulSoup` You can also `download a tarball of Beautiful Soup 3.2.0 `_. If you ran ``easy_install beautifulsoup`` or ``easy_install BeautifulSoup``, but your code doesn't work, you installed Beautiful Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. `The documentation for Beautiful Soup 3 is archived online `_. If your first language is Chinese, it might be easier for you to read `the Chinese translation of the Beautiful Soup 3 documentation `_, then read this document to find out about the changes made in Beautiful Soup 4. Porting code to BS4 ------------------- Most code written against Beautiful Soup 3 will work against Beautiful Soup 4 with one simple change. All you should have to do is change the package name from ``BeautifulSoup`` to ``bs4``. So this:: from BeautifulSoup import BeautifulSoup becomes this:: from bs4 import BeautifulSoup * If you get the ``ImportError`` "No module named BeautifulSoup", your problem is that you're trying to run Beautiful Soup 3 code, but you only have Beautiful Soup 4 installed. * If you get the ``ImportError`` "No module named bs4", your problem is that you're trying to run Beautiful Soup 4 code, but you only have Beautiful Soup 3 installed. Although BS4 is mostly backwards-compatible with BS3, most of its methods have been deprecated and given new names for `PEP 8 compliance `_. There are numerous other renames and changes, and a few of them break backwards compatibility. Here's what you'll need to know to convert your BS3 code and habits to BS4: You need a parser ^^^^^^^^^^^^^^^^^ Beautiful Soup 3 used Python's ``SGMLParser``, a module that was deprecated and removed in Python 3.0. Beautiful Soup 4 uses ``html.parser`` by default, but you can plug in lxml or html5lib and use that instead. See `Installing a parser`_ for a comparison. Since ``html.parser`` is not the same parser as ``SGMLParser``, it will treat invalid markup differently. Usually the "difference" is that ``html.parser`` crashes. In that case, you'll need to install another parser. But sometimes ``html.parser`` just creates a different parse tree than ``SGMLParser`` would. If this happens, you may need to update your BS3 scraping code to deal with the new tree. Method names ^^^^^^^^^^^^ * ``renderContents`` -> ``encode_contents`` * ``replaceWith`` -> ``replace_with`` * ``replaceWithChildren`` -> ``unwrap`` * ``findAll`` -> ``find_all`` * ``findAllNext`` -> ``find_all_next`` * ``findAllPrevious`` -> ``find_all_previous`` * ``findNext`` -> ``find_next`` * ``findNextSibling`` -> ``find_next_sibling`` * ``findNextSiblings`` -> ``find_next_siblings`` * ``findParent`` -> ``find_parent`` * ``findParents`` -> ``find_parents`` * ``findPrevious`` -> ``find_previous`` * ``findPreviousSibling`` -> ``find_previous_sibling`` * ``findPreviousSiblings`` -> ``find_previous_siblings`` * ``nextSibling`` -> ``next_sibling`` * ``previousSibling`` -> ``previous_sibling`` Some arguments to the Beautiful Soup constructor were renamed for the same reasons: * ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` * ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` I renamed one method for compatibility with Python 3: * ``Tag.has_key()`` -> ``Tag.has_attr()`` I renamed one attribute to use more accurate terminology: * ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` I renamed three attributes to avoid using words that have special meaning to Python. Unlike the others, these changes are *not backwards compatible.* If you used these attributes in BS3, your code will break on BS4 until you change them. * ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` * ``Tag.next`` -> ``Tag.next_element`` * ``Tag.previous`` -> ``Tag.previous_element`` Generators ^^^^^^^^^^ I gave the generators PEP 8-compliant names, and transformed them into properties: * ``childGenerator()`` -> ``children`` * ``nextGenerator()`` -> ``next_elements`` * ``nextSiblingGenerator()`` -> ``next_siblings`` * ``previousGenerator()`` -> ``previous_elements`` * ``previousSiblingGenerator()`` -> ``previous_siblings`` * ``recursiveChildGenerator()`` -> ``descendants`` * ``parentGenerator()`` -> ``parents`` So instead of this:: for parent in tag.parentGenerator(): ... You can write this:: for parent in tag.parents: ... (But the old code will still work.) Some of the generators used to yield ``None`` after they were done, and then stop. That was a bug. Now the generators just stop. There are two new generators, :ref:`.strings and .stripped_strings `. ``.strings`` yields NavigableString objects, and ``.stripped_strings`` yields Python strings that have had whitespace stripped. XML ^^^ There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To parse XML you pass in "xml" as the second argument to the ``BeautifulSoup`` constructor. For the same reason, the ``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` argument. Beautiful Soup's handling of empty-element XML tags has been improved. Previously when you parsed XML you had to explicitly say which tags were considered empty-element tags. The ``selfClosingTags`` argument to the constructor is no longer recognized. Instead, Beautiful Soup considers any empty tag to be an empty-element tag. If you add a child to an empty-element tag, it stops being an empty-element tag. Entities ^^^^^^^^ An incoming HTML or XML entity is always converted into the corresponding Unicode character. Beautiful Soup 3 had a number of overlapping ways of dealing with entities, which have been removed. The ``BeautifulSoup`` constructor no longer recognizes the ``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, Dammit`_ still has ``smart_quotes_to``, but its default is now to turn smart quotes into Unicode.) If you want to turn those Unicode characters back into HTML entities on output, rather than turning them into UTF-8 characters, you need to use an :ref:`output formatter `. Miscellaneous ^^^^^^^^^^^^^ :ref:`Tag.string ` now operates recursively. If tag A contains a single tag B and nothing else, then A.string is the same as B.string. (Previously, it was None.) `Multi-valued attributes`_ like ``class`` have lists of strings as their values, not strings. This may affect the way you search by CSS class. If you pass one of the ``find*`` methods both :ref:`text ` `and` a tag-specific argument like :ref:`name `, Beautiful Soup will search for tags that match your tag-specific criteria and whose :ref:`Tag.string ` matches your value for :ref:`text `. It will `not` find the strings themselves. Previously, Beautiful Soup ignored the tag-specific arguments and looked for strings. The ``BeautifulSoup`` constructor no longer recognizes the `markupMassage` argument. It's now the parser's responsibility to handle markup correctly. The rarely-used alternate parser classes like ``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been removed. It's now the parser's decision how to handle ambiguous markup.

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/NEWS.txt

= 4.1.0 (20120529) = * Added experimental support for fixing Windows-1252 characters embedded in UTF-8 documents. (UnicodeDammit.detwingle()) * Fixed the handling of &quot; with the built-in parser. [bug=993871] * Comments, processing instructions, document type declarations, and markup declarations are now treated as preformatted strings, the way CData blocks are. [bug=1001025] * Fixed a bug with the lxml treebuilder that prevented the user from adding attributes to a tag that didn't originally have attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. * Fixed some edge-case bugs having to do with inserting an element into a tag it's already inside, and replacing one of a tag's children with another. [bug=997529] * Added the ability to search for attribute values specified in UTF-8. [bug=1003974] This caused a major refactoring of the search code. All the tests pass, but it's possible that some searches will behave differently. = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. * Renamed replace_with_children() to unwrap(), which is easier to understand and also the jQuery name of the function. * Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%). * Fixed a bug in decoding data that contained a byte-order mark, such as data encoded in UTF-16LE. [bug=988980] * Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258] * Upon document generation, CData objects are no longer run through the formatter. [bug=988905] * The test suite now passes when lxml is not installed, whether or not html5lib is installed. [bug=987004] * Print a warning on HTMLParseErrors to let people know they should install a better parser library. = 4.0.4 (20120416) = * Fixed a bug that sometimes created disconnected trees. * Fixed a bug with the string setter that moved a string around the tree instead of copying it. [bug=983050] * Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. In the future I may make it possible to specify different formatters for attribute values and strings, but for now, consistent behavior is better than inconsistent behavior. [bug=980237] * Added the missing renderContents method from Beautiful Soup 3. Also added an encode_contents() method to go along with decode_contents(). * Give a more useful error when the user tries to run the Python 2 version of BS under Python 3. * UnicodeDammit can now convert Microsoft smart quotes to ASCII with UnicodeDammit(markup, smart_quotes_to="ascii"). = 4.0.3 (20120403) = * Fixed a typo that caused some versions of Python 3 to convert the Beautiful Soup codebase incorrectly. * Got rid of the 4.0.2 workaround for HTML documents--it was unnecessary and the workaround was triggering a (possibly different, but related) bug in lxml. [bug=972466] = 4.0.2 (20120326) = * Worked around a possible bug in lxml that prevents non-tiny XML documents from being parsed. [bug=963880, bug=963936] * Fixed a bug where specifying `text` while also searching for a tag only worked if `text` wanted an exact string match. [bug=955942] = 4.0.1 (20120314) = * This is the first official release of Beautiful Soup 4. There is no 4.0.0 release, to eliminate any possibility that packaging software might treat "4.0.0" as being an earlier version than "4.0.0b10". * Brought BS up to date with the latest release of soupselect, adding CSS selector support for direct descendant matches and multiple CSS class matches. = 4.0.0b10 (20120302) = * Added support for simple CSS selectors, taken from the soupselect project. * Fixed a crash when using html5lib. [bug=943246] * In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714] * Fixed a bug that caused calling a tag to sometimes call find_all() with the wrong arguments. [bug=944426] * For backwards compatibility, brought back the BeautifulStoneSoup class as a deprecated wrapper around BeautifulSoup. = 4.0.0b9 (20120228) = * Fixed the string representation of DOCTYPEs that have both a public ID and a system ID. * Fixed the generated XML declaration. * Renamed Tag.nsprefix to Tag.prefix, for consistency with NamespacedAttribute. * Fixed a test failure that occured on Python 3.x when chardet was installed. * Made prettify() return Unicode by default, so it will look nice on Python 3 when passed into print(). = 4.0.0b8 (20120224) = * All tree builders now preserve namespace information in the documents they parse. If you use the html5lib parser or lxml's XML parser, you can access the namespace URL for a tag as tag.namespace. However, there is no special support for namespace-oriented searching or tree manipulation. When you search the tree, you need to use namespace prefixes exactly as they're used in the original document. * The string representation of a DOCTYPE always ends in a newline. * Issue a warning if the user tries to use a SoupStrainer in conjunction with the html5lib tree builder, which doesn't support them. = 4.0.0b7 (20120223) = * Upon decoding to string, any characters that can't be represented in your chosen encoding will be converted into numeric XML entity references. * Issue a warning if characters were replaced with REPLACEMENT CHARACTER during Unicode conversion. * Restored compatibility with Python 2.6. * The install process no longer installs docs or auxillary text files. * It's now possible to deepcopy a BeautifulSoup object created with Python's built-in HTML parser. * About 100 unit tests that "test" the behavior of various parsers on invalid markup have been removed. Legitimate changes to those parsers caused these tests to fail, indicating that perhaps Beautiful Soup should not test the behavior of foreign libraries. The problematic unit tests have been reformulated as informational comparisons generated by the script scripts/demonstrate_parser_differences.py. This makes Beautiful Soup compatible with html5lib version 0.95 and future versions of HTMLParser. = 4.0.0b6 (20120216) = * Multi-valued attributes like "class" always have a list of values, even if there's only one value in the list. * Added a number of multi-valued attributes defined in HTML5. * Stopped generating a space before the slash that closes an empty-element tag. This may come back if I add a special XHTML mode (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty useless. * Passing text along with tag-specific arguments to a find* method: find("a", text="Click here") will find tags that contain the given text as their .string. Previously, the tag-specific arguments were ignored and only strings were searched. * Fixed a bug that caused the html5lib tree builder to build a partially disconnected tree. Generally cleaned up the html5lib tree builder. * If you restrict a multi-valued attribute like "class" to a string that contains spaces, Beautiful Soup will only consider it a match if the values correspond to that specific string. = 4.0.0b5 (20120209) = * Rationalized Beautiful Soup's treatment of CSS class. A tag belonging to multiple CSS classes is treated as having a list of values for the 'class' attribute. Searching for a CSS class will match *any* of the CSS classes. This actually affects all attributes that the HTML standard defines as taking multiple values (class, rel, rev, archive, accept-charset, and headers), but 'class' is by far the most common. [bug=41034] * If you pass anything other than a dictionary as the second argument to one of the find* methods, it'll assume you want to use that object to search against a tag's CSS classes. Previously this only worked if you passed in a string. * Fixed a bug that caused a crash when you passed a dictionary as an attribute value (possibly because you mistyped "attrs"). [bug=842419] * Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268] * If Unicode, Dammit can't figure out a consistent encoding for a page, it will try each of its guesses again, with errors="replace" instead of errors="strict". This may mean that some data gets replaced with REPLACEMENT CHARACTER, but at least most of it will get turned into Unicode. [bug=754903] * Patched over a bug in html5lib (?) that was crashing Beautiful Soup on certain kinds of markup. [bug=838800] * Fixed a bug that wrecked the tree if you replaced an element with an empty string. [bug=728697] * Improved Unicode, Dammit's behavior when you give it Unicode to begin with. = 4.0.0b4 (20120208) = * Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() * BeautifulSoup.new_tag() will follow the rules of whatever tree-builder was used to create the original BeautifulSoup object. A new <p> tag will look like "<p />" if the soup object was created to parse XML, but it will look like "<p></p>" if the soup object was created to parse HTML. * We pass in strict=False to html.parser on Python 3, greatly improving html.parser's ability to handle bad HTML. * We also monkeypatch a serious bug in html.parser that made strict=False disastrous on Python 3.2.2. * Replaced the "substitute_html_entities" argument with the more general "formatter" argument. * Bare ampersands and angle brackets are always converted to XML entities unless the user prevents it. * Added PageElement.insert_before() and PageElement.insert_after(), which let you put an element into the parse tree with respect to some other element. * Raise an exception when the user tries to do something nonsensical like insert a tag into itself. = 4.0.0b3 (20120203) = Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful Soup's custom HTML parser in favor of a system that lets you write a little glue code and plug in any HTML or XML parser you want. Beautiful Soup 4.0 comes with glue code for four parsers: * Python's standard HTMLParser (html.parser in Python 3) * lxml's HTML and XML parsers * html5lib's HTML parser HTMLParser is the default, but I recommend you install lxml if you can. For complete documentation, see the Sphinx documentation in bs4/doc/source/. What follows is a summary of the changes from Beautiful Soup 3. === The module name has changed === Previously you imported the BeautifulSoup class from a module also called BeautifulSoup. To save keystrokes and make it clear which version of the API is in use, the module is now called 'bs4': >>> from bs4 import BeautifulSoup === It works with Python 3 === Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was so bad that it barely worked at all. Beautiful Soup 4 works with Python 3, and since its parser is pluggable, you don't sacrifice quality. Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 support to the finish line. Ezio Melotti is also to thank for greatly improving the HTML parser that comes with Python 3.2. === CDATA sections are normal text, if they're understood at all. === Currently, the lxml and html5lib HTML parsers ignore CDATA sections in markup: <p><![CDATA[foo]]></p> => <p></p> A future version of html5lib will turn CDATA sections into text nodes, but only within tags like <svg> and <math>: <svg><![CDATA[foo]]></svg> => <p>foo</p> The default XML parser (which uses lxml behind the scenes) turns CDATA sections into ordinary text elements: <p><![CDATA[foo]]></p> => <p>foo</p> In theory it's possible to preserve the CDATA sections when using the XML parser, but I don't see how to get it to work in practice. === Miscellaneous other stuff === If the BeautifulSoup instance has .is_xml set to True, an appropriate XML declaration will be emitted when the tree is transformed into a string: <?xml version="1.0" encoding="utf-8"> <markup> ... </markup> The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree builders set it to False. If you want to parse XHTML with an HTML parser, you can set it manually. = 3.2.0 = The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 to make it obvious which one you should use. = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted to run under Python 3.0. There are three backwards-incompatible changes you should be aware of, but no new features or deliberate behavior changes. 1. str() may no longer do what you want. This is because the meaning of str() inverts between Python 2 and 3; in Python 2 it gives you a byte string, in Python 3 it gives you a Unicode string. The effect of this is that you can't pass an encoding to .__str__ anymore. Use encode() to get a string and decode() to get Unicode, and you'll be ready (well, readier) for Python 3. 2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, which is gone in Python 3. There's some bad HTML that SGMLParser handled but HTMLParser doesn't, usually to do with attribute values that aren't closed or have brackets inside them: <a href="foo</a>, </a><a href="bar">baz</a> <a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a> A later version of Beautiful Soup will allow you to plug in different parsers to make tradeoffs between speed and the ability to handle bad HTML. 3. In Python 3 (but not Python 2), HTMLParser converts entities within attributes to the corresponding Unicode characters. In Python 2 it's possible to parse this string and leave the &eacute; intact. <a href="http://crummy.com?sacr&eacute;&bleu"> In Python 3, the &eacute; is always converted to \xe9 during parsing. = 3.0.7a = Added an import that makes BS work in Python 2.3. = 3.0.7 = Fixed a UnicodeDecodeError when unpickling documents that contain non-ASCII characters. Fixed a TypeError that occured in some circumstances when a tag contained no text. Jump through hoops to avoid the use of chardet, which can be extremely slow in some circumstances. UTF-8 documents should never trigger the use of chardet. Whitespace is preserved inside <pre> and <textarea> tags that contain nothing but whitespace. Beautiful Soup can now parse a doctype that's scoped to an XML namespace. = 3.0.6 = Got rid of a very old debug line that prevented chardet from working. Added a Tag.decompose() method that completely disconnects a tree or a subset of a tree, breaking it up into bite-sized pieces that are easy for the garbage collecter to collect. Tag.extract() now returns the tag that was extracted. Tag.findNext() now does something with the keyword arguments you pass it instead of dropping them on the floor. Fixed a Unicode conversion bug. Fixed a bug that garbled some <meta> tags when rewriting them. = 3.0.5 = Soup objects can now be pickled, and copied with copy.deepcopy. Tag.append now works properly on existing BS objects. (It wasn't originally intended for outside use, but it can be now.) (Giles Radford) Passing in a nonexistent encoding will no longer crash the parser on Python 2.4 (John Nagle). Fixed an underlying bug in SGMLParser that thinks ASCII has 255 characters instead of 127 (John Nagle). Entities are converted more consistently to Unicode characters. Entity references in attribute values are now converted to Unicode characters when appropriate. Numeric entities are always converted, because SGMLParser always converts them outside of attribute values. ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to XHTML_ENTITIES. The regular expression for bare ampersands was too loose. In some cases ampersands were not being escaped. (Sam Ruby?) Non-breaking spaces and other special Unicode space characters are no longer folded to ASCII spaces. (Robert Leftwich) Information inside a TEXTAREA tag is now parsed literally, not as HTML tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) = 3.0.4 = Fixed a bug that crashed Unicode conversion in some cases. Fixed a bug that prevented UnicodeDammit from being used as a general-purpose data scrubber. Fixed some unit test failures when running against Python 2.5. When considering whether to convert smart quotes, UnicodeDammit now looks at the original encoding in a case-insensitive way. = 3.0.3 (20060606) = Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be sure to pass in an appropriate value for convertEntities, or XML/HTML entities might stick around that aren't valid in HTML/XML). The result may not validate, but it should be good enough to not choke a real-world XML parser. Specifically, the output of a properly constructed soup object should always be valid as part of an XML document, but parts may be missing if they were missing in the original. As always, if the input is valid XML, the output will also be valid. = 3.0.2 (20060602) = Previously, Beautiful Soup correctly handled attribute values that contained embedded quotes (sometimes by escaping), but not other kinds of XML character. Now, it correctly handles or escapes all special XML characters in attribute values. I aliased methods to the 2.x names (fetch, find, findText, etc.) for backwards compatibility purposes. Those names are deprecated and if I ever do a 4.0 I will remove them. I will, I tell you! Fixed a bug where the findAll method wasn't passing along any keyword arguments. When run from the command line, Beautiful Soup now acts as an HTML pretty-printer, not an XML pretty-printer. = 3.0.1 (20060530) = Reintroduced the "fetch by CSS class" shortcut. I thought keyword arguments would replace it, but they don't. You can't call soup('a', class='foo') because class is a Python keyword. If Beautiful Soup encounters a meta tag that declares the encoding, but a SoupStrainer tells it not to parse that tag, Beautiful Soup will no longer try to rewrite the meta tag to mention the new encoding. Basically, this makes SoupStrainers work in real-world applications instead of crashing the parser. = 3.0.0 "Who would not give all else for two p" (20060528) = This release is not backward-compatible with previous releases. If you've got code written with a previous version of the library, go ahead and keep using it, unless one of the features mentioned here really makes your life easier. Since the library is self-contained, you can include an old copy of the library in your old applications, and use the new version for everything else. The documentation has been rewritten and greatly expanded with many more examples. Beautiful Soup autodetects the encoding of a document (or uses the one you specify), and converts it from its native encoding to Unicode. Internally, it only deals with Unicode strings. When you print out the document, it converts to UTF-8 (or another encoding you specify). [Doc reference] It's now easy to make large-scale changes to the parse tree without screwing up the navigation members. The methods are extract, replaceWith, and insert. [Doc reference. See also Improving Memory Usage with extract] Passing True in as an attribute value gives you tags that have any value for that attribute. You don't have to create a regular expression. Passing None for an attribute value gives you tags that don't have that attribute at all. Tag objects now know whether or not they're self-closing. This avoids the problem where Beautiful Soup thought that tags like <BR /> were self-closing even in XML documents. You can customize the self-closing tags for a parser object by passing them in as a list of selfClosingTags: you don't have to subclass anymore. There's a new built-in parser, MinimalSoup, which has most of BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc reference] You can use a SoupStrainer to tell Beautiful Soup to parse only part of a document. This saves time and memory, often making Beautiful Soup about as fast as a custom-built SGMLParser subclass. [Doc reference, SoupStrainer reference] You can (usually) use keyword arguments instead of passing a dictionary of attributes to a search method. That is, you can replace soup(args={"id" : "5"}) with soup(id="5"). You can still use args if (for instance) you need to find an attribute whose name clashes with the name of an argument to findAll. [Doc reference: **kwargs attrs] The method names have changed to the better method names used in Rubyful Soup. Instead of find methods and fetch methods, there are only find methods. Instead of a scheme where you can't remember which method finds one element and which one finds them all, we have find and findAll. In general, if the method name mentions All or a plural noun (eg. findNextSiblings), then it finds many elements method. Otherwise, it only finds one element. [Doc reference] Some of the argument names have been renamed for clarity. For instance avoidParserProblems is now parserMassage. Beautiful Soup no longer implements a feed method. You need to pass a string or a filehandle into the soup constructor, not with feed after the soup has been created. There is still a feed method, but it's the feed method implemented by SGMLParser and calling it will bypass Beautiful Soup and cause problems. The NavigableText class has been renamed to NavigableString. There is no NavigableUnicodeString anymore, because every string inside a Beautiful Soup parse tree is a Unicode string. findText and fetchText are gone. Just pass a text argument into find or findAll. Null was more trouble than it was worth, so I got rid of it. Anything that used to return Null now returns None. Special XML constructs like comments and CDATA now have their own NavigableString subclasses, instead of being treated as oddly-formed data. If you parse a document that contains CDATA and write it back out, the CDATA will still be there. When you're parsing a document, you can get Beautiful Soup to convert XML or HTML entities into the corresponding Unicode characters. [Doc reference] = 2.1.1 (20050918) = Fixed a serious performance bug in BeautifulStoneSoup which was causing parsing to be incredibly slow. Corrected several entities that were previously being incorrectly translated from Microsoft smart-quote-like characters. Fixed a bug that was breaking text fetch. Fixed a bug that crashed the parser when text chunks that look like HTML tag names showed up within a SCRIPT tag. THEAD, TBODY, and TFOOT tags are now nestable within TABLE tags. Nested tables should parse more sensibly now. BASE is now considered a self-closing tag. = 2.1.0 "Game, or any other dish?" (20050504) = Added a wide variety of new search methods which, given a starting point inside the tree, follow a particular navigation member (like nextSibling) over and over again, looking for Tag and NavigableText objects that match certain criteria. The new methods are findNext, fetchNext, findPrevious, fetchPrevious, findNextSibling, fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, findParent, and fetchParents. All of these use the same basic code used by first and fetch, so you can pass your weird ways of matching things into these methods. The fetch method and its derivatives now accept a limit argument. You can now pass keyword arguments when calling a Tag object as though it were a method. Fixed a bug that caused all hand-created tags to share a single set of attributes. = 2.0.3 (20050501) = Fixed Python 2.2 support for iterators. Fixed a bug that gave the wrong representation to tags within quote tags like <script>. Took some code from Mark Pilgrim that treats CDATA declarations as data instead of ignoring them. Beautiful Soup's setup.py will now do an install even if the unit tests fail. It won't build a source distribution if the unit tests fail, so I can't release a new version unless they pass. = 2.0.2 (20050416) = Added the unit tests in a separate module, and packaged it with distutils. Fixed a bug that sometimes caused renderContents() to return a Unicode string even if there was no Unicode in the original string. Added the done() method, which closes all of the parser's open tags. It gets called automatically when you pass in some text to the constructor of a parser class; otherwise you must call it yourself. Reinstated some backwards compatibility with 1.x versions: referencing the string member of a NavigableText object returns the NavigableText object instead of throwing an error. = 2.0.1 (20050412) = Fixed a bug that caused bad results when you tried to reference a tag name shorter than 3 characters as a member of a Tag, eg. tag.table.td. Made sure all Tags have the 'hidden' attribute so that an attempt to access tag.hidden doesn't spawn an attempt to find a tag named 'hidden'. Fixed a bug in the comparison operator. = 2.0.0 "Who cares for fish?" (20050410) Beautiful Soup version 1 was very useful but also pretty stupid. I originally wrote it without noticing any of the problems inherent in trying to build a parse tree out of ambiguous HTML tags. This version solves all of those problems to my satisfaction. It also adds many new clever things to make up for the removal of the stupid things. == Parsing == The parser logic has been greatly improved, and the BeautifulSoup class should much more reliably yield a parse tree that looks like what the page author intended. For a particular class of odd edge cases that now causes problems, there is a new class, ICantBelieveItsBeautifulSoup. By default, Beautiful Soup now performs some cleanup operations on text before parsing it. This is to avoid common problems with bad definitions and self-closing tags that crash SGMLParser. You can provide your own set of cleanup operations, or turn it off altogether. The cleanup operations include fixing self-closing tags that don't close, and replacing Microsoft smart quotes and similar characters with their HTML entity equivalents. You can now get a pretty-print version of parsed HTML to get a visual picture of how Beautiful Soup parses it, with the Tag.prettify() method. == Strings and Unicode == There are separate NavigableText subclasses for ASCII and Unicode strings. These classes directly subclass the corresponding base data types. This means you can treat NavigableText objects as strings instead of having to call methods on them to get the strings. str() on a Tag always returns a string, and unicode() always returns Unicode. Previously it was inconsistent. == Tree traversal == In a first() or fetch() call, the tag name or the desired value of an attribute can now be any of the following: * A string (matches that specific tag or that specific attribute value) * A list of strings (matches any tag or attribute value in the list) * A compiled regular expression object (matches any tag or attribute value that matches the regular expression) * A callable object that takes the Tag object or attribute value as a string. It returns None/false/empty string if the given string doesn't match, and any other value if it does. This is much easier to use than SQL-style wildcards (see, regular expressions are good for something). Because of this, I took out SQL-style wildcards. I'll put them back if someone complains, but their removal simplifies the code a lot. You can use fetch() and first() to search for text in the parse tree, not just tags. There are new alias methods fetchText() and firstText() designed for this purpose. As with searching for tags, you can pass in a string, a regular expression object, or a method to match your text. If you pass in something besides a map to the attrs argument of fetch() or first(), Beautiful Soup will assume you want to match that thing against the "class" attribute. When you're scraping well-structured HTML, this makes your code a lot cleaner. 1.x and 2.x both let you call a Tag object as a shorthand for fetch(). For instance, foo("bar") is a shorthand for foo.fetch("bar"). In 2.x, you can also access a specially-named member of a Tag object as a shorthand for first(). For instance, foo.barTag is a shorthand for foo.first("bar"). By chaining these shortcuts you traverse a tree in very little code: for header in soup.bodyTag.pTag.tableTag('th'): If an element relationship (like parent or next) doesn't apply to a tag, it'll now show up Null instead of None. first() will also return Null if you ask it for a nonexistent tag. Null is an object that's just like None, except you can do whatever you want to it and it'll give you Null instead of throwing an error. This lets you do tree traversals like soup.htmlTag.headTag.titleTag without having to worry if the intermediate stages are actually there. Previously, if there was no 'head' tag in the document, headTag in that instance would have been None, and accessing its 'titleTag' member would have thrown an AttributeError. Now, you can get what you want when it exists, and get Null when it doesn't, without having to do a lot of conditionals checking to see if every stage is None. There are two new relations between page elements: previousSibling and nextSibling. They reference the previous and next element at the same level of the parse tree. For instance, if you have HTML like this: <p><ul><li>Foo<br /><li>Bar</ul> The first 'li' tag has a previousSibling of Null and its nextSibling is the second 'li' tag. The second 'li' tag has a nextSibling of Null and its previousSibling is the first 'li' tag. The previousSibling of the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the 'br' tag. I took out the ability to use fetch() to find tags that have a specific list of contents. See, I can't even explain it well. It was really difficult to use, I never used it, and I don't think anyone else ever used it. To the extent anyone did, they can probably use fetchText() instead. If it turns out someone needs it I'll think of another solution. == Tree manipulation == You can add new attributes to a tag, and delete attributes from a tag. In 1.x you could only change a tag's existing attributes. == Porting Considerations == There are three changes in 2.0 that break old code: In the post-1.2 release you could pass in a function into fetch(). The function took a string, the tag name. In 2.0, the function takes the actual Tag object. It's no longer to pass in SQL-style wildcards to fetch(). Use a regular expression instead. The different parsing algorithm means the parse tree may not be shaped like you expect. This will only actually affect you if your code uses one of the affected parts. I haven't run into this problem yet while porting my code. = Between 1.2 and 2.0 = This is the release to get if you want Python 1.5 compatibility. The desired value of an attribute can now be any of the following: * A string * A string with SQL-style wildcards * A compiled RE object * A callable that returns None/false/empty string if the given value doesn't match, and any other value otherwise. This is much easier to use than SQL-style wildcards (see, regular expressions are good for something). Because of this, I no longer recommend you use SQL-style wildcards. They may go away in a future release to clean up the code. Made Beautiful Soup handle processing instructions as text instead of ignoring them. Applied patch from Richie Hindle (richie at entrian dot com) that makes tag.string a shorthand for tag.contents[0].string when the tag has only one string-owning child. Added still more nestable tags. The nestable tags thing won't work in a lot of cases and needs to be rethought. Fixed an edge case where searching for "%foo" would match any string shorter than "foo". = 1.2 "Who for such dainties would not stoop?" (20040708) = Applied patch from Ben Last (ben at benlast dot com) that made Tag.renderContents() correctly handle Unicode. Made BeautifulStoneSoup even dumber by making it not implicitly close a tag when another tag of the same type is encountered; only when an actual closing tag is encountered. This change courtesy of Fuzzy (mike at pcblokes dot com). BeautifulSoup still works as before. = 1.1 "Swimming in a hot tureen" = Added more 'nestable' tags. Changed popping semantics so that when a nestable tag is encountered, tags are popped up to the previously encountered nestable tag (of whatever kind). I will revert this if enough people complain, but it should make more people's lives easier than harder. This enhancement was suggested by Anthony Baxter (anthony at interlink dot com dot au). = 1.0 "So rich and green" (20040420) = Initial release.

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/PKG-INFO

Metadata-Version: 1.1 Name: beautifulsoup4 Version: 4.1.0 Summary: UNKNOWN Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ Author: Leonard Richardson Author-email: [email protected] License: MIT Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ Description: Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree. Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Text Processing :: Markup :: HTML Classifier: Topic :: Text Processing :: Markup :: XML Classifier: Topic :: Text Processing :: Markup :: SGML Classifier: Topic :: Software Development :: Libraries :: Python Modules

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/README.txt

= Introduction = >>> from bs4 import BeautifulSoup >>> soup = BeautifulSoup("

SomebadHTML") >>> print soup.prettify()

Some bad HTML

>>> soup.find(text="bad") u'bad' >>> soup.i HTML >>> soup = BeautifulSoup("SomebadXML", "xml") >>> print soup.prettify() Some bad XML = Full documentation = The bs4/doc/ directory contains full documentation in Sphinx format. Run "make html" in that directory to create HTML documentation. = Running the unit tests = Beautiful Soup supports unit test discovery from the project root directory: $ nosetests $ python -m unittest discover -s bs4 # Python 2.7 and up If you checked out the source tree, you should see a script in the home directory called test-all-versions. This script will run the unit tests under Python 2.7, then create a temporary Python 3 conversion of the source and run the unit tests again under Python 3. = Links = Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/ Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ http://readthedocs.org/docs/beautiful-soup-4/ Discussion group: http://groups.google.com/group/beautifulsoup/ Development: https://code.launchpad.net/beautifulsoup/ Bug tracker: https://bugs.launchpad.net/beautifulsoup/

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/scripts/demonstrate_parser_differences.py

"""Demonstrate how different parsers parse the same markup. Beautiful Soup can use any of a number of different parsers. Every parser should behave more or less the same on valid markup, and Beautiful Soup's unit tests make sure this is the case. But every parser handles invalid markup differently. Even different versions of the same parser handle invalid markup differently. So instead of unit tests I've created this educational demonstration script. The file demonstration_markup.txt contains many lines of HTML. This script tests each line of markup against every parser you have installed, and prints out how each parser sees that markup. This may help you choose a parser, or understand why Beautiful Soup presents your document the way it does. """ import os import sys from bs4 import BeautifulSoup parsers = ['html.parser'] try: from bs4.builder import _lxml parsers.append('lxml') except ImportError, e: pass try: from bs4.builder import _html5lib parsers.append('html5lib') except ImportError, e: pass class Demonstration(object): def __init__(self, markup): self.results = {} self.markup = markup def run_against(self, *parser_names): uniform_results = True previous_output = None for parser in parser_names: try: soup = BeautifulSoup(self.markup, parser) if markup.startswith("<div>"): # Extract the interesting part output = soup.div else: output = soup except Exception, e: output = "[EXCEPTION] %s" % str(e) self.results[parser] = output if previous_output is None: previous_output = output elif previous_output != output: uniform_results = False return uniform_results def dump(self): print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) for parser, output in self.results.items(): print "%s: %s" % (parser.rjust(13), output.encode("utf8")) different_results = [] uniform_results = [] print "= Testing the following parsers: %s =" % ", ".join(parsers) print input_file = sys.stdin if sys.stdin.isatty(): for filename in [ "demonstration_markup.txt", os.path.join("scripts", "demonstration_markup.txt")]: if os.path.exists(filename): input_file = open(filename) for markup in input_file: demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) is_uniform = demo.run_against(*parsers) if is_uniform: uniform_results.append(demo) else: different_results.append(demo) print "== Markup that's handled the same in every parser ==" print for demo in uniform_results: demo.dump() print print "== Markup that's not handled the same in every parser ==" print for demo in different_results: demo.dump() print

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/scripts/demonstration_markup.txt

A bare string <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> <div><![CDATA[A CDATA section where it doesn't belong]]></div> <div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> <div>A <meta> tag</div> <div>A <br> tag that supposedly has contents.</br></div> <div>AT&T</div> <div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div> <div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> <div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div> <div><a href="http://example.com/</a> that attribute value never got closed</div> <div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> <! This document starts with a bogus declaration ><div>a</div> <div>This document contains <!an incomplete declaration <div>(do you see it?)</div> <div>This document ends with <!an incomplete declaration <div><a style={height:21px;}>That attribute value was bogus</a></div> <! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace <div><table><td nowrap>That boolean attribute had no value</td></table></div> <div>Here's a nonexistent entity: &#foo; (do you see it?)</div> <div>This document ends before the entity finishes: &gt <div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> <b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> <div><table><tr><td>Here's a table</td></tr></table></div> <div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> <div>This tag contains nothing but whitespace: <b> </b></div> <div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> <div><table><div>This table contains bare markup</div></table></div> <div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> <div>This document contains a <!DOCTYPE surprise>surprise doctype</div> <div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> <div><our☃>Tag name contains Unicode characters</our☃></div> <div><a ☃="snowman">Attribute name contains Unicode characters</a></div> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/setup.py

from distutils.core import setup try: from distutils.command.build_py import build_py_2to3 as build_py except ImportError: # 2.x from distutils.command.build_py import build_py setup(name="beautifulsoup4", version = "4.1.0", author="Leonard Richardson", author_email='[email protected]', url="http://www.crummy.com/software/BeautifulSoup/bs4/", download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/", long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""", license="MIT", packages=['bs4', 'bs4.builder', 'bs4.tests'], cmdclass = {'build_py':build_py}, classifiers=["Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", 'Programming Language :: Python :: 3', "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", "Topic :: Text Processing :: Markup :: SGML", "Topic :: Software Development :: Libraries :: Python Modules", ], )

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0/TODO.txt

Additions --------- More of the jQuery API: nextUntil? Optimizations ------------- The html5lib tree builder doesn't use the standard tree-building API, which worries me and has resulted in a number of bugs. markup_attr_map can be optimized since it's always a map now. Upon encountering UTF-16LE data or some other uncommon serialization of Unicode, UnicodeDammit will convert the data to Unicode, then encode it at UTF-8. This is wasteful because it will just get decoded back to Unicode. CDATA ----- The elementtree XMLParser has a strip_cdata argument that, when set to False, should allow Beautiful Soup to preserve CDATA sections instead of treating them as text. Except it doesn't. (This argument is also present for HTMLParser, and also does nothing there.) Currently, htm5lib converts CDATA sections into comments. An as-yet-unreleased version of html5lib changes the parser's handling of CDATA sections to allow CDATA sections in tags like <svg> and <math>. The HTML5TreeBuilder will need to be updated to create CData objects instead of Comment objects in this situation.

beautifulsoup4-4.1.0.tar/dist/beautifulsoup4-4.1.0.tar

beautifulsoup4-4.1.0/README.txt

= Introduction = >>> from bs4 import BeautifulSoup >>> soup = BeautifulSoup("

SomebadHTML") >>> print soup.prettify()

Some bad HTML

>>> soup.find(text="bad") u'bad' >>> soup.i HTML >>> soup = BeautifulSoup("SomebadXML", "xml") >>> print soup.prettify() Some bad XML = Full documentation = The bs4/doc/ directory contains full documentation in Sphinx format. Run "make html" in that directory to create HTML documentation. = Running the unit tests = Beautiful Soup supports unit test discovery from the project root directory: $ nosetests $ python -m unittest discover -s bs4 # Python 2.7 and up If you checked out the source tree, you should see a script in the home directory called test-all-versions. This script will run the unit tests under Python 2.7, then create a temporary Python 3 conversion of the source and run the unit tests again under Python 3. = Links = Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/ Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ http://readthedocs.org/docs/beautiful-soup-4/ Discussion group: http://groups.google.com/group/beautifulsoup/ Development: https://code.launchpad.net/beautifulsoup/ Bug tracker: https://bugs.launchpad.net/beautifulsoup/

beautifulsoup4-4.1.0/bs4/__init__.py

"""Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup provides provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. Beautiful Soup works with Python 2.6 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson ([email protected])" __version__ = "4.1.0" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import re import warnings from .builder import builder_registry from .dammit import UnicodeDammit from .element import ( CData, Comment, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, NavigableString, PageElement, ProcessingInstruction, ResultSet, SoupStrainer, Tag, ) # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ This class defines the basic interface called by the tree builders. These methods will be called by the parser: reset() feed(markup) The tree builder may call these methods from its feed() implementation: handle_starttag(name, attrs) # See note about return value handle_endtag(name) handle_data(data) # Appends to the current data node endData(containerClass=NavigableString) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, 'data' events, and "done with data" events. If you encounter an empty-element tag (aka a self-closing tag, like HTML's <br> tag), call handle_starttag and then handle_endtag. """ ROOT_TAG_NAME = u'[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. You can pass in features='html' " "or features='xml' to get a builder capable of handling " "one or the other.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise ValueError( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) = ( self.builder.prepare_markup(markup, from_encoding)) try: self._feed() except StopParsing: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None def _feed(self): # Convert the document to Unicode. self.builder.reset() self.builder.feed(self.markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def reset(self): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" navigable = NavigableString(s) navigable.setup() return navigable def insert_before(self, successor): raise ValueError("BeautifulSoup objects don't support insert_before().") def insert_after(self, successor): raise ValueError("BeautifulSoup objects don't support insert_after().") def popTag(self): tag = self.tagStack.pop() #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): if self.currentData: currentData = u''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.builder.preserve_whitespace_tags)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ not self.parse_only.search(currentData)): return o = containerClass(currentData) self.object_was_parsed(o) def object_was_parsed(self, o): """Add an object to the parse tree.""" o.setup(self.currentTag, self.previous_element) if self.previous_element: self.previous_element.next_element = o self.previous_element = o self.currentTag.contents.append(o) def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): if (name == self.tagStack[i].name and nsprefix == self.tagStack[i].nsprefix == nsprefix): numPops = len(self.tagStack) - i break if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the SoupStrainer. You should proceed as if the tag had not occured in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 and (self.parse_only.text or not self.parse_only.search_tag(name, attrs))): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self.previous_element) if tag is None: return tag if self.previous_element: self.previous_element.next_element = tag self.previous_element = tag self.pushTag(tag) return tag def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): self.currentData.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: # Print the XML declaration encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = u'<?xml version="1.0"%s?>\n' % encoding_part else: prefix = u'' if not pretty_print: indent_level = None else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" def __init__(self, *args, **kwargs): kwargs['features'] = 'xml' warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.') super(BeautifulStoneSoup, self).__init__(*args, **kwargs) class StopParsing(Exception): pass #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print soup.prettify()

beautifulsoup4-4.1.0/bs4/element.py

import collections import re import sys import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(unicode): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '<meta charset="utf8">', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: <meta http-equiv="content-type" content="text/html; charset=utf8"> The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return unicode.__new__(unicode, original_value) obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" # There are five possible values for the "formatter" argument passed in # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities # are converted to those entities on output. # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: &amp; &lt; &gt; # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". # A function - This function will be called on every string that # needs to undergo entity substition FORMATTERS = { "html" : EntitySubstitution.substitute_html, "minimal" : EntitySubstitution.substitute_xml, None : None } @classmethod def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" if not callable(formatter): formatter = self.FORMATTERS.get( formatter, EntitySubstitution.substitute_xml) if formatter is None: output = s else: output = formatter(s) return output def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous_element = previous_element if previous_element is not None: self.previous_element.next_element = self self.next_element = None self.previous_sibling = None self.next_sibling = None if self.parent is not None and self.parent.contents: self.previous_sibling = self.parent.contents[-1] self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): if replace_with is self: return if replace_with is self.parent: raise ValueError("Cannot replace a Tag with its parent.") old_parent = self.parent my_index = self.parent.index(self) self.extract() old_parent.insert(my_index, replace_with) return self replaceWith = replace_with # BS3 def unwrap(self): my_parent = self.parent my_index = self.parent.index(self) self.extract() for child in reversed(self.contents[:]): my_parent.insert(my_index, child) return self replace_with_children = unwrap replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside def extract(self): """Destructively rips this element out of the tree.""" if self.parent is not None: del self.parent.contents[self.parent.index(self)] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. last_child = self._last_descendant() next_element = last_child.next_element if self.previous_element is not None: self.previous_element.next_element = next_element if next_element is not None: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None if self.previous_sibling is not None: self.previous_sibling.next_sibling = self.next_sibling if self.next_sibling is not None: self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self def _last_descendant(self): "Finds the last element beneath this object to be parsed." last_child = self while hasattr(last_child, 'contents') and last_child.contents: last_child = last_child.contents[-1] return last_child # BS3: Not part of the API! _lastRecursiveChild = _last_descendant def insert(self, position, new_child): if new_child is self: raise ValueError("Cannot insert a tag into itself.") if (isinstance(new_child, basestring) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one # of this object's children. if new_child.parent is self: current_index = self.index(new_child) if current_index < position: # We're moving this element further down the list # of this object's children. That means that when # we extract this element, our target index will # jump down one. position -= 1 new_child.extract() new_child.parent = self previous_child = None if position == 0: new_child.previous_sibling = None new_child.previous_element = self else: previous_child = self.contents[position - 1] new_child.previous_sibling = previous_child new_child.previous_sibling.next_sibling = new_child new_child.previous_element = previous_child._last_descendant() if new_child.previous_element is not None: new_child.previous_element.next_element = new_child new_childs_last_element = new_child._last_descendant() if position >= len(self.contents): new_child.next_sibling = None parent = self parents_next_sibling = None while parents_next_sibling is None and parent is not None: parents_next_sibling = parent.next_sibling parent = parent.parent if parents_next_sibling is not None: # We found the element that comes next in the document. break if parents_next_sibling is not None: new_childs_last_element.next_element = parents_next_sibling else: # The last element of this tag is the last element in # the document. new_childs_last_element.next_element = None else: next_child = self.contents[position] new_child.next_sibling = next_child if new_child.next_sibling is not None: new_child.next_sibling.previous_sibling = new_child new_childs_last_element.next_element = next_child if new_childs_last_element.next_element is not None: new_childs_last_element.next_element.previous_element = new_childs_last_element self.contents.insert(position, new_child) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def insert_before(self, predecessor): """Makes the given element the immediate predecessor of this one. The two elements will have the same parent, and the given element will be immediately before this one. """ if self is predecessor: raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(predecessor, PageElement): predecessor.extract() index = parent.index(self) parent.insert(index, predecessor) def insert_after(self, successor): """Makes the given element the immediate successor of this one. The two elements will have the same parent, and the given element will be immediately after this one. """ if self is successor: raise ValueError("Can't insert an element after itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") # Extract first so that the index won't be screwed up if they # are siblings. if isinstance(successor, PageElement): successor.extract() index = parent.index(self) parent.insert(index+1, successor) def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_all_next, name, attrs, text, **kwargs) findNext = find_next # BS3 def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._find_one(self.find_next_siblings, name, attrs, text, **kwargs) findNextSibling = find_next_sibling # BS3 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.next_siblings, **kwargs) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 def find_previous(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._find_one( self.find_all_previous, name, attrs, text, **kwargs) findPrevious = find_previous # BS3 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._find_one(self.find_previous_siblings, name, attrs, text, **kwargs) findPreviousSibling = find_previous_sibling # BS3 def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._find_all(name, attrs, text, limit, self.previous_siblings, **kwargs) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None l = self.find_parents(name, attrs, 1) if l: r = l[0] return r findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._find_all(name, attrs, None, limit, self.parents, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @property def next(self): return self.next_element @property def previous(self): return self.previous_element #These methods do the real heavy lifting. def _find_one(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name elif text is None and not limit and not attrs and not kwargs: # Optimization to find all tags. if name is True or name is None: return [element for element in generator if isinstance(element, Tag)] # Optimization to find all tags with a given name. elif isinstance(name, basestring): return [element for element in generator if isinstance(element, Tag) and element.name == name] else: strainer = SoupStrainer(name, attrs, text, **kwargs) else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) while True: try: i = next(generator) except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These generators can be used to navigate starting from both #NavigableStrings and Tags. @property def next_elements(self): i = self.next_element while i is not None: yield i i = i.next_element @property def next_siblings(self): i = self.next_sibling while i is not None: yield i i = i.next_sibling @property def previous_elements(self): i = self.previous_element while i is not None: yield i i = i.previous_element @property def previous_siblings(self): i = self.previous_sibling while i is not None: yield i i = i.previous_sibling @property def parents(self): i = self.parent while i is not None: yield i i = i.parent # Methods for supporting CSS selectors. tag_name_re = re.compile('^[a-z0-9]+$') # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # \---/ \---/\-------------/ \-------/ # | | | | # | | | The value # | | ~,|,^,$,* or = # | Attribute # Tag attribselect_re = re.compile( r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + r'=?"?(?P<value>[^\]"]*)"?\]$' ) def _attr_value_as_string(self, value, default=None): """Force an attribute value into a string representation. A multi-valued attribute will be converted into a space-separated stirng. """ value = self.get(value, default) if isinstance(value, list) or isinstance(value, tuple): value =" ".join(value) return value def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. Takes an operator, attribute and optional value. Returns a function that will return True for elements that match that combination. """ if operator == '=': # string representation of `attribute` is equal to `value` return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': # space-separated list representation of `attribute` # contains `value` def _includes_value(element): attribute_value = element.get(attribute, []) if not isinstance(attribute_value, list): attribute_value = attribute_value.split() return value in attribute_value return _includes_value elif operator == '^': # string representation of `attribute` starts with `value` return lambda el: el._attr_value_as_string( attribute, '').startswith(value) elif operator == '$': # string represenation of `attribute` ends with `value` return lambda el: el._attr_value_as_string( attribute, '').endswith(value) elif operator == '*': # string representation of `attribute` contains `value` return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': # string representation of `attribute` is either exactly # `value` or starts with `value` and then a dash. def _is_or_starts_with_dash(element): attribute_value = element._attr_value_as_string(attribute, '') return (attribute_value == value or attribute_value.startswith( value + '-')) return _is_or_starts_with_dash else: return lambda el: el.has_attr(attribute) def select(self, selector): """Perform a CSS selection operation on the current element.""" tokens = selector.split() current_context = [self] for index, token in enumerate(tokens): if tokens[index - 1] == '>': # already found direct descendants in last step. skip this # step. continue m = self.attribselect_re.match(token) if m is not None: # Attribute selector tag, attribute, operator, value = m.groups() if not tag: tag = True checker = self._attribute_checker(operator, attribute, value) found = [] for context in current_context: found.extend( [el for el in context.find_all(tag) if checker(el)]) current_context = found continue if '#' in token: # ID selector tag, id = token.split('#', 1) if tag == "": tag = True el = current_context[0].find(tag, {'id': id}) if el is None: return [] # No match current_context = [el] continue if '.' in token: # Class selector tag_name, klass = token.split('.', 1) if not tag_name: tag_name = True classes = set(klass.split('.')) found = [] def classes_match(tag): if tag_name is not True and tag.name != tag_name: return False if not tag.has_attr('class'): return False return classes.issubset(tag['class']) for context in current_context: found.extend(context.find_all(classes_match)) current_context = found continue if token == '*': # Star selector found = [] for context in current_context: found.extend(context.findAll(True)) current_context = found continue if token == '>': # Child selector tag = tokens[index + 1] if not tag: tag = True found = [] for context in current_context: found.extend(context.find_all(tag, recursive=False)) current_context = found continue # Here we should just have a regular tag if not self.tag_name_re.match(token): return [] found = [] for context in current_context: found.extend(context.findAll(token)) current_context = found return current_context # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): return self.next_elements def nextSiblingGenerator(self): return self.next_siblings def previousGenerator(self): return self.previous_elements def previousSiblingGenerator(self): return self.previous_siblings def parentGenerator(self): return self.parents class NavigableString(unicode, PageElement): PREFIX = '' SUFFIX = '' def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (unicode(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError( "'%s' object has no attribute '%s'" % ( self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. The string will be passed into the formatter (to trigger side effects), but the return value will be ignored. """ def output_ready(self, formatter="minimal"): """CData strings are passed into the formatter. But the return value is ignored.""" self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): PREFIX = u'<![CDATA[' SUFFIX = u']]>' class ProcessingInstruction(PreformattedString): PREFIX = u'<?' SUFFIX = u'?>' class Comment(PreformattedString): PREFIX = u'<!--' SUFFIX = u'-->' class Declaration(PreformattedString): PREFIX = u'<!' SUFFIX = u'!>' class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): value = name if pub_id is not None: value += ' PUBLIC "%s"' % pub_id if system_id is not None: value += ' "%s"' % system_id elif system_id is not None: value += ' SYSTEM "%s"' % system_id return Doctype(value) PREFIX = u'<!DOCTYPE ' SUFFIX = u'>\n' class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: self.parser_class = None else: # We don't actually store the parser object: that lets extracted # chunks be garbage-collected. self.parser_class = parser.__class__ if name is None: raise ValueError("No value provided for new tag's name.") self.name = name self.namespace = namespace self.prefix = prefix if attrs is None: attrs = {} elif builder.cdata_list_attributes: attrs = builder._replace_cdata_list_attribute_values( self.name, attrs) else: attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False # Set up any substitutions, such as the charset in a META tag. if builder is not None: builder.set_up_substitutions(self) self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False parserClass = _alias("parser_class") # BS3 @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) A tag that has contents is never an empty-element tag. A tag that has no contents may or may not be an empty-element tag. It depends on the builder used to create the tag. If the builder has a designated list of empty-element tags, then only a tag whose name shows up in that list is considered an empty-element tag. If the builder has no designated list of empty-element tags, then any tag with no contents is an empty-element tag. """ return len(self.contents) == 0 and self.can_be_empty_element isSelfClosing = is_empty_element # BS3 @property def string(self): """Convenience property to get the single string within this tag. :Return: If this tag has a single string child, return value is that string. If this tag has no children, or more than one child, return value is None. If this tag has one child tag, return value is the 'string' attribute of the child tag, recursively. """ if len(self.contents) != 1: return None child = self.contents[0] if isinstance(child, NavigableString): return child return child.string @string.setter def string(self, string): self.clear() self.append(string.__class__(string)) def _all_strings(self, strip=False): """Yield all child strings, possibly stripping them.""" for descendant in self.descendants: if not isinstance(descendant, NavigableString): continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant strings = property(_all_strings) @property def stripped_strings(self): for string in self._all_strings(True): yield string def get_text(self, separator="", strip=False): """ Get all child strings, concatenated using the given separator. """ return separator.join([s for s in self._all_strings(strip)]) getText = get_text text = property(get_text) def decompose(self): """Recursively destroys the contents of this tree.""" self.extract() i = self while i is not None: next = i.next_element i.__dict__.clear() i = next def clear(self, decompose=False): """ Extract all children. If decompose is True, decompose instead. """ if decompose: for element in self.contents[:]: if isinstance(element, Tag): element.decompose() else: element.extract() else: for element in self.contents[:]: element.extract() def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with tag.contents.index(element) getting the index of equal elements. """ for i, child in enumerate(self.contents): if child is element: return i raise ValueError("Tag.index: element not in tag") def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self.attrs.get(key, default) def has_attr(self, key): return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." self.attrs.pop(key, None) def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( '.%sTag is deprecated, use .find("%s") instead.' % ( tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag=="contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag.""" if self is other: return True if (not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other)): return False for i, my_child in enumerate(self.contents): if my_child != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.encode(encoding) def __unicode__(self): return self.decode() def __str__(self): return self.encode() if PY3K: __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = '/' else: closeTag = '</%s>' % self.name prefix = '' if self.prefix: prefix = self.prefix + ":" pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) else: return self.encode(encoding, True, formatter=formatter) def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a Unicode string. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ pretty_print = (indent_level is not None) s = [] for c in self: text = None if isinstance(c, NavigableString): text = c.output_ready(formatter) elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) if text and indent_level: text = text.strip() if text: if pretty_print: s.append(" " * (indent_level - 1)) s.append(text) if pretty_print: s.append("\n") return ''.join(s) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Renders the contents of this tag as a bytestring.""" contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): if not prettyPrint: indentLevel = None return self.encode_contents( indent_level=indentLevel, encoding=encoding) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.descendants if not recursive: generator = self.children return self._find_all(name, attrs, text, limit, generator, **kwargs) findAll = find_all # BS3 findChildren = find_all # BS2 #Generator methods @property def children(self): # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def descendants(self): if not len(self.contents): return stopNode = self._last_descendant().next_element current = self.contents[0] while current is not stopNode: yield current current = current.next_element # Old names for backwards compatibility def childGenerator(self): return self.children def recursiveChildGenerator(self): return self.descendants # This was kind of misleading because has_key() (attributes) was # different from __in__ (contents). has_key() is gone in Python 3, # anyway. has_key = has_attr # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' # attribute. kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs normalized_attrs = {} for key, value in attrs.items(): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs self.text = self._normalize_search_value(text) def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value # If it's a bytestring, convert it to Unicode, treating it as UTF-8. if isinstance(value, bytes): return value.decode("utf8") # If it's listlike, convert it into a list of strings. if hasattr(value, '__iter__'): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) and not isinstance(v, unicode)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. new_value.append(v) else: new_value.append(self._normalize_search_value(v)) return new_value # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. return unicode(str(value)) def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): found = None markup = None if isinstance(markup_name, Tag): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( isinstance(self.name, collections.Callable) and not isinstance(markup_name, Tag)) if ((not self.name) or call_function_with_tag_data or (markup and self._matches(markup, self.name)) or (not markup and self._matches(markup_name, self.name))): if call_function_with_tag_data: match = self.name(markup_name, markup_attrs) else: match = True markup_attr_map = None for attr, match_against in list(self.attrs.items()): if not markup_attr_map: if hasattr(markup_attrs, 'get'): markup_attr_map = markup_attrs else: markup_attr_map = {} for k, v in markup_attrs: markup_attr_map[k] = v attr_value = markup_attr_map.get(attr) if not self._matches(attr_value, match_against): match = False break if match: if markup: found = markup else: found = markup_name if found and self.text and not self._matches(found.string, self.text): found = None return found searchTag = search_tag def search(self, markup): # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text or self.name or self.attrs: found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isinstance(markup, basestring): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: raise Exception( "I don't know how to match against a %s" % markup.__class__) return found def _matches(self, markup, match_against): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. if (isinstance(match_against, unicode) and ' ' in match_against): # A bit of a special case. If they try to match "foo # bar" on a multivalue attribute's value, only accept # the literal value "foo bar" # # XXX This is going to be pretty slow because we keep # splitting match_against. But it shouldn't come up # too often. return (whitespace_re.split(match_against) == markup) else: for item in markup: if self._matches(item, match_against): return True return False if match_against is True: # True matches any non-None value. return markup is not None if isinstance(match_against, collections.Callable): return match_against(markup) # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name # Ensure that `markup` is either a Unicode string, or None. markup = self._normalize_search_value(markup) if markup is None: # None matches None, False, an empty string, an empty list, and so on. return not match_against if isinstance(match_against, unicode): # Exact string match return markup == match_against if hasattr(match_against, 'match'): # Regexp match return match_against.search(markup) if hasattr(match_against, '__iter__'): # The markup must be an exact match against something # in the iterable. return markup in match_against class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source

beautifulsoup4-4.1.0/bs4/dammit.py

# -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit This class forces XML data into a standard format (usually to UTF-8 or Unicode). It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs from htmlentitydefs import codepoint2name import re import warnings # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ # or 'apt-get install python-chardet' # or 'easy_install chardet' try: import chardet #import chardet.constants #chardet.constants._debug = 1 except ImportError: chardet = None # Available from http://cjkpython.i18n.org/. try: import iconv_codec except ImportError: pass xml_encoding_re = re.compile( '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) html_meta_re = re.compile( '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): """Substitute XML or HTML entities for the corresponding characters.""" def _populate_class_variables(): lookup = {} reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): character = unichr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # &quot;, unless it happens within an attribute value, which # is handled elsewhere. characters_for_re.append(character) lookup[character] = name # But we do want to turn &quot; into the quotation mark. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() CHARACTER_TO_XML_ENTITY = { "'": "apos", '"': "quot", "&": "amp", "<": "lt", ">": "gt", } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity @classmethod def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity @classmethod def quoted_attribute_value(self, value): """Make a value into a quoted XML attribute, possibly escaping it. Most strings will be quoted using double quotes. Bob's Bar -> "Bob's Bar" If a string contains double quotes, it will be quoted using single quotes. Welcome to "my bar" -> 'Welcome to "my bar"' If a string contains both single and double quotes, the double quotes will be escaped, and the string will be quoted using double quotes. Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot; """ quote_with = '"' if '"' in value: if "'" in value: # The string contains both single and double # quotes. Turn the double quotes into # entities. We quote the double quotes rather than # the single quotes because the entity name is # "&quot;" whether this is HTML or XML. If we # quoted the single quotes, we'd have to decide # between &apos; and &squot;. replace_with = "&quot;" value = value.replace('"', replace_with) else: # There are double quotes but no single quotes. # We can use single quotes to quote the attribute. quote_with = "'" return quote_with + value + quote_with @classmethod def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. :param value: A string to be substituted. The less-than sign will become &lt;, the greater-than sign will become &gt;, and any ampersands that are not part of an entity defition will become &amp;. :param make_quoted_attribute: If True, then the string will be quoted, as befits an attribute value. """ # Escape angle brackets, and ampersands that aren't part of # entities. value = cls.BARE_AMPERSAND_OR_BRACKET.sub( cls._substitute_xml_entity, value) if make_quoted_attribute: value = cls.quoted_attribute_value(value) return value @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. This differs from data.encode(encoding, 'xmlcharrefreplace') in that the goal is to make the result more readable (to those with ASCII displays) rather than to recover from errors. There's absolutely nothing wrong with a UTF-8 string containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that character with "&eacute;" will make it more readable to some people. """ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( cls._substitute_html_entity, s) class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.""" # This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. CHARSET_ALIASES = {"macintosh": "mac-roman", "x-sjis": "shift-jis"} ENCODINGS_WITH_SMART_QUOTES = [ "windows-1252", "iso-8859-1", "iso-8859-2", ] def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return new_markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, is_html) self.markup = new_markup u = None if new_markup != markup: # _detectEncoding modified the markup, then converted it to # Unicode and then to UTF-8. So convert it from UTF-8. u = self._convert_from("utf8") self.original_encoding = sniffed_encoding if not u: for proposed_encoding in ( override_encodings + [document_encoding, sniffed_encoding]): if proposed_encoding is not None: u = self._convert_from(proposed_encoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convert_from(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convert_from(proposed_encoding) if u: break # As an absolute last resort, try the encodings again with # character replacement. if not u: for proposed_encoding in ( override_encodings + [ document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): if proposed_encoding != "ascii": u = self._convert_from(proposed_encoding, "replace") if u is not None: warnings.warn( UnicodeWarning( "Some characters could not be decoded, and were " "replaced with REPLACEMENT CHARACTER.")) self.contains_replacement_characters = True break # We could at this point force it to ASCII, but that would # destroy so much data that I think giving up is better self.unicode_markup = u if not u: self.original_encoding = None def _sub_ms_char(self, match): """Changes a MS smart quote character to an XML or HTML entity, or an ASCII character.""" orig = match.group(1) if self.smart_quotes_to == 'ascii': sub = self.MS_CHARS_TO_ASCII.get(orig).encode() else: sub = self.MS_CHARS.get(orig) if type(sub) == tuple: if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() else: sub = sub.encode() return sub def _convert_from(self, proposed, errors="strict"): proposed = self.find_codec(proposed) if not proposed or (proposed, errors) in self.tried_encodings: return None self.tried_encodings.append((proposed, errors)) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: #print "Trying to convert document to %s (errors=%s)" % ( # proposed, errors) u = self._to_unicode(markup, proposed, errors) self.markup = u self.original_encoding = proposed except Exception as e: #print "That didn't work!" #print e return None #print "Correct encoding: %s" % proposed return self.markup def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding, errors) return newdata def _detectEncoding(self, xml_data, is_html=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == b'\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == b'\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ and (xml_data[2:4] != b'\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ (xml_data[2:4] != b'\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == b'\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == b'\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == b'\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass except: xml_encoding_match = None xml_encoding_match = xml_encoding_re.match(xml_data) if not xml_encoding_match and is_html: xml_encoding_match = html_meta_re.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].decode( 'ascii').lower() if is_html: self.declared_html_encoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): pass return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), b'\x81': ' ', b'\x82': ('sbquo', '201A'), b'\x83': ('fnof', '192'), b'\x84': ('bdquo', '201E'), b'\x85': ('hellip', '2026'), b'\x86': ('dagger', '2020'), b'\x87': ('Dagger', '2021'), b'\x88': ('circ', '2C6'), b'\x89': ('permil', '2030'), b'\x8A': ('Scaron', '160'), b'\x8B': ('lsaquo', '2039'), b'\x8C': ('OElig', '152'), b'\x8D': '?', b'\x8E': ('#x17D', '17D'), b'\x8F': '?', b'\x90': '?', b'\x91': ('lsquo', '2018'), b'\x92': ('rsquo', '2019'), b'\x93': ('ldquo', '201C'), b'\x94': ('rdquo', '201D'), b'\x95': ('bull', '2022'), b'\x96': ('ndash', '2013'), b'\x97': ('mdash', '2014'), b'\x98': ('tilde', '2DC'), b'\x99': ('trade', '2122'), b'\x9a': ('scaron', '161'), b'\x9b': ('rsaquo', '203A'), b'\x9c': ('oelig', '153'), b'\x9d': '?', b'\x9e': ('#x17E', '17E'), b'\x9f': ('Yuml', ''),} # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains # horrors like stripping diacritical marks to turn á into a, but also # contains non-horrors like turning “ into ". MS_CHARS_TO_ASCII = { b'\x80' : 'EUR', b'\x81' : ' ', b'\x82' : ',', b'\x83' : 'f', b'\x84' : ',,', b'\x85' : '...', b'\x86' : '+', b'\x87' : '++', b'\x88' : '^', b'\x89' : '%', b'\x8a' : 'S', b'\x8b' : '<', b'\x8c' : 'OE', b'\x8d' : '?', b'\x8e' : 'Z', b'\x8f' : '?', b'\x90' : '?', b'\x91' : "'", b'\x92' : "'", b'\x93' : '"', b'\x94' : '"', b'\x95' : '*', b'\x96' : '-', b'\x97' : '--', b'\x98' : '~', b'\x99' : '(TM)', b'\x9a' : 's', b'\x9b' : '>', b'\x9c' : 'oe', b'\x9d' : '?', b'\x9e' : 'z', b'\x9f' : 'Y', b'\xa0' : ' ', b'\xa1' : '!', b'\xa2' : 'c', b'\xa3' : 'GBP', b'\xa4' : '$', #This approximation is especially parochial--this is the #generic currency symbol. b'\xa5' : 'YEN', b'\xa6' : '|', b'\xa7' : 'S', b'\xa8' : '..', b'\xa9' : '', b'\xaa' : '(th)', b'\xab' : '<<', b'\xac' : '!', b'\xad' : ' ', b'\xae' : '(R)', b'\xaf' : '-', b'\xb0' : 'o', b'\xb1' : '+-', b'\xb2' : '2', b'\xb3' : '3', b'\xb4' : ("'", 'acute'), b'\xb5' : 'u', b'\xb6' : 'P', b'\xb7' : '*', b'\xb8' : ',', b'\xb9' : '1', b'\xba' : '(th)', b'\xbb' : '>>', b'\xbc' : '1/4', b'\xbd' : '1/2', b'\xbe' : '3/4', b'\xbf' : '?', b'\xc0' : 'A', b'\xc1' : 'A', b'\xc2' : 'A', b'\xc3' : 'A', b'\xc4' : 'A', b'\xc5' : 'A', b'\xc6' : 'AE', b'\xc7' : 'C', b'\xc8' : 'E', b'\xc9' : 'E', b'\xca' : 'E', b'\xcb' : 'E', b'\xcc' : 'I', b'\xcd' : 'I', b'\xce' : 'I', b'\xcf' : 'I', b'\xd0' : 'D', b'\xd1' : 'N', b'\xd2' : 'O', b'\xd3' : 'O', b'\xd4' : 'O', b'\xd5' : 'O', b'\xd6' : 'O', b'\xd7' : '*', b'\xd8' : 'O', b'\xd9' : 'U', b'\xda' : 'U', b'\xdb' : 'U', b'\xdc' : 'U', b'\xdd' : 'Y', b'\xde' : 'b', b'\xdf' : 'B', b'\xe0' : 'a', b'\xe1' : 'a', b'\xe2' : 'a', b'\xe3' : 'a', b'\xe4' : 'a', b'\xe5' : 'a', b'\xe6' : 'ae', b'\xe7' : 'c', b'\xe8' : 'e', b'\xe9' : 'e', b'\xea' : 'e', b'\xeb' : 'e', b'\xec' : 'i', b'\xed' : 'i', b'\xee' : 'i', b'\xef' : 'i', b'\xf0' : 'o', b'\xf1' : 'n', b'\xf2' : 'o', b'\xf3' : 'o', b'\xf4' : 'o', b'\xf5' : 'o', b'\xf6' : 'o', b'\xf7' : '/', b'\xf8' : 'o', b'\xf9' : 'u', b'\xfa' : 'u', b'\xfb' : 'u', b'\xfc' : 'u', b'\xfd' : 'y', b'\xfe' : 'b', b'\xff' : 'y', } # A map used when removing rogue Windows-1252/ISO-8859-1 # characters in otherwise UTF-8 documents. # # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in # Windows-1252. WINDOWS_1252_TO_UTF8 = { 0x80 : b'\xe2\x82\xac', # € 0x82 : b'\xe2\x80\x9a', # ‚ 0x83 : b'\xc6\x92', # ƒ 0x84 : b'\xe2\x80\x9e', # „ 0x85 : b'\xe2\x80\xa6', # … 0x86 : b'\xe2\x80\xa0', # † 0x87 : b'\xe2\x80\xa1', # ‡ 0x88 : b'\xcb\x86', # ˆ 0x89 : b'\xe2\x80\xb0', # ‰ 0x8a : b'\xc5\xa0', # Š 0x8b : b'\xe2\x80\xb9', # ‹ 0x8c : b'\xc5\x92', # Œ 0x8e : b'\xc5\xbd', # Ž 0x91 : b'\xe2\x80\x98', # ‘ 0x92 : b'\xe2\x80\x99', # ’ 0x93 : b'\xe2\x80\x9c', # “ 0x94 : b'\xe2\x80\x9d', # ” 0x95 : b'\xe2\x80\xa2', # • 0x96 : b'\xe2\x80\x93', # – 0x97 : b'\xe2\x80\x94', # — 0x98 : b'\xcb\x9c', # ˜ 0x99 : b'\xe2\x84\xa2', # ™ 0x9a : b'\xc5\xa1', # š 0x9b : b'\xe2\x80\xba', # › 0x9c : b'\xc5\x93', # œ 0x9e : b'\xc5\xbe', # ž 0x9f : b'\xc5\xb8', # Ÿ 0xa0 : b'\xc2\xa0', #   0xa1 : b'\xc2\xa1', # ¡ 0xa2 : b'\xc2\xa2', # ¢ 0xa3 : b'\xc2\xa3', # £ 0xa4 : b'\xc2\xa4', # ¤ 0xa5 : b'\xc2\xa5', # ¥ 0xa6 : b'\xc2\xa6', # ¦ 0xa7 : b'\xc2\xa7', # § 0xa8 : b'\xc2\xa8', # ¨ 0xa9 : b'\xc2\xa9', # © 0xaa : b'\xc2\xaa', # ª 0xab : b'\xc2\xab', # « 0xac : b'\xc2\xac', # ¬ 0xad : b'\xc2\xad', # ­ 0xae : b'\xc2\xae', # ® 0xaf : b'\xc2\xaf', # ¯ 0xb0 : b'\xc2\xb0', # ° 0xb1 : b'\xc2\xb1', # ± 0xb2 : b'\xc2\xb2', # ² 0xb3 : b'\xc2\xb3', # ³ 0xb4 : b'\xc2\xb4', # ´ 0xb5 : b'\xc2\xb5', # µ 0xb6 : b'\xc2\xb6', # ¶ 0xb7 : b'\xc2\xb7', # · 0xb8 : b'\xc2\xb8', # ¸ 0xb9 : b'\xc2\xb9', # ¹ 0xba : b'\xc2\xba', # º 0xbb : b'\xc2\xbb', # » 0xbc : b'\xc2\xbc', # ¼ 0xbd : b'\xc2\xbd', # ½ 0xbe : b'\xc2\xbe', # ¾ 0xbf : b'\xc2\xbf', # ¿ 0xc0 : b'\xc3\x80', # À 0xc1 : b'\xc3\x81', # Á 0xc2 : b'\xc3\x82', #  0xc3 : b'\xc3\x83', # à 0xc4 : b'\xc3\x84', # Ä 0xc5 : b'\xc3\x85', # Å 0xc6 : b'\xc3\x86', # Æ 0xc7 : b'\xc3\x87', # Ç 0xc8 : b'\xc3\x88', # È 0xc9 : b'\xc3\x89', # É 0xca : b'\xc3\x8a', # Ê 0xcb : b'\xc3\x8b', # Ë 0xcc : b'\xc3\x8c', # Ì 0xcd : b'\xc3\x8d', # Í 0xce : b'\xc3\x8e', # Î 0xcf : b'\xc3\x8f', # Ï 0xd0 : b'\xc3\x90', # Ð 0xd1 : b'\xc3\x91', # Ñ 0xd2 : b'\xc3\x92', # Ò 0xd3 : b'\xc3\x93', # Ó 0xd4 : b'\xc3\x94', # Ô 0xd5 : b'\xc3\x95', # Õ 0xd6 : b'\xc3\x96', # Ö 0xd7 : b'\xc3\x97', # × 0xd8 : b'\xc3\x98', # Ø 0xd9 : b'\xc3\x99', # Ù 0xda : b'\xc3\x9a', # Ú 0xdb : b'\xc3\x9b', # Û 0xdc : b'\xc3\x9c', # Ü 0xdd : b'\xc3\x9d', # Ý 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä 0xe5 : b'\xc3\xa5', # å 0xe6 : b'\xc3\xa6', # æ 0xe7 : b'\xc3\xa7', # ç 0xe8 : b'\xc3\xa8', # è 0xe9 : b'\xc3\xa9', # é 0xea : b'\xc3\xaa', # ê 0xeb : b'\xc3\xab', # ë 0xec : b'\xc3\xac', # ì 0xed : b'\xc3\xad', # í 0xee : b'\xc3\xae', # î 0xef : b'\xc3\xaf', # ï 0xf0 : b'\xc3\xb0', # ð 0xf1 : b'\xc3\xb1', # ñ 0xf2 : b'\xc3\xb2', # ò 0xf3 : b'\xc3\xb3', # ó 0xf4 : b'\xc3\xb4', # ô 0xf5 : b'\xc3\xb5', # õ 0xf6 : b'\xc3\xb6', # ö 0xf7 : b'\xc3\xb7', # ÷ 0xf8 : b'\xc3\xb8', # ø 0xf9 : b'\xc3\xb9', # ù 0xfa : b'\xc3\xba', # ú 0xfb : b'\xc3\xbb', # û 0xfc : b'\xc3\xbc', # ü 0xfd : b'\xc3\xbd', # ý 0xfe : b'\xc3\xbe', # þ } MULTIBYTE_MARKERS_AND_SIZES = [ (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF (0xe0, 0xef, 3), # 3-byte characters start with E0-EF (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 ] FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] @classmethod def detwingle(cls, in_bytes, main_encoding="utf8", embedded_encoding="windows-1252"): """Fix characters from one encoding embedded in some other encoding. Currently the only situation supported is Windows-1252 (or its subset ISO-8859-1), embedded in UTF-8. The input must be a bytestring. If you've already converted the document to Unicode, you're too late. The output is a bytestring in which `embedded_encoding` characters have been converted to their `main_encoding` equivalents. """ if embedded_encoding.replace('_', '-').lower() not in ( 'windows-1252', 'windows_1252'): raise NotImplementedError( "Windows-1252 and ISO-8859-1 are the only currently supported " "embedded encodings.") if main_encoding.lower() not in ('utf8', 'utf-8'): raise NotImplementedError( "UTF-8 is the only currently supported main encoding.") byte_chunks = [] chunk_start = 0 pos = 0 while pos < len(in_bytes): byte = in_bytes[pos] if not isinstance(byte, int): # Python 2.x byte = ord(byte) if (byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER): # This is the start of a UTF-8 multibyte character. Skip # to the end. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: if byte >= start and byte <= end: pos += size break elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: # We found a Windows-1252 character! # Save the string up to this point as a chunk. byte_chunks.append(in_bytes[chunk_start:pos]) # Now translate the Windows-1252 character into UTF-8 # and add it as another, one-byte chunk. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) pos += 1 chunk_start = pos else: # Go on to the next character. pos += 1 if chunk_start == 0: # The string is unchanged. return in_bytes else: # Store the final chunk. byte_chunks.append(in_bytes[chunk_start:]) return b''.join(byte_chunks)

beautifulsoup4-4.1.0/bs4/tests/__init__.py

"The beautifulsoup tests."

beautifulsoup4-4.1.0/bs4/tests/test_htmlparser.py

"""Tests to ensure that the html.parser tree builder generates good trees.""" from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): return HTMLParserTreeBuilder() def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass

beautifulsoup4-4.1.0/bs4/tests/test_soup.py

# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" import unittest from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, ) import bs4.dammit from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import ( SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError, e: LXML_PRESENT = False class TestDeprecatedConstructorArguments(SoupTest): def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", parseOnlyThese=SoupStrainer("b")) msg = str(w[0].message) self.assertTrue("parseOnlyThese" in msg) self.assertTrue("parse_only" in msg) self.assertEqual(b"", soup.encode()) def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding) def test_unrecognized_keyword_argument(self): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) @skipIf( not LXML_PRESENT, "lxml not present, not testing BeautifulStoneSoup.") def test_beautifulstonesoup(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") self.assertTrue(isinstance(soup, BeautifulSoup)) self.assertTrue("BeautifulStoneSoup class is deprecated") class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.encode(), b"YesYes Yes") class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = u"foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), u"foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = b"\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) def test_xml_attribute_quoting_normally_uses_double_quotes(self): self.assertEqual(self.sub.substitute_xml("Welcome", True), '"Welcome"') self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), '"Bob\'s Bar"') def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, True), "'Welcome to \"my bar\"'") def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' self.assertEqual( self.sub.substitute_xml(s, True), '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' self.assertEqual(self.sub.substitute_xml(quoted), quoted) def test_xml_quoting_handles_angle_brackets(self): self.assertEqual( self.sub.substitute_xml("foo"), "foo<bar>") def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' self.assertEqual(self.sub.substitute_html(text), text) class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setUp(self): super(TestEncodingConversion, self).setUp() self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b"Sacr\xc3\xa9 bleu!") def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii") def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") def test_smart_quotes_to_xml_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_ascii(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """''""""") def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding, 'utf-8') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8') def test_detect_html5_style_meta_tag(self): for data in ( b'', b"", b"", b""): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding) def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe\x00\xe1\x00\xe9\x00\x00' dammit = UnicodeDammit(data) self.assertEqual(u"áé", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual( u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none(self): a = NamespacedAttribute("xmlns", None) self.assertEqual(a, "xmlns") def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) def test_attributes_are_equivalent_if_prefix_and_name_identical(self): a = NamespacedAttribute("a", "b", "c") b = NamespacedAttribute("a", "b", "c") self.assertEqual(a, b) # The actual namespace is not considered. c = NamespacedAttribute("a", "b", None) self.assertEqual(a, c) # But name and prefix are important. d = NamespacedAttribute("a", "z", "c") self.assertNotEqual(a, d) e = NamespacedAttribute("z", "b", "c") self.assertNotEqual(a, e) class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") self.assertEqual("euc-jp", value) self.assertEqual("euc-jp", value.original_value) self.assertEqual("utf8", value.encode("utf8")) def test_content_meta_attribute_value(self): value = ContentMetaAttributeValue("text/html; charset=euc-jp") self.assertEqual("text/html; charset=euc-jp", value) self.assertEqual("text/html; charset=euc-jp", value.original_value) self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

beautifulsoup4-4.1.0/bs4/tests/test_lxml.py

"""Tests to ensure that the lxml tree builder generates good trees.""" import re import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError, e: LXML_PRESENT = False from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilder() def test_out_of_range_entity(self): self.assertSoupEquals( "

foo&#10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo&#x10000000000000;bar

", "

foobar

") self.assertSoupEquals( "

foo빲�bar

", "

foobar

") def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=False) as w: soup = BeautifulStoneSoup("") self.assertEqual(u"", unicode(soup.b)) def test_real_xhtml_document(self): """lxml strips the XML definition from an XHTML doc, which is fine.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b''), markup.replace(b'\n', b'').replace( b'', b'')) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilderForXML()

beautifulsoup4-4.1.0/bs4/tests/test_builder_registry.py

"""Tests of the builder registry.""" import unittest from bs4 import BeautifulSoup from bs4.builder import ( builder_registry as registry, HTMLParserTreeBuilder, TreeBuilderRegistry, ) try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError: HTML5LIB_PRESENT = False try: from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, ) LXML_PRESENT = True except ImportError: LXML_PRESENT = False class BuiltInRegistryTest(unittest.TestCase): """Test the built-in registry with the default builders registered.""" def test_combination(self): if LXML_PRESENT: self.assertEqual(registry.lookup('fast', 'html'), LXMLTreeBuilder) if LXML_PRESENT: self.assertEqual(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder) def test_lookup_by_markup_type(self): if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) else: self.assertEqual(registry.lookup('xml'), None) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) else: self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) def test_named_library(self): if LXML_PRESENT: self.assertEqual(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('lxml', 'html'), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib'), HTML5TreeBuilder) self.assertEqual(registry.lookup('html.parser'), HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): # You can pass in a string. BeautifulSoup("", features="html") # Or a list of strings. BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. self.assertRaises(ValueError, BeautifulSoup, "", features="no-such-feature") class RegistryTest(unittest.TestCase): """Test the TreeBuilderRegistry class in general.""" def setUp(self): self.registry = TreeBuilderRegistry() def builder_for_features(self, *feature_list): cls = type('Builder_' + '_'.join(feature_list), (object,), {'features' : feature_list}) self.registry.register(cls) return cls def test_register_with_no_features(self): builder = self.builder_for_features() # Since the builder advertises no features, you can't find it # by looking up features. self.assertEqual(self.registry.lookup('foo'), None) # But you can find it by doing a lookup with no features, if # this happens to be the only registered builder. self.assertEqual(self.registry.lookup(), builder) def test_register_with_features_makes_lookup_succeed(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('foo'), builder) self.assertEqual(self.registry.lookup('bar'), builder) def test_lookup_fails_when_no_builder_implements_feature(self): builder = self.builder_for_features('foo', 'bar') self.assertEqual(self.registry.lookup('baz'), None) def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): builder1 = self.builder_for_features('foo') builder2 = self.builder_for_features('bar') self.assertEqual(self.registry.lookup(), builder2) def test_lookup_fails_when_no_tree_builders_registered(self): self.assertEqual(self.registry.lookup(), None) def test_lookup_gets_most_recent_builder_supporting_all_features(self): has_one = self.builder_for_features('foo') has_the_other = self.builder_for_features('bar') has_both_early = self.builder_for_features('foo', 'bar', 'baz') has_both_late = self.builder_for_features('foo', 'bar', 'quux') lacks_one = self.builder_for_features('bar') has_the_other = self.builder_for_features('foo') # There are two builders featuring 'foo' and 'bar', but # the one that also features 'quux' was registered later. self.assertEqual(self.registry.lookup('foo', 'bar'), has_both_late) # There is only one builder featuring 'foo', 'bar', and 'baz'. self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), has_both_early) def test_lookup_fails_when_cannot_reconcile_requested_features(self): builder1 = self.builder_for_features('foo', 'bar') builder2 = self.builder_for_features('foo', 'baz') self.assertEqual(self.registry.lookup('bar', 'baz'), None)

beautifulsoup4-4.1.0/bs4/tests/test_html5lib.py

"""Tests to ensure that the html5lib tree builder generates good trees.""" import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError, e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( HTML5TreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): return HTML5TreeBuilder() def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" with warnings.catch_warnings(record=True) as w: soup = self.soup(markup, parse_only=strainer) self.assertEqual( soup.decode(), self.document_for(markup)) self.assertTrue( "the html5lib tree builder doesn't support parse_only" in str(w[0].message)) def test_correctly_nested_tables(self): """html5lib inserts <tbody> tags where other parsers don't.""" markup = ('<table id="1">' '<tr>' "<td>Here's another table:" '<table id="2">' '<tr><td>foo</td></tr>' '</table></td>') self.assertSoupEquals( markup, '<table id="1"><tbody><tr><td>Here\'s another table:' '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' '</td></tr></tbody></table>') self.assertSoupEquals( "<table><thead><tr><td>Foo</td></tr></thead>" "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>")

beautifulsoup4-4.1.0/bs4/tests/test_docs.py

"Test harness for doctests." # pylint: disable-msg=E0611,W0142 __metaclass__ = type __all__ = [ 'additional_tests', ] import atexit import doctest import os #from pkg_resources import ( # resource_filename, resource_exists, resource_listdir, cleanup_resources) import unittest DOCTEST_FLAGS = ( doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) # def additional_tests(): # "Run the doc tests (README.txt and docs/*, if any exist)" # doctest_files = [ # os.path.abspath(resource_filename('bs4', 'README.txt'))] # if resource_exists('bs4', 'docs'): # for name in resource_listdir('bs4', 'docs'): # if name.endswith('.txt'): # doctest_files.append( # os.path.abspath( # resource_filename('bs4', 'docs/%s' % name))) # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) # atexit.register(cleanup_resources) # return unittest.TestSuite(( # doctest.DocFileSuite(*doctest_files, **kwargs)))

beautifulsoup4-4.1.0/bs4/tests/test_tree.py

# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( CData, Doctype, NavigableString, SoupStrainer, Tag, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup(u'

Räksmörgås

') self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. self.assertEqual(soup.find_all(text="bar"), [u"bar"]) # Match any of a number of strings. self.assertEqual( soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), [u"Foo", u"bar", u'\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), [u"Foo", u"bar", u'\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("1 2 3 4 5") self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) self.assertSelects(soup.find_all('a', limit=1), ["1"]) self.assertSelects( soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assertSelects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assertSelects(soup('a', limit=1), ["1"]) self.assertSelects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): soup = self.soup("") # Create a self-referential list. l = [] l.append(l) # Without special code in _normalize_search_value, this would cause infinite # recursion. self.assertEqual([], soup.find_all(l)) class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('4') self.assertEqual("4", soup.find("mathml:msqrt").string) self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" def setUp(self): super(TreeTest, self).setUp() self.tree = self.soup(""" First tag. Second tag. Third Nested tag. tag.""") def test_find_all_by_tag_name(self): # Find all the tags. self.assertSelects( self.tree.find_all('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_name_and_text(self): self.assertSelects( self.tree.find_all('a', text='First tag.'), ['First tag.']) self.assertSelects( self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) self.assertSelects( self.tree.find_all('a', text=re.compile("tag")), ['First tag.', 'Nested tag.']) def test_find_all_on_non_root_element(self): # You can call find_all on any node, not just the root. self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) def test_calling_element_invokes_find_all(self): self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_strainer(self): self.assertSelects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_names(self): self.assertSelects( self.tree.find_all(['a', 'b']), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_dict(self): self.assertSelects( self.tree.find_all({'a' : True, 'b' : True}), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_re(self): self.assertSelects( self.tree.find_all(re.compile('^[ab]$')), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_with_tags_matching_method(self): # You can define an oracle method that determines whether # a tag matches the search. def id_matches_name(tag): return tag.name == tag.get('id') tree = self.soup(""" Match 1. Does not match. Match 2.""") self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by # attribute. tree = self.soup(""" Matching a. Non-matching Matching b.a. """) self.assertSelects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): peace = u"םולש".encode("utf8") data = u''.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument # to find_all) and 'class' (a reserved word in Python.) tree = self.soup(""" Name match. Class match. Non-match. A tag called 'name1'. """) # This doesn't do what you want. self.assertSelects(tree.find_all(name='name1'), ["A tag called 'name1'."]) # This does what you want. self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) # Passing class='class2' would cause a syntax error. self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): # Passing in a string to 'attrs' will search the CSS class. tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") attrs = { 'class' : re.compile("o") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, ["Found it"]) f = tree.find_all("gar", re.compile("a")) self.assertSelects(f, ["Found it"]) # Since the class is not the string "foo bar", but the two # strings "foo" and "bar", this will not find anything. attrs = { 'class' : re.compile("o b") } f = tree.find_all("gar", attrs=attrs) self.assertSelects(f, []) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) def big_attribute_value(value): return len(value) > 3 self.assertSelects(soup.find_all("a", big_attribute_value), []) def small_attribute_value(value): return len(value) ') a, a2 = soup.find_all("a") self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) # If you specify the attribute as a string that contains a # space, only that specific value will be found. self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" Match. Non-match.""") strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) def test_find_all_with_missing_atribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) def test_find_all_with_defined_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that have that attribute set to any value. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects( tree.find_all(id=True), ["ID present.", "ID is empty."]) def test_find_all_with_numeric_attribute(self): # If you search for a number, it's treated as a string. tree = self.soup("""Unquoted attribute. Quoted attribute.""") expected = ["Unquoted attribute.", "Quoted attribute."] self.assertSelects(tree.find_all(id=1), expected) self.assertSelects(tree.find_all(id="1"), expected) def test_find_all_with_list_attribute_values(self): # You can pass a list of attribute values instead of just one, # and you'll get tags that match any of the values. tree = self.soup("""1 2 3 No ID.""") self.assertSelects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) def test_find_all_with_regular_expression_attribute_value(self): # You can pass a regular expression as an attribute value, and # you'll get tags whose values for that attribute match the # regular expression. tree = self.soup("""One a. Two as. Mixed as and bs. One b. No ID.""") self.assertSelects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) def test_find_by_name_and_containing_string(self): soup = self.soup("foobarfoo") a = soup.a self.assertEqual([a], soup.find_all("a", text="foo")) self.assertEqual([], soup.find_all("a", text="bar")) self.assertEqual([], soup.find_all("a", text="bar")) def test_find_by_name_and_containing_string_when_string_is_buried(self): soup = self.soup("foo foo") self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) def test_find_by_attribute_and_containing_string(self): soup = self.soup('foofoo') a = soup.a self.assertEqual([a], soup.find_all(id=2, text="foo")) self.assertEqual([], soup.find_all(id=1, text="bar")) class TestIndex(TreeTest): """Test Tag.index""" def test_index(self): tree = self.soup(""" Identical Not identical Identical Identical with child Also not identical Identical with child """) div = tree.div for i, element in enumerate(div.contents): self.assertEqual(i, div.index(element)) self.assertRaises(ValueError, tree.index, 1) class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" def setUp(self): super(TestParentOperations, self).setUp() self.tree = self.soup('''

beautifulsoup4-4.1.0/bs4/builder/__init__.py

from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, whitespace_re ) __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', 'TreeBuilderRegistry', ] # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' class TreeBuilderRegistry(object): def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): """Register a treebuilder based on its advertised features.""" for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): if len(self.builders) == 0: # There are no builders at all. return None if len(features) == 0: # They didn't ask for any features. Give them the most # recently registered builder. return self.builders[0] # Go down the list of features in order, and eliminate any builders # that don't match every feature. features = list(features) features.reverse() candidates = None candidate_set = None while len(features) > 0: feature = features.pop() we_have_the_feature = self.builders_for_feature.get(feature, []) if len(we_have_the_feature) > 0: if candidates is None: candidates = we_have_the_feature candidate_set = set(candidates) else: # Eliminate any candidates that don't have this feature. candidate_set = candidate_set.intersection( set(we_have_the_feature)) # The only valid candidates are the ones in candidate_set. # Go through the original list of candidates and pick the first one # that's in candidate_set. if candidate_set is None: return None for candidate in candidates: if candidate in candidate_set: return candidate return None # The BeautifulSoup class will take feature lists from developers and use them # to look up builders in this registry. builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" features = [] is_xml = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} def __init__(self): self.soup = None def reset(self): pass def can_be_empty_element(self, tag_name): """Might a tag with this name be an empty-element tag? The final markup may or may not actually present this tag as self-closing. For instance: an HTMLBuilder does not consider a <p> tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty <p> tag will be presented as "<p></p>", not "<p />". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no contents. "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will be left alone. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): return markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. Different parsers do this differently. For instance, lxml introduces an empty <head> tag, and html5lib doesn't. Abstracting this away lets us write simple tests which run HTML fragments through the parser and compare the results against other HTML fragments. This method should not be used outside of tests. """ return fragment def set_up_substitutions(self, tag): return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): """Replaces class="foo bar" with class=["foo", "bar"] Modifies its input in place. """ if self.cdata_list_attributes: universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), []) for cdata_list_attr in itertools.chain(universal, tag_specific): if cdata_list_attr in dict(attrs): # Basically, we have a "class" attribute whose # value is a whitespace-separated list of CSS # classes. Split it into a list. value = attrs[cdata_list_attr] values = whitespace_re.split(value) attrs[cdata_list_attr] = values return attrs class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" def feed(self, markup): raise NotImplementedError() def close(self): pass def startElement(self, name, attrs): attrs = dict((key[1], value) for key, value in list(attrs.items())) #print "Start %s, %r" % (name, attrs) self.soup.handle_starttag(name, attrs) def endElement(self, name): #print "End %s" % name self.soup.handle_endtag(name) def startElementNS(self, nsTuple, nodeName, attrs): # Throw away (ns, nodeName) for now. self.startElement(nodeName, attrs) def endElementNS(self, nsTuple, nodeName): # Throw away (ns, nodeName) for now. self.endElement(nodeName) #handler.endElementNS((ns, node.nodeName), node.nodeName) def startPrefixMapping(self, prefix, nodeValue): # Ignore the prefix for now. pass def endPrefixMapping(self, prefix): # Ignore the prefix for now. # handler.endPrefixMapping(prefix) pass def characters(self, content): self.soup.handle_data(content) def startDocument(self): pass def endDocument(self): pass class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. Such as which tags are empty-element tags. """ preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, # 'foo' and 'bar', not the single value 'foo bar'. When we # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. cdata_list_attributes = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], "td" : ["headers"], "th" : ["headers"], "td" : ["headers"], "form" : ["accept-charset"], "object" : ["archive"], # These are HTML5 specific, as are *.accesskey and *.dropzone above. "area" : ["rel"], "icon" : ["sizes"], "iframe" : ["sandbox"], "output" : ["for"], } def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': return False http_equiv = tag.get('http-equiv') content = tag.get('content') charset = tag.get('charset') # We are interested in <meta> tags that say what encoding the # document was originally in. This means HTML 5-style <meta> # tags that provide the "charset" attribute. It also means # HTML 4-style <meta> tags that provide the "content" # attribute and have "http-equiv" set to "content-type". # # In both cases we will replace the value of the appropriate # attribute with a standin object that can take on any # encoding. meta_encoding = None if charset is not None: # HTML 5 style: # <meta charset="utf8"> meta_encoding = charset tag['charset'] = CharsetMetaAttributeValue(charset) elif (content is not None and http_equiv is not None and http_equiv.lower() == 'content-type'): # HTML 4 style: # <meta http-equiv="content-type" content="text/html; charset=utf8"> tag['content'] = ContentMetaAttributeValue(content) return (meta_encoding is not None) def register_treebuilders_from(module): """Copy TreeBuilders from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules['bs4.builder'] for name in module.__all__: obj = getattr(module, name) if issubclass(obj, TreeBuilder): setattr(this_module, name, obj) this_module.__all__.append(name) # Register the builder while we're at it. this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only # want to use HTMLParser as a last result. from . import _htmlparser register_treebuilders_from(_htmlparser) try: from . import _html5lib register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass try: from . import _lxml register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass

beautifulsoup4-4.1.0/bs4/builder/_html5lib.py

__all__ = [ 'HTML5TreeBuilder', ] import warnings from bs4.builder import ( PERMISSIVE, HTML, HTML_5, HTMLTreeBuilder, ) from bs4.element import NamespacedAttribute import html5lib from html5lib.constants import namespaces from bs4.element import ( Comment, Doctype, NavigableString, Tag, ) class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" features = ['html5lib', PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding return markup, None, None, False # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0] def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( self.soup, namespaceHTMLElements) return self.underlying_builder def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'%s' % fragment class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element class AttrList(object): def __init__(self, element): self.element = element self.attrs = dict(self.element.attrs) def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): "set attr", name, value self.element[name] = value def items(self): return list(self.attrs.items()) def keys(self): return list(self.attrs.keys()) def __len__(self): return len(self.attrs) def __getitem__(self, name): return self.attrs[name] def __contains__(self, name): return name in list(self.attrs.keys()) class Element(html5lib.treebuilders._base.Node): def __init__(self, element, soup, namespace): html5lib.treebuilders._base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # XXX This has O(n^2) performance, for input like # "aaa..." old_element = self.element.contents[-1] new_element = self.soup.new_string(old_element + node.element) old_element.replace_with(new_element) else: self.element.append(node.element) node.parent = self def getAttributes(self): return AttrList(self.element) def setAttributes(self, attributes): if attributes is not None and len(attributes) > 0: converted_attributes = [] for name, value in list(attributes.items()): if isinstance(name, tuple): new_name = NamespacedAttribute(*name) del attributes[name] attributes[new_name] = value self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) for name, value in attributes.items(): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. # # The Tag constructor called this method when the Tag was created, # but we just set/changed the attributes, so call it again. self.soup.builder.set_up_substitutions(self.element) attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) old_node = self.element.contents[index-1] new_str = self.soup.new_string(old_node + node.element) old_node.replace_with(new_str) else: self.element.insert(index, node.element) node.parent = self def removeChild(self, node): node.element.extract() def reparentChildren(self, newParent): while self.element.contents: child = self.element.contents[0] child.extract() if isinstance(child, Tag): newParent.appendChild( Element(child, self.soup, namespaces["html"])) else: newParent.appendChild( TextNode(child, self.soup)) def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node def hasContent(self): return self.element.contents def getNameTuple(self): if self.namespace == None: return namespaces["html"], self.name else: return self.namespace, self.name nameTuple = property(getNameTuple) class TextNode(Element): def __init__(self, element, soup): html5lib.treebuilders._base.Node.__init__(self, None) self.element = element self.soup = soup def cloneNode(self): raise NotImplementedError

beautifulsoup4-4.1.0/bs4/builder/_lxml.py

__all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] from StringIO import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, HTMLTreeBuilder, PERMISSIVE, TreeBuilder, XML) from bs4.dammit import UnicodeDammit LXML = 'lxml' class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @property def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. return etree.XMLParser(target=self, strip_cdata=False, recover=True) def __init__(self, parser=None, empty_element_tags=None): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) if parser is None: # Use the default parser. parser = self.default_parser if isinstance(parser, collections.Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None self.nsmaps = None def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. if tag[0] == '{': return tuple(tag[1:].split('}', 1)) else: return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): if isinstance(markup, basestring): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) self.parser.feed(data) while data != '': # Now call feed() on the rest of the data, chunk by chunk. data = markup.read(self.CHUNK_SIZE) if data != '': self.parser.feed(data) self.parser.close() def close(self): self.nsmaps = None def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. if len(nsmap) == 0 and self.nsmaps != None: # There are no new namespaces for this tag, but namespaces # are in play, so we need a separate tag stack to know # when they end. self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. if self.nsmaps is None: self.nsmaps = [] inverted_nsmap = dict((value, key) for key, value in nsmap.items()) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() for prefix, namespace in nsmap.items(): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace namespace, name = self._getNsTag(name) if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] namespace, name = self._getNsTag(name) nsprefix = None if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: nsprefix = inverted_nsmap[namespace] break self.soup.handle_endtag(name, nsprefix) if self.nsmaps != None: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. self.nsmaps.pop() if len(self.nsmaps) == 0: # Namespaces are no longer in play, so don't bother keeping # track of the namespace stack. self.nsmaps = None def pi(self, target, data): pass def data(self, content): self.soup.handle_data(content) def doctype(self, name, pubid, system): self.soup.endData() doctype = Doctype.for_name_and_ids(name, pubid, system) self.soup.object_was_parsed(doctype) def comment(self, content): "Handle comments as Comment objects." self.soup.endData() self.soup.handle_data(content) self.soup.endData(Comment) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False @property def default_parser(self): return etree.HTMLParser def feed(self, markup): self.parser.feed(markup) self.parser.close() def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" return u'%s' % fragment

beautifulsoup4-4.1.0/bs4/builder/_htmlparser.py

"""Use the HTMLParser library to parse HTML files that aren't too bad.""" __all__ = [ 'HTMLParserTreeBuilder', ] from HTMLParser import ( HTMLParser, HTMLParseError, ) import sys import warnings # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' # argument, which we'd like to set to False. Unfortunately, # http://bugs.python.org/issue13273 makes strict=True a better bet # before Python 3.2.3. # # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] CONSTRUCTOR_TAKES_STRICT = ( major > 3 or (major == 3 and minor > 2) or (major == 3 and minor == 2 and release >= 3)) from bs4.element import ( CData, Comment, Declaration, Doctype, ProcessingInstruction, ) from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( HTML, HTMLTreeBuilder, STRICT, ) HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): # XXX namespace self.soup.handle_starttag(name, None, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) else: real_name = int(name) try: data = unichr(real_name) except (ValueError, OverflowError), e: data = u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) if character is not None: data = character else: data = "&%s;" % name self.handle_data(data) def handle_comment(self, data): self.soup.endData() self.soup.handle_data(data) self.soup.endData(Comment) def handle_decl(self, data): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] self.soup.handle_data(data) self.soup.endData(Doctype) def unknown_decl(self, data): if data.upper().startswith('CDATA['): cls = CData data = data[len('CDATA['):] else: cls = Declaration self.soup.endData() self.soup.handle_data(data) self.soup.endData(cls) def handle_pi(self, data): self.soup.endData() if data.endswith("?") and data.lower().startswith("xml"): # "An XHTML processing instruction using the trailing '?' # will cause the '?' to be included in data." - HTMLParser # docs. # # Strip the question mark so we don't end up with two # question marks. data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False features = [HTML, STRICT, HTMLPARSER] def __init__(self, *args, **kwargs): if CONSTRUCTOR_TAKES_STRICT: kwargs['strict'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3. if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: import re attrfind_tolerant = re.compile( r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value |[^'\">\s]+ # bare value ) )? ) )* \s* # trailing whitespace """, re.VERBOSE) BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = rawdata[i+1:k].lower() while k < endpos: if self.strict: m = attrfind.match(rawdata, k) else: m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) if self.strict: self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) BeautifulSoupHTMLParser.parse_starttag = parse_starttag BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True

beautifulsoup4-4.1.0/bs4/testing.py

"""Helper classes for tests.""" import copy import functools import unittest from unittest import TestCase from bs4 import BeautifulSoup from bs4.element import ( CharsetMetaAttributeValue, Comment, ContentMetaAttributeValue, Doctype, SoupStrainer, ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder class SoupTest(unittest.TestCase): @property def default_builder(self): return default_builder() def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) def document_for(self, markup): """Turn an HTML fragment into a document. The details depend on the builder. """ return self.default_builder.test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = BeautifulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. Any HTML treebuilder, present or future, should be able to pass these tests. With invalid markup, there's room for interpretation, and different parsers can handle it differently. But with the markup in these tests, there's not much room for interpretation. """ def assertDoctypeHandled(self, doctype_fragment): """Assert that a given doctype string is handled correctly.""" doctype_str, soup = self._document_with_doctype(doctype_fragment) # Make sure a Doctype object was created. doctype = soup.contents[0] self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype, doctype_fragment) self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) # Make sure that the doctype was correctly associated with the # parse tree and that the rest of the document parsed. self.assertEqual(soup.p.contents[0], 'foo') def _document_with_doctype(self, doctype_fragment): """Generate and parse a document with the given doctype.""" doctype = '' % doctype_fragment markup = doctype + '\n

foo

' soup = self.soup(markup) return doctype, soup def test_normal_doctypes(self): """Make sure normal, everyday HTML doctypes are handled correctly.""" self.assertDoctypeHandled("html") self.assertDoctypeHandled( 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) def test_system_doctype(self): self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') def test_namespaced_system_doctype(self): # We can handle a namespaced doctype with a system ID. self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') def test_real_xhtml_document(self): """A real XHTML document should come out more or less the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) def test_deepcopy(self): """Make sure you can copy the tree builder. This is important because the builder is part of a BeautifulSoup object, and we want to be able to copy that. """ copy.deepcopy(self.default_builder) def test_p_tag_is_never_empty_element(self): """A

tag is never designated as an empty-element tag. Even if the markup shows it as an empty-element tag, it shouldn't be presented that way. """ soup = self.soup("

") self.assertFalse(soup.p.is_empty_element) self.assertEqual(str(soup.p), "

") def test_unclosed_tags_get_closed(self): """A tag that's not closed by the end of the document should be closed. This applies to all tags except empty-element tags. """ self.assertSoupEquals("

", "

") self.assertSoupEquals("", "") self.assertSoupEquals(" ", " ") def test_br_is_always_empty_element_tag(self): """A tag is designated as an empty-element tag. Some parsers treat as one tag, some parsers as two tags, but it should always be an empty-element tag. """ soup = self.soup(" ") self.assertTrue(soup.br.is_empty_element) self.assertEqual(str(soup.br), " ") def test_nested_formatting_elements(self): self.assertSoupEquals("") def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) def test_preserved_whitespace_in_pre_and_textarea(self): """Whitespace must be preserved in
 and  tags."""
        self.assertSoupEquals("
   
") self.assertSoupEquals(" woo ") def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): """Block elements can be nested.""" soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_correctly_nested_tables(self): """One table can go inside another one.""" markup = (' ' '
' "
Here's another table:" ' ' '
foo
' '') self.assertSoupEquals( markup, '
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "
Foo
" "
Bar
" "
Baz
") def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') def test_entities_in_attributes_converted_to_unicode(self): expect = u'

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) def test_quot_entity_converted_to_quotation_mark(self): self.assertSoupEquals("

I said "good day!"

", '

I said "good day!"

') def test_out_of_range_entity(self): expect = u"\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("빲�", expect) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose data.""" markup = b'4' soup = self.soup(markup) self.assertEqual(markup, soup.encode()) html = soup.html self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) self.assertEqual( 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) def test_multivalued_attribute_value_becomes_list(self): markup = b'
' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder # to detect any differences between them. # def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('', '') self.assertSoupEquals( 'foo', 'foo') def test_escaped_ampersand_in_attribute_value_is_left_alone(self): self.assertSoupEquals('') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("
  ") self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'
'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'
') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. hebrew_document = b'

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") self.assertEqual(soup.original_encoding, 'iso8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) content = parsed_meta['content'] self.assertEqual('text/html; charset=x-sjis', content) # But that value is actually a ContentMetaAttributeValue object. self.assertTrue(isinstance(content, ContentMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_html5_style_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") charset = parsed_meta['charset'] self.assertEqual('x-sjis', charset) # But that value is actually a CharsetMetaAttributeValue object. self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('utf8', charset.encode("utf8")) def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) class XMLTreeBuilderSmokeTest(object): def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" Goodbye.""" soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") self.assertEqual( soup.encode("latin1"), b'\n') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" markup = (b'\n' + b'0' * (2**12) + b'') soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("

", "

") self.assertSoupEquals("

foo

") def test_namespaces_are_preserved(self): markup = 'This tag is in the a namespaceThis tag is in the b namespace' soup = self.soup(markup) root = soup.root self.assertEqual("http://example.com/", root['xmlns:a']) self.assertEqual("http://example.net/", root['xmlns:b']) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" def test_real_xhtml_document(self): # Since XHTML is not HTML5, HTML5 parsers are not tested to handle # XHTML documents in any particular way. pass def test_html_tags_have_namespace(self): markup = "" soup = self.soup(markup) self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) def test_svg_tags_have_namespace(self): markup = '' soup = self.soup(markup) namespace = "http://www.w3.org/2000/svg" self.assertEqual(namespace, soup.svg.namespace) self.assertEqual(namespace, soup.circle.namespace) def test_mathml_tags_have_namespace(self): markup = '5' soup = self.soup(markup) namespace = 'http://www.w3.org/1998/Math/MathML' self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) def skipIf(condition, reason): def nothing(test, *args, **kwargs): return None def decorator(test_item): if condition: return nothing else: return test_item return decorator

beautifulsoup4-4.1.0/scripts/demonstrate_parser_differences.py

"""Demonstrate how different parsers parse the same markup. Beautiful Soup can use any of a number of different parsers. Every parser should behave more or less the same on valid markup, and Beautiful Soup's unit tests make sure this is the case. But every parser handles invalid markup differently. Even different versions of the same parser handle invalid markup differently. So instead of unit tests I've created this educational demonstration script. The file demonstration_markup.txt contains many lines of HTML. This script tests each line of markup against every parser you have installed, and prints out how each parser sees that markup. This may help you choose a parser, or understand why Beautiful Soup presents your document the way it does. """ import os import sys from bs4 import BeautifulSoup parsers = ['html.parser'] try: from bs4.builder import _lxml parsers.append('lxml') except ImportError, e: pass try: from bs4.builder import _html5lib parsers.append('html5lib') except ImportError, e: pass class Demonstration(object): def __init__(self, markup): self.results = {} self.markup = markup def run_against(self, *parser_names): uniform_results = True previous_output = None for parser in parser_names: try: soup = BeautifulSoup(self.markup, parser) if markup.startswith("<div>"): # Extract the interesting part output = soup.div else: output = soup except Exception, e: output = "[EXCEPTION] %s" % str(e) self.results[parser] = output if previous_output is None: previous_output = output elif previous_output != output: uniform_results = False return uniform_results def dump(self): print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) for parser, output in self.results.items(): print "%s: %s" % (parser.rjust(13), output.encode("utf8")) different_results = [] uniform_results = [] print "= Testing the following parsers: %s =" % ", ".join(parsers) print input_file = sys.stdin if sys.stdin.isatty(): for filename in [ "demonstration_markup.txt", os.path.join("scripts", "demonstration_markup.txt")]: if os.path.exists(filename): input_file = open(filename) for markup in input_file: demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) is_uniform = demo.run_against(*parsers) if is_uniform: uniform_results.append(demo) else: different_results.append(demo) print "== Markup that's handled the same in every parser ==" print for demo in uniform_results: demo.dump() print print "== Markup that's not handled the same in every parser ==" print for demo in different_results: demo.dump() print

beautifulsoup4-4.1.0/scripts/demonstration_markup.txt

A bare string <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> <div><![CDATA[A CDATA section where it doesn't belong]]></div> <div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> <div>A <meta> tag</div> <div>A <br> tag that supposedly has contents.</br></div> <div>AT&T</div> <div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div> <div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> <div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div> <div><a href="http://example.com/</a> that attribute value never got closed</div> <div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> <! This document starts with a bogus declaration ><div>a</div> <div>This document contains <!an incomplete declaration <div>(do you see it?)</div> <div>This document ends with <!an incomplete declaration <div><a style={height:21px;}>That attribute value was bogus</a></div> <! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace <div><table><td nowrap>That boolean attribute had no value</td></table></div> <div>Here's a nonexistent entity: &#foo; (do you see it?)</div> <div>This document ends before the entity finishes: &gt <div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> <b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> <div><table><tr><td>Here's a table</td></tr></table></div> <div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> <div>This tag contains nothing but whitespace: <b> </b></div> <div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> <div><table><div>This table contains bare markup</div></table></div> <div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> <div>This document contains a <!DOCTYPE surprise>surprise doctype</div> <div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> <div><our☃>Tag name contains Unicode characters</our☃></div> <div><a ☃="snowman">Attribute name contains Unicode characters</a></div> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

beautifulsoup4-4.1.0/doc/Makefile

# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest help: @echo "Please use \`make <target>' where <target> is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." make -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt."

beautifulsoup4-4.1.0/doc/source/6.1.jpg

beautifulsoup4-4.1.0/doc/source/index.rst

Beautiful Soup Documentation ============================ .. image:: 6.1.jpg :align: right :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." `Beautiful Soup `_ is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work. These instructions illustrate all major features of Beautiful Soup 4, with examples. I show you what the library is good for, how it works, how to use it, how to make it do what you want, and what to do when it violates your expectations. The examples in this documentation should work the same way in Python 2.7 and Python 3.2. You might be looking for the documentation for `Beautiful Soup 3 `_. If you want to learn about the differences between Beautiful Soup 3 and Beautiful Soup 4, see `Porting code to BS4`_. Getting help ------------ If you have questions about Beautiful Soup, or run into problems, `send mail to the discussion group `_. Quick Start =========== Here's an HTML document I'll be using as an example throughout this document. It's part of a story from `Alice in Wonderland`:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" Running the "three sisters" document through Beautiful Soup gives us a ``BeautifulSoup`` object, which represents the document as a nested data structure:: from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) print(soup.prettify()) # # # # # #

# # The Dormouse's story # #

#

# Once upon a time there were three little sisters; and their names were # # Elsie # # , # # Lacie # # and # # Tillie # # ; and they lived at the bottom of a well. #

#

# ... #

# # Here are some simple ways to navigate that data structure:: soup.title # soup.title.name # u'title' soup.title.string # u'The Dormouse's story' soup.title.parent.name # u'head' soup.p #

The Dormouse's story

soup.p['class'] # u'title' soup.a # Elsie soup.find_all('a') # [Elsie, # Lacie, # Tillie] soup.find(id="link3") # Tillie One common task is extracting all the URLs found within a page's tags:: for link in soup.find_all('a'): print(link.get('href')) # http://example.com/elsie # http://example.com/lacie # http://example.com/tillie Another common task is extracting all the text from a page:: print(soup.get_text()) # The Dormouse's story # # The Dormouse's story # # Once upon a time there were three little sisters; and their names were # Elsie, # Lacie and # Tillie; # and they lived at the bottom of a well. # # ... Does this look like what you need? If so, read on. Installing Beautiful Soup ========================= If you're using a recent version of Debian or Ubuntu Linux, you can install Beautiful Soup with the system package manager: :kbd:`$ apt-get install python-beautifulsoup4` Beautiful Soup 4 is published through PyPi, so if you can't install it with the system packager, you can install it with ``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, and the same package works on Python 2 and Python 3. :kbd:`$ easy_install beautifulsoup4` :kbd:`$ pip install beautifulsoup4` (The ``BeautifulSoup`` package is probably `not` what you want. That's the previous major release, `Beautiful Soup 3`_. Lots of software uses BS3, so it's still available, but if you're writing new code you should install ``beautifulsoup4``.) If you don't have ``easy_install`` or ``pip`` installed, you can `download the Beautiful Soup 4 source tarball `_ and install it with ``setup.py``. :kbd:`$ python setup.py install` If all else fails, the license for Beautiful Soup allows you to package the entire library with your application. You can download the tarball, copy its ``bs4`` directory into your application's codebase, and use Beautiful Soup without installing it at all. I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it should work with other recent versions. Problems after installation --------------------------- Beautiful Soup is packaged as Python 2 code. When you install it for use with Python 3, it's automatically converted to Python 3 code. If you don't install the package, the code won't be converted. There have also been reports on Windows machines of the wrong version being installed. If you get the ``ImportError`` "No module named HTMLParser", your problem is that you're running the Python 2 version of the code under Python 3. If you get the ``ImportError`` "No module named html.parser", your problem is that you're running the Python 3 version of the code under Python 2. In both cases, your best bet is to completely remove the Beautiful Soup installation from your system (including any directory created when you unzipped the tarball) and try the installation again. If you get the ``SyntaxError`` "Invalid syntax" on the line ``ROOT_TAG_NAME = u'[document]'``, you need to convert the Python 2 code to Python 3. You can do this either by installing the package: :kbd:`$ python3 setup.py install` or by manually running Python's ``2to3`` conversion script on the ``bs4`` directory: :kbd:`$ 2to3-3.2 -w bs4` .. _parser-installation: Installing a parser ------------------- Beautiful Soup supports the HTML parser included in Python's standard library, but it also supports a number of third-party Python parsers. One is the `lxml parser `_. Depending on your setup, you might install lxml with one of these commands: :kbd:`$ apt-get install python-lxml` :kbd:`$ easy_install lxml` :kbd:`$ pip install lxml` If you're using Python 2, another alternative is the pure-Python `html5lib parser `_, which parses HTML the way a web browser does. Depending on your setup, you might install html5lib with one of these commands: :kbd:`$ apt-get install python-html5lib` :kbd:`$ easy_install html5lib` :kbd:`$ pip install html5lib` This table summarizes the advantages and disadvantages of each parser library: +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | Parser | Typical usage | Advantages | Disadvantages | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | Python's html.parser | ``BeautifulSoup(markup, "html.parser")`` | * Batteries included | * Not very lenient | | | | * Decent speed | (before Python 2.7.3 | | | | * Lenient (as of Python 2.7.3 | or 3.2.2) | | | | and 3.2.) | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency | | | | * Lenient | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | lxml's XML parser | ``BeautifulSoup(markup, ["lxml", "xml"])`` | * Very fast | * External C dependency | | | ``BeautifulSoup(markup, "xml")`` | * The only currently supported | | | | | XML parser | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ | html5lib | ``BeautifulSoup(markup, html5lib)`` | * Extremely lenient | * Very slow | | | | * Parses pages the same way a | * External Python | | | | web browser does | dependency | | | | * Creates valid HTML5 | * Python 2 only | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ If you can, I recommend you install and use lxml for speed. If you're using a version of Python 2 earlier than 2.7.3, or a version of Python 3 earlier than 3.2.2, it's `essential` that you install lxml or html5lib--Python's built-in HTML parser is just not very good in older versions. Note that if a document is invalid, different parsers will generate different Beautiful Soup trees for it. See `Differences between parsers`_ for details. Making the soup =============== To parse a document, pass it into the ``BeautifulSoup`` constructor. You can pass in a string or an open filehandle:: from bs4 import BeautifulSoup soup = BeautifulSoup(open("index.html")) soup = BeautifulSoup("data") First, the document is converted to Unicode, and HTML entities are converted to Unicode characters:: BeautifulSoup("Sacré bleu!") Sacré bleu! Beautiful Soup then parses the document using the best available parser. It will use an HTML parser unless you specifically tell it to use an XML parser. (See `Parsing XML`_.) Kinds of objects ================ Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you'll only ever have to deal with about four `kinds` of objects. .. _Tag: ``Tag`` ------- A ``Tag`` object corresponds to an XML or HTML tag in the original document:: soup = BeautifulSoup('Extremely bold') tag = soup.b type(tag) # Tags have a lot of attributes and methods, and I'll cover most of them in `Navigating the tree`_ and `Searching the tree`_. For now, the most important features of a tag are its name and attributes. Name ^^^^ Every tag has a name, accessible as ``.name``:: tag.name # u'b' If you change a tag's name, the change will be reflected in any HTML markup generated by Beautiful Soup:: tag.name = "blockquote" tag #
Extremely bold
Attributes ^^^^^^^^^^ A tag may have any number of attributes. The tag ```` has an attribute "class" whose value is "boldest". You can access a tag's attributes by treating the tag like a dictionary:: tag['class'] # u'boldest' You can access that dictionary directly as ``.attrs``:: tag.attrs # {u'class': u'boldest'} You can add, remove, and modify a tag's attributes. Again, this is done by treating the tag as a dictionary:: tag['class'] = 'verybold' tag['id'] = 1 tag #
Extremely bold
del tag['class'] del tag['id'] tag #
Extremely bold
tag['class'] # KeyError: 'class' print(tag.get('class')) # None .. _multivalue: Multi-valued attributes &&&&&&&&&&&&&&&&&&&&&&& HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more. The most common multi-valued attribute is ``class`` (that is, a tag can have more than one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, ``headers``, and ``accesskey``. Beautiful Soup presents the value(s) of a multi-valued attribute as a list:: css_soup = BeautifulSoup('

') css_soup.p['class'] # ["body", "strikeout"] css_soup = BeautifulSoup('

') css_soup.p['class'] # ["body"] If an attribute `looks` like it has more than one value, but it's not a multi-valued attribute as defined by any version of the HTML standard, Beautiful Soup will leave the attribute alone:: id_soup = BeautifulSoup('

') id_soup.p['id'] # 'my id' When you turn a tag back into a string, multiple attribute values are consolidated:: rel_soup = BeautifulSoup('

Back to the homepage

') rel_soup.a['rel'] # ['index'] rel_soup.a['rel'] = ['index', 'contents'] print(rel_soup.p) #

Back to the homepage

If you parse a document as XML, there are no multi-valued attributes:: xml_soup = BeautifulSoup('

', 'xml') xml_soup.p['class'] # u'body strikeout' ``NavigableString`` ------------------- A string corresponds to a bit of text within a tag. Beautiful Soup uses the ``NavigableString`` class to contain these bits of text:: tag.string # u'Extremely bold' type(tag.string) # A ``NavigableString`` is just like a Python Unicode string, except that it also supports some of the features described in `Navigating the tree`_ and `Searching the tree`_. You can convert a ``NavigableString`` to a Unicode string with ``unicode()``:: unicode_string = unicode(tag.string) unicode_string # u'Extremely bold' type(unicode_string) # You can't edit a string in place, but you can replace one string with another, using :ref:`replace_with`:: tag.string.replace_with("No longer bold") tag #

No longer bold
``NavigableString`` supports most of the features described in `Navigating the tree`_ and `Searching the tree`_, but not all of them. In particular, since a string can't contain anything (the way a tag may contain a string or another tag), strings don't support the ``.contents`` or ``.string`` attributes, or the ``find()`` method. ``BeautifulSoup`` ----------------- The ``BeautifulSoup`` object itself represents the document as a whole. For most purposes, you can treat it as a :ref:`Tag` object. This means it supports most of the methods described in `Navigating the tree`_ and `Searching the tree`_. Since the ``BeautifulSoup`` object doesn't correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it's useful to look at its ``.name``, so it's been given the special ``.name`` "[document]":: soup.name # u'[document]' Comments and other special strings ---------------------------------- ``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost everything you'll see in an HTML or XML file, but there are a few leftover bits. The only one you'll probably ever need to worry about is the comment:: markup = "" soup = BeautifulSoup(markup) comment = soup.b.string type(comment) # The ``Comment`` object is just a special type of ``NavigableString``:: comment # u'Hey, buddy. Want to buy a used parser' But when it appears as part of an HTML document, a ``Comment`` is displayed with special formatting:: print(soup.b.prettify()) # # # Beautiful Soup defines classes for anything else that might show up in an XML document: ``CData``, ``ProcessingInstruction``, ``Declaration``, and ``Doctype``. Just like ``Comment``, these classes are subclasses of ``NavigableString`` that add something extra to the string. Here's an example that replaces the comment with a CDATA block:: from bs4 import CData cdata = CData("A CDATA block") comment.replace_with(cdata) print(soup.b.prettify()) # # A CDATA block # Navigating the tree =================== Here's the "Three sisters" HTML document again:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) I'll use this as an example to show you how to move from one part of a document to another. Going down ---------- Tags may contain strings and other tags. These elements are the tag's `children`. Beautiful Soup provides a lot of different attributes for navigating and iterating over a tag's children. Note that Beautiful Soup strings don't support any of these attributes, because a string can't have children. Navigating using tag names ^^^^^^^^^^^^^^^^^^^^^^^^^^ The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the tag, just say ``soup.head``:: soup.head # soup.title # You can do use this trick again and again to zoom in on a certain part of the parse tree. This code gets the first tag beneath the tag:: soup.body.b # The Dormouse's story Using a tag name as an attribute will give you only the `first` tag by that name:: soup.a # Elsie If you need to get `all` the tags, or anything more complicated than the first tag with a certain name, you'll need to use one of the methods described in `Searching the tree`_, such as `find_all()`:: soup.find_all('a') # [ Elsie, # Lacie, # Tillie] ``.contents`` and ``.children`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A tag's children are available in a list called ``.contents``:: head_tag = soup.head head_tag # head_tag.contents [] title_tag = head_tag.contents[0] title_tag # title_tag.contents # [u'The Dormouse's story'] The ``BeautifulSoup`` object itself has children. In this case, the tag is the child of the ``BeautifulSoup`` object.:: len(soup.contents) # 1 soup.contents[0].name # u'html' A string does not have ``.contents``, because it can't contain anything:: text = title_tag.contents[0] text.contents # AttributeError: 'NavigableString' object has no attribute 'contents' Instead of getting them as a list, you can iterate over a tag's children using the ``.children`` generator:: for child in title_tag.children: print(child) # The Dormouse's story ``.descendants`` ^^^^^^^^^^^^^^^^ The ``.contents`` and ``.children`` attributes only consider a tag's `direct` children. For instance, the tag has a single direct child--the ] But the tag. The ``.descendants`` attribute lets you iterate over `all` of a tag's children, recursively: its direct children, the children of its direct children, and so on:: for child in head_tag.descendants: print(child) # # The Dormouse's story The tag has only one child, but it has two descendants: the tag), but it has a whole lot of descendants:: len(list(soup.children)) # 1 len(list(soup.descendants)) # 25 .. _.string: ``.string`` ^^^^^^^^^^^ If a tag has only one child, and that child is a ``NavigableString``, the child is made available as ``.string``:: title_tag.string # u'The Dormouse's story' If a tag's only child is another tag, and `that` tag has a ``.string``, then the parent tag is considered to have the same ``.string`` as its child:: head_tag.contents # [] head_tag.string # u'The Dormouse's story' If a tag contains more than one thing, then it's not clear what ``.string`` should refer to, so ``.string`` is defined to be ``None``:: print(soup.html.string) # None .. _string-generators: ``.strings`` and ``stripped_strings`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If there's more than one thing inside a tag, you can still look at just the strings. Use the ``.strings`` generator:: for string in soup.strings: print(repr(string)) # u"The Dormouse's story" # u'\n\n' # u"The Dormouse's story" # u'\n\n' # u'Once upon a time there were three little sisters; and their names were\n' # u'Elsie' # u',\n' # u'Lacie' # u' and\n' # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'\n\n' # u'...' # u'\n' These strings tend to have a lot of extra whitespace, which you can remove by using the ``.stripped_strings`` generator instead:: for string in soup.stripped_strings: print(repr(string)) # u"The Dormouse's story" # u"The Dormouse's story" # u'Once upon a time there were three little sisters; and their names were' # u'Elsie' # u',' # u'Lacie' # u'and' # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'...' Here, strings consisting entirely of whitespace are ignored, and whitespace at the beginning and end of strings is removed. Going up -------- Continuing the "family tree" analogy, every tag and every string has a `parent`: the tag that contains it. .. _.parent: ``.parent`` ^^^^^^^^^^^ You can access an element's parent with the ``.parent`` attribute. In the example "three sisters" document, the tag is the parent of the title_tag.parent # The title string itself has a parent: the The parent of a top-level tag like is the ``BeautifulSoup`` object itself:: html_tag = soup.html type(html_tag.parent) # And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: print(soup.parent) # None .. _.parents: ``.parents`` ^^^^^^^^^^^^ You can iterate over all of an element's parents with ``.parents``. This example uses ``.parents`` to travel from an tag buried deep within the document, to the very top of the document:: link = soup.a link # Elsie for parent in link.parents: if parent is None: print(parent) else: print(parent.name) # p # body # html # [document] # None Going sideways -------------- Consider a simple document like this:: sibling_soup = BeautifulSoup("text1text2") print(sibling_soup.prettify()) # # # # # text1 # # # text2 # # # # The tag and the tag are at the same level: they're both direct children of the same tag. We call them `siblings`. When a document is pretty-printed, siblings show up at the same indentation level. You can also use this relationship in the code you write. ``.next_sibling`` and ``.previous_sibling`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can use ``.next_sibling`` and ``.previous_sibling`` to navigate between page elements that are on the same level of the parse tree:: sibling_soup.b.next_sibling # text2 sibling_soup.c.previous_sibling # text1 The tag has a ``.next_sibling``, but no ``.previous_sibling``, because there's nothing before the tag `on the same level of the tree`. For the same reason, the tag has a ``.previous_sibling`` but no ``.next_sibling``:: print(sibling_soup.b.previous_sibling) # None print(sibling_soup.c.next_sibling) # None The strings "text1" and "text2" are `not` siblings, because they don't have the same parent:: sibling_soup.b.string # u'text1' print(sibling_soup.b.string.next_sibling) # None In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a tag will usually be a string containing whitespace. Going back to the "three sisters" document:: Elsie Lacie Tillie You might think that the ``.next_sibling`` of the first tag would be the second tag. But actually, it's a string: the comma and newline that separate the first tag from the second:: link = soup.a link # Elsie link.next_sibling # u',\n' The second tag is actually the ``.next_sibling`` of the comma:: link.next_sibling.next_sibling # Lacie .. _sibling-generators: ``.next_siblings`` and ``.previous_siblings`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can iterate over a tag's siblings with ``.next_siblings`` or ``.previous_siblings``:: for sibling in soup.a.next_siblings: print(repr(sibling)) # u',\n' # Lacie # u' and\n' # Tillie # u'; and they lived at the bottom of a well.' # None for sibling in soup.find(id="link3").previous_siblings: print(repr(sibling)) # ' and\n' # Lacie # u',\n' # Elsie # u'Once upon a time there were three little sisters; and their names were\n' # None Going back and forth -------------------- Take a look at the beginning of the "three sisters" document::

The Dormouse's story

An HTML parser takes this string of characters and turns it into a series of events: "open an tag", "open a tag", "open a

tag", and so on. Beautiful Soup offers tools for reconstructing the initial parse of the document. .. _element-generators: ``.next_element`` and ``.previous_element`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``.next_element`` attribute of a string or tag points to whatever was parsed immediately afterwards. It might be the same as ``.next_sibling``, but it's usually drastically different. Here's the final tag in the "three sisters" document. Its ``.next_sibling`` is a string: the conclusion of the sentence that was interrupted by the start of the tag.:: last_a_tag = soup.find("a", id="link3") last_a_tag # Tillie last_a_tag.next_sibling # '; and they lived at the bottom of a well.' But the ``.next_element`` of that tag, the thing that was parsed immediately after the tag, is `not` the rest of that sentence: it's the word "Tillie":: last_a_tag.next_element # u'Tillie' That's because in the original markup, the word "Tillie" appeared before that semicolon. The parser encountered an tag, then the word "Tillie", then the closing tag, then the semicolon and rest of the sentence. The semicolon is on the same level as the tag, but the word "Tillie" was encountered first. The ``.previous_element`` attribute is the exact opposite of ``.next_element``. It points to whatever element was parsed immediately before this one:: last_a_tag.previous_element # u' and\n' last_a_tag.previous_element.next_element # Tillie ``.next_elements`` and ``.previous_elements`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You should get the idea by now. You can use these iterators to move forward or backward in the document as it was parsed:: for element in last_a_tag.next_elements: print(repr(element)) # u'Tillie' # u';\nand they lived at the bottom of a well.' # u'\n\n' #

...

# u'...' # u'\n' # None Searching the tree ================== Beautiful Soup defines a lot of methods for searching the parse tree, but they're all very similar. I'm going to spend a lot of time explain the two most popular methods: ``find()`` and ``find_all()``. The other methods take almost exactly the same arguments, so I'll just cover them briefly. Once again, I'll be using the "three sisters" document as an example:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) By passing in a filter to an argument like ``find_all()``, you can isolate whatever parts of the document you're interested. Kinds of filters ---------------- Before talking in detail about ``find_all()`` and similar methods, I want to show examples of different filters you can pass into these methods. These filters show up again and again, throughout the search API. You can use them to filter based on a tag's name, on its attributes, on the text of a string, or on some combination of these. .. _a string: A string ^^^^^^^^ The simplest filter is a string. Pass a string to a search method and Beautiful Soup will perform a match against that exact string. This code finds all the tags in the document:: soup.find_all('b') # [The Dormouse's story] If you pass in a byte string, Beautiful Soup will assume the string is encoded as UTF-8. You can avoid this by passing in a Unicode string instead. .. _a regular expression: A regular expression ^^^^^^^^^^^^^^^^^^^^ If you pass in a regular expression object, Beautiful Soup will filter against that regular expression. This code finds all the tags whose names start with the letter "b"; in this case, the tag and the tag:: import re for tag in soup.find_all(re.compile("b.*")): print(tag.name) # body # b .. _a list: A list ^^^^^^ If you pass in a list, Beautiful Soup will allow a string match against `any` item in that list. This code finds all the tags `and` all the tags:: soup.find_all(["a", "b"]) # [The Dormouse's story, # Elsie, # Lacie, # Tillie] .. _the value True: ``True`` ^^^^^^^^ The value ``True`` matches everything it can. This code finds `all` the tags in the document, but none of the text strings:: for tag in soup.find_all(True): print(tag.name) # html # head # title # body # p # b # p # a # a # a # p .. a function: A function ^^^^^^^^^^ If none of the other matches work for you, define a function that takes an element as its only argument. The function should return ``True`` if the argument matches, and ``False`` otherwise. Here's a function that returns ``True`` if a tag defines the "class" attribute but doesn't define the "id" attribute:: def has_class_but_no_id(tag): return tag.has_key('class') and not tag.has_key('id') Pass this function into ``find_all()`` and you'll pick up all the

tags:: soup.find_all(has_class_but_no_id) # [

The Dormouse's story

, #

Once upon a time there were...

, #

...

] This function only picks up the

tags. It doesn't pick up the tags, because those tags define both "class" and "id". It doesn't pick up tags like

and ] soup.find_all("p", "title") # [

The Dormouse's story

] soup.find_all("a") # [Elsie, # Lacie, # Tillie] soup.find_all(id="link2") # [Lacie] import re soup.find(text=re.compile("sisters")) # u'Once upon a time there were three little sisters; and their names were\n' Some of these should look familiar, but others are new. What does it mean to pass in a value for ``text``, or ``id``? Why does ``find_all("p", "title")`` find a

tag with the CSS class "title"? Let's look at the arguments to ``find_all()``. .. _name: The ``name`` argument ^^^^^^^^^^^^^^^^^^^^^ Pass in a value for ``name`` and you'll tell Beautiful Soup to only consider tags with certain names. Text strings will be ignored, as will tags whose names that don't match. This is the simplest usage:: soup.find_all("title") # [

] Recall from `Kinds of filters`_ that the value to ``name`` can be `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. .. _kwargs: The keyword arguments ^^^^^^^^^^^^^^^^^^^^^ Any argument that's not recognized will be turned into a filter on one of a tag's attributes. If you pass in a value for an argument called ``id``, Beautiful Soup will filter against each tag's 'id' attribute:: soup.find_all(id='link2') # [Lacie] If you pass in a value for ``href``, Beautiful Soup will filter against each tag's 'href' attribute:: soup.find_all(href=re.compile("elsie")) # [Elsie] You can filter an attribute based on `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. This code finds all tags that have an ``id`` attribute, regardless of what the value is:: soup.find_all(id=True) # [Elsie, # Lacie, # Tillie] You can filter multiple attributes at once by passing in more than one keyword argument:: soup.find_all(href=re.compile("elsie"), id='link1') # [three] .. _attrs: Searching by CSS class ^^^^^^^^^^^^^^^^^^^^^^ Instead of using keyword arguments, you can filter tags based on their attributes by passing a dictionary in for ``attrs``. These two lines of code are equivalent:: soup.find_all(href=re.compile("elsie"), id='link1') soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) The ``attrs`` argument would be a pretty obscure feature were it not for one thing: CSS. It's very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, "class", is also a Python reserved word. You can use ``attrs`` to search by CSS class:: soup.find_all("a", { "class" : "sister" }) # [Elsie, # Lacie, # Tillie] But that's a lot of code for such a common operation. Instead, you can pass a string `attrs` instead of a dictionary. The string will be used to restrict the CSS class:: soup.find_all("a", "sister") # [Elsie, # Lacie, # Tillie] You can also pass in a regular expression, a function or True. Anything you pass in for ``attrs`` that's not a dictionary will be used to search against the CSS class:: soup.find_all(attrs=re.compile("itl")) # [

The Dormouse's story

] def has_six_characters(css_class): return css_class is not None and len(css_class) == 6 soup.find_all(attrs=has_six_characters) # [Elsie, # Lacie, # Tillie] :ref:`Remember ` that a single tag can have multiple values for its "class" attribute. When you search for a tag that matches a certain CSS class, you're matching against `any` of its CSS classes:: css_soup = BeautifulSoup('

') css_soup.find_all("p", "strikeout") # [

] css_soup.find_all("p", "body") # [

] Searching for the string value of the ``class`` attribute won't work:: css_soup.find_all("p", "body strikeout") # [] .. _text: The ``text`` argument ^^^^^^^^^^^^^^^^^^^^^ With ``text`` you can search for strings instead of tags. As with ``name`` and the keyword arguments, you can pass in `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. Here are some examples:: soup.find_all(text="Elsie") # [u'Elsie'] soup.find_all(text=["Tillie", "Elsie", "Lacie"]) # [u'Elsie', u'Lacie', u'Tillie'] soup.find_all(text=re.compile("Dormouse")) [u"The Dormouse's story", u"The Dormouse's story"] def is_the_only_string_within_a_tag(s): """Return True if this string is the only child of its parent tag.""" return (s == s.parent.string) soup.find_all(text=is_the_only_string_within_a_tag) # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] Although ``text`` is for finding strings, you can combine it with arguments for finding tags, Beautiful Soup will find all tags whose ``.string`` matches your value for ``text``. This code finds the tags whose ``.string`` is "Elsie":: soup.find_all("a", text="Elsie") # [ Elsie] .. _limit: The ``limit`` argument ^^^^^^^^^^^^^^^^^^^^^^ ``find_all()`` returns all the tags and strings that match your filters. This can take a while if the document is large. If you don't need `all` the results, you can pass in a number for ``limit``. This works just like the LIMIT keyword in SQL. It tells Beautiful Soup to stop gathering results after it's found a certain number. There are three links in the "three sisters" document, but this code only finds the first two:: soup.find_all("a", limit=2) # [Elsie, # Lacie] .. _recursive: The ``recursive`` argument ^^^^^^^^^^^^^^^^^^^^^^^^^^ If you call ``mytag.find_all()``, Beautiful Soup will examine all the descendants of ``mytag``: its children, its children's children, and so on. If you only want Beautiful Soup to consider direct children, you can pass in ``recursive=False``. See the difference here:: soup.html.find_all("title") # [] soup.html.find_all("title", recursive=False) # [] Here's that part of the document:: ... The tag, but it's not `directly` beneath the tag: the tag is in the way. Beautiful Soup finds the tag, but when ``recursive=False`` restricts it to the tag's immediate children, it finds nothing. Beautiful Soup offers a lot of tree-searching methods (covered below), and they mostly take the same arguments as ``find_all()``: ``name``, ``attrs``, ``text``, ``limit``, and the keyword arguments. But the ``recursive`` argument is different: ``find_all()`` and ``find()`` are the only methods that support it. Passing ``recursive=False`` into a method like ``find_parents()`` wouldn't be very useful. Calling a tag is like calling ``find_all()`` -------------------------------------------- Because ``find_all()`` is the most popular method in the Beautiful Soup search API, you can use a shortcut for it. If you treat the ``BeautifulSoup`` object or a ``Tag`` object as though it were a function, then it's the same as calling ``find_all()`` on that object. These two lines of code are equivalent:: soup.find_all("a") soup("a") These two lines are also equivalent:: soup.title.find_all(text=True) soup.title(text=True) ``find()`` ---------- Signature: find(:ref:`name `, :ref:`attrs `, :ref:`recursive `, :ref:`text `, :ref:`**kwargs `) The ``find_all()`` method scans the entire document looking for results, but sometimes you only want to find one result. If you know a document only has one tag, it's a waste of time to scan the entire document looking for more. Rather than passing in ``limit=1`` every time you call ``find_all``, you can use the ``find()`` method. These two lines of code are `nearly` equivalent:: soup.find_all('title', limit=1) # [] soup.find('title') # The only difference is that ``find_all()`` returns a list containing the single result, and ``find()`` just returns the result. If ``find_all()`` can't find anything, it returns an empty list. If ``find()`` can't find anything, it returns ``None``:: print(soup.find("nosuchtag")) # None Remember the ``soup.head.title`` trick from `Navigating using tag names`_? That trick works by repeatedly calling ``find()``:: soup.head.title # soup.find("head").find("title") # ``find_parents()`` and ``find_parent()`` ---------------------------------------- Signature: find_parents(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_parent(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) I spent a lot of time above covering ``find_all()`` and ``find()``. The Beautiful Soup API defines ten other methods for searching the tree, but don't be afraid. Five of these methods are basically the same as ``find_all()``, and the other five are basically the same as ``find()``. The only differences are in what parts of the tree they search. First let's consider ``find_parents()`` and ``find_parent()``. Remember that ``find_all()`` and ``find()`` work their way down the tree, looking at tag's descendants. These methods do the opposite: they work their way `up` the tree, looking at a tag's (or a string's) parents. Let's try them out, starting from a string buried deep in the "three daughters" document:: a_string = soup.find(text="Lacie") a_string # u'Lacie' a_string.find_parents("a") # [Lacie] a_string.find_parent("p") #

Once upon a time there were three little sisters; and their names were # Elsie, # Lacie and # Tillie; # and they lived at the bottom of a well.

a_string.find_parents("p", class="title") # [] One of the three tags is the direct parent of the string in question, so our search finds it. One of the three

tags is an indirect parent of the string, and our search finds that as well. There's a

tag with the CSS class "title" `somewhere` in the document, but it's not one of this string's parents, so we can't find it with ``find_parents()``. You may have made the connection between ``find_parent()`` and ``find_parents()``, and the `.parent`_ and `.parents`_ attributes mentioned earlier. The connection is very strong. These search methods actually use ``.parents`` to iterate over all the parents, and check each one against the provided filter to see if it matches. ``find_next_siblings()`` and ``find_next_sibling()`` ---------------------------------------------------- Signature: find_next_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_next_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.next_siblings ` to iterate over the rest of an element's siblings in the tree. The ``find_next_siblings()`` method returns all the siblings that match, and ``find_next_sibling()`` only returns the first one:: first_link = soup.a first_link # Elsie first_link.find_next_siblings("a") # [Lacie, # Tillie] first_story_paragraph = soup.find("p", "story") first_story_paragraph.find_next_sibling("p") #

...

``find_previous_siblings()`` and ``find_previous_sibling()`` ------------------------------------------------------------ Signature: find_previous_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_previous_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.previous_siblings ` to iterate over an element's siblings that precede it in the tree. The ``find_previous_siblings()`` method returns all the siblings that match, and ``find_previous_sibling()`` only returns the first one:: last_link = soup.find("a", id="link3") last_link # Tillie last_link.find_previous_siblings("a") # [Lacie, # Elsie] first_story_paragraph = soup.find("p", "story") first_story_paragraph.find_previous_sibling("p") #

The Dormouse's story

``find_all_next()`` and ``find_next()`` --------------------------------------- Signature: find_all_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.next_elements ` to iterate over whatever tags and strings that come after it in the document. The ``find_all_next()`` method returns all matches, and ``find_next()`` only returns the first match:: first_link = soup.a first_link # Elsie first_link.find_all_next(text=True) # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] first_link.find_next("p") #

...

In the first example, the string "Elsie" showed up, even though it was contained within the tag we started from. In the second example, the last

tag in the document showed up, even though it's not in the same part of the tree as the tag we started from. For these methods, all that matters is that an element match the filter, and show up later in the document than the starting element. ``find_all_previous()`` and ``find_previous()`` ----------------------------------------------- Signature: find_all_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) Signature: find_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) These methods use :ref:`.previous_elements ` to iterate over the tags and strings that came before it in the document. The ``find_all_previous()`` method returns all matches, and ``find_previous()`` only returns the first match:: first_link = soup.a first_link # Elsie first_link.find_all_previous("p") # [

Once upon a time there were three little sisters; ...

, #

The Dormouse's story

] first_link.find_previous("title") # The call to ``find_all_previous("p")`` found the first paragraph in the document (the one with class="title"), but it also finds the second paragraph, the

tag that contains the tag we started with. This shouldn't be too surprising: we're looking at all the tags that show up earlier in the document than the one we started with. A

tag that contains an tag must have shown up before the tag it contains. CSS selectors ------------- Beautiful Soup supports a subset of the `CSS selector standard `_. Just construct the selector as a string and pass it into the ``.select()`` method of a ``Tag`` or the ``BeautifulSoup`` object itself. You can find tags:: soup.select("title") # [

] Find tags beneath other tags:: soup.select("body a") # [Elsie, # Lacie, # Tillie] soup.select("html head title") # [] Find tags `directly` beneath other tags:: soup.select("head > title") # [] soup.select("p > a") # [Elsie, # Lacie, # Tillie] soup.select("body > a") # [] Find tags by CSS class:: soup.select(".sister") # [Elsie, # Lacie, # Tillie] soup.select("[class~=sister]") # [Elsie, # Lacie, # Tillie] Find tags by ID:: soup.select("#link1") # [Elsie] soup.select("a#link2") # [Lacie] Test for the existence of an attribute:: soup.select('a[href]') # [Elsie, # Lacie, # Tillie] Find tags by attribute value:: soup.select('a[href="http://example.com/elsie"]') # [Elsie] soup.select('a[href^="http://example.com/"]') # [Elsie, # Lacie, # Tillie] soup.select('a[href$="tillie"]') # [Tillie] soup.select('a[href*=".com/el"]') # [Elsie] Match language codes:: multilingual_markup = """

Hello

Howdy, y'all

Pip-pip, old fruit

Bonjour mes amis

""" multilingual_soup = BeautifulSoup(multilingual_markup) multilingual_soup.select('p[lang|=en]') # [

Hello

, #

Howdy, y'all

, #

Pip-pip, old fruit

] This is a convenience for users who know the CSS selector syntax. You can do all this stuff with the Beautiful Soup API. And if CSS selectors are all you need, you might as well use lxml directly, because it's faster. But this lets you `combine` simple CSS selectors with the Beautiful Soup API. Modifying the tree ================== Beautiful Soup's main strength is in searching the parse tree, but you can also modify the tree and write your changes as a new HTML or XML document. Changing tag names and attributes --------------------------------- I covered this earlier, in `Attributes`_, but it bears repeating. You can rename a tag, change the values of its attributes, add new attributes, and delete attributes:: soup = BeautifulSoup('Extremely bold') tag = soup.b tag.name = "blockquote" tag['class'] = 'verybold' tag['id'] = 1 tag #
Extremely bold
del tag['class'] del tag['id'] tag #
Extremely bold
Modifying ``.string`` --------------------- If you set a tag's ``.string`` attribute, the tag's contents are replaced with the string you give:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.string = "New link text." tag # New link text. Be careful: if the tag contained other tags, they and all their contents will be destroyed. ``append()`` ------------ You can add to a tag's contents with ``Tag.append()``. It works just like calling ``.append()`` on a Python list:: soup = BeautifulSoup("Foo") soup.a.append("Bar") soup # FooBar soup.a.contents # [u'Foo', u'Bar'] ``BeautifulSoup.new_string()`` and ``.new_tag()`` ------------------------------------------------- If you need to add a string to a document, no problem--you can pass a Python string in to ``append()``, or you can call the factory method ``BeautifulSoup.new_string()``:: soup = BeautifulSoup("") tag = soup.b tag.append("Hello") new_string = soup.new_string(" there") tag.append(new_string) tag # Hello there. tag.contents # [u'Hello', u' there'] What if you need to create a whole new tag? The best solution is to call the factory method ``BeautifulSoup.new_tag()``:: soup = BeautifulSoup("") original_tag = soup.b new_tag = soup.new_tag("a", href="http://www.example.com") original_tag.append(new_tag) original_tag # new_tag.string = "Link text." original_tag # Link text. Only the first argument, the tag name, is required. ``insert()`` ------------ ``Tag.insert()`` is just like ``Tag.append()``, except the new element doesn't necessarily go at the end of its parent's ``.contents``. It'll be inserted at whatever numeric position you say. It works just like ``.insert()`` on a Python list:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.insert(1, "but did not endorse ") tag # I linked to but did not endorse example.com tag.contents # [u'I linked to ', u'but did not endorse', example.com] ``insert_before()`` and ``insert_after()`` ------------------------------------------ The ``insert_before()`` method inserts a tag or string immediately before something else in the parse tree:: soup = BeautifulSoup("stop") tag = soup.new_tag("i") tag.string = "Don't" soup.b.string.insert_before(tag) soup.b # Don'tstop The ``insert_after()`` method moves a tag or string so that it immediately follows something else in the parse tree:: soup.b.i.insert_after(soup.new_string(" ever ")) soup.b # Don't ever stop soup.b.contents # [Don't, u' ever ', u'stop'] ``clear()`` ----------- ``Tag.clear()`` removes the contents of a tag:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) tag = soup.a tag.clear() tag # ``extract()`` ------------- ``PageElement.extract()`` removes a tag or string from the tree. It returns the tag or string that was extracted:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a i_tag = soup.i.extract() a_tag # I linked to i_tag # example.com print(i_tag.parent) None At this point you effectively have two parse trees: one rooted at the ``BeautifulSoup`` object you used to parse the document, and one rooted at the tag that was extracted. You can go on to call ``extract`` on a child of the element you extracted:: my_string = i_tag.string.extract() my_string # u'example.com' print(my_string.parent) # None i_tag # ``decompose()`` --------------- ``Tag.decompose()`` removes a tag from the tree, then `completely destroys it and its contents`:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a soup.i.decompose() a_tag # I linked to .. _replace_with: ``replace_with()`` ------------------ ``PageElement.replace_with()`` removes a tag or string from the tree, and replaces it with the tag or string of your choice:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a new_tag = soup.new_tag("b") new_tag.string = "example.net" a_tag.i.replace_with(new_tag) a_tag # I linked to example.net ``replace_with()`` returns the tag or string that was replaced, so that you can examine it or add it back to another part of the tree. ``wrap()`` ---------- ``PageElement.wrap()`` wraps an element in the tag you specify. It returns the new wrapper:: soup = BeautifulSoup("

I wish I was bold.

") soup.p.string.wrap(soup.new_tag("b")) # I wish I was bold. soup.p.wrap(soup.new_tag("div") #

I wish I was bold.

This method is new in Beautiful Soup 4.0.5. ``unwrap()`` --------------------------- ``Tag.unwrap()`` is the opposite of ``wrap()``. It replaces a tag with whatever's inside that tag. It's good for stripping out markup:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) a_tag = soup.a a_tag.i.unwrap() a_tag # I linked to example.com Like ``replace_with()``, ``unwrap()`` returns the tag that was replaced. (In earlier versions of Beautiful Soup, ``unwrap()`` was called ``replace_with_children()``, and that name will still work.) Output ====== Pretty-printing --------------- The ``prettify()`` method will turn a Beautiful Soup parse tree into a nicely formatted bytestring, with each HTML/XML tag on its own line:: markup = 'I linked to example.com' soup = BeautifulSoup(markup) soup.prettify() # '\n \n \n \n \n...' print(soup.prettify()) # # # # # # I linked to # # example.com # # # # You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, or on any of its ``Tag`` objects:: print(soup.a.prettify()) # # I linked to # # example.com # # Non-pretty printing ------------------- If you just want a string, with no fancy formatting, you can call ``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag`` within it:: str(soup) # 'I linked to example.com' unicode(soup.a) # u'I linked to example.com' The ``str()`` function returns a string encoded in UTF-8. See `Encodings`_ for other options. You can also call ``encode()`` to get a bytestring, and ``decode()`` to get Unicode. .. _output_formatters: Output formatters ----------------- If you give Beautiful Soup a document that contains HTML entities like "&lquot;", they'll be converted to Unicode characters:: soup = BeautifulSoup("“Dammit!” he said.") unicode(soup) # u'\u201cDammit!\u201d he said.' If you then convert the document to a string, the Unicode characters will be encoded as UTF-8. You won't get the HTML entities back:: str(soup) # '\xe2\x80\x9cDammit!\xe2\x80\x9d he said.' By default, the only characters that are escaped upon output are bare ampersands and angle brackets. These get turned into "&", "<", and ">", so that Beautiful Soup doesn't inadvertently generate invalid HTML or XML:: soup = BeautifulSoup("

The law firm of Dewey, Cheatem, & Howe

") soup.p #

The law firm of Dewey, Cheatem, & Howe

soup = BeautifulSoup('A link') soup.a # A link You can change this behavior by providing a value for the ``formatter`` argument to ``prettify()``, ``encode()``, or ``decode()``. Beautiful Soup recognizes four possible values for ``formatter``. The default is ``formatter="minimal"``. Strings will only be processed enough to ensure that Beautiful Soup generates valid HTML/XML:: french = "

Il a dit <<Sacré bleu!>>

" soup = BeautifulSoup(french) print(soup.prettify(formatter="minimal")) # # #

# Il a dit <<Sacré bleu!>> #

# # If you pass in ``formatter="html"``, Beautiful Soup will convert Unicode characters to HTML entities whenever possible:: print(soup.prettify(formatter="html")) # # #

# Il a dit <<Sacré bleu!>> #

# # If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: print(soup.prettify(formatter=None)) # # #

# Il a dit > #

# # link_soup = BeautifulSoup('A link') print(link_soup.a.encode(formatter=None)) # A link Finally, if you pass in a function for ``formatter``, Beautiful Soup will call that function once for every string and attribute value in the document. You can do whatever you want in this function. Here's a formatter that converts strings to uppercase and does absolutely nothing else:: def uppercase(str): return str.upper() print(soup.prettify(formatter=uppercase)) # # #

# IL A DIT > #

# # print(link_soup.a.prettify(formatter=uppercase)) # # A LINK # If you're writing your own function, you should know about the ``EntitySubstitution`` class in the ``bs4.dammit`` module. This class implements Beautiful Soup's standard formatters as class methods: the "html" formatter is ``EntitySubstitution.substitute_html``, and the "minimal" formatter is ``EntitySubstitution.substitute_xml``. You can use these functions to simulate ``formatter=html`` or ``formatter==minimal``, but then do something extra. Here's an example that replaces Unicode characters with HTML entities whenever possible, but `also` converts all strings to uppercase:: from bs4.dammit import EntitySubstitution def uppercase_and_substitute_html_entities(str): return EntitySubstitution.substitute_html(str.upper()) print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) # # #

# IL A DIT <<SACRÉ BLEU!>> #

# # One last caveat: if you create a ``CData`` object, the text inside that object is always presented `exactly as it appears, with no formatting`. Beautiful Soup will call the formatter method, just in case you've written a custom method that counts all the strings in the document or something, but it will ignore the return value. from bs4.element import CData soup = BeautifulSoup("") soup.a.string = CData("one < three") print(soup.a.prettify(formatter="xml")) # # one < three # ``get_text()`` -------------- If you only want the text part of a document or tag, you can use the ``get_text()`` method. It returns all the text in a document or beneath a tag, as a single Unicode string:: markup = '\nI linked to example.com\n' soup = BeautifulSoup(markup) soup.get_text() u'\nI linked to example.com\n' soup.i.get_text() u'example.com' You can specify a string to be used to join the bits of text together:: # soup.get_text("|") u'\nI linked to |example.com|\n' You can tell Beautiful Soup to strip whitespace from the beginning and end of each bit of text:: # soup.get_text("|", strip=True) u'I linked to|example.com' But at that point you might want to use the :ref:`.stripped_strings ` generator instead, and process the text yourself:: [text for text in soup.stripped_strings] # [u'I linked to', u'example.com'] Specifying the parser to use ============================ If you just need to parse some HTML, you can dump the markup into the ``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful Soup will pick a parser for you and parse the data. But there are a few additional arguments you can pass in to the constructor to change which parser is used. The first argument to the ``BeautifulSoup`` constructor is a string or an open filehandle--the markup you want parsed. The second argument is `how` you'd like the markup parsed. If you don't specify anything, you'll get the best HTML parser that's installed. Beautiful Soup ranks lxml's parser as being the best, then html5lib's, then Python's built-in parser. You can override this by specifying one of the following: * What type of markup you want to parse. Currently supported are "html", "xml", and "html5". * The name of the parser library you want to use. Currently supported options are "lxml", "html5lib", and "html.parser" (Python's built-in HTML parser). The section `Installing a parser`_ contrasts the supported parsers. If you don't have an appropriate parser installed, Beautiful Soup will ignore your request and pick a different parser. Right now, the only supported XML parser is lxml. If you don't have lxml installed, asking for an XML parser won't give you one, and asking for "lxml" won't work either. Differences between parsers --------------------------- Beautiful Soup presents the same interface to a number of different parsers, but each parser is different. Different parsers will create different parse trees from the same document. The biggest differences are between the HTML parsers and the XML parsers. Here's a short document, parsed as HTML:: BeautifulSoup("") # Since an empty tag is not valid HTML, the parser turns it into a tag pair. Here's the same document parsed as XML (running this requires that you have lxml installed). Note that the empty tag is left alone, and that the document is given an XML declaration instead of being put into an tag.:: BeautifulSoup("", "xml") # # There are also differences between HTML parsers. If you give Beautiful Soup a perfectly-formed HTML document, these differences won't matter. One parser will be faster than another, but they'll all give you a data structure that looks exactly like the original HTML document. But if the document is not perfectly-formed, different parsers will give different results. Here's a short, invalid document parsed using lxml's HTML parser. Note that the dangling tag is simply ignored:: BeautifulSoup("", "lxml") # Here's the same document parsed using html5lib:: BeautifulSoup("", "html5lib") #

Instead of ignoring the dangling tag, html5lib pairs it with an opening

tag. This parser also adds an empty

tag to the document. Here's the same document parsed with Python's built-in HTML parser:: BeautifulSoup("
", "html.parser") # Like html5lib, this parser ignores the closing tag. Unlike html5lib, this parser makes no attempt to create a well-formed HTML document by adding a tag. Unlike lxml, it doesn't even bother to add an tag. Since the document "" is invalid, none of these techniques is the "correct" way to handle it. The html5lib parser uses techniques that are part of the HTML5 standard, so it has the best claim on being the "correct" way, but all three techniques are legitimate. Differences between parsers can affect your script. If you're planning on distributing your script to other people, or running it on multiple machines, you should specify a parser in the ``BeautifulSoup`` constructor. That will reduce the chances that your users parse a document differently from the way you parse it. Encodings ========= Any HTML or XML document is written in a specific encoding like ASCII or UTF-8. But when you load that document into Beautiful Soup, you'll discover it's been converted to Unicode:: markup = "

Sacr\xc3\xa9 bleu!

" soup = BeautifulSoup(markup) soup.h1 #

Sacré bleu!

soup.h1.string # u'Sacr\xe9 bleu!' It's not magic. (That sure would be nice.) Beautiful Soup uses a sub-library called `Unicode, Dammit`_ to detect a document's encoding and convert it to Unicode. The autodetected encoding is available as the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: soup.original_encoding 'utf-8' Unicode, Dammit guesses correctly most of the time, but sometimes it makes mistakes. Sometimes it guesses correctly, but only after a byte-by-byte search of the document that takes a very long time. If you happen to know a document's encoding ahead of time, you can avoid mistakes and delays by passing it to the ``BeautifulSoup`` constructor as ``from_encoding``. Here's a document written in ISO-8859-8. The document is so short that Unicode, Dammit can't get a good lock on it, and misidentifies it as ISO-8859-7:: markup = b"

\xed\xe5\xec\xf9

" soup = BeautifulSoup(markup) soup.h1

νεμω

soup.original_encoding 'ISO-8859-7' We can fix this by passing in the correct ``from_encoding``:: soup = BeautifulSoup(markup, from_encoding="iso-8859-8") soup.h1

םולש

soup.original_encoding 'iso8859-8' In rare cases (usually when a UTF-8 document contains text written in a completely different encoding), the only way to get Unicode may be to replace some characters with the special Unicode character "REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do this, it will set the ``.contains_replacement_characters`` attribute to ``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This lets you know that the Unicode representation is not an exact representation of the original--some data was lost. If a document contains �, but ``.contains_replacement_characters`` is ``False``, you'll know that the � was there originally (as it is in this paragraph) and doesn't stand in for missing data. Output encoding --------------- When you write out a document from Beautiful Soup, you get a UTF-8 document, even if the document wasn't in UTF-8 to begin with. Here's a document written in the Latin-1 encoding:: markup = b'''

Sacr\xe9 bleu!

''' soup = BeautifulSoup(markup) print(soup.prettify()) # # # # # #

# Sacré bleu! #

# # Note that the tag has been rewritten to reflect the fact that the document is now in UTF-8. If you don't want UTF-8, you can pass an encoding into ``prettify()``:: print(soup.prettify("latin-1")) # # # # ... You can also call encode() on the ``BeautifulSoup`` object, or any element in the soup, just as if it were a Python string:: soup.p.encode("latin-1") # '

Sacr\xe9 bleu!

' soup.p.encode("utf-8") # '

Sacr\xc3\xa9 bleu!

' Any characters that can't be represented in your chosen encoding will be converted into numeric XML entity references. Here's a document that includes the Unicode character SNOWMAN:: markup = u"\N{SNOWMAN}" snowman_soup = BeautifulSoup(markup) tag = snowman_soup.b The SNOWMAN character can be part of a UTF-8 document (it looks like ☃), but there's no representation for that character in ISO-Latin-1 or ASCII, so it's converted into "☃" for those encodings:: print(tag.encode("utf-8")) # ☃ print tag.encode("latin-1") # ☃ print tag.encode("ascii") # ☃ Unicode, Dammit --------------- You can use Unicode, Dammit without using Beautiful Soup. It's useful whenever you have data in an unknown encoding and you just want it to become Unicode:: from bs4 import UnicodeDammit dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") print(dammit.unicode_markup) # Sacré bleu! dammit.original_encoding # 'utf-8' The more data you give Unicode, Dammit, the more accurately it will guess. If you have your own suspicions as to what the encoding might be, you can pass them in as a list:: dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) # Sacré bleu! dammit.original_encoding # 'latin-1' Unicode, Dammit has two special features that Beautiful Soup doesn't use. Smart quotes ^^^^^^^^^^^^ You can use Unicode, Dammit to convert Microsoft smart quotes to HTML or XML entities:: markup = b"

I just \x93love\x94 Microsoft Word\x92s smart quotes

" UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup # u'

I just “love” Microsoft Word’s smart quotes

' UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup # u'

I just “love” Microsoft Word’s smart quotes

' You can also convert Microsoft smart quotes to ASCII quotes:: UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup # u'

I just "love" Microsoft Word\'s smart quotes

' Hopefully you'll find this feature useful, but Beautiful Soup doesn't use it. Beautiful Soup prefers the default behavior, which is to convert Microsoft smart quotes to Unicode characters along with everything else:: UnicodeDammit(markup, ["windows-1252"]).unicode_markup # u'

I just \u201clove\u201d Microsoft Word\u2019s smart quotes

' Inconsistent encodings ^^^^^^^^^^^^^^^^^^^^^^ Sometimes a document is mostly in UTF-8, but contains Windows-1252 characters such as (again) Microsoft smart quotes. This can happen when a website includes data from multiple sources. You can use ``UnicodeDammit.detwingle()`` to turn such a document into pure UTF-8. Here's a simple example:: snowmen = (u"\N{SNOWMAN}" * 3) quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") doc = snowmen.encode("utf8") + quote.encode("windows_1252") This document is a mess. The snowmen are in UTF-8 and the quotes are in Windows-1252. You can display the snowmen or the quotes, but not both:: print(doc) # ☃☃☃�I like snowmen!� print(doc.decode("windows-1252")) # ☃☃☃“I like snowmen!” Decoding the document as UTF-8 raises a ``UnicodeDecodeError``, and decoding it as Windows-1252 gives you gibberish. Fortunately, ``UnicodeDammit.detwingle()`` will convert the string to pure UTF-8, allowing you to decode it to Unicode and display the snowmen and quote marks simultaneously:: new_doc = UnicodeDammit.detwingle(doc) print(new_doc.decode("utf8")) # ☃☃☃“I like snowmen!” ``UnicodeDammit.detwingle()`` only knows how to handle Windows-1252 embedded in UTF-8 (or vice versa, I suppose), but this is the most common case. Note that you must know to call ``UnicodeDammit.detwingle()`` on your data before passing it into ``BeautifulSoup`` or the ``UnicodeDammit`` constructor. Beautiful Soup assumes that a document has a single encoding, whatever it might be. If you pass it a document that contains both UTF-8 and Windows-1252, it's likely to think the whole document is Windows-1252, and the document will come out looking like ` ☃☃☃“I like snowmen!”`. ``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0. Parsing only part of a document =============================== Let's say you want to use Beautiful Soup look at a document's tags. It's a waste of time and memory to parse the entire document and then go over it again looking for tags. It would be much faster to ignore everything that wasn't an tag in the first place. The ``SoupStrainer`` class allows you to choose which parts of an incoming document are parsed. You just create a ``SoupStrainer`` and pass it in to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. (Note that *this feature won't work if you're using the html5lib parser*. If you use html5lib, the whole document will be parsed, no matter what. This is because html5lib constantly rearranges the parse tree as it works, and if some part of the document didn't actually make it into the parse tree, it'll crash. To avoid confusion, in the examples below I'll be forcing Beautiful Soup to use Python's built-in parser.) ``SoupStrainer`` ---------------- The ``SoupStrainer`` class takes the same arguments as a typical method from `Searching the tree`_: :ref:`name `, :ref:`attrs `, :ref:`text `, and :ref:`**kwargs `. Here are three ``SoupStrainer`` objects:: from bs4 import SoupStrainer only_a_tags = SoupStrainer("a") only_tags_with_id_link2 = SoupStrainer(id="link2") def is_short_string(string): return len(string) < 10 only_short_strings = SoupStrainer(text=is_short_string) I'm going to bring back the "three sisters" document one more time, and we'll see what the document looks like when it's parsed with these three ``SoupStrainer`` objects:: html_doc = """

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) # # Elsie # # # Lacie # # # Tillie # print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) # # Lacie # print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) # Elsie # , # Lacie # and # Tillie # ... # You can also pass a ``SoupStrainer`` into any of the methods covered in `Searching the tree`_. This probably isn't terribly useful, but I thought I'd mention it:: soup = BeautifulSoup(html_doc) soup.find_all(only_short_strings) # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', # u'\n\n', u'...', u'\n'] Troubleshooting =============== Version mismatch problems ------------------------- * ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME = u'[document]'``): Caused by running the Python 2 version of Beautiful Soup under Python 3, without converting the code. * ``ImportError: No module named HTMLParser`` - Caused by running the Python 2 version of Beautiful Soup under Python 3. * ``ImportError: No module named html.parser`` - Caused by running the Python 3 version of Beautiful Soup under Python 2. * ``ImportError: No module named BeautifulSoup`` - Caused by running Beautiful Soup 3 code on a system that doesn't have BS3 installed. Or, by writing Beautiful Soup 4 code without knowing that the package name has changed to ``bs4``. * ``ImportError: No module named bs4`` - Caused by running Beautiful Soup 4 code on a system that doesn't have BS4 installed. Parsing XML ----------- By default, Beautiful Soup parses documents as HTML. To parse a document as XML, pass in "xml" as the second argument to the ``BeautifulSoup`` constructor:: soup = BeautifulSoup(markup, "xml") You'll need to :ref:`have lxml installed `. Other parser problems --------------------- * If your script works on one computer but not another, it's probably because the two computers have different parser libraries available. For example, you may have developed the script on a computer that has lxml installed, and then tried to run it on a computer that only has html5lib installed. See `Differences between parsers`_ for why this matters, and fix the problem by mentioning a specific parser library in the ``BeautifulSoup`` constructor. * ``HTMLParser.HTMLParseError: malformed start tag`` or ``HTMLParser.HTMLParseError: bad end tag`` - Caused by giving Python's built-in HTML parser a document it can't handle. Any other ``HTMLParseError`` is probably the same problem. Solution: :ref:`Install lxml or html5lib. ` * If you can't find a tag that you know is in the document (that is, ``find_all()`` returned ``[]`` or ``find()`` returned ``None``), you're probably using Python's built-in HTML parser, which sometimes skips tags it doesn't understand. Solution: :ref:`Install lxml or html5lib. ` Miscellaneous ------------- * ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` when the tag in question doesn't define the ``attr`` attribute. The most common errors are ``KeyError: 'href'`` and ``KeyError: 'class'``. Use ``tag.get('attr')`` if you're not sure ``attr`` is defined, just as you would with a Python dictionary. * ``UnicodeEncodeError: 'charmap' codec can't encode character u'\xfoo' in position bar`` (or just about any other ``UnicodeEncodeError``) - This is not a problem with Beautiful Soup: you're trying to print a Unicode character that your console doesn't know how to display. See `this page on the Python wiki `_ for help. One easy solution is to write the text to a file and then look at the file. Improving Performance --------------------- Beautiful Soup will never be as fast as the parsers it sits on top of. If response time is critical, if you're paying for computer time by the hour, or if there's any other reason why computer time is more valuable than programmer time, you should forget about Beautiful Soup and work directly atop `lxml `_. That said, there are things you can do to speed up Beautiful Soup. If you're not using lxml as the underlying parser, my advice is to :ref:`start `. Beautiful Soup parses documents significantly faster using lxml than using html.parser or html5lib. Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by doing a byte-by-byte examination of the file. This slows Beautiful Soup to a crawl. My tests indicate that this only happened on 2.x versions of Python, and that it happened most often with documents using Russian or Chinese encodings. If this is happening to you, you can fix it by using Python 3 for your script. Or, if you happen to know a document's encoding, you can pass it into the ``BeautifulSoup`` constructor as ``from_encoding``. `Parsing only part of a document`_ won't save you much time parsing the document, but it can save a lot of memory, and it'll make `searching` the document much faster. Beautiful Soup 3 ================ Beautiful Soup 3 is the previous release series, and is no longer being actively developed. It's currently packaged with all major Linux distributions: :kbd:`$ apt-get install python-beautifulsoup` It's also published through PyPi as ``BeautifulSoup``.: :kbd:`$ easy_install BeautifulSoup` :kbd:`$ pip install BeautifulSoup` You can also `download a tarball of Beautiful Soup 3.2.0 `_. If you ran ``easy_install beautifulsoup`` or ``easy_install BeautifulSoup``, but your code doesn't work, you installed Beautiful Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. `The documentation for Beautiful Soup 3 is archived online `_. If your first language is Chinese, it might be easier for you to read `the Chinese translation of the Beautiful Soup 3 documentation `_, then read this document to find out about the changes made in Beautiful Soup 4. Porting code to BS4 ------------------- Most code written against Beautiful Soup 3 will work against Beautiful Soup 4 with one simple change. All you should have to do is change the package name from ``BeautifulSoup`` to ``bs4``. So this:: from BeautifulSoup import BeautifulSoup becomes this:: from bs4 import BeautifulSoup * If you get the ``ImportError`` "No module named BeautifulSoup", your problem is that you're trying to run Beautiful Soup 3 code, but you only have Beautiful Soup 4 installed. * If you get the ``ImportError`` "No module named bs4", your problem is that you're trying to run Beautiful Soup 4 code, but you only have Beautiful Soup 3 installed. Although BS4 is mostly backwards-compatible with BS3, most of its methods have been deprecated and given new names for `PEP 8 compliance `_. There are numerous other renames and changes, and a few of them break backwards compatibility. Here's what you'll need to know to convert your BS3 code and habits to BS4: You need a parser ^^^^^^^^^^^^^^^^^ Beautiful Soup 3 used Python's ``SGMLParser``, a module that was deprecated and removed in Python 3.0. Beautiful Soup 4 uses ``html.parser`` by default, but you can plug in lxml or html5lib and use that instead. See `Installing a parser`_ for a comparison. Since ``html.parser`` is not the same parser as ``SGMLParser``, it will treat invalid markup differently. Usually the "difference" is that ``html.parser`` crashes. In that case, you'll need to install another parser. But sometimes ``html.parser`` just creates a different parse tree than ``SGMLParser`` would. If this happens, you may need to update your BS3 scraping code to deal with the new tree. Method names ^^^^^^^^^^^^ * ``renderContents`` -> ``encode_contents`` * ``replaceWith`` -> ``replace_with`` * ``replaceWithChildren`` -> ``unwrap`` * ``findAll`` -> ``find_all`` * ``findAllNext`` -> ``find_all_next`` * ``findAllPrevious`` -> ``find_all_previous`` * ``findNext`` -> ``find_next`` * ``findNextSibling`` -> ``find_next_sibling`` * ``findNextSiblings`` -> ``find_next_siblings`` * ``findParent`` -> ``find_parent`` * ``findParents`` -> ``find_parents`` * ``findPrevious`` -> ``find_previous`` * ``findPreviousSibling`` -> ``find_previous_sibling`` * ``findPreviousSiblings`` -> ``find_previous_siblings`` * ``nextSibling`` -> ``next_sibling`` * ``previousSibling`` -> ``previous_sibling`` Some arguments to the Beautiful Soup constructor were renamed for the same reasons: * ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` * ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` I renamed one method for compatibility with Python 3: * ``Tag.has_key()`` -> ``Tag.has_attr()`` I renamed one attribute to use more accurate terminology: * ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` I renamed three attributes to avoid using words that have special meaning to Python. Unlike the others, these changes are *not backwards compatible.* If you used these attributes in BS3, your code will break on BS4 until you change them. * ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` * ``Tag.next`` -> ``Tag.next_element`` * ``Tag.previous`` -> ``Tag.previous_element`` Generators ^^^^^^^^^^ I gave the generators PEP 8-compliant names, and transformed them into properties: * ``childGenerator()`` -> ``children`` * ``nextGenerator()`` -> ``next_elements`` * ``nextSiblingGenerator()`` -> ``next_siblings`` * ``previousGenerator()`` -> ``previous_elements`` * ``previousSiblingGenerator()`` -> ``previous_siblings`` * ``recursiveChildGenerator()`` -> ``descendants`` * ``parentGenerator()`` -> ``parents`` So instead of this:: for parent in tag.parentGenerator(): ... You can write this:: for parent in tag.parents: ... (But the old code will still work.) Some of the generators used to yield ``None`` after they were done, and then stop. That was a bug. Now the generators just stop. There are two new generators, :ref:`.strings and .stripped_strings `. ``.strings`` yields NavigableString objects, and ``.stripped_strings`` yields Python strings that have had whitespace stripped. XML ^^^ There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To parse XML you pass in "xml" as the second argument to the ``BeautifulSoup`` constructor. For the same reason, the ``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` argument. Beautiful Soup's handling of empty-element XML tags has been improved. Previously when you parsed XML you had to explicitly say which tags were considered empty-element tags. The ``selfClosingTags`` argument to the constructor is no longer recognized. Instead, Beautiful Soup considers any empty tag to be an empty-element tag. If you add a child to an empty-element tag, it stops being an empty-element tag. Entities ^^^^^^^^ An incoming HTML or XML entity is always converted into the corresponding Unicode character. Beautiful Soup 3 had a number of overlapping ways of dealing with entities, which have been removed. The ``BeautifulSoup`` constructor no longer recognizes the ``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, Dammit`_ still has ``smart_quotes_to``, but its default is now to turn smart quotes into Unicode.) If you want to turn those Unicode characters back into HTML entities on output, rather than turning them into UTF-8 characters, you need to use an :ref:`output formatter `. Miscellaneous ^^^^^^^^^^^^^ :ref:`Tag.string ` now operates recursively. If tag A contains a single tag B and nothing else, then A.string is the same as B.string. (Previously, it was None.) `Multi-valued attributes`_ like ``class`` have lists of strings as their values, not strings. This may affect the way you search by CSS class. If you pass one of the ``find*`` methods both :ref:`text ` `and` a tag-specific argument like :ref:`name `, Beautiful Soup will search for tags that match your tag-specific criteria and whose :ref:`Tag.string ` matches your value for :ref:`text `. It will `not` find the strings themselves. Previously, Beautiful Soup ignored the tag-specific arguments and looked for strings. The ``BeautifulSoup`` constructor no longer recognizes the `markupMassage` argument. It's now the parser's responsibility to handle markup correctly. The rarely-used alternate parser classes like ``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been removed. It's now the parser's decision how to handle ambiguous markup.

beautifulsoup4-4.1.0/doc/source/conf.py

# -*- coding: utf-8 -*- # # Beautiful Soup documentation build configuration file, created by # sphinx-quickstart on Thu Jan 26 11:22:55 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Beautiful Soup' copyright = u'2012, Leonard Richardson' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '4' # The full version, including alpha/beta/rc tags. release = '4.0.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # "<project> v<release> documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a <link> tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'BeautifulSoupdoc' # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', u'Leonard Richardson', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'beautifulsoup', u'Beautiful Soup Documentation', [u'Leonard Richardson'], 1) ] # -- Options for Epub output --------------------------------------------------- # Bibliographic Dublin Core info. epub_title = u'Beautiful Soup' epub_author = u'Leonard Richardson' epub_publisher = u'Leonard Richardson' epub_copyright = u'2012, Leonard Richardson' # The language of the text. It defaults to the language option # or en if the language is not set. #epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. #epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. #epub_identifier = '' # A unique identification for the text. #epub_uid = '' # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_post_files = [] # A list of files that should not be packed into the epub file. #epub_exclude_files = [] # The depth of the table of contents in toc.ncx. #epub_tocdepth = 3 # Allow duplicate toc entries. #epub_tocdup = True

beautifulsoup4-4.1.0/setup.py

from distutils.core import setup try: from distutils.command.build_py import build_py_2to3 as build_py except ImportError: # 2.x from distutils.command.build_py import build_py setup(name="beautifulsoup4", version = "4.1.0", author="Leonard Richardson", author_email='[email protected]', url="http://www.crummy.com/software/BeautifulSoup/bs4/", download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/", long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""", license="MIT", packages=['bs4', 'bs4.builder', 'bs4.tests'], cmdclass = {'build_py':build_py}, classifiers=["Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", 'Programming Language :: Python :: 3', "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", "Topic :: Text Processing :: Markup :: SGML", "Topic :: Software Development :: Libraries :: Python Modules", ], )

beautifulsoup4-4.1.0/NEWS.txt

= 4.1.0 (20120529) = * Added experimental support for fixing Windows-1252 characters embedded in UTF-8 documents. (UnicodeDammit.detwingle()) * Fixed the handling of &quot; with the built-in parser. [bug=993871] * Comments, processing instructions, document type declarations, and markup declarations are now treated as preformatted strings, the way CData blocks are. [bug=1001025] * Fixed a bug with the lxml treebuilder that prevented the user from adding attributes to a tag that didn't originally have attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. * Fixed some edge-case bugs having to do with inserting an element into a tag it's already inside, and replacing one of a tag's children with another. [bug=997529] * Added the ability to search for attribute values specified in UTF-8. [bug=1003974] This caused a major refactoring of the search code. All the tests pass, but it's possible that some searches will behave differently. = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. * Renamed replace_with_children() to unwrap(), which is easier to understand and also the jQuery name of the function. * Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%). * Fixed a bug in decoding data that contained a byte-order mark, such as data encoded in UTF-16LE. [bug=988980] * Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258] * Upon document generation, CData objects are no longer run through the formatter. [bug=988905] * The test suite now passes when lxml is not installed, whether or not html5lib is installed. [bug=987004] * Print a warning on HTMLParseErrors to let people know they should install a better parser library. = 4.0.4 (20120416) = * Fixed a bug that sometimes created disconnected trees. * Fixed a bug with the string setter that moved a string around the tree instead of copying it. [bug=983050] * Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. In the future I may make it possible to specify different formatters for attribute values and strings, but for now, consistent behavior is better than inconsistent behavior. [bug=980237] * Added the missing renderContents method from Beautiful Soup 3. Also added an encode_contents() method to go along with decode_contents(). * Give a more useful error when the user tries to run the Python 2 version of BS under Python 3. * UnicodeDammit can now convert Microsoft smart quotes to ASCII with UnicodeDammit(markup, smart_quotes_to="ascii"). = 4.0.3 (20120403) = * Fixed a typo that caused some versions of Python 3 to convert the Beautiful Soup codebase incorrectly. * Got rid of the 4.0.2 workaround for HTML documents--it was unnecessary and the workaround was triggering a (possibly different, but related) bug in lxml. [bug=972466] = 4.0.2 (20120326) = * Worked around a possible bug in lxml that prevents non-tiny XML documents from being parsed. [bug=963880, bug=963936] * Fixed a bug where specifying `text` while also searching for a tag only worked if `text` wanted an exact string match. [bug=955942] = 4.0.1 (20120314) = * This is the first official release of Beautiful Soup 4. There is no 4.0.0 release, to eliminate any possibility that packaging software might treat "4.0.0" as being an earlier version than "4.0.0b10". * Brought BS up to date with the latest release of soupselect, adding CSS selector support for direct descendant matches and multiple CSS class matches. = 4.0.0b10 (20120302) = * Added support for simple CSS selectors, taken from the soupselect project. * Fixed a crash when using html5lib. [bug=943246] * In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714] * Fixed a bug that caused calling a tag to sometimes call find_all() with the wrong arguments. [bug=944426] * For backwards compatibility, brought back the BeautifulStoneSoup class as a deprecated wrapper around BeautifulSoup. = 4.0.0b9 (20120228) = * Fixed the string representation of DOCTYPEs that have both a public ID and a system ID. * Fixed the generated XML declaration. * Renamed Tag.nsprefix to Tag.prefix, for consistency with NamespacedAttribute. * Fixed a test failure that occured on Python 3.x when chardet was installed. * Made prettify() return Unicode by default, so it will look nice on Python 3 when passed into print(). = 4.0.0b8 (20120224) = * All tree builders now preserve namespace information in the documents they parse. If you use the html5lib parser or lxml's XML parser, you can access the namespace URL for a tag as tag.namespace. However, there is no special support for namespace-oriented searching or tree manipulation. When you search the tree, you need to use namespace prefixes exactly as they're used in the original document. * The string representation of a DOCTYPE always ends in a newline. * Issue a warning if the user tries to use a SoupStrainer in conjunction with the html5lib tree builder, which doesn't support them. = 4.0.0b7 (20120223) = * Upon decoding to string, any characters that can't be represented in your chosen encoding will be converted into numeric XML entity references. * Issue a warning if characters were replaced with REPLACEMENT CHARACTER during Unicode conversion. * Restored compatibility with Python 2.6. * The install process no longer installs docs or auxillary text files. * It's now possible to deepcopy a BeautifulSoup object created with Python's built-in HTML parser. * About 100 unit tests that "test" the behavior of various parsers on invalid markup have been removed. Legitimate changes to those parsers caused these tests to fail, indicating that perhaps Beautiful Soup should not test the behavior of foreign libraries. The problematic unit tests have been reformulated as informational comparisons generated by the script scripts/demonstrate_parser_differences.py. This makes Beautiful Soup compatible with html5lib version 0.95 and future versions of HTMLParser. = 4.0.0b6 (20120216) = * Multi-valued attributes like "class" always have a list of values, even if there's only one value in the list. * Added a number of multi-valued attributes defined in HTML5. * Stopped generating a space before the slash that closes an empty-element tag. This may come back if I add a special XHTML mode (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty useless. * Passing text along with tag-specific arguments to a find* method: find("a", text="Click here") will find tags that contain the given text as their .string. Previously, the tag-specific arguments were ignored and only strings were searched. * Fixed a bug that caused the html5lib tree builder to build a partially disconnected tree. Generally cleaned up the html5lib tree builder. * If you restrict a multi-valued attribute like "class" to a string that contains spaces, Beautiful Soup will only consider it a match if the values correspond to that specific string. = 4.0.0b5 (20120209) = * Rationalized Beautiful Soup's treatment of CSS class. A tag belonging to multiple CSS classes is treated as having a list of values for the 'class' attribute. Searching for a CSS class will match *any* of the CSS classes. This actually affects all attributes that the HTML standard defines as taking multiple values (class, rel, rev, archive, accept-charset, and headers), but 'class' is by far the most common. [bug=41034] * If you pass anything other than a dictionary as the second argument to one of the find* methods, it'll assume you want to use that object to search against a tag's CSS classes. Previously this only worked if you passed in a string. * Fixed a bug that caused a crash when you passed a dictionary as an attribute value (possibly because you mistyped "attrs"). [bug=842419] * Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268] * If Unicode, Dammit can't figure out a consistent encoding for a page, it will try each of its guesses again, with errors="replace" instead of errors="strict". This may mean that some data gets replaced with REPLACEMENT CHARACTER, but at least most of it will get turned into Unicode. [bug=754903] * Patched over a bug in html5lib (?) that was crashing Beautiful Soup on certain kinds of markup. [bug=838800] * Fixed a bug that wrecked the tree if you replaced an element with an empty string. [bug=728697] * Improved Unicode, Dammit's behavior when you give it Unicode to begin with. = 4.0.0b4 (20120208) = * Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() * BeautifulSoup.new_tag() will follow the rules of whatever tree-builder was used to create the original BeautifulSoup object. A new <p> tag will look like "<p />" if the soup object was created to parse XML, but it will look like "<p></p>" if the soup object was created to parse HTML. * We pass in strict=False to html.parser on Python 3, greatly improving html.parser's ability to handle bad HTML. * We also monkeypatch a serious bug in html.parser that made strict=False disastrous on Python 3.2.2. * Replaced the "substitute_html_entities" argument with the more general "formatter" argument. * Bare ampersands and angle brackets are always converted to XML entities unless the user prevents it. * Added PageElement.insert_before() and PageElement.insert_after(), which let you put an element into the parse tree with respect to some other element. * Raise an exception when the user tries to do something nonsensical like insert a tag into itself. = 4.0.0b3 (20120203) = Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful Soup's custom HTML parser in favor of a system that lets you write a little glue code and plug in any HTML or XML parser you want. Beautiful Soup 4.0 comes with glue code for four parsers: * Python's standard HTMLParser (html.parser in Python 3) * lxml's HTML and XML parsers * html5lib's HTML parser HTMLParser is the default, but I recommend you install lxml if you can. For complete documentation, see the Sphinx documentation in bs4/doc/source/. What follows is a summary of the changes from Beautiful Soup 3. === The module name has changed === Previously you imported the BeautifulSoup class from a module also called BeautifulSoup. To save keystrokes and make it clear which version of the API is in use, the module is now called 'bs4': >>> from bs4 import BeautifulSoup === It works with Python 3 === Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was so bad that it barely worked at all. Beautiful Soup 4 works with Python 3, and since its parser is pluggable, you don't sacrifice quality. Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 support to the finish line. Ezio Melotti is also to thank for greatly improving the HTML parser that comes with Python 3.2. === CDATA sections are normal text, if they're understood at all. === Currently, the lxml and html5lib HTML parsers ignore CDATA sections in markup: <p><![CDATA[foo]]></p> => <p></p> A future version of html5lib will turn CDATA sections into text nodes, but only within tags like <svg> and <math>: <svg><![CDATA[foo]]></svg> => <p>foo</p> The default XML parser (which uses lxml behind the scenes) turns CDATA sections into ordinary text elements: <p><![CDATA[foo]]></p> => <p>foo</p> In theory it's possible to preserve the CDATA sections when using the XML parser, but I don't see how to get it to work in practice. === Miscellaneous other stuff === If the BeautifulSoup instance has .is_xml set to True, an appropriate XML declaration will be emitted when the tree is transformed into a string: <?xml version="1.0" encoding="utf-8"> <markup> ... </markup> The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree builders set it to False. If you want to parse XHTML with an HTML parser, you can set it manually. = 3.2.0 = The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 to make it obvious which one you should use. = 3.1.0 = A hybrid version that supports 2.4 and can be automatically converted to run under Python 3.0. There are three backwards-incompatible changes you should be aware of, but no new features or deliberate behavior changes. 1. str() may no longer do what you want. This is because the meaning of str() inverts between Python 2 and 3; in Python 2 it gives you a byte string, in Python 3 it gives you a Unicode string. The effect of this is that you can't pass an encoding to .__str__ anymore. Use encode() to get a string and decode() to get Unicode, and you'll be ready (well, readier) for Python 3. 2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, which is gone in Python 3. There's some bad HTML that SGMLParser handled but HTMLParser doesn't, usually to do with attribute values that aren't closed or have brackets inside them: <a href="foo</a>, </a><a href="bar">baz</a> <a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a> A later version of Beautiful Soup will allow you to plug in different parsers to make tradeoffs between speed and the ability to handle bad HTML. 3. In Python 3 (but not Python 2), HTMLParser converts entities within attributes to the corresponding Unicode characters. In Python 2 it's possible to parse this string and leave the &eacute; intact. <a href="http://crummy.com?sacr&eacute;&bleu"> In Python 3, the &eacute; is always converted to \xe9 during parsing. = 3.0.7a = Added an import that makes BS work in Python 2.3. = 3.0.7 = Fixed a UnicodeDecodeError when unpickling documents that contain non-ASCII characters. Fixed a TypeError that occured in some circumstances when a tag contained no text. Jump through hoops to avoid the use of chardet, which can be extremely slow in some circumstances. UTF-8 documents should never trigger the use of chardet. Whitespace is preserved inside <pre> and <textarea> tags that contain nothing but whitespace. Beautiful Soup can now parse a doctype that's scoped to an XML namespace. = 3.0.6 = Got rid of a very old debug line that prevented chardet from working. Added a Tag.decompose() method that completely disconnects a tree or a subset of a tree, breaking it up into bite-sized pieces that are easy for the garbage collecter to collect. Tag.extract() now returns the tag that was extracted. Tag.findNext() now does something with the keyword arguments you pass it instead of dropping them on the floor. Fixed a Unicode conversion bug. Fixed a bug that garbled some <meta> tags when rewriting them. = 3.0.5 = Soup objects can now be pickled, and copied with copy.deepcopy. Tag.append now works properly on existing BS objects. (It wasn't originally intended for outside use, but it can be now.) (Giles Radford) Passing in a nonexistent encoding will no longer crash the parser on Python 2.4 (John Nagle). Fixed an underlying bug in SGMLParser that thinks ASCII has 255 characters instead of 127 (John Nagle). Entities are converted more consistently to Unicode characters. Entity references in attribute values are now converted to Unicode characters when appropriate. Numeric entities are always converted, because SGMLParser always converts them outside of attribute values. ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to XHTML_ENTITIES. The regular expression for bare ampersands was too loose. In some cases ampersands were not being escaped. (Sam Ruby?) Non-breaking spaces and other special Unicode space characters are no longer folded to ASCII spaces. (Robert Leftwich) Information inside a TEXTAREA tag is now parsed literally, not as HTML tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) = 3.0.4 = Fixed a bug that crashed Unicode conversion in some cases. Fixed a bug that prevented UnicodeDammit from being used as a general-purpose data scrubber. Fixed some unit test failures when running against Python 2.5. When considering whether to convert smart quotes, UnicodeDammit now looks at the original encoding in a case-insensitive way. = 3.0.3 (20060606) = Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be sure to pass in an appropriate value for convertEntities, or XML/HTML entities might stick around that aren't valid in HTML/XML). The result may not validate, but it should be good enough to not choke a real-world XML parser. Specifically, the output of a properly constructed soup object should always be valid as part of an XML document, but parts may be missing if they were missing in the original. As always, if the input is valid XML, the output will also be valid. = 3.0.2 (20060602) = Previously, Beautiful Soup correctly handled attribute values that contained embedded quotes (sometimes by escaping), but not other kinds of XML character. Now, it correctly handles or escapes all special XML characters in attribute values. I aliased methods to the 2.x names (fetch, find, findText, etc.) for backwards compatibility purposes. Those names are deprecated and if I ever do a 4.0 I will remove them. I will, I tell you! Fixed a bug where the findAll method wasn't passing along any keyword arguments. When run from the command line, Beautiful Soup now acts as an HTML pretty-printer, not an XML pretty-printer. = 3.0.1 (20060530) = Reintroduced the "fetch by CSS class" shortcut. I thought keyword arguments would replace it, but they don't. You can't call soup('a', class='foo') because class is a Python keyword. If Beautiful Soup encounters a meta tag that declares the encoding, but a SoupStrainer tells it not to parse that tag, Beautiful Soup will no longer try to rewrite the meta tag to mention the new encoding. Basically, this makes SoupStrainers work in real-world applications instead of crashing the parser. = 3.0.0 "Who would not give all else for two p" (20060528) = This release is not backward-compatible with previous releases. If you've got code written with a previous version of the library, go ahead and keep using it, unless one of the features mentioned here really makes your life easier. Since the library is self-contained, you can include an old copy of the library in your old applications, and use the new version for everything else. The documentation has been rewritten and greatly expanded with many more examples. Beautiful Soup autodetects the encoding of a document (or uses the one you specify), and converts it from its native encoding to Unicode. Internally, it only deals with Unicode strings. When you print out the document, it converts to UTF-8 (or another encoding you specify). [Doc reference] It's now easy to make large-scale changes to the parse tree without screwing up the navigation members. The methods are extract, replaceWith, and insert. [Doc reference. See also Improving Memory Usage with extract] Passing True in as an attribute value gives you tags that have any value for that attribute. You don't have to create a regular expression. Passing None for an attribute value gives you tags that don't have that attribute at all. Tag objects now know whether or not they're self-closing. This avoids the problem where Beautiful Soup thought that tags like <BR /> were self-closing even in XML documents. You can customize the self-closing tags for a parser object by passing them in as a list of selfClosingTags: you don't have to subclass anymore. There's a new built-in parser, MinimalSoup, which has most of BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc reference] You can use a SoupStrainer to tell Beautiful Soup to parse only part of a document. This saves time and memory, often making Beautiful Soup about as fast as a custom-built SGMLParser subclass. [Doc reference, SoupStrainer reference] You can (usually) use keyword arguments instead of passing a dictionary of attributes to a search method. That is, you can replace soup(args={"id" : "5"}) with soup(id="5"). You can still use args if (for instance) you need to find an attribute whose name clashes with the name of an argument to findAll. [Doc reference: **kwargs attrs] The method names have changed to the better method names used in Rubyful Soup. Instead of find methods and fetch methods, there are only find methods. Instead of a scheme where you can't remember which method finds one element and which one finds them all, we have find and findAll. In general, if the method name mentions All or a plural noun (eg. findNextSiblings), then it finds many elements method. Otherwise, it only finds one element. [Doc reference] Some of the argument names have been renamed for clarity. For instance avoidParserProblems is now parserMassage. Beautiful Soup no longer implements a feed method. You need to pass a string or a filehandle into the soup constructor, not with feed after the soup has been created. There is still a feed method, but it's the feed method implemented by SGMLParser and calling it will bypass Beautiful Soup and cause problems. The NavigableText class has been renamed to NavigableString. There is no NavigableUnicodeString anymore, because every string inside a Beautiful Soup parse tree is a Unicode string. findText and fetchText are gone. Just pass a text argument into find or findAll. Null was more trouble than it was worth, so I got rid of it. Anything that used to return Null now returns None. Special XML constructs like comments and CDATA now have their own NavigableString subclasses, instead of being treated as oddly-formed data. If you parse a document that contains CDATA and write it back out, the CDATA will still be there. When you're parsing a document, you can get Beautiful Soup to convert XML or HTML entities into the corresponding Unicode characters. [Doc reference] = 2.1.1 (20050918) = Fixed a serious performance bug in BeautifulStoneSoup which was causing parsing to be incredibly slow. Corrected several entities that were previously being incorrectly translated from Microsoft smart-quote-like characters. Fixed a bug that was breaking text fetch. Fixed a bug that crashed the parser when text chunks that look like HTML tag names showed up within a SCRIPT tag. THEAD, TBODY, and TFOOT tags are now nestable within TABLE tags. Nested tables should parse more sensibly now. BASE is now considered a self-closing tag. = 2.1.0 "Game, or any other dish?" (20050504) = Added a wide variety of new search methods which, given a starting point inside the tree, follow a particular navigation member (like nextSibling) over and over again, looking for Tag and NavigableText objects that match certain criteria. The new methods are findNext, fetchNext, findPrevious, fetchPrevious, findNextSibling, fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, findParent, and fetchParents. All of these use the same basic code used by first and fetch, so you can pass your weird ways of matching things into these methods. The fetch method and its derivatives now accept a limit argument. You can now pass keyword arguments when calling a Tag object as though it were a method. Fixed a bug that caused all hand-created tags to share a single set of attributes. = 2.0.3 (20050501) = Fixed Python 2.2 support for iterators. Fixed a bug that gave the wrong representation to tags within quote tags like <script>. Took some code from Mark Pilgrim that treats CDATA declarations as data instead of ignoring them. Beautiful Soup's setup.py will now do an install even if the unit tests fail. It won't build a source distribution if the unit tests fail, so I can't release a new version unless they pass. = 2.0.2 (20050416) = Added the unit tests in a separate module, and packaged it with distutils. Fixed a bug that sometimes caused renderContents() to return a Unicode string even if there was no Unicode in the original string. Added the done() method, which closes all of the parser's open tags. It gets called automatically when you pass in some text to the constructor of a parser class; otherwise you must call it yourself. Reinstated some backwards compatibility with 1.x versions: referencing the string member of a NavigableText object returns the NavigableText object instead of throwing an error. = 2.0.1 (20050412) = Fixed a bug that caused bad results when you tried to reference a tag name shorter than 3 characters as a member of a Tag, eg. tag.table.td. Made sure all Tags have the 'hidden' attribute so that an attempt to access tag.hidden doesn't spawn an attempt to find a tag named 'hidden'. Fixed a bug in the comparison operator. = 2.0.0 "Who cares for fish?" (20050410) Beautiful Soup version 1 was very useful but also pretty stupid. I originally wrote it without noticing any of the problems inherent in trying to build a parse tree out of ambiguous HTML tags. This version solves all of those problems to my satisfaction. It also adds many new clever things to make up for the removal of the stupid things. == Parsing == The parser logic has been greatly improved, and the BeautifulSoup class should much more reliably yield a parse tree that looks like what the page author intended. For a particular class of odd edge cases that now causes problems, there is a new class, ICantBelieveItsBeautifulSoup. By default, Beautiful Soup now performs some cleanup operations on text before parsing it. This is to avoid common problems with bad definitions and self-closing tags that crash SGMLParser. You can provide your own set of cleanup operations, or turn it off altogether. The cleanup operations include fixing self-closing tags that don't close, and replacing Microsoft smart quotes and similar characters with their HTML entity equivalents. You can now get a pretty-print version of parsed HTML to get a visual picture of how Beautiful Soup parses it, with the Tag.prettify() method. == Strings and Unicode == There are separate NavigableText subclasses for ASCII and Unicode strings. These classes directly subclass the corresponding base data types. This means you can treat NavigableText objects as strings instead of having to call methods on them to get the strings. str() on a Tag always returns a string, and unicode() always returns Unicode. Previously it was inconsistent. == Tree traversal == In a first() or fetch() call, the tag name or the desired value of an attribute can now be any of the following: * A string (matches that specific tag or that specific attribute value) * A list of strings (matches any tag or attribute value in the list) * A compiled regular expression object (matches any tag or attribute value that matches the regular expression) * A callable object that takes the Tag object or attribute value as a string. It returns None/false/empty string if the given string doesn't match, and any other value if it does. This is much easier to use than SQL-style wildcards (see, regular expressions are good for something). Because of this, I took out SQL-style wildcards. I'll put them back if someone complains, but their removal simplifies the code a lot. You can use fetch() and first() to search for text in the parse tree, not just tags. There are new alias methods fetchText() and firstText() designed for this purpose. As with searching for tags, you can pass in a string, a regular expression object, or a method to match your text. If you pass in something besides a map to the attrs argument of fetch() or first(), Beautiful Soup will assume you want to match that thing against the "class" attribute. When you're scraping well-structured HTML, this makes your code a lot cleaner. 1.x and 2.x both let you call a Tag object as a shorthand for fetch(). For instance, foo("bar") is a shorthand for foo.fetch("bar"). In 2.x, you can also access a specially-named member of a Tag object as a shorthand for first(). For instance, foo.barTag is a shorthand for foo.first("bar"). By chaining these shortcuts you traverse a tree in very little code: for header in soup.bodyTag.pTag.tableTag('th'): If an element relationship (like parent or next) doesn't apply to a tag, it'll now show up Null instead of None. first() will also return Null if you ask it for a nonexistent tag. Null is an object that's just like None, except you can do whatever you want to it and it'll give you Null instead of throwing an error. This lets you do tree traversals like soup.htmlTag.headTag.titleTag without having to worry if the intermediate stages are actually there. Previously, if there was no 'head' tag in the document, headTag in that instance would have been None, and accessing its 'titleTag' member would have thrown an AttributeError. Now, you can get what you want when it exists, and get Null when it doesn't, without having to do a lot of conditionals checking to see if every stage is None. There are two new relations between page elements: previousSibling and nextSibling. They reference the previous and next element at the same level of the parse tree. For instance, if you have HTML like this: <p><ul><li>Foo<br /><li>Bar</ul> The first 'li' tag has a previousSibling of Null and its nextSibling is the second 'li' tag. The second 'li' tag has a nextSibling of Null and its previousSibling is the first 'li' tag. The previousSibling of the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the 'br' tag. I took out the ability to use fetch() to find tags that have a specific list of contents. See, I can't even explain it well. It was really difficult to use, I never used it, and I don't think anyone else ever used it. To the extent anyone did, they can probably use fetchText() instead. If it turns out someone needs it I'll think of another solution. == Tree manipulation == You can add new attributes to a tag, and delete attributes from a tag. In 1.x you could only change a tag's existing attributes. == Porting Considerations == There are three changes in 2.0 that break old code: In the post-1.2 release you could pass in a function into fetch(). The function took a string, the tag name. In 2.0, the function takes the actual Tag object. It's no longer to pass in SQL-style wildcards to fetch(). Use a regular expression instead. The different parsing algorithm means the parse tree may not be shaped like you expect. This will only actually affect you if your code uses one of the affected parts. I haven't run into this problem yet while porting my code. = Between 1.2 and 2.0 = This is the release to get if you want Python 1.5 compatibility. The desired value of an attribute can now be any of the following: * A string * A string with SQL-style wildcards * A compiled RE object * A callable that returns None/false/empty string if the given value doesn't match, and any other value otherwise. This is much easier to use than SQL-style wildcards (see, regular expressions are good for something). Because of this, I no longer recommend you use SQL-style wildcards. They may go away in a future release to clean up the code. Made Beautiful Soup handle processing instructions as text instead of ignoring them. Applied patch from Richie Hindle (richie at entrian dot com) that makes tag.string a shorthand for tag.contents[0].string when the tag has only one string-owning child. Added still more nestable tags. The nestable tags thing won't work in a lot of cases and needs to be rethought. Fixed an edge case where searching for "%foo" would match any string shorter than "foo". = 1.2 "Who for such dainties would not stoop?" (20040708) = Applied patch from Ben Last (ben at benlast dot com) that made Tag.renderContents() correctly handle Unicode. Made BeautifulStoneSoup even dumber by making it not implicitly close a tag when another tag of the same type is encountered; only when an actual closing tag is encountered. This change courtesy of Fuzzy (mike at pcblokes dot com). BeautifulSoup still works as before. = 1.1 "Swimming in a hot tureen" = Added more 'nestable' tags. Changed popping semantics so that when a nestable tag is encountered, tags are popped up to the previously encountered nestable tag (of whatever kind). I will revert this if enough people complain, but it should make more people's lives easier than harder. This enhancement was suggested by Anthony Baxter (anthony at interlink dot com dot au). = 1.0 "So rich and green" (20040420) = Initial release.

beautifulsoup4-4.1.0/AUTHORS.txt

Behold, mortal, the origins of Beautiful Soup... ================================================ Leonard Richardson is the primary programmer. Aaron DeVore is awesome. Mark Pilgrim provided the encoding detection code that forms the base of UnicodeDammit. Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful Soup 4 working under Python 3. Simon Willison wrote soupselect, which was used to make Beautiful Soup support CSS selectors. Sam Ruby helped with a lot of edge cases. Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his work in solving the nestable tags conundrum. An incomplete list of people have contributed patches to Beautiful Soup: Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn Webster, Paul Wright, Danny Yoo An incomplete list of people who made suggestions or found bugs or found ways to break Beautiful Soup: Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de Sousa Rocha, Yichun Wei, Per Vognsen

beautifulsoup4-4.1.0/PKG-INFO

Metadata-Version: 1.1 Name: beautifulsoup4 Version: 4.1.0 Summary: UNKNOWN Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ Author: Leonard Richardson Author-email: [email protected] License: MIT Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ Description: Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree. Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Text Processing :: Markup :: HTML Classifier: Topic :: Text Processing :: Markup :: XML Classifier: Topic :: Text Processing :: Markup :: SGML Classifier: Topic :: Software Development :: Libraries :: Python Modules

beautifulsoup4-4.1.0/COPYING.txt

Beautiful Soup is made available under the MIT license: Copyright (c) 2004-2012 Leonard Richardson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE, DAMMIT. Beautiful Soup incorporates code from the html5lib library, which is also made available under the MIT license.

beautifulsoup4-4.1.0/TODO.txt

Additions --------- More of the jQuery API: nextUntil? Optimizations ------------- The html5lib tree builder doesn't use the standard tree-building API, which worries me and has resulted in a number of bugs. markup_attr_map can be optimized since it's always a map now. Upon encountering UTF-16LE data or some other uncommon serialization of Unicode, UnicodeDammit will convert the data to Unicode, then encode it at UTF-8. This is wasteful because it will just get decoded back to Unicode. CDATA ----- The elementtree XMLParser has a strip_cdata argument that, when set to False, should allow Beautiful Soup to preserve CDATA sections instead of treating them as text. Except it doesn't. (This argument is also present for HTMLParser, and also does nothing there.) Currently, htm5lib converts CDATA sections into comments. An as-yet-unreleased version of html5lib changes the parser's handling of CDATA sections to allow CDATA sections in tags like <svg> and <math>. The HTML5TreeBuilder will need to be updated to create CData objects instead of Comment objects in this situation.

2Improvement.csv

GeneticAlgorithm.py

import urllib.request as urllib2 import csv import itertools import numpy import random import operator from bs4 import BeautifulSoup as bs random.seed(a=None) PopulationSize = 200 DataSize = 0 NumberOfGenerations = 4 MutationRate = 5 MutationChange = 2 Stock_name = 'tsla' NumReturn = 5 NumberOfDay = 365 File_path = '2Improvement.csv' class Chromosome(): def __init__(self, min=None, max=None, prev_min=None, prev_max=None, buy=None, score =None): self.min = min self.max = max self.prev_min = prev_min self.prev_max = prev_max self.buy = buy self.score = score def mutate(self): mu, sigma = 0, 0.15 s = numpy.random.normal(mu, sigma, 1) x = iter(s) toChange = random.randint(0,5) if toChange == 0: self.buy = random.randint(0,999)%2 if toChange == 1: self.min = next(x) if toChange == 2: self.max = next(x) if toChange == 3: self.prev_min = next(x) if toChange == 4: self.prev_max = next(x) if self.min > self.max: self.min, self.max = self.max, self.min if self.prev_min > self.prev_max: self.prev_min, self.prev_max = self.prev_max, self.prev_min class TrainingData(object): population = [] nextGeneration = [] dayChange = [] nextDayChange = [] profit = [] def __init__(self, stockName = '', popSize = None, mRate = None, mChange = None): self.stockName = stockName self.popSize= popSize self.mRate = mRate self.mChange= mChange def generateData(self): global DataSize data = [] url = "https://finance.yahoo.com/quote/" + Stock_name + "/history/" rows = bs(urllib2.urlopen(url).read(), "lxml").findAll('table')[0].tbody.findAll('tr') for each_row in rows: divs = each_row.findAll('td') if divs[1].span != 'Dividend': data.append({'open': divs[1].span.text, 'Adj close': float(divs[5].span.text.replace(',',''))}) data[:NumberOfDay] print(data) file = open('stock_data', 'w') closes = [c['Adj close'] for c in data] opens = [o['open'] for o in data] oArray = [] cArray = [] for c in closes: cArray.append(c) for o in opens: oArray.append(o) for x in range(len(data)-2): file.write(str((float(cArray[x])-float(oArray[x+1]))/100) + ' ' + str((float(cArray[x+1]) - float(oArray[x+2]))/100) + ' ' + str((float(oArray[x]) - float(oArray[x+1]))) + '\n') self.dayChange.append((float(cArray[x])-float(oArray[x+1]))/100) self.nextDayChange.append((float(cArray[x+1]) - float(oArray[x+2]))/100) self.profit.append(float(oArray[x]) - float(oArray[x+1])) DataSize = len(self.dayChange) file.close() def populationInit(self): mu, sigma = 0, 0.15 s = numpy.random.normal(mu, sigma, 4*PopulationSize) x = iter(s) for i in range(PopulationSize): temp = Chromosome(next(x),next(x),next(x),next(x),random.randint(0,999)%2, 0) if temp.min > temp.max: temp.min, temp.max = temp.max, temp.min if temp.prev_min > temp.prev_max: temp.prev_min, temp.prev_max = temp.prev_max, temp.prev_min self.population.append(temp) def fitnessFunction(self): for i in range(len(self.population)): match = False for j in range(DataSize): if(self.population[i].prev_min < self.dayChange[j] and self.population[i].prev_max > self.dayChange[j]): if(self.population[i].min < self.nextDayChange[j] and self.population[i].max > self.nextDayChange[j]): if(self.population[i].buy == 1): match = True self.population[i].score += self.profit[j] if(self.population[i].prev_min < self.dayChange[j] and self.population[i].prev_max > self.dayChange[j]): if(self.population[i].min < self.nextDayChange[j] and self.population[i].max > self.nextDayChange[j]): if(self.population[i].buy == 0): match = True self.population[i].score -= self.profit[j] if match == False: self.population[i].score = -5000 def weighted_random_choice(self): self.fitnessFunction() max = self.population[0].score for i in self.population[1:]: max+= i.score pick = random.uniform(0,max) current = 0 for i in range(len(self.population)): current += self.population[i].score if current > pick: self.nextGeneration.append(self.population[i]) def exists(self): i = 0 while i <len(self.population): if self.population[i].score is None: del self.population[i] else: i+=1 def uniformCross(self, z): children = [] for i in range(PopulationSize-len(self.nextGeneration)): child = Chromosome(0,0,0,0,0) chromosome1 = self.nextGeneration[random.randint(0,999999) % len(self.nextGeneration)] chromosome2 = self.nextGeneration[random.randint(0,999999) % len(self.nextGeneration)] if(random.randint(0,999) %2): child.min = chromosome1.min else: child.min = chromosome2.min if(random.randint(0,999) %2): child.max = chromosome1.max else: child.max = chromosome2.max if child.max < child.min: child.max, child.min = child.min, child.max if(random.randint(0,999) %2): child.prev_min = chromosome1.prev_min else: child.prev_min = chromosome2.prev_min if(random.randint(0,999) %2): child.prev_max = chromosome1.prev_max else: child.prev_max = chromosome2.prev_max if child.prev_max < child.prev_min: child.prev_max, child.prev_min = child.prev_min, child.prev_max if(random.randint(0,999) %2): child.buy = chromosome1.buy else: child.buy = chromosome2.buy children.append(child) for i in range(len(children)): if random.randint(0,999) % 100 <= z: children[i].mutate() self.population[i] = children[i] for i in range(len(children),len(self.population),1): self.population[i] = self.nextGeneration[i-len(children)] self.exists() self.fitnessFunction() self.population.sort(key=operator.attrgetter('score')) def printChromosomes(self): buyRec = [] shortRec = [] for i in range(len(self.population)): if(self.population[i].buy == 1): buyRec.append(self.population[i]) if(self.population[i].buy == 0): shortRec.append(self.population[i]) print("The Best %d Chromosomes When Buying" % (NumReturn)) outputBuy = [] fieldnames = ["Score"] i = 1 size = len(buyRec) while i < NumReturn + 1: index = size - i print("min: %f | max: %f | previous min: %f | previous max: %f | score: %f" % (buyRec[index].min, buyRec[index].max, buyRec[index].prev_min, buyRec[index].prev_max, buyRec[index].score)) outputBuy.append(buyRec[index].score) i += 1 print("The Best %d Chromosomes When Shorting" % (NumReturn)) i = 1 size = len(shortRec) while i < NumReturn+1: index = size-i print("min: %f | max: %f | previous min: %f | previous max: %f | score: %f" % (shortRec[index].min, shortRec[index].max, shortRec[index].prev_min, shortRec[index].prev_max, shortRec[index].score)) i+=1 print(outputBuy) print(outputBuy) my_list = [] for i in range(len(self.population)): print("------") print(fieldnames, i) # inner_dict = dict(zip(fieldnames, i)) # my_list.append(inner_dict) # x.csv_dict_writer(File_path,fieldnames,my_list) def csv_dict_writer(path, fieldnames, data): out_file = open(path, "wb") writer = csv.DictWriter(out_file, delimiter=',', fieldnames=fieldnames) writer.writeheader() for row in data: writer.writerow(row) out_file.close() if __name__ == '__main__': x = TrainingData() x.generateData() x.populationInit() x.weighted_random_choice() x.uniformCross(MutationRate) x.printChromosomes()

stock_data

-0.04010000000000019 0.015500000000000114 -0.2300000000000182 0.015500000000000114 -0.4622000000000003 -8.089999999999975 -0.4622000000000003 -0.2815999999999997 -39.420000000000016 -0.2815999999999997 -0.07069999999999993 -3.339999999999975 -0.07069999999999993 -0.12349999999999967 -3.910000000000025 -0.12349999999999967 -0.011499999999999773 -2.6399999999999864 -0.011499999999999773 0.01659999999999968 3.6399999999999864 0.01659999999999968 -0.059499999999999886 -4.6200000000000045 -0.059499999999999886 -0.09370000000000005 -1.6299999999999955 -0.09370000000000005 -0.08029999999999973 -6.430000000000007 -0.08029999999999973 -0.11159999999999969 -6.449999999999989 -0.11159999999999969 -0.019800000000000182 -7.3799999999999955 -0.019800000000000182 0.1322999999999996 8.149999999999977 0.1322999999999996 0.21409999999999968 0.0 0.21409999999999968 -0.05689999999999998 4.509999999999991 -0.05689999999999998 0.03660000000000025 -8.759999999999991 0.03660000000000025 -0.014499999999999886 7.420000000000016 -0.014499999999999886 -0.04189999999999998 -8.310000000000002 -0.04189999999999998 0.06370000000000005 1.3600000000000136 0.06370000000000005 -0.09889999999999986 5.409999999999968 -0.09889999999999986 -0.21639999999999987 -18.029999999999973 -0.21639999999999987 -0.1319 -7.560000000000002 -0.1319 -0.02509999999999991 -3.680000000000007 -0.02509999999999991 0.09590000000000032 2.75 0.09590000000000032 0.16520000000000037 5.670000000000016 0.16520000000000037 0.10140000000000043 12.300000000000011 0.10140000000000043 -0.01170000000000016 -0.5 -0.01170000000000016 0.022699999999999818 1.5599999999999454 0.022699999999999818 0.1099000000000001 1.9700000000000273 0.1099000000000001 0.1323000000000002 8.0 0.1323000000000002 0.0729000000000002 3.660000000000025 0.0729000000000002 0.0753000000000003 5.819999999999993 0.0753000000000003 -0.041999999999999885 -1.1100000000000136 -0.041999999999999885 -0.32889999999999986 -3.8000000000000114 -0.32889999999999986 -0.23759999999999992 -23.379999999999995 -0.23759999999999992 0.19790000000000021 4.319999999999993 0.19790000000000021 -0.04 13.78000000000003 -0.04 -0.1531 -12.760000000000048 -0.1531 -0.0725 -10.46999999999997 -0.0725 0.017400000000000092 -2.5600000000000023 0.017400000000000092 0.09170000000000016 3.490000000000009 0.09170000000000016 0.059699999999999705 2.3700000000000045 0.059699999999999705 0.08029999999999973 5.289999999999964 0.08029999999999973 -0.05419999999999959 -1.6499999999999773 -0.05419999999999959 -0.16939999999999997 -6.769999999999982 -0.16939999999999997 -0.14110000000000014 -6.310000000000002 -0.14110000000000014 0.03390000000000043 -5.420000000000016 0.03390000000000043 0.06560000000000002 10.600000000000023 0.06560000000000002 0.04349999999999966 4.399999999999977 0.04349999999999966 0.04099999999999966 -0.6700000000000159 0.04099999999999966 0.09620000000000005 5.199999999999989 0.09620000000000005 0.014300000000000068 2.930000000000007 0.014300000000000068 0.009800000000000182 -1.089999999999975 0.009800000000000182 0.0575 3.3899999999999864 0.0575 -0.0036000000000001365 3.0400000000000205 -0.0036000000000001365 0.17689999999999997 -2.9600000000000364 0.17689999999999997 0.19790000000000021 19.160000000000025 0.19790000000000021 0.03709999999999979 -0.6200000000000045 0.03709999999999979 -0.06379999999999995 3.75 -0.06379999999999995 0.0525 -8.129999999999995 0.0525 0.04349999999999966 9.0 0.04349999999999966 -0.003999999999999773 -4.180000000000007 -0.003999999999999773 -0.006399999999999863 4.430000000000007 -0.006399999999999863 -0.12189999999999998 -4.25 -0.12189999999999998 -0.1221999999999997 -7.829999999999984 -0.1221999999999997 -0.04389999999999986 -5.680000000000007 -0.04389999999999986 -0.010299999999999728 -0.07999999999998408 -0.010299999999999728 -0.11279999999999973 -3.1000000000000227 -0.11279999999999973 -0.13799999999999954 -7.569999999999993 -0.13799999999999954 -0.03170000000000016 -4.639999999999986 -0.03170000000000016 0.024399999999999977 2.859999999999957 0.024399999999999977 -0.030400000000000205 1.0300000000000296 -0.030400000000000205 0.08579999999999983 0.07999999999998408 0.08579999999999983 0.2639999999999998 10.480000000000018 0.2639999999999998 0.14310000000000003 15.819999999999993 0.14310000000000003 0.03129999999999995 0.029999999999972715 0.03129999999999995 0.11139999999999986 2.6000000000000227 0.11139999999999986 0.11259999999999991 11.899999999999977 0.11259999999999991 -0.028000000000000115 -1.8999999999999773 -0.028000000000000115 -0.002400000000000091 -4.5 -0.002400000000000091 -0.020300000000000297 1.0600000000000023 -0.020300000000000297 -0.08449999999999988 -3.1200000000000045 -0.08449999999999988 -0.08819999999999993 -8.740000000000009 -0.08819999999999993 0.043000000000000115 0.9399999999999977 0.043000000000000115 0.030199999999999817 3.1100000000000136 0.030199999999999817 -0.012199999999999704 -0.5400000000000205 -0.012199999999999704 0.017400000000000092 -2.9799999999999613 0.017400000000000092 0.04019999999999982 5.909999999999968 0.04019999999999982 -0.16930000000000006 -2.930000000000007 -0.16930000000000006 0.010600000000000023 -11.879999999999995 0.010600000000000023 0.0649000000000001 11.680000000000007 0.0649000000000001 -0.03699999999999989 7.980000000000018 -0.03699999999999989 0.08569999999999993 -8.990000000000009 0.08569999999999993 0.12899999999999978 14.870000000000005 0.12899999999999978 0.004900000000000091 -2.3700000000000045 0.004900000000000091 -0.02509999999999991 0.0 -0.02509999999999991 0.03370000000000004 -3.0 0.03370000000000004 -0.009499999999999887 4.480000000000018

TSLA.csv

Date,Open,High,Low,Close,Adj Close,Volume 2018-01-08,316.000000,337.019989,315.500000,336.410004,336.410004,9859400 2018-01-09,335.160004,338.799988,327.399994,333.690002,333.690002,7146600 2018-01-10,332.200012,337.000000,330.000000,334.799988,334.799988,4309900 2018-01-11,335.239990,344.809998,333.260010,337.950012,337.950012,6645500 2018-01-12,338.630005,340.410004,333.670013,336.220001,336.220001,4825100 2018-01-16,337.540009,345.000000,334.799988,340.059998,340.059998,6474300 2018-01-17,340.470001,349.000000,339.750000,347.160004,347.160004,7103500 2018-01-18,345.670013,352.299988,343.739990,344.570007,344.570007,5685800 2018-01-19,345.000000,350.589996,342.600006,350.019989,350.019989,4888300 2018-01-22,349.399994,357.829987,349.200012,351.559998,351.559998,6210400 2018-01-23,360.000000,360.500000,351.000000,352.790009,352.790009,5465400 2018-01-24,354.579987,354.750000,343.519989,345.890015,345.890015,5287500 2018-01-25,348.269989,349.200012,336.399994,337.640015,337.640015,6740300 2018-01-26,341.500000,344.000000,335.709991,342.850006,342.850006,4539400 2018-01-29,339.850006,350.850006,338.279999,349.529999,349.529999,4747100 2018-01-30,345.140015,348.269989,342.170013,345.820007,345.820007,4717700 2018-01-31,347.510010,356.190002,345.190002,354.309998,354.309998,6214100 2018-02-01,351.000000,359.660004,348.630005,349.250000,349.250000,4197700 2018-02-02,348.440002,351.950012,340.510010,343.750000,343.750000,3704800 2018-02-05,337.970001,344.470001,333.000000,333.130005,333.130005,4429100 2018-02-06,325.209991,336.220001,323.500000,327.570007,327.570007,3719663

untitled0.py

from bs4 import BeautifulSoup as bs import urllib.request as urllib2 def get_historical_data(name, number_of_days): data = [] url = "https://finance.yahoo.com/quote/" + name + "/history/" rows = bs(urllib2.urlopen(url).read(), "lxml").findAll('table')[0].tbody.findAll('tr') for each_row in rows: divs = each_row.findAll('td') if divs[1].span != 'Dividend': data.append({'Date': divs[0].span.text, 'Adj close': float(divs[1].span.text.replace(',',''))}) return data[:number_of_days] print (get_historical_data('amzn', 15))