diff options
| -rw-r--r-- | module/BeautifulSoup.py | 2000 | ||||
| -rw-r--r-- | module/CaptchaManager.py | 112 | ||||
| -rw-r--r-- | module/gui/CaptchaDock.py | 82 | ||||
| -rw-r--r-- | module/gui/MainWindow.py | 5 | ||||
| -rw-r--r-- | module/gui/connector.py | 40 | ||||
| -rw-r--r-- | module/plugins/decrypter/SerienjunkiesOrg.py | 86 | ||||
| -rwxr-xr-x | pyLoadCore.py | 47 | ||||
| -rwxr-xr-x | pyLoadGui.py | 14 | 
8 files changed, 2375 insertions, 11 deletions
| diff --git a/module/BeautifulSoup.py b/module/BeautifulSoup.py new file mode 100644 index 000000000..34204e740 --- /dev/null +++ b/module/BeautifulSoup.py @@ -0,0 +1,2000 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-formed XML/HTML document yields a well-formed data +structure. An ill-formed XML/HTML document yields a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this library to find and process the +well-formed part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings +  http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported +  by stock Python. +  http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific +   language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid +   or invalid. This class has web browser-like heuristics for +   obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup also defines a class (UnicodeDammit) for autodetecting +the encoding of an HTML or XML document, and converting it to +Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html + +Here, have some legalese: + +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +  * Redistributions of source code must retain the above copyright +    notice, this list of conditions and the following disclaimer. + +  * Redistributions in binary form must reproduce the above +    copyright notice, this list of conditions and the following +    disclaimer in the documentation and/or other materials provided +    with the distribution. + +  * Neither the name of the the Beautiful Soup Consortium and All +    Night Kosher Bakery nor the names of its contributors may be +    used to endorse or promote products derived from this software +    without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "3.1.0.1" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__license__ = "New-style BSD" + +import codecs +import markupbase +import types +import re +from HTMLParser import HTMLParser, HTMLParseError +try: +    from htmlentitydefs import name2codepoint +except ImportError: +    name2codepoint = {} +try: +    set +except NameError: +    from sets import Set as set + +#These hacks make Beautiful Soup able to parse XML with namespaces +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match + +DEFAULT_OUTPUT_ENCODING = "utf-8" + +# First, the classes that represent markup elements. + +def sob(unicode, encoding): +    """Returns either the given Unicode string or its encoding.""" +    if encoding is None: +        return unicode +    else: +        return unicode.encode(encoding) + +class PageElement: +    """Contains the navigational information for some part of the page +    (either a tag or a piece of text)""" + +    def setup(self, parent=None, previous=None): +        """Sets up the initial relations between this element and +        other elements.""" +        self.parent = parent +        self.previous = previous +        self.next = None +        self.previousSibling = None +        self.nextSibling = None +        if self.parent and self.parent.contents: +            self.previousSibling = self.parent.contents[-1] +            self.previousSibling.nextSibling = self + +    def replaceWith(self, replaceWith): +        oldParent = self.parent +        myIndex = self.parent.contents.index(self) +        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: +            # We're replacing this element with one of its siblings. +            index = self.parent.contents.index(replaceWith) +            if index and index < myIndex: +                # Furthermore, it comes before this element. That +                # means that when we extract it, the index of this +                # element will change. +                myIndex = myIndex - 1 +        self.extract() +        oldParent.insert(myIndex, replaceWith) + +    def extract(self): +        """Destructively rips this element out of the tree.""" +        if self.parent: +            try: +                self.parent.contents.remove(self) +            except ValueError: +                pass + +        #Find the two elements that would be next to each other if +        #this element (and any children) hadn't been parsed. Connect +        #the two. +        lastChild = self._lastRecursiveChild() +        nextElement = lastChild.next + +        if self.previous: +            self.previous.next = nextElement +        if nextElement: +            nextElement.previous = self.previous +        self.previous = None +        lastChild.next = None + +        self.parent = None +        if self.previousSibling: +            self.previousSibling.nextSibling = self.nextSibling +        if self.nextSibling: +            self.nextSibling.previousSibling = self.previousSibling +        self.previousSibling = self.nextSibling = None +        return self + +    def _lastRecursiveChild(self): +        "Finds the last element beneath this object to be parsed." +        lastChild = self +        while hasattr(lastChild, 'contents') and lastChild.contents: +            lastChild = lastChild.contents[-1] +        return lastChild + +    def insert(self, position, newChild): +        if (isinstance(newChild, basestring) +            or isinstance(newChild, unicode)) \ +            and not isinstance(newChild, NavigableString): +            newChild = NavigableString(newChild) + +        position =  min(position, len(self.contents)) +        if hasattr(newChild, 'parent') and newChild.parent != None: +            # We're 'inserting' an element that's already one +            # of this object's children. +            if newChild.parent == self: +                index = self.find(newChild) +                if index and index < position: +                    # Furthermore we're moving it further down the +                    # list of this object's children. That means that +                    # when we extract this element, our target index +                    # will jump down one. +                    position = position - 1 +            newChild.extract() + +        newChild.parent = self +        previousChild = None +        if position == 0: +            newChild.previousSibling = None +            newChild.previous = self +        else: +            previousChild = self.contents[position-1] +            newChild.previousSibling = previousChild +            newChild.previousSibling.nextSibling = newChild +            newChild.previous = previousChild._lastRecursiveChild() +        if newChild.previous: +            newChild.previous.next = newChild + +        newChildsLastElement = newChild._lastRecursiveChild() + +        if position >= len(self.contents): +            newChild.nextSibling = None + +            parent = self +            parentsNextSibling = None +            while not parentsNextSibling: +                parentsNextSibling = parent.nextSibling +                parent = parent.parent +                if not parent: # This is the last element in the document. +                    break +            if parentsNextSibling: +                newChildsLastElement.next = parentsNextSibling +            else: +                newChildsLastElement.next = None +        else: +            nextChild = self.contents[position] +            newChild.nextSibling = nextChild +            if newChild.nextSibling: +                newChild.nextSibling.previousSibling = newChild +            newChildsLastElement.next = nextChild + +        if newChildsLastElement.next: +            newChildsLastElement.next.previous = newChildsLastElement +        self.contents.insert(position, newChild) + +    def append(self, tag): +        """Appends the given tag to the contents of this tag.""" +        self.insert(len(self.contents), tag) + +    def findNext(self, name=None, attrs={}, text=None, **kwargs): +        """Returns the first item that matches the given criteria and +        appears after this Tag in the document.""" +        return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + +    def findAllNext(self, name=None, attrs={}, text=None, limit=None, +                    **kwargs): +        """Returns all items that match the given criteria and appear +        after this Tag in the document.""" +        return self._findAll(name, attrs, text, limit, self.nextGenerator, +                             **kwargs) + +    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): +        """Returns the closest sibling to this Tag that matches the +        given criteria and appears after this Tag in the document.""" +        return self._findOne(self.findNextSiblings, name, attrs, text, +                             **kwargs) + +    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, +                         **kwargs): +        """Returns the siblings of this Tag that match the given +        criteria and appear after this Tag in the document.""" +        return self._findAll(name, attrs, text, limit, +                             self.nextSiblingGenerator, **kwargs) +    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + +    def findPrevious(self, name=None, attrs={}, text=None, **kwargs): +        """Returns the first item that matches the given criteria and +        appears before this Tag in the document.""" +        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + +    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, +                        **kwargs): +        """Returns all items that match the given criteria and appear +        before this Tag in the document.""" +        return self._findAll(name, attrs, text, limit, self.previousGenerator, +                           **kwargs) +    fetchPrevious = findAllPrevious # Compatibility with pre-3.x + +    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): +        """Returns the closest sibling to this Tag that matches the +        given criteria and appears before this Tag in the document.""" +        return self._findOne(self.findPreviousSiblings, name, attrs, text, +                             **kwargs) + +    def findPreviousSiblings(self, name=None, attrs={}, text=None, +                             limit=None, **kwargs): +        """Returns the siblings of this Tag that match the given +        criteria and appear before this Tag in the document.""" +        return self._findAll(name, attrs, text, limit, +                             self.previousSiblingGenerator, **kwargs) +    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + +    def findParent(self, name=None, attrs={}, **kwargs): +        """Returns the closest parent of this Tag that matches the given +        criteria.""" +        # NOTE: We can't use _findOne because findParents takes a different +        # set of arguments. +        r = None +        l = self.findParents(name, attrs, 1) +        if l: +            r = l[0] +        return r + +    def findParents(self, name=None, attrs={}, limit=None, **kwargs): +        """Returns the parents of this Tag that match the given +        criteria.""" + +        return self._findAll(name, attrs, None, limit, self.parentGenerator, +                             **kwargs) +    fetchParents = findParents # Compatibility with pre-3.x + +    #These methods do the real heavy lifting. + +    def _findOne(self, method, name, attrs, text, **kwargs): +        r = None +        l = method(name, attrs, text, 1, **kwargs) +        if l: +            r = l[0] +        return r + +    def _findAll(self, name, attrs, text, limit, generator, **kwargs): +        "Iterates over a generator looking for things that match." + +        if isinstance(name, SoupStrainer): +            strainer = name +        else: +            # Build a SoupStrainer +            strainer = SoupStrainer(name, attrs, text, **kwargs) +        results = ResultSet(strainer) +        g = generator() +        while True: +            try: +                i = g.next() +            except StopIteration: +                break +            if i: +                found = strainer.search(i) +                if found: +                    results.append(found) +                    if limit and len(results) >= limit: +                        break +        return results + +    #These Generators can be used to navigate starting from both +    #NavigableStrings and Tags. +    def nextGenerator(self): +        i = self +        while i: +            i = i.next +            yield i + +    def nextSiblingGenerator(self): +        i = self +        while i: +            i = i.nextSibling +            yield i + +    def previousGenerator(self): +        i = self +        while i: +            i = i.previous +            yield i + +    def previousSiblingGenerator(self): +        i = self +        while i: +            i = i.previousSibling +            yield i + +    def parentGenerator(self): +        i = self +        while i: +            i = i.parent +            yield i + +    # Utility methods +    def substituteEncoding(self, str, encoding=None): +        encoding = encoding or "utf-8" +        return str.replace("%SOUP-ENCODING%", encoding) + +    def toEncoding(self, s, encoding=None): +        """Encodes an object to a string in some encoding, or to Unicode. +        .""" +        if isinstance(s, unicode): +            if encoding: +                s = s.encode(encoding) +        elif isinstance(s, str): +            if encoding: +                s = s.encode(encoding) +            else: +                s = unicode(s) +        else: +            if encoding: +                s  = self.toEncoding(str(s), encoding) +            else: +                s = unicode(s) +        return s + +class NavigableString(unicode, PageElement): + +    def __new__(cls, value): +        """Create a new NavigableString. + +        When unpickling a NavigableString, this method is called with +        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be +        passed in to the superclass's __new__ or the superclass won't know +        how to handle non-ASCII characters. +        """ +        if isinstance(value, unicode): +            return unicode.__new__(cls, value) +        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + +    def __getnewargs__(self): +        return (unicode(self),) + +    def __getattr__(self, attr): +        """text.string gives you text. This is for backwards +        compatibility for Navigable*String, but for CData* it lets you +        get the string without the CData wrapper.""" +        if attr == 'string': +            return self +        else: +            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): +        return self.decode().encode(encoding) + +    def decodeGivenEventualEncoding(self, eventualEncoding): +        return self + +class CData(NavigableString): + +    def decodeGivenEventualEncoding(self, eventualEncoding): +        return u'<![CDATA[' + self + u']]>' + +class ProcessingInstruction(NavigableString): + +    def decodeGivenEventualEncoding(self, eventualEncoding): +        output = self +        if u'%SOUP-ENCODING%' in output: +            output = self.substituteEncoding(output, eventualEncoding) +        return u'<?' + output + u'?>' + +class Comment(NavigableString): +    def decodeGivenEventualEncoding(self, eventualEncoding): +        return u'<!--' + self + u'-->' + +class Declaration(NavigableString): +    def decodeGivenEventualEncoding(self, eventualEncoding): +        return u'<!' + self + u'>' + +class Tag(PageElement): + +    """Represents a found HTML tag with its attributes and contents.""" + +    def _invert(h): +        "Cheap function to invert a hash." +        i = {} +        for k,v in h.items(): +            i[v] = k +        return i + +    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", +                                      "quot" : '"', +                                      "amp" : "&", +                                      "lt" : "<", +                                      "gt" : ">" } + +    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + +    def _convertEntities(self, match): +        """Used in a call to re.sub to replace HTML, XML, and numeric +        entities with the appropriate Unicode characters. If HTML +        entities are being converted, any unrecognized entities are +        escaped.""" +        x = match.group(1) +        if self.convertHTMLEntities and x in name2codepoint: +            return unichr(name2codepoint[x]) +        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: +            if self.convertXMLEntities: +                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] +            else: +                return u'&%s;' % x +        elif len(x) > 0 and x[0] == '#': +            # Handle numeric entities +            if len(x) > 1 and x[1] == 'x': +                return unichr(int(x[2:], 16)) +            else: +                return unichr(int(x[1:])) + +        elif self.escapeUnrecognizedEntities: +            return u'&%s;' % x +        else: +            return u'&%s;' % x + +    def __init__(self, parser, name, attrs=None, parent=None, +                 previous=None): +        "Basic constructor." + +        # We don't actually store the parser object: that lets extracted +        # chunks be garbage-collected +        self.parserClass = parser.__class__ +        self.isSelfClosing = parser.isSelfClosingTag(name) +        self.name = name +        if attrs == None: +            attrs = [] +        self.attrs = attrs +        self.contents = [] +        self.setup(parent, previous) +        self.hidden = False +        self.containsSubstitutions = False +        self.convertHTMLEntities = parser.convertHTMLEntities +        self.convertXMLEntities = parser.convertXMLEntities +        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities + +        def convert(kval): +            "Converts HTML, XML and numeric entities in the attribute value." +            k, val = kval +            if val is None: +                return kval +            return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", +                              self._convertEntities, val)) +        self.attrs = map(convert, self.attrs) + +    def get(self, key, default=None): +        """Returns the value of the 'key' attribute for the tag, or +        the value given for 'default' if it doesn't have that +        attribute.""" +        return self._getAttrMap().get(key, default) + +    def has_key(self, key): +        return self._getAttrMap().has_key(key) + +    def __getitem__(self, key): +        """tag[key] returns the value of the 'key' attribute for the tag, +        and throws an exception if it's not there.""" +        return self._getAttrMap()[key] + +    def __iter__(self): +        "Iterating over a tag iterates over its contents." +        return iter(self.contents) + +    def __len__(self): +        "The length of a tag is the length of its list of contents." +        return len(self.contents) + +    def __contains__(self, x): +        return x in self.contents + +    def __nonzero__(self): +        "A tag is non-None even if it has no contents." +        return True + +    def __setitem__(self, key, value): +        """Setting tag[key] sets the value of the 'key' attribute for the +        tag.""" +        self._getAttrMap() +        self.attrMap[key] = value +        found = False +        for i in range(0, len(self.attrs)): +            if self.attrs[i][0] == key: +                self.attrs[i] = (key, value) +                found = True +        if not found: +            self.attrs.append((key, value)) +        self._getAttrMap()[key] = value + +    def __delitem__(self, key): +        "Deleting tag[key] deletes all 'key' attributes for the tag." +        for item in self.attrs: +            if item[0] == key: +                self.attrs.remove(item) +                #We don't break because bad HTML can define the same +                #attribute multiple times. +            self._getAttrMap() +            if self.attrMap.has_key(key): +                del self.attrMap[key] + +    def __call__(self, *args, **kwargs): +        """Calling a tag like a function is the same as calling its +        findAll() method. Eg. tag('a') returns a list of all the A tags +        found within this tag.""" +        return apply(self.findAll, args, kwargs) + +    def __getattr__(self, tag): +        #print "Getattr %s.%s" % (self.__class__, tag) +        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: +            return self.find(tag[:-3]) +        elif tag.find('__') != 0: +            return self.find(tag) +        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + +    def __eq__(self, other): +        """Returns true iff this tag has the same name, the same attributes, +        and the same contents (recursively) as the given tag. + +        NOTE: right now this will return false if two tags have the +        same attributes in a different order. Should this be fixed?""" +        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): +            return False +        for i in range(0, len(self.contents)): +            if self.contents[i] != other.contents[i]: +                return False +        return True + +    def __ne__(self, other): +        """Returns true iff this tag is not identical to the other tag, +        as defined in __eq__.""" +        return not self == other + +    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): +        """Renders this tag as a string.""" +        return self.decode(eventualEncoding=encoding) + +    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" +                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" +                                           + ")") + +    def _sub_entity(self, x): +        """Used with a regular expression to substitute the +        appropriate XML entity for an XML special character.""" +        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + +    def __unicode__(self): +        return self.decode() + +    def __str__(self): +        return self.encode() + +    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, +               prettyPrint=False, indentLevel=0): +        return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + +    def decode(self, prettyPrint=False, indentLevel=0, +               eventualEncoding=DEFAULT_OUTPUT_ENCODING): +        """Returns a string or Unicode representation of this tag and +        its contents. To get Unicode, pass None for encoding.""" + +        attrs = [] +        if self.attrs: +            for key, val in self.attrs: +                fmt = '%s="%s"' +                if isString(val): +                    if (self.containsSubstitutions +                        and eventualEncoding is not None +                        and '%SOUP-ENCODING%' in val): +                        val = self.substituteEncoding(val, eventualEncoding) + +                    # The attribute value either: +                    # +                    # * Contains no embedded double quotes or single quotes. +                    #   No problem: we enclose it in double quotes. +                    # * Contains embedded single quotes. No problem: +                    #   double quotes work here too. +                    # * Contains embedded double quotes. No problem: +                    #   we enclose it in single quotes. +                    # * Embeds both single _and_ double quotes. This +                    #   can't happen naturally, but it can happen if +                    #   you modify an attribute value after parsing +                    #   the document. Now we have a bit of a +                    #   problem. We solve it by enclosing the +                    #   attribute in single quotes, and escaping any +                    #   embedded single quotes to XML entities. +                    if '"' in val: +                        fmt = "%s='%s'" +                        if "'" in val: +                            # TODO: replace with apos when +                            # appropriate. +                            val = val.replace("'", "&squot;") + +                    # Now we're okay w/r/t quotes. But the attribute +                    # value might also contain angle brackets, or +                    # ampersands that aren't part of entities. We need +                    # to escape those to XML entities too. +                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) +                if val is None: +                    # Handle boolean attributes. +                    decoded = key +                else: +                    decoded = fmt % (key, val) +                attrs.append(decoded) +        close = '' +        closeTag = '' +        if self.isSelfClosing: +            close = ' /' +        else: +            closeTag = '</%s>' % self.name + +        indentTag, indentContents = 0, 0 +        if prettyPrint: +            indentTag = indentLevel +            space = (' ' * (indentTag-1)) +            indentContents = indentTag + 1 +        contents = self.decodeContents(prettyPrint, indentContents, +                                       eventualEncoding) +        if self.hidden: +            s = contents +        else: +            s = [] +            attributeString = '' +            if attrs: +                attributeString = ' ' + ' '.join(attrs) +            if prettyPrint: +                s.append(space) +            s.append('<%s%s%s>' % (self.name, attributeString, close)) +            if prettyPrint: +                s.append("\n") +            s.append(contents) +            if prettyPrint and contents and contents[-1] != "\n": +                s.append("\n") +            if prettyPrint and closeTag: +                s.append(space) +            s.append(closeTag) +            if prettyPrint and closeTag and self.nextSibling: +                s.append("\n") +            s = ''.join(s) +        return s + +    def decompose(self): +        """Recursively destroys the contents of this tree.""" +        contents = [i for i in self.contents] +        for i in contents: +            if isinstance(i, Tag): +                i.decompose() +            else: +                i.extract() +        self.extract() + +    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): +        return self.encode(encoding, True) + +    def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, +                       prettyPrint=False, indentLevel=0): +        return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + +    def decodeContents(self, prettyPrint=False, indentLevel=0, +                       eventualEncoding=DEFAULT_OUTPUT_ENCODING): +        """Renders the contents of this tag as a string in the given +        encoding. If encoding is None, returns a Unicode string..""" +        s=[] +        for c in self: +            text = None +            if isinstance(c, NavigableString): +                text = c.decodeGivenEventualEncoding(eventualEncoding) +            elif isinstance(c, Tag): +                s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) +            if text and prettyPrint: +                text = text.strip() +            if text: +                if prettyPrint: +                    s.append(" " * (indentLevel-1)) +                s.append(text) +                if prettyPrint: +                    s.append("\n") +        return ''.join(s) + +    #Soup methods + +    def find(self, name=None, attrs={}, recursive=True, text=None, +             **kwargs): +        """Return only the first child of this Tag matching the given +        criteria.""" +        r = None +        l = self.findAll(name, attrs, recursive, text, 1, **kwargs) +        if l: +            r = l[0] +        return r +    findChild = find + +    def findAll(self, name=None, attrs={}, recursive=True, text=None, +                limit=None, **kwargs): +        """Extracts a list of Tag objects that match the given +        criteria.  You can specify the name of the Tag and any +        attributes you want the Tag to have. + +        The value of a key-value pair in the 'attrs' map can be a +        string, a list of strings, a regular expression object, or a +        callable that takes a string and returns whether or not the +        string matches for some custom definition of 'matches'. The +        same is true of the tag name.""" +        generator = self.recursiveChildGenerator +        if not recursive: +            generator = self.childGenerator +        return self._findAll(name, attrs, text, limit, generator, **kwargs) +    findChildren = findAll + +    # Pre-3.x compatibility methods. Will go away in 4.0. +    first = find +    fetch = findAll + +    def fetchText(self, text=None, recursive=True, limit=None): +        return self.findAll(text=text, recursive=recursive, limit=limit) + +    def firstText(self, text=None, recursive=True): +        return self.find(text=text, recursive=recursive) + +    # 3.x compatibility methods. Will go away in 4.0. +    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, +                       prettyPrint=False, indentLevel=0): +        if encoding is None: +            return self.decodeContents(prettyPrint, indentLevel, encoding) +        else: +            return self.encodeContents(encoding, prettyPrint, indentLevel) + + +    #Private methods + +    def _getAttrMap(self): +        """Initializes a map representation of this tag's attributes, +        if not already initialized.""" +        if not getattr(self, 'attrMap'): +            self.attrMap = {} +            for (key, value) in self.attrs: +                self.attrMap[key] = value +        return self.attrMap + +    #Generator methods +    def recursiveChildGenerator(self): +        if not len(self.contents): +            raise StopIteration +        stopNode = self._lastRecursiveChild().next +        current = self.contents[0] +        while current is not stopNode: +            yield current +            current = current.next + +    def childGenerator(self): +        if not len(self.contents): +            raise StopIteration +        current = self.contents[0] +        while current: +            yield current +            current = current.nextSibling +        raise StopIteration + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: +    """Encapsulates a number of ways of matching a markup element (tag or +    text).""" + +    def __init__(self, name=None, attrs={}, text=None, **kwargs): +        self.name = name +        if isString(attrs): +            kwargs['class'] = attrs +            attrs = None +        if kwargs: +            if attrs: +                attrs = attrs.copy() +                attrs.update(kwargs) +            else: +                attrs = kwargs +        self.attrs = attrs +        self.text = text + +    def __str__(self): +        if self.text: +            return self.text +        else: +            return "%s|%s" % (self.name, self.attrs) + +    def searchTag(self, markupName=None, markupAttrs={}): +        found = None +        markup = None +        if isinstance(markupName, Tag): +            markup = markupName +            markupAttrs = markup +        callFunctionWithTagData = callable(self.name) \ +                                and not isinstance(markupName, Tag) + +        if (not self.name) \ +               or callFunctionWithTagData \ +               or (markup and self._matches(markup, self.name)) \ +               or (not markup and self._matches(markupName, self.name)): +            if callFunctionWithTagData: +                match = self.name(markupName, markupAttrs) +            else: +                match = True +                markupAttrMap = None +                for attr, matchAgainst in self.attrs.items(): +                    if not markupAttrMap: +                         if hasattr(markupAttrs, 'get'): +                            markupAttrMap = markupAttrs +                         else: +                            markupAttrMap = {} +                            for k,v in markupAttrs: +                                markupAttrMap[k] = v +                    attrValue = markupAttrMap.get(attr) +                    if not self._matches(attrValue, matchAgainst): +                        match = False +                        break +            if match: +                if markup: +                    found = markup +                else: +                    found = markupName +        return found + +    def search(self, markup): +        #print 'looking for %s in %s' % (self, markup) +        found = None +        # If given a list of items, scan it for a text element that +        # matches. +        if isList(markup) and not isinstance(markup, Tag): +            for element in markup: +                if isinstance(element, NavigableString) \ +                       and self.search(element): +                    found = element +                    break +        # If it's a Tag, make sure its name or attributes match. +        # Don't bother with Tags if we're searching for text. +        elif isinstance(markup, Tag): +            if not self.text: +                found = self.searchTag(markup) +        # If it's text, make sure the text matches. +        elif isinstance(markup, NavigableString) or \ +                 isString(markup): +            if self._matches(markup, self.text): +                found = markup +        else: +            raise Exception, "I don't know how to match against a %s" \ +                  % markup.__class__ +        return found + +    def _matches(self, markup, matchAgainst): +        #print "Matching %s against %s" % (markup, matchAgainst) +        result = False +        if matchAgainst == True and type(matchAgainst) == types.BooleanType: +            result = markup != None +        elif callable(matchAgainst): +            result = matchAgainst(markup) +        else: +            #Custom match methods take the tag as an argument, but all +            #other ways of matching match the tag name as a string. +            if isinstance(markup, Tag): +                markup = markup.name +            if markup is not None and not isString(markup): +                markup = unicode(markup) +            #Now we know that chunk is either a string, or None. +            if hasattr(matchAgainst, 'match'): +                # It's a regexp object. +                result = markup and matchAgainst.search(markup) +            elif (isList(matchAgainst) +                  and (markup is not None or not isString(matchAgainst))): +                result = markup in matchAgainst +            elif hasattr(matchAgainst, 'items'): +                result = markup.has_key(matchAgainst) +            elif matchAgainst and isString(markup): +                if isinstance(markup, unicode): +                    matchAgainst = unicode(matchAgainst) +                else: +                    matchAgainst = str(matchAgainst) + +            if not result: +                result = matchAgainst == markup +        return result + +class ResultSet(list): +    """A ResultSet is just a list that keeps track of the SoupStrainer +    that created it.""" +    def __init__(self, source): +        list.__init__([]) +        self.source = source + +# Now, some helper functions. + +def isList(l): +    """Convenience method that works with all 2.x versions of Python +    to determine whether or not something is listlike.""" +    return ((hasattr(l, '__iter__') and not isString(l)) +            or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): +    """Convenience method that works with all 2.x versions of Python +    to determine whether or not something is stringlike.""" +    try: +        return isinstance(s, unicode) or isinstance(s, basestring) +    except NameError: +        return isinstance(s, str) + +def buildTagMap(default, *args): +    """Turns a list of maps, lists, or scalars into a single map. +    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and +    NESTING_RESET_TAGS maps out of lists and partial maps.""" +    built = {} +    for portion in args: +        if hasattr(portion, 'items'): +            #It's a map. Merge it. +            for k,v in portion.items(): +                built[k] = v +        elif isList(portion) and not isString(portion): +            #It's a list. Map each item to the default. +            for k in portion: +                built[k] = default +        else: +            #It's a scalar. Map it to the default. +            built[portion] = default +    return built + +# Now, the parser classes. + +class HTMLParserBuilder(HTMLParser): + +    def __init__(self, soup): +        HTMLParser.__init__(self) +        self.soup = soup + +    # We inherit feed() and reset(). + +    def handle_starttag(self, name, attrs): +        if name == 'meta': +            self.soup.extractCharsetFromMeta(attrs) +        else: +            self.soup.unknown_starttag(name, attrs) + +    def handle_endtag(self, name): +        self.soup.unknown_endtag(name) + +    def handle_data(self, content): +        self.soup.handle_data(content) + +    def _toStringSubclass(self, text, subclass): +        """Adds a certain piece of text to the tree as a NavigableString +        subclass.""" +        self.soup.endData() +        self.handle_data(text) +        self.soup.endData(subclass) + +    def handle_pi(self, text): +        """Handle a processing instruction as a ProcessingInstruction +        object, possibly one with a %SOUP-ENCODING% slot into which an +        encoding will be plugged later.""" +        if text[:3] == "xml": +            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" +        self._toStringSubclass(text, ProcessingInstruction) + +    def handle_comment(self, text): +        "Handle comments as Comment objects." +        self._toStringSubclass(text, Comment) + +    def handle_charref(self, ref): +        "Handle character references as data." +        if self.soup.convertEntities: +            data = unichr(int(ref)) +        else: +            data = '&#%s;' % ref +        self.handle_data(data) + +    def handle_entityref(self, ref): +        """Handle entity references as data, possibly converting known +        HTML and/or XML entity references to the corresponding Unicode +        characters.""" +        data = None +        if self.soup.convertHTMLEntities: +            try: +                data = unichr(name2codepoint[ref]) +            except KeyError: +                pass + +        if not data and self.soup.convertXMLEntities: +                data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + +        if not data and self.soup.convertHTMLEntities and \ +            not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): +                # TODO: We've got a problem here. We're told this is +                # an entity reference, but it's not an XML entity +                # reference or an HTML entity reference. Nonetheless, +                # the logical thing to do is to pass it through as an +                # unrecognized entity reference. +                # +                # Except: when the input is "&carol;" this function +                # will be called with input "carol". When the input is +                # "AT&T", this function will be called with input +                # "T". We have no way of knowing whether a semicolon +                # was present originally, so we don't know whether +                # this is an unknown entity or just a misplaced +                # ampersand. +                # +                # The more common case is a misplaced ampersand, so I +                # escape the ampersand and omit the trailing semicolon. +                data = "&%s" % ref +        if not data: +            # This case is different from the one above, because we +            # haven't already gone through a supposedly comprehensive +            # mapping of entities to Unicode characters. We might not +            # have gone through any mapping at all. So the chances are +            # very high that this is a real entity, and not a +            # misplaced ampersand. +            data = "&%s;" % ref +        self.handle_data(data) + +    def handle_decl(self, data): +        "Handle DOCTYPEs and the like as Declaration objects." +        self._toStringSubclass(data, Declaration) + +    def parse_declaration(self, i): +        """Treat a bogus SGML declaration as raw data. Treat a CDATA +        declaration as a CData object.""" +        j = None +        if self.rawdata[i:i+9] == '<![CDATA[': +             k = self.rawdata.find(']]>', i) +             if k == -1: +                 k = len(self.rawdata) +             data = self.rawdata[i+9:k] +             j = k+3 +             self._toStringSubclass(data, CData) +        else: +            try: +                j = HTMLParser.parse_declaration(self, i) +            except HTMLParseError: +                toHandle = self.rawdata[i:] +                self.handle_data(toHandle) +                j = i + len(toHandle) +        return j + + +class BeautifulStoneSoup(Tag): + +    """This class contains the basic parser and search code. It defines +    a parser that knows nothing about tag behavior except for the +    following: + +      You can't close a tag without closing all the tags it encloses. +      That is, "<foo><bar></foo>" actually means +      "<foo><bar></bar></foo>". + +    [Another possible explanation is "<foo><bar /></foo>", but since +    this class defines no SELF_CLOSING_TAGS, it will never use that +    explanation.] + +    This class is useful for parsing XML or made-up markup languages, +    or when BeautifulSoup makes an assumption counter to what you were +    expecting.""" + +    SELF_CLOSING_TAGS = {} +    NESTABLE_TAGS = {} +    RESET_NESTING_TAGS = {} +    QUOTE_TAGS = {} +    PRESERVE_WHITESPACE_TAGS = [] + +    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), +                       lambda x: x.group(1) + ' />'), +                      (re.compile('<!\s+([^<>]*)>'), +                       lambda x: '<!' + x.group(1) + '>') +                      ] + +    ROOT_TAG_NAME = u'[document]' + +    HTML_ENTITIES = "html" +    XML_ENTITIES = "xml" +    XHTML_ENTITIES = "xhtml" +    # TODO: This only exists for backwards-compatibility +    ALL_ENTITIES = XHTML_ENTITIES + +    # Used when determining whether a text node is all whitespace and +    # can be replaced with a single space. A text node that contains +    # fancy Unicode spaces (usually non-breaking) should be left +    # alone. +    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + +    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, +                 markupMassage=True, smartQuotesTo=XML_ENTITIES, +                 convertEntities=None, selfClosingTags=None, isHTML=False, +                 builder=HTMLParserBuilder): +        """The Soup object is initialized as the 'root tag', and the +        provided markup (which can be a string or a file-like object) +        is fed into the underlying parser. + +        HTMLParser will process most bad HTML, and the BeautifulSoup +        class has some tricks for dealing with some HTML that kills +        HTMLParser, but Beautiful Soup can nonetheless choke or lose data +        if your data uses self-closing tags or declarations +        incorrectly. + +        By default, Beautiful Soup uses regexes to sanitize input, +        avoiding the vast majority of these problems. If the problems +        don't apply to you, pass in False for markupMassage, and +        you'll get better performance. + +        The default parser massage techniques fix the two most common +        instances of invalid HTML that choke HTMLParser: + +         <br/> (No space between name of closing tag and tag close) +         <! --Comment--> (Extraneous whitespace in declaration) + +        You can pass in a custom list of (RE object, replace method) +        tuples to get Beautiful Soup to scrub your input the way you +        want.""" + +        self.parseOnlyThese = parseOnlyThese +        self.fromEncoding = fromEncoding +        self.smartQuotesTo = smartQuotesTo +        self.convertEntities = convertEntities +        # Set the rules for how we'll deal with the entities we +        # encounter +        if self.convertEntities: +            # It doesn't make sense to convert encoded characters to +            # entities even while you're converting entities to Unicode. +            # Just convert it all to Unicode. +            self.smartQuotesTo = None +            if convertEntities == self.HTML_ENTITIES: +                self.convertXMLEntities = False +                self.convertHTMLEntities = True +                self.escapeUnrecognizedEntities = True +            elif convertEntities == self.XHTML_ENTITIES: +                self.convertXMLEntities = True +                self.convertHTMLEntities = True +                self.escapeUnrecognizedEntities = False +            elif convertEntities == self.XML_ENTITIES: +                self.convertXMLEntities = True +                self.convertHTMLEntities = False +                self.escapeUnrecognizedEntities = False +        else: +            self.convertXMLEntities = False +            self.convertHTMLEntities = False +            self.escapeUnrecognizedEntities = False + +        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) +        self.builder = builder(self) +        self.reset() + +        if hasattr(markup, 'read'):        # It's a file-type object. +            markup = markup.read() +        self.markup = markup +        self.markupMassage = markupMassage +        try: +            self._feed(isHTML=isHTML) +        except StopParsing: +            pass +        self.markup = None                 # The markup can now be GCed. +        self.builder = None                # So can the builder. + +    def _feed(self, inDocumentEncoding=None, isHTML=False): +        # Convert the document to Unicode. +        markup = self.markup +        if isinstance(markup, unicode): +            if not hasattr(self, 'originalEncoding'): +                self.originalEncoding = None +        else: +            dammit = UnicodeDammit\ +                     (markup, [self.fromEncoding, inDocumentEncoding], +                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) +            markup = dammit.unicode +            self.originalEncoding = dammit.originalEncoding +            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding +        if markup: +            if self.markupMassage: +                if not isList(self.markupMassage): +                    self.markupMassage = self.MARKUP_MASSAGE +                for fix, m in self.markupMassage: +                    markup = fix.sub(m, markup) +                # TODO: We get rid of markupMassage so that the +                # soup object can be deepcopied later on. Some +                # Python installations can't copy regexes. If anyone +                # was relying on the existence of markupMassage, this +                # might cause problems. +                del(self.markupMassage) +        self.builder.reset() + +        self.builder.feed(markup) +        # Close out any unfinished strings and close all the open tags. +        self.endData() +        while self.currentTag.name != self.ROOT_TAG_NAME: +            self.popTag() + +    def isSelfClosingTag(self, name): +        """Returns true iff the given string is the name of a +        self-closing tag according to this parser.""" +        return self.SELF_CLOSING_TAGS.has_key(name) \ +               or self.instanceSelfClosingTags.has_key(name) + +    def reset(self): +        Tag.__init__(self, self, self.ROOT_TAG_NAME) +        self.hidden = 1 +        self.builder.reset() +        self.currentData = [] +        self.currentTag = None +        self.tagStack = [] +        self.quoteStack = [] +        self.pushTag(self) + +    def popTag(self): +        tag = self.tagStack.pop() +        # Tags with just one string-owning child get the child as a +        # 'string' property, so that soup.tag.string is shorthand for +        # soup.tag.contents[0] +        if len(self.currentTag.contents) == 1 and \ +           isinstance(self.currentTag.contents[0], NavigableString): +            self.currentTag.string = self.currentTag.contents[0] + +        #print "Pop", tag.name +        if self.tagStack: +            self.currentTag = self.tagStack[-1] +        return self.currentTag + +    def pushTag(self, tag): +        #print "Push", tag.name +        if self.currentTag: +            self.currentTag.contents.append(tag) +        self.tagStack.append(tag) +        self.currentTag = self.tagStack[-1] + +    def endData(self, containerClass=NavigableString): +        if self.currentData: +            currentData = u''.join(self.currentData) +            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and +                not set([tag.name for tag in self.tagStack]).intersection( +                    self.PRESERVE_WHITESPACE_TAGS)): +                if '\n' in currentData: +                    currentData = '\n' +                else: +                    currentData = ' ' +            self.currentData = [] +            if self.parseOnlyThese and len(self.tagStack) <= 1 and \ +                   (not self.parseOnlyThese.text or \ +                    not self.parseOnlyThese.search(currentData)): +                return +            o = containerClass(currentData) +            o.setup(self.currentTag, self.previous) +            if self.previous: +                self.previous.next = o +            self.previous = o +            self.currentTag.contents.append(o) + + +    def _popToTag(self, name, inclusivePop=True): +        """Pops the tag stack up to and including the most recent +        instance of the given tag. If inclusivePop is false, pops the tag +        stack up to but *not* including the most recent instqance of +        the given tag.""" +        #print "Popping to %s" % name +        if name == self.ROOT_TAG_NAME: +            return + +        numPops = 0 +        mostRecentTag = None +        for i in range(len(self.tagStack)-1, 0, -1): +            if name == self.tagStack[i].name: +                numPops = len(self.tagStack)-i +                break +        if not inclusivePop: +            numPops = numPops - 1 + +        for i in range(0, numPops): +            mostRecentTag = self.popTag() +        return mostRecentTag + +    def _smartPop(self, name): + +        """We need to pop up to the previous tag of this type, unless +        one of this tag's nesting reset triggers comes between this +        tag and the previous tag of this type, OR unless this tag is a +        generic nesting trigger and another generic nesting trigger +        comes between this tag and the previous tag of this type. + +        Examples: +         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. +         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. +         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + +         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. +         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' +         <td><tr><td> *<td>* should pop to 'tr', not the first 'td' +        """ + +        nestingResetTriggers = self.NESTABLE_TAGS.get(name) +        isNestable = nestingResetTriggers != None +        isResetNesting = self.RESET_NESTING_TAGS.has_key(name) +        popTo = None +        inclusive = True +        for i in range(len(self.tagStack)-1, 0, -1): +            p = self.tagStack[i] +            if (not p or p.name == name) and not isNestable: +                #Non-nestable tags get popped to the top or to their +                #last occurance. +                popTo = name +                break +            if (nestingResetTriggers != None +                and p.name in nestingResetTriggers) \ +                or (nestingResetTriggers == None and isResetNesting +                    and self.RESET_NESTING_TAGS.has_key(p.name)): + +                #If we encounter one of the nesting reset triggers +                #peculiar to this tag, or we encounter another tag +                #that causes nesting to reset, pop up to but not +                #including that tag. +                popTo = p.name +                inclusive = False +                break +            p = p.parent +        if popTo: +            self._popToTag(popTo, inclusive) + +    def unknown_starttag(self, name, attrs, selfClosing=0): +        #print "Start tag %s: %s" % (name, attrs) +        if self.quoteStack: +            #This is not a real tag. +            #print "<%s> is not real!" % name +            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) +            self.handle_data('<%s%s>' % (name, attrs)) +            return +        self.endData() + +        if not self.isSelfClosingTag(name) and not selfClosing: +            self._smartPop(name) + +        if self.parseOnlyThese and len(self.tagStack) <= 1 \ +               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): +            return + +        tag = Tag(self, name, attrs, self.currentTag, self.previous) +        if self.previous: +            self.previous.next = tag +        self.previous = tag +        self.pushTag(tag) +        if selfClosing or self.isSelfClosingTag(name): +            self.popTag() +        if name in self.QUOTE_TAGS: +            #print "Beginning quote (%s)" % name +            self.quoteStack.append(name) +            self.literal = 1 +        return tag + +    def unknown_endtag(self, name): +        #print "End tag %s" % name +        if self.quoteStack and self.quoteStack[-1] != name: +            #This is not a real end tag. +            #print "</%s> is not real!" % name +            self.handle_data('</%s>' % name) +            return +        self.endData() +        self._popToTag(name) +        if self.quoteStack and self.quoteStack[-1] == name: +            self.quoteStack.pop() +            self.literal = (len(self.quoteStack) > 0) + +    def handle_data(self, data): +        self.currentData.append(data) + +    def extractCharsetFromMeta(self, attrs): +        self.unknown_starttag('meta', attrs) + + +class BeautifulSoup(BeautifulStoneSoup): + +    """This parser knows the following facts about HTML: + +    * Some tags have no closing tag and should be interpreted as being +      closed as soon as they are encountered. + +    * The text inside some tags (ie. 'script') may contain tags which +      are not really part of the document and which should be parsed +      as text, not tags. If you want to parse the text as tags, you can +      always fetch it and parse it explicitly. + +    * Tag nesting rules: + +      Most tags can't be nested at all. For instance, the occurance of +      a <p> tag should implicitly close the previous <p> tag. + +       <p>Para1<p>Para2 +        should be transformed into: +       <p>Para1</p><p>Para2 + +      Some tags can be nested arbitrarily. For instance, the occurance +      of a <blockquote> tag should _not_ implicitly close the previous +      <blockquote> tag. + +       Alice said: <blockquote>Bob said: <blockquote>Blah +        should NOT be transformed into: +       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + +      Some tags can be nested, but the nesting is reset by the +      interposition of other tags. For instance, a <tr> tag should +      implicitly close the previous <tr> tag within the same <table>, +      but not close a <tr> tag in another table. + +       <table><tr>Blah<tr>Blah +        should be transformed into: +       <table><tr>Blah</tr><tr>Blah +        but, +       <tr>Blah<table><tr>Blah +        should NOT be transformed into +       <tr>Blah<table></tr><tr>Blah + +    Differing assumptions about tag nesting rules are a major source +    of problems with the BeautifulSoup class. If BeautifulSoup is not +    treating as nestable a tag your page author treats as nestable, +    try ICantBelieveItsBeautifulSoup, MinimalSoup, or +    BeautifulStoneSoup before writing your own subclass.""" + +    def __init__(self, *args, **kwargs): +        if not kwargs.has_key('smartQuotesTo'): +            kwargs['smartQuotesTo'] = self.HTML_ENTITIES +        kwargs['isHTML'] = True +        BeautifulStoneSoup.__init__(self, *args, **kwargs) + +    SELF_CLOSING_TAGS = buildTagMap(None, +                                    ['br' , 'hr', 'input', 'img', 'meta', +                                    'spacer', 'link', 'frame', 'base']) + +    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + +    QUOTE_TAGS = {'script' : None, 'textarea' : None} + +    #According to the HTML standard, each of these inline tags can +    #contain another tag of the same type. Furthermore, it's common +    #to actually use these tags this way. +    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', +                            'center'] + +    #According to the HTML standard, these block tags can contain +    #another tag of the same type. Furthermore, it's common +    #to actually use these tags this way. +    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + +    #Lists can contain other lists, but there are restrictions. +    NESTABLE_LIST_TAGS = { 'ol' : [], +                           'ul' : [], +                           'li' : ['ul', 'ol'], +                           'dl' : [], +                           'dd' : ['dl'], +                           'dt' : ['dl'] } + +    #Tables can contain other tables, but there are restrictions. +    NESTABLE_TABLE_TAGS = {'table' : [], +                           'tr' : ['table', 'tbody', 'tfoot', 'thead'], +                           'td' : ['tr'], +                           'th' : ['tr'], +                           'thead' : ['table'], +                           'tbody' : ['table'], +                           'tfoot' : ['table'], +                           } + +    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + +    #If one of these tags is encountered, all tags up to the next tag of +    #this type are popped. +    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', +                                     NON_NESTABLE_BLOCK_TAGS, +                                     NESTABLE_LIST_TAGS, +                                     NESTABLE_TABLE_TAGS) + +    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, +                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +    # Used to detect the charset in a META tag; see start_meta +    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + +    def extractCharsetFromMeta(self, attrs): +        """Beautiful Soup can detect a charset included in a META tag, +        try to convert the document to that charset, and re-parse the +        document from the beginning.""" +        httpEquiv = None +        contentType = None +        contentTypeIndex = None +        tagNeedsEncodingSubstitution = False + +        for i in range(0, len(attrs)): +            key, value = attrs[i] +            key = key.lower() +            if key == 'http-equiv': +                httpEquiv = value +            elif key == 'content': +                contentType = value +                contentTypeIndex = i + +        if httpEquiv and contentType: # It's an interesting meta tag. +            match = self.CHARSET_RE.search(contentType) +            if match: +                if (self.declaredHTMLEncoding is not None or +                    self.originalEncoding == self.fromEncoding): +                    # An HTML encoding was sniffed while converting +                    # the document to Unicode, or an HTML encoding was +                    # sniffed during a previous pass through the +                    # document, or an encoding was specified +                    # explicitly and it worked. Rewrite the meta tag. +                    def rewrite(match): +                        return match.group(1) + "%SOUP-ENCODING%" +                    newAttr = self.CHARSET_RE.sub(rewrite, contentType) +                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], +                                               newAttr) +                    tagNeedsEncodingSubstitution = True +                else: +                    # This is our first pass through the document. +                    # Go through it again with the encoding information. +                    newCharset = match.group(3) +                    if newCharset and newCharset != self.originalEncoding: +                        self.declaredHTMLEncoding = newCharset +                        self._feed(self.declaredHTMLEncoding) +                        raise StopParsing +                    pass +        tag = self.unknown_starttag("meta", attrs) +        if tag and tagNeedsEncodingSubstitution: +            tag.containsSubstitutions = True + + +class StopParsing(Exception): +    pass + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + +    """The BeautifulSoup class is oriented towards skipping over +    common HTML errors like unclosed tags. However, sometimes it makes +    errors of its own. For instance, consider this fragment: + +     <b>Foo<b>Bar</b></b> + +    This is perfectly valid (if bizarre) HTML. However, the +    BeautifulSoup class will implicitly close the first b tag when it +    encounters the second 'b'. It will think the author wrote +    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because +    there's no real-world reason to bold something that's already +    bold. When it encounters '</b></b>' it will close two more 'b' +    tags, for a grand total of three tags closed instead of two. This +    can throw off the rest of your document structure. The same is +    true of a number of other tags, listed below. + +    It's much more common for someone to forget to close a 'b' tag +    than to actually use nested 'b' tags, and the BeautifulSoup class +    handles the common case. This class handles the not-co-common +    case: where you can't believe someone wrote what they did, but +    it's valid HTML and BeautifulSoup screwed up by assuming it +    wouldn't be.""" + +    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ +     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', +      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', +      'big'] + +    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + +    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, +                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, +                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class MinimalSoup(BeautifulSoup): +    """The MinimalSoup class is for parsing HTML that contains +    pathologically bad markup. It makes no assumptions about tag +    nesting, but it does know which tags are self-closing, that +    <script> tags contain Javascript and should not be parsed, that +    META tags may contain encoding information, and so on. + +    This also makes it better for subclassing than BeautifulStoneSoup +    or BeautifulSoup.""" + +    RESET_NESTING_TAGS = buildTagMap('noscript') +    NESTABLE_TAGS = {} + +class BeautifulSOAP(BeautifulStoneSoup): +    """This class will push a tag with only a single string child into +    the tag's parent as an attribute. The attribute's name is the tag +    name, and the value is the string child. An example should give +    the flavor of the change: + +    <foo><bar>baz</bar></foo> +     => +    <foo bar="baz"><bar>baz</bar></foo> + +    You can then access fooTag['bar'] instead of fooTag.barTag.string. + +    This is, of course, useful for scraping structures that tend to +    use subelements instead of attributes, such as SOAP messages. Note +    that it modifies its input, so don't print the modified version +    out. + +    I'm not sure how many people really want to use this class; let me +    know if you do. Mainly I like the name.""" + +    def popTag(self): +        if len(self.tagStack) > 1: +            tag = self.tagStack[-1] +            parent = self.tagStack[-2] +            parent._getAttrMap() +            if (isinstance(tag, Tag) and len(tag.contents) == 1 and +                isinstance(tag.contents[0], NavigableString) and +                not parent.attrMap.has_key(tag.name)): +                parent[tag.name] = tag.contents[0] +        BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisiness, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): +    pass +class RobustHTMLParser(BeautifulSoup): +    pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): +    pass +class RobustInsanelyWackAssHTMLParser(MinimalSoup): +    pass +class SimplifyingSOAPParser(BeautifulSOAP): +    pass + +###################################################### +# +# Bonus library: Unicode, Dammit +# +# This class forces XML data into a standard format (usually to UTF-8 +# or Unicode).  It is heavily based on code from Mark Pilgrim's +# Universal Feed Parser. It does not rewrite the XML or HTML to +# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi +# (XML) and BeautifulSoup.start_meta (HTML). + +# Autodetects character encodings. +# Download from http://chardet.feedparser.org/ +try: +    import chardet +#    import chardet.constants +#    chardet.constants._debug = 1 +except ImportError: +    chardet = None + +# cjkcodecs and iconv_codec make Python know about more character encodings. +# Both are available from http://cjkpython.i18n.org/ +# They're built in if you use Python 2.4. +try: +    import cjkcodecs.aliases +except ImportError: +    pass +try: +    import iconv_codec +except ImportError: +    pass + +class UnicodeDammit: +    """A class for detecting the encoding of a *ML document and +    converting it to a Unicode string. If the source encoding is +    windows-1252, can replace MS smart quotes with their HTML or XML +    equivalents.""" + +    # This dictionary maps commonly seen values for "charset" in HTML +    # meta tags to the corresponding Python codec names. It only covers +    # values that aren't in Python's aliases and can't be determined +    # by the heuristics in find_codec. +    CHARSET_ALIASES = { "macintosh" : "mac-roman", +                        "x-sjis" : "shift-jis" } + +    def __init__(self, markup, overrideEncodings=[], +                 smartQuotesTo='xml', isHTML=False): +        self.declaredHTMLEncoding = None +        self.markup, documentEncoding, sniffedEncoding = \ +                     self._detectEncoding(markup, isHTML) +        self.smartQuotesTo = smartQuotesTo +        self.triedEncodings = [] +        if markup == '' or isinstance(markup, unicode): +            self.originalEncoding = None +            self.unicode = unicode(markup) +            return + +        u = None +        for proposedEncoding in overrideEncodings: +            u = self._convertFrom(proposedEncoding) +            if u: break +        if not u: +            for proposedEncoding in (documentEncoding, sniffedEncoding): +                u = self._convertFrom(proposedEncoding) +                if u: break + +        # If no luck and we have auto-detection library, try that: +        if not u and chardet and not isinstance(self.markup, unicode): +            u = self._convertFrom(chardet.detect(self.markup)['encoding']) + +        # As a last resort, try utf-8 and windows-1252: +        if not u: +            for proposed_encoding in ("utf-8", "windows-1252"): +                u = self._convertFrom(proposed_encoding) +                if u: break + +        self.unicode = u +        if not u: self.originalEncoding = None + +    def _subMSChar(self, match): +        """Changes a MS smart quote character to an XML or HTML +        entity.""" +        orig = match.group(1) +        sub = self.MS_CHARS.get(orig) +        if type(sub) == types.TupleType: +            if self.smartQuotesTo == 'xml': +                sub = '&#x'.encode() + sub[1].encode() + ';'.encode() +            else: +                sub = '&'.encode() + sub[0].encode() + ';'.encode() +        else: +            sub = sub.encode() +        return sub + +    def _convertFrom(self, proposed): +        proposed = self.find_codec(proposed) +        if not proposed or proposed in self.triedEncodings: +            return None +        self.triedEncodings.append(proposed) +        markup = self.markup + +        # Convert smart quotes to HTML if coming from an encoding +        # that might have them. +        if self.smartQuotesTo and proposed.lower() in("windows-1252", +                                                      "iso-8859-1", +                                                      "iso-8859-2"): +            smart_quotes_re = "([\x80-\x9f])" +            smart_quotes_compiled = re.compile(smart_quotes_re) +            markup = smart_quotes_compiled.sub(self._subMSChar, markup) + +        try: +            # print "Trying to convert document to %s" % proposed +            u = self._toUnicode(markup, proposed) +            self.markup = u +            self.originalEncoding = proposed +        except Exception, e: +            # print "That didn't work!" +            # print e +            return None +        #print "Correct encoding: %s" % proposed +        return self.markup + +    def _toUnicode(self, data, encoding): +        '''Given a string and its encoding, decodes the string into Unicode. +        %encoding is a string recognized by encodings.aliases''' + +        # strip Byte Order Mark (if present) +        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ +               and (data[2:4] != '\x00\x00'): +            encoding = 'utf-16be' +            data = data[2:] +        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ +                 and (data[2:4] != '\x00\x00'): +            encoding = 'utf-16le' +            data = data[2:] +        elif data[:3] == '\xef\xbb\xbf': +            encoding = 'utf-8' +            data = data[3:] +        elif data[:4] == '\x00\x00\xfe\xff': +            encoding = 'utf-32be' +            data = data[4:] +        elif data[:4] == '\xff\xfe\x00\x00': +            encoding = 'utf-32le' +            data = data[4:] +        newdata = unicode(data, encoding) +        return newdata + +    def _detectEncoding(self, xml_data, isHTML=False): +        """Given a document, tries to detect its XML encoding.""" +        xml_encoding = sniffed_xml_encoding = None +        try: +            if xml_data[:4] == '\x4c\x6f\xa7\x94': +                # EBCDIC +                xml_data = self._ebcdic_to_ascii(xml_data) +            elif xml_data[:4] == '\x00\x3c\x00\x3f': +                # UTF-16BE +                sniffed_xml_encoding = 'utf-16be' +                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') +            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ +                     and (xml_data[2:4] != '\x00\x00'): +                # UTF-16BE with BOM +                sniffed_xml_encoding = 'utf-16be' +                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') +            elif xml_data[:4] == '\x3c\x00\x3f\x00': +                # UTF-16LE +                sniffed_xml_encoding = 'utf-16le' +                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') +            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ +                     (xml_data[2:4] != '\x00\x00'): +                # UTF-16LE with BOM +                sniffed_xml_encoding = 'utf-16le' +                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') +            elif xml_data[:4] == '\x00\x00\x00\x3c': +                # UTF-32BE +                sniffed_xml_encoding = 'utf-32be' +                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') +            elif xml_data[:4] == '\x3c\x00\x00\x00': +                # UTF-32LE +                sniffed_xml_encoding = 'utf-32le' +                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') +            elif xml_data[:4] == '\x00\x00\xfe\xff': +                # UTF-32BE with BOM +                sniffed_xml_encoding = 'utf-32be' +                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') +            elif xml_data[:4] == '\xff\xfe\x00\x00': +                # UTF-32LE with BOM +                sniffed_xml_encoding = 'utf-32le' +                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') +            elif xml_data[:3] == '\xef\xbb\xbf': +                # UTF-8 with BOM +                sniffed_xml_encoding = 'utf-8' +                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') +            else: +                sniffed_xml_encoding = 'ascii' +                pass +        except: +            xml_encoding_match = None +        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() +        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) +        if not xml_encoding_match and isHTML: +            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() +            regexp = re.compile(meta_re, re.I) +            xml_encoding_match = regexp.search(xml_data) +        if xml_encoding_match is not None: +            xml_encoding = xml_encoding_match.groups()[0].decode( +                'ascii').lower() +            if isHTML: +                self.declaredHTMLEncoding = xml_encoding +            if sniffed_xml_encoding and \ +               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', +                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4', +                                 'utf-16', 'utf-32', 'utf_16', 'utf_32', +                                 'utf16', 'u16')): +                xml_encoding = sniffed_xml_encoding +        return xml_data, xml_encoding, sniffed_xml_encoding + + +    def find_codec(self, charset): +        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ +               or (charset and self._codec(charset.replace("-", ""))) \ +               or (charset and self._codec(charset.replace("-", "_"))) \ +               or charset + +    def _codec(self, charset): +        if not charset: return charset +        codec = None +        try: +            codecs.lookup(charset) +            codec = charset +        except (LookupError, ValueError): +            pass +        return codec + +    EBCDIC_TO_ASCII_MAP = None +    def _ebcdic_to_ascii(self, s): +        c = self.__class__ +        if not c.EBCDIC_TO_ASCII_MAP: +            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, +                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, +                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, +                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, +                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, +                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, +                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, +                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, +                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, +                    201,202,106,107,108,109,110,111,112,113,114,203,204,205, +                    206,207,208,209,126,115,116,117,118,119,120,121,122,210, +                    211,212,213,214,215,216,217,218,219,220,221,222,223,224, +                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, +                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, +                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, +                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, +                    250,251,252,253,254,255) +            import string +            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ +            ''.join(map(chr, range(256))), ''.join(map(chr, emap))) +        return s.translate(c.EBCDIC_TO_ASCII_MAP) + +    MS_CHARS = { '\x80' : ('euro', '20AC'), +                 '\x81' : ' ', +                 '\x82' : ('sbquo', '201A'), +                 '\x83' : ('fnof', '192'), +                 '\x84' : ('bdquo', '201E'), +                 '\x85' : ('hellip', '2026'), +                 '\x86' : ('dagger', '2020'), +                 '\x87' : ('Dagger', '2021'), +                 '\x88' : ('circ', '2C6'), +                 '\x89' : ('permil', '2030'), +                 '\x8A' : ('Scaron', '160'), +                 '\x8B' : ('lsaquo', '2039'), +                 '\x8C' : ('OElig', '152'), +                 '\x8D' : '?', +                 '\x8E' : ('#x17D', '17D'), +                 '\x8F' : '?', +                 '\x90' : '?', +                 '\x91' : ('lsquo', '2018'), +                 '\x92' : ('rsquo', '2019'), +                 '\x93' : ('ldquo', '201C'), +                 '\x94' : ('rdquo', '201D'), +                 '\x95' : ('bull', '2022'), +                 '\x96' : ('ndash', '2013'), +                 '\x97' : ('mdash', '2014'), +                 '\x98' : ('tilde', '2DC'), +                 '\x99' : ('trade', '2122'), +                 '\x9a' : ('scaron', '161'), +                 '\x9b' : ('rsaquo', '203A'), +                 '\x9c' : ('oelig', '153'), +                 '\x9d' : '?', +                 '\x9e' : ('#x17E', '17E'), +                 '\x9f' : ('Yuml', ''),} + +####################################################################### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': +    import sys +    soup = BeautifulSoup(sys.stdin) +    print soup.prettify() diff --git a/module/CaptchaManager.py b/module/CaptchaManager.py new file mode 100644 index 000000000..9fbff92a1 --- /dev/null +++ b/module/CaptchaManager.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +""" +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 3 of the License, +    or (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +    See the GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, see <http://www.gnu.org/licenses/>. +     +    @author: mkaay +""" + +from uuid import uuid4 as uuid +from threading import Lock + +class CaptchaManager(): +    def __init__(self, core): +        self.lock = Lock() +        self.core = core +        self.tasks = [] +     +    def newTask(self, plugin): +        task = CaptchaTask(plugin, self) +        self.lock.acquire() +        self.tasks.append(task) +        self.lock.release() +        return task +     +    def removeTask(self, task): +        self.lock.acquire() +        self.tasks.remove(task) +        self.lock.release() +     +    def getTask(self): +        self.lock.acquire() +        for task in self.tasks: +            if task.getStatus() == "waiting": +                self.lock.release() +                return task +        self.lock.release() +        return None +     +    def getTaskFromID(self, tid): +        self.lock.acquire() +        for task in self.tasks: +            if task.getID() == tid: +                self.lock.release() +                return task +        self.lock.release() +        return None + +class CaptchaTask(): +    def __init__(self, plugin, manager): +        self.lock = Lock() +        self.plugin = plugin +        self.manager = manager +        self.captchaImg = None +        self.captchaType = None +        self.result = None +        self.status = "preparing" +        self.id = uuid().hex +     +    def setCaptcha(self, img, imgType): +        self.lock.acquire() +        self.captchaImg = img +        self.captchaType = imgType +        self.lock.release() +     +    def getCaptcha(self): +        return self.captchaImg, self.captchaType +     +    def setResult(self, result): +        self.lock.acquire() +        self.result = result +        self.lock.release() +     +    def getResult(self): +        return self.result +     +    def getID(self): +        return self.id +     +    def getStatus(self): +        return self.status +     +    def setDone(self): +        self.lock.acquire() +        self.status = "done" +        self.lock.release() +     +    def setWaiting(self): +        self.lock.acquire() +        self.status = "waiting" +        self.lock.release() +     +    def setWatingForUser(self): +        self.lock.acquire() +        self.status = "user" +        self.lock.release() +     +    def removeTask(self): +        self.manager.removeTask(self) +     +    def __str__(self): +        return "<CaptchaTask '%s'>" % (self.getID(),) diff --git a/module/gui/CaptchaDock.py b/module/gui/CaptchaDock.py new file mode 100644 index 000000000..193bac6fa --- /dev/null +++ b/module/gui/CaptchaDock.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +""" +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 3 of the License, +    or (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +    See the GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, see <http://www.gnu.org/licenses/>. +     +    @author: mkaay +""" + +from PyQt4.QtCore import * +from PyQt4.QtGui import * + +class CaptchaDock(QDockWidget): +    def __init__(self): +        QDockWidget.__init__(self, "Captcha") +        self.setObjectName("Captcha Dock") +        self.widget = CaptchaDockWidget(self) +        self.setWidget(self.widget) +        self.setAllowedAreas(Qt.BottomDockWidgetArea) +        self.setFeatures(QDockWidget.NoDockWidgetFeatures) +        self.hide() +        self.processing = False +        self.currentID = None +     +    def isFree(self): +        return not self.processing +     +    def setTask(self, tid, img, imgType): +        self.processing = True +        data = QByteArray(img) +        self.currentID = tid +        self.widget.emit(SIGNAL("setImage"), data) +        self.show() + +class CaptchaDockWidget(QWidget): +    def __init__(self, dock): +        QWidget.__init__(self) +        self.dock = dock +        self.setLayout(QHBoxLayout()) +        layout = self.layout() +         +        imgLabel = QLabel() +        captchaInput = QLineEdit() +        okayButton = QPushButton("OK") +        cancelButton = QPushButton("Cancel") +         +        layout.addWidget(imgLabel) +        layout.addWidget(captchaInput) +        layout.addWidget(okayButton) +        layout.addWidget(cancelButton) +        layout.addStretch() +         +        self.input = captchaInput +         +        self.connect(okayButton, SIGNAL("clicked()"), self.slotSubmit) +        self.connect(captchaInput, SIGNAL("returnPressed()"), self.slotSubmit) +        self.connect(self, SIGNAL("setImage"), self.setImg) +        self.connect(self, SIGNAL("setPixmap(const QPixmap &)"), imgLabel, SLOT("setPixmap(const QPixmap &)")) +     +    def setImg(self, data): +        pixmap = QPixmap() +        pixmap.loadFromData(data) +        self.emit(SIGNAL("setPixmap(const QPixmap &)"), pixmap) +     +    def slotSubmit(self): +        text = self.input.text() +        tid = self.dock.currentID +        self.dock.currentID = None +        self.dock.emit(SIGNAL("done"), tid, str(text)) +        self.dock.hide() +        self.dock.processing = False + diff --git a/module/gui/MainWindow.py b/module/gui/MainWindow.py index 6daa0aa55..94ff63dda 100644 --- a/module/gui/MainWindow.py +++ b/module/gui/MainWindow.py @@ -21,6 +21,7 @@ from PyQt4.QtGui import *  from module.gui.PackageDock import *  from module.gui.LinkDock import * +from module.gui.CaptchaDock import CaptchaDock  class MainWindow(QMainWindow):      def __init__(self): @@ -34,7 +35,7 @@ class MainWindow(QMainWindow):          self.resize(850,500)          #layout version -        self.version = 1 +        self.version = 2          #init docks          self.newPackDock = NewPackageDock() @@ -43,6 +44,8 @@ class MainWindow(QMainWindow):          self.addDockWidget(Qt.RightDockWidgetArea, self.newLinkDock)          self.connect(self.newLinkDock, SIGNAL("done"), self.slotAddLinks)          self.connect(self.newPackDock, SIGNAL("done"), self.slotAddPackage) +        self.captchaDock = CaptchaDock() +        self.addDockWidget(Qt.BottomDockWidgetArea, self.captchaDock)          #central widget, layout          self.masterlayout = QVBoxLayout() diff --git a/module/gui/connector.py b/module/gui/connector.py index 2915cf568..432999161 100644 --- a/module/gui/connector.py +++ b/module/gui/connector.py @@ -374,7 +374,7 @@ class connector(QThread):          finally:              self.mutex.unlock() -    def slotPullOutPackage(self, pid): +    def pullOutPackage(self, pid):          """              pull out package          """ @@ -382,7 +382,43 @@ class connector(QThread):          try:              return self.proxy.pull_out_package(pid)          except Exception, e: -            self.emit(SIGNAL("proxy_error"), "slotPullOutPackage", e) +            self.emit(SIGNAL("proxy_error"), "pullOutPackage", e) +        finally: +            self.mutex.unlock() +     +    def captchaWaiting(self): +        """ +            is the a captcha waiting? +        """ +        self.mutex.lock() +        try: +            return self.proxy.is_captcha_waiting() +        except Exception, e: +            self.emit(SIGNAL("proxy_error"), "captchaWaiting", e) +        finally: +            self.mutex.unlock() +     +    def getCaptcha(self): +        """ +            get captcha +        """ +        self.mutex.lock() +        try: +            return self.proxy.get_captcha_task() +        except Exception, e: +            self.emit(SIGNAL("proxy_error"), "getCaptcha", e) +        finally: +            self.mutex.unlock() +     +    def setCaptchaResult(self, cid, result): +        """ +            get captcha +        """ +        self.mutex.lock() +        try: +            return self.proxy.set_captcha_result(cid, result) +        except Exception, e: +            self.emit(SIGNAL("proxy_error"), "setCaptchaResult", e)          finally:              self.mutex.unlock() diff --git a/module/plugins/decrypter/SerienjunkiesOrg.py b/module/plugins/decrypter/SerienjunkiesOrg.py new file mode 100644 index 000000000..407d14c53 --- /dev/null +++ b/module/plugins/decrypter/SerienjunkiesOrg.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +import re +from time import sleep + +from module.Plugin import Plugin +from module.BeautifulSoup import BeautifulSoup + +class SerienjunkiesOrg(Plugin): +    def __init__(self, parent): +        Plugin.__init__(self, parent) +        props = {} +        props['name'] = "SerienjunkiesOrg" +        props['type'] = "container" +        props['pattern'] = r"http://.*?serienjunkies.org/.*?" +        props['version'] = "0.1" +        props['description'] = """serienjunkies.org Container Plugin""" +        props['author_name'] = ("mkaay") +        props['author_mail'] = ("mkaay@mkaay.de") +        self.props = props +        self.parent = parent +        self.html = None +        self.multi_dl = False +     +    def getSJSrc(self, url): +        src = self.req.load(str(url)) +        if not src.find("Enter Serienjunkies") == -1: +            src = self.req.load(str(url)) +        return src +     +    def file_exists(self): +        return True +     +    def handleEpisode(self, url): +        if not self.parent.core.isGUIConnected(): +            return False +        for i in range(3): +            src = self.getSJSrc(url) +            if not src.find("Du hast das Download-Limit überschritten! Bitte versuche es später nocheinmal.") == -1: +                self.logger.info("Downloadlimit reached") +                return False +            else: +                soup = BeautifulSoup(src) +                form = soup.find("form") +                captchaTag = soup.find(attrs={"src":re.compile("^/secure/")}) +                captchaUrl = "http://download.serienjunkies.org"+captchaTag["src"] +                captchaData = self.req.load(str(captchaUrl)) +                captchaManager = self.parent.core.captchaManager +                task = captchaManager.newTask(self) +                task.setCaptcha(captchaData, "png") +                task.setWaiting() +                while not task.getStatus() == "done": +                    if not self.parent.core.isGUIConnected(): +                        return False +                    sleep(1) +                result = task.getResult() +                url = "http://download.serienjunkies.org"+form["action"] +                sinp = form.find(attrs={"name":"s"}) +                 +                sj = self.req.load(str(url), post={'s': sinp["value"], 'c': result, 'action': "Download"}) +                 +                soup = BeautifulSoup(sj) +                rawLinks = soup.findAll(attrs={"action": re.compile("^http://download.serienjunkies.org/")}) +                 +                if not len(rawLinks) > 0: +                    continue +                 +                links = [] +                for link in rawLinks: +                    frameUrl = link["action"].replace("/go-", "/frame/go-") +                    links.append(self.handleFrame(frameUrl)) +                return links +     +    def handleFrame(self, url): +        self.req.load(str(url), cookies=False, just_header=True) +        return self.req.lastEffectiveURL +     +    def proceed(self, url, location): +        links = False +        episodePattern = re.compile("^http://download.serienjunkies.org/f-.*?.html$") +        framePattern = re.compile("^http://download.serienjunkies.org/frame/go-.*?/$") +        if framePattern.match(url): +            links = [self.handleFrame(url)] +        elif episodePattern.match(url): +            links = self.handleEpisode(url) +        self.links = links diff --git a/pyLoadCore.py b/pyLoadCore.py index 6b38d425c..4ca81efed 100755 --- a/pyLoadCore.py +++ b/pyLoadCore.py @@ -50,12 +50,16 @@ from time import sleep  from shutil import copyfile  from tempfile import NamedTemporaryFile
 -from module.file_list import File_List
  from module.network.Request import Request
  import module.remote.SecureXMLRPCServer as Server
 +from module.XMLConfigParser import XMLConfigParser +
 +from module.file_list import File_List
  from module.thread_list import Thread_List
  from module.web.ServerThread import WebServer
 -from module.XMLConfigParser import XMLConfigParser
 +from module.CaptchaManager import CaptchaManager + +from xmlrpclib import Binary
  class Core(object):
      """ pyLoad Core """
 @@ -121,10 +125,13 @@ class Core(object):          self.init_scripts()
          path.append(self.plugin_folder)
          self.create_plugin_index()
 -
 +         +        self.lastGuiConnected = 0 +        
          self.server_methods = ServerMethods(self)
          self.file_list = File_List(self)
 -        self.thread_list = Thread_List(self)
 +        self.thread_list = Thread_List(self) +        self.captchaManager = CaptchaManager(self)
          self.last_update_check = 0          self.update_check_interval = 1800
 @@ -135,7 +142,7 @@ class Core(object):          self.logger.info(_("Downloadtime: %s") % self.server_methods.is_time_download()) # debug only
 -
 +        
          #read url list @mkaay: pid, lid?
          # pid = package id
          # lid = link/file id
 @@ -273,7 +280,10 @@ class Core(object):                      print _("could not create %s: %s") % (description, tmp_name)
                  if essential:
                      exit()
 -
 +     +    def isGUIConnected(self): +        return self.lastGuiConnected+10 > time.time() +    
      def restart(self):
          self.shutdown()          execv(executable, [executable, "pyLoadCore.py"])
 @@ -556,7 +566,30 @@ class ServerMethods():          self.core.file_list.packager.setPackageData(pid, package_name=name)      def pull_out_package(self, pid): -        self.core.file_list.packager.pullOutPackage(pid)
 +        self.core.file_list.packager.pullOutPackage(pid) +     +    def is_captcha_waiting(self): +        self.core.lastGuiConnected = time.time() +        task = self.core.captchaManager.getTask() +        return not task == None +     +    def get_captcha_task(self): +        task = self.core.captchaManager.getTask() +        if task: +            task.setWatingForUser() +            c = task.getCaptcha() +            return str(task.getID()), Binary(c[0]), str(c[1]) +        else: +            return None, None, None +     +    def set_captcha_result(self, tid, result): +        task = self.core.captchaManager.getTaskFromID(tid) +        if task: +            task.setResult(result) +            task.setDone() +            return True +        else: +            return False      #def move_urls_up(self, ids):
      #    for id in ids:
 diff --git a/pyLoadGui.py b/pyLoadGui.py index dab3ef18b..8fff8e8c9 100755 --- a/pyLoadGui.py +++ b/pyLoadGui.py @@ -62,6 +62,7 @@ class main(QObject):          self.checkClipboard = False          default = self.refreshConnections()          self.connData = None +        self.captchaProcessing = False          if not first:              self.connWindow.show()          else: @@ -131,6 +132,7 @@ class main(QObject):          self.connect(self.mainWindow, SIGNAL("setClipboardStatus"), self.slotSetClipboardStatus)          self.connect(self.mainWindow, SIGNAL("changePackageName"), self.slotChangePackageName)          self.connect(self.mainWindow, SIGNAL("pullOutPackage"), self.slotPullOutPackage) +        self.connect(self.mainWindow.captchaDock, SIGNAL("done"), self.slotCaptchaDone)      def slotShowConnector(self):          """ @@ -503,6 +505,7 @@ class main(QObject):          geo = str(nodes["geometry"].text())          self.mainWindow.restoreWindow(state, geo) +        self.mainWindow.captchaDock.hide()      def slotPushPackageToQueue(self, id):          """ @@ -566,7 +569,15 @@ class main(QObject):              pull package out of the queue          """          if isPack: -            self.connector.slotPullOutPackage(pid) +            self.connector.pullOutPackage(pid) +     +    def checkCaptcha(self): +        if self.connector.captchaWaiting() and self.mainWindow.captchaDock.isFree(): +            cid, img, imgType = self.connector.getCaptcha() +            self.mainWindow.captchaDock.setTask(cid, str(img), imgType) +     +    def slotCaptchaDone(self, cid, result): +        print self.connector.setCaptchaResult(str(cid), str(result))      class Loop(QThread):          """ @@ -590,6 +601,7 @@ class main(QObject):              self.parent.refreshServerStatus()              self.parent.refreshLog()              self.parent.updateAvailable() +            self.parent.checkCaptcha()          def stop(self):              self.running = False | 
