diff options
Diffstat (limited to 'module/plugins/ocr')
| -rw-r--r-- | module/plugins/ocr/GigasizeCom.py | 20 | ||||
| -rw-r--r-- | module/plugins/ocr/LinksaveIn.py | 149 | ||||
| -rw-r--r-- | module/plugins/ocr/NetloadIn.py | 26 | ||||
| -rw-r--r-- | module/plugins/ocr/ShareonlineBiz.py | 53 | ||||
| -rw-r--r-- | module/plugins/ocr/__init__.py | 0 | 
5 files changed, 248 insertions, 0 deletions
diff --git a/module/plugins/ocr/GigasizeCom.py b/module/plugins/ocr/GigasizeCom.py new file mode 100644 index 000000000..8f9d78710 --- /dev/null +++ b/module/plugins/ocr/GigasizeCom.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR + +class GigasizeCom(OCR): +    def __init__(self): +        OCR.__init__(self) + +    def get_captcha(self, image): +        self.load_image(image) +        self.threshold(2.8) +        self.run_tesser(True, False, False, True) +        return self.result_captcha + +if __name__ == '__main__': +    ocr = GigasizeCom() +    import urllib +    urllib.urlretrieve('http://www.gigasize.com/randomImage.php', "gigasize_tmp.jpg") + +    print ocr.get_captcha('gigasize_tmp.jpg') diff --git a/module/plugins/ocr/LinksaveIn.py b/module/plugins/ocr/LinksaveIn.py new file mode 100644 index 000000000..0ddd50a50 --- /dev/null +++ b/module/plugins/ocr/LinksaveIn.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR +from PIL import Image +from os import sep +from os.path import dirname +from os.path import abspath +from glob import glob + + +class LinksaveIn(OCR): +    __name__ = "LinksaveIn" +    def __init__(self): +        OCR.__init__(self) +        self.data_dir = dirname(abspath(__file__)) + sep + "LinksaveIn" + sep + +    def load_image(self, image): +        im = Image.open(image) +        frame_nr = 0 + +        lut = im.resize((256, 1)) +        lut.putdata(range(256)) +        lut = list(lut.convert("RGB").getdata()) + +        new = Image.new("RGB", im.size) +        npix = new.load() +        while True: +            try: +                im.seek(frame_nr) +            except EOFError: +                break +            frame = im.copy() +            pix = frame.load() +            for x in xrange(frame.size[0]): +                for y in xrange(frame.size[1]): +                    if lut[pix[x, y]] != (0,0,0): +                        npix[x, y] = lut[pix[x, y]] +            frame_nr += 1 +        new.save(self.data_dir+"unblacked.png") +        self.image = new.copy() +        self.pixels = self.image.load() +        self.result_captcha = '' + +    def get_bg(self): +        stat = {} +        cstat = {} +        img = self.image.convert("P") +        for bgpath in glob(self.data_dir+"bg/*.gif"): +            stat[bgpath] = 0 +            bg = Image.open(bgpath) + +            bglut = bg.resize((256, 1)) +            bglut.putdata(range(256)) +            bglut = list(bglut.convert("RGB").getdata()) + +            lut = img.resize((256, 1)) +            lut.putdata(range(256)) +            lut = list(lut.convert("RGB").getdata()) + +            bgpix = bg.load() +            pix = img.load() +            for x in xrange(bg.size[0]): +                for y in xrange(bg.size[1]): +                    rgb_bg = bglut[bgpix[x, y]] +                    rgb_c = lut[pix[x, y]] +                    try: +                        cstat[rgb_c] += 1 +                    except: +                        cstat[rgb_c] = 1 +                    if rgb_bg == rgb_c: +                        stat[bgpath] += 1 +        max_p = 0 +        bg = "" +        for bgpath, value in stat.items(): +            if max_p < value: +                bg = bgpath +                max_p = value +        return bg + +    def substract_bg(self, bgpath): +        bg = Image.open(bgpath) +        img = self.image.convert("P") + +        bglut = bg.resize((256, 1)) +        bglut.putdata(range(256)) +        bglut = list(bglut.convert("RGB").getdata()) + +        lut = img.resize((256, 1)) +        lut.putdata(range(256)) +        lut = list(lut.convert("RGB").getdata()) + +        bgpix = bg.load() +        pix = img.load() +        orgpix = self.image.load() +        for x in xrange(bg.size[0]): +            for y in xrange(bg.size[1]): +                rgb_bg = bglut[bgpix[x, y]] +                rgb_c = lut[pix[x, y]] +                if rgb_c == rgb_bg: +                    orgpix[x, y] = (255,255,255) + +    def eval_black_white(self): +        new = Image.new("RGB", (140, 75)) +        pix = new.load() +        orgpix = self.image.load() +        thresh = 4 +        for x in xrange(new.size[0]): +            for y in xrange(new.size[1]): +                rgb = orgpix[x, y] +                r, g, b = rgb +                pix[x, y] = (255,255,255) +                if r > max(b, g)+thresh: +                    pix[x, y] = (0,0,0) +                if g < min(r, b): +                    pix[x, y] = (0,0,0) +                if g > max(r, b)+thresh: +                    pix[x, y] = (0,0,0) +                if b > max(r, g)+thresh: +                    pix[x, y] = (0,0,0) +        self.image = new +        self.pixels = self.image.load() + +    def get_captcha(self, image): +        self.load_image(image) +        bg = self.get_bg() +        self.substract_bg(bg) +        self.eval_black_white() +        self.to_greyscale() +        self.image.save(self.data_dir+"cleaned_pass1.png") +        self.clean(4) +        self.clean(4) +        self.image.save(self.data_dir+"cleaned_pass2.png") +        letters = self.split_captcha_letters() +        final = "" +        for n, letter in enumerate(letters): +            self.image = letter +            self.image.save(ocr.data_dir+"letter%d.png" % n) +            self.run_tesser(True, True, False, False) +            final += self.result_captcha + +        return final + +if __name__ == '__main__': +    import urllib +    ocr = LinksaveIn() +    testurl = "http://linksave.in/captcha/cap.php?hsh=2229185&code=ZzHdhl3UffV3lXTH5U4b7nShXj%2Bwma1vyoNBcbc6lcc%3D" +    urllib.urlretrieve(testurl, ocr.data_dir+"captcha.gif") + +    print ocr.get_captcha(ocr.data_dir+'captcha.gif') diff --git a/module/plugins/ocr/NetloadIn.py b/module/plugins/ocr/NetloadIn.py new file mode 100644 index 000000000..9fc2f0725 --- /dev/null +++ b/module/plugins/ocr/NetloadIn.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR + +class NetloadIn(OCR): +    __name__ = "NetloadIn" +    def __init__(self): +        OCR.__init__(self) + +    def get_captcha(self, image): +        self.load_image(image) +        self.to_greyscale() +        self.clean(3) +        self.clean(3) +        self.run_tesser(True, True, False, False) + +        self.result_captcha = self.result_captcha.replace(" ", "")[:4] # cut to 4 numbers + +        return self.result_captcha + +if __name__ == '__main__': +    import urllib +    ocr = NetloadIn() +    urllib.urlretrieve("http://netload.in/share/includes/captcha.php", "captcha.png") + +    print  ocr.get_captcha('captcha.png') diff --git a/module/plugins/ocr/ShareonlineBiz.py b/module/plugins/ocr/ShareonlineBiz.py new file mode 100644 index 000000000..db72449d1 --- /dev/null +++ b/module/plugins/ocr/ShareonlineBiz.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# +#Copyright (C) 2009 kingzero, RaNaN +# +#This program is free software; you can redistribute it and/or modify +#it under the terms of the GNU General Public License as published by +#the Free Software Foundation; either version 3 of the License, +#or (at your option) any later version. +# +#This program is distributed in the hope that it will be useful, +#but WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +#See the GNU General Public License for more details. +# +#You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. +# +### +from module.plugins.OCR import OCR + +class ShareonlineBiz(OCR): +    __name__ = "ShareonlineBiz" + +    def __init__(self): +        OCR.__init__(self) + +    def get_captcha(self, image):  +        self.load_image(image) +        self.to_greyscale() +        self.image = self.image.resize((160, 50)) +        self.pixels = self.image.load() +        self.threshold(1.85) +        #self.eval_black_white(240) +        #self.derotate_by_average() + +        letters = self.split_captcha_letters() + +        final = "" +        for letter in letters: +            self.image = letter +            self.run_tesser(True, True, False, False) +            final += self.result_captcha + +        return final + +        #tesseract at 60% + +if __name__ == '__main__': +    import urllib +    ocr = ShareonlineBiz() +    urllib.urlretrieve("http://www.share-online.biz/captcha.php", "captcha.jpeg") +    print  ocr.get_captcha('captcha.jpeg') diff --git a/module/plugins/ocr/__init__.py b/module/plugins/ocr/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/module/plugins/ocr/__init__.py  | 
