From 7dc0b955b6e3d4448c173447c700717e8cdbbe95 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 14 Jun 2015 19:43:37 +0200 Subject: Move OCR to internal plugin folder --- module/plugins/internal/OCR.py | 319 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 319 insertions(+) create mode 100644 module/plugins/internal/OCR.py (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py new file mode 100644 index 000000000..1874ba07d --- /dev/null +++ b/module/plugins/internal/OCR.py @@ -0,0 +1,319 @@ +# -*- coding: utf-8 -*- + +from __future__ import with_statement + +try: + from PIL import Image, GifImagePlugin, JpegImagePlugin, PngImagePlugin, TiffImagePlugin + +except ImportError: + import Image, GifImagePlugin, JpegImagePlugin, PngImagePlugin, TiffImagePlugin + +import logging +import os +import subprocess +#import tempfile + +from module.utils import save_join + + +class OCR(object): + __name__ = "OCR" + __type__ = "ocr" + __version__ = "0.11" + + __description__ = """OCR base plugin""" + __license__ = "GPLv3" + __authors__ = [("pyLoad Team", "admin@pyload.org")] + + + def __init__(self): + self.logger = logging.getLogger("log") + + + def load_image(self, image): + self.image = Image.open(image) + self.pixels = self.image.load() + self.result_captcha = '' + + + def unload(self): + """delete all tmp images""" + pass + + + def threshold(self, value): + self.image = self.image.point(lambda a: a * value + 10) + + + def run(self, command): + """Run a command""" + + popen = subprocess.Popen(command, bufsize = -1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + popen.wait() + output = popen.stdout.read() +" | "+ popen.stderr.read() + popen.stdout.close() + popen.stderr.close() + self.logger.debug("Tesseract ReturnCode %s Output: %s" % (popen.returncode, output)) + + + def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): + #tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") + try: + tmpTif = open(save_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") + tmpTif.close() + + #tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") + tmpTxt = open(save_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") + tmpTxt.close() + + except IOError, e: + self.logError(e) + return + + self.logger.debug("save tiff") + self.image.save(tmpTif.name, 'TIFF') + + if os.name == "nt": + tessparams = [os.path.join(pypath, "tesseract", "tesseract.exe")] + else: + tessparams = ["tesseract"] + + tessparams.extend( [os.path.abspath(tmpTif.name), os.path.abspath(tmpTxt.name).replace(".txt", "")] ) + + if subset and (digits or lowercase or uppercase): + #tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") + with open(save_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: + tmpSub.write("tessedit_char_whitelist ") + + if digits: + tmpSub.write("0123456789") + if lowercase: + tmpSub.write("abcdefghijklmnopqrstuvwxyz") + if uppercase: + tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + + tmpSub.write("\n") + tessparams.append("nobatch") + tessparams.append(os.path.abspath(tmpSub.name)) + + self.logger.debug("run tesseract") + self.run(tessparams) + self.logger.debug("read txt") + + try: + with open(tmpTxt.name, 'r') as f: + self.result_captcha = f.read().replace("\n", "") + except Exception: + self.result_captcha = "" + + self.logger.debug(self.result_captcha) + try: + os.remove(tmpTif.name) + os.remove(tmpTxt.name) + if subset and (digits or lowercase or uppercase): + os.remove(tmpSub.name) + except Exception: + pass + + + def get_captcha(self, name): + raise NotImplementedError + + + def to_greyscale(self): + if self.image.mode != 'L': + self.image = self.image.convert('L') + + self.pixels = self.image.load() + + + def eval_black_white(self, limit): + self.pixels = self.image.load() + w, h = self.image.size + for x in xrange(w): + for y in xrange(h): + if self.pixels[x, y] > limit: + self.pixels[x, y] = 255 + else: + self.pixels[x, y] = 0 + + + def clean(self, allowed): + pixels = self.pixels + + w, h = self.image.size + + for x in xrange(w): + for y in xrange(h): + if pixels[x, y] == 255: + continue + # No point in processing white pixels since we only want to remove black pixel + count = 0 + + try: + if pixels[x-1, y-1] != 255: + count += 1 + if pixels[x-1, y] != 255: + count += 1 + if pixels[x-1, y + 1] != 255: + count += 1 + if pixels[x, y + 1] != 255: + count += 1 + if pixels[x + 1, y + 1] != 255: + count += 1 + if pixels[x + 1, y] != 255: + count += 1 + if pixels[x + 1, y-1] != 255: + count += 1 + if pixels[x, y-1] != 255: + count += 1 + except Exception: + pass + + # not enough neighbors are dark pixels so mark this pixel + # to be changed to white + if count < allowed: + pixels[x, y] = 1 + + # second pass: this time set all 1's to 255 (white) + for x in xrange(w): + for y in xrange(h): + if pixels[x, y] == 1: + pixels[x, y] = 255 + + self.pixels = pixels + + + def derotate_by_average(self): + """rotate by checking each angle and guess most suitable""" + + w, h = self.image.size + pixels = self.pixels + + for x in xrange(w): + for y in xrange(h): + if pixels[x, y] == 0: + pixels[x, y] = 155 + + highest = {} + counts = {} + + for angle in xrange(-45, 45): + + tmpimage = self.image.rotate(angle) + + pixels = tmpimage.load() + + w, h = self.image.size + + for x in xrange(w): + for y in xrange(h): + if pixels[x, y] == 0: + pixels[x, y] = 255 + + + count = {} + + for x in xrange(w): + count[x] = 0 + for y in xrange(h): + if pixels[x, y] == 155: + count[x] += 1 + + sum = 0 + cnt = 0 + + for x in count.values(): + if x != 0: + sum += x + cnt += 1 + + avg = sum / cnt + counts[angle] = cnt + highest[angle] = 0 + for x in count.values(): + if x > highest[angle]: + highest[angle] = x + + highest[angle] = highest[angle] - avg + + hkey = 0 + hvalue = 0 + + for key, value in highest.iteritems(): + if value > hvalue: + hkey = key + hvalue = value + + self.image = self.image.rotate(hkey) + pixels = self.image.load() + + for x in xrange(w): + for y in xrange(h): + if pixels[x, y] == 0: + pixels[x, y] = 255 + + if pixels[x, y] == 155: + pixels[x, y] = 0 + + self.pixels = pixels + + + def split_captcha_letters(self): + captcha = self.image + started = False + letters = [] + width, height = captcha.size + bottomY, topY = 0, height + pixels = captcha.load() + + for x in xrange(width): + black_pixel_in_col = False + for y in xrange(height): + if pixels[x, y] != 255: + if not started: + started = True + firstX = x + lastX = x + + if y > bottomY: + bottomY = y + if y < topY: + topY = y + if x > lastX: + lastX = x + + black_pixel_in_col = True + + if black_pixel_in_col is False and started is True: + rect = (firstX, topY, lastX, bottomY) + new_captcha = captcha.crop(rect) + + w, h = new_captcha.size + if w > 5 and h > 5: + letters.append(new_captcha) + + started = False + bottomY, topY = 0, height + + return letters + + + def correct(self, values, var=None): + if var: + result = var + else: + result = self.result_captcha + + for key, item in values.iteritems(): + + if key.__class__ == str: + result = result.replace(key, item) + else: + for expr in key: + result = result.replace(expr, item) + + if var: + return result + else: + self.result_captcha = result -- cgit v1.2.3 From c9144f451b74e4d3cc67935b9e73c662ac870c6e Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Mon, 15 Jun 2015 07:18:39 +0200 Subject: Hook plugin code cosmetics (2) --- module/plugins/internal/OCR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 1874ba07d..2349d32af 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -36,7 +36,7 @@ class OCR(object): self.result_captcha = '' - def unload(self): + def deactivate(self): """delete all tmp images""" pass -- cgit v1.2.3 From 5a139055ae658d3a05cbb658cbd66aeae0d01db5 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Mon, 15 Jun 2015 21:06:10 +0200 Subject: Spare code cosmetics --- module/plugins/internal/OCR.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 2349d32af..1782e17f0 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -13,7 +13,7 @@ import os import subprocess #import tempfile -from module.utils import save_join +from module.utils import save_join as fs_join class OCR(object): @@ -59,11 +59,11 @@ class OCR(object): def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): #tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") try: - tmpTif = open(save_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") + tmpTif = open(fs_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") tmpTif.close() #tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") - tmpTxt = open(save_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") + tmpTxt = open(fs_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") tmpTxt.close() except IOError, e: @@ -82,7 +82,7 @@ class OCR(object): if subset and (digits or lowercase or uppercase): #tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") - with open(save_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: + with open(fs_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: tmpSub.write("tessedit_char_whitelist ") if digits: -- cgit v1.2.3 From d99d6eddb6af637580bb6fc72013f913077525d6 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Wed, 17 Jun 2015 11:23:08 +0200 Subject: Spare fixes --- module/plugins/internal/OCR.py | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 1782e17f0..2d41ab39e 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -11,12 +11,13 @@ except ImportError: import logging import os import subprocess -#import tempfile +# import tempfile +from module.plugins.internal.Plugin import Plugin from module.utils import save_join as fs_join -class OCR(object): +class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" __version__ = "0.11" @@ -37,7 +38,7 @@ class OCR(object): def deactivate(self): - """delete all tmp images""" + """Delete all tmp images""" pass @@ -48,21 +49,21 @@ class OCR(object): def run(self, command): """Run a command""" - popen = subprocess.Popen(command, bufsize = -1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + popen = subprocess.Popen(command, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) popen.wait() - output = popen.stdout.read() +" | "+ popen.stderr.read() + output = popen.stdout.read() + " | " + popen.stderr.read() popen.stdout.close() popen.stderr.close() self.logger.debug("Tesseract ReturnCode %s Output: %s" % (popen.returncode, output)) - def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): - #tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") + def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True, pagesegmode=None): + # tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") try: tmpTif = open(fs_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") tmpTif.close() - #tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") + # tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") tmpTxt = open(fs_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") tmpTxt.close() @@ -78,10 +79,13 @@ class OCR(object): else: tessparams = ["tesseract"] - tessparams.extend( [os.path.abspath(tmpTif.name), os.path.abspath(tmpTxt.name).replace(".txt", "")] ) + tessparams.extend([os.path.abspath(tmpTif.name), os.path.abspath(tmpTxt.name).replace(".txt", "")]) + + if pagesegmode: + tessparams.extend(["-psm", str(pagesegmode)]) if subset and (digits or lowercase or uppercase): - #tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") + # tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") with open(fs_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: tmpSub.write("tessedit_char_whitelist ") @@ -151,11 +155,11 @@ class OCR(object): count = 0 try: - if pixels[x-1, y-1] != 255: + if pixels[x - 1, y - 1] != 255: count += 1 - if pixels[x-1, y] != 255: + if pixels[x - 1, y] != 255: count += 1 - if pixels[x-1, y + 1] != 255: + if pixels[x - 1, y + 1] != 255: count += 1 if pixels[x, y + 1] != 255: count += 1 @@ -163,19 +167,19 @@ class OCR(object): count += 1 if pixels[x + 1, y] != 255: count += 1 - if pixels[x + 1, y-1] != 255: + if pixels[x + 1, y - 1] != 255: count += 1 - if pixels[x, y-1] != 255: + if pixels[x, y - 1] != 255: count += 1 except Exception: pass - # not enough neighbors are dark pixels so mark this pixel - # to be changed to white + # not enough neighbors are dark pixels so mark this pixel + # to be changed to white if count < allowed: pixels[x, y] = 1 - # second pass: this time set all 1's to 255 (white) + # second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): if pixels[x, y] == 1: @@ -185,7 +189,7 @@ class OCR(object): def derotate_by_average(self): - """rotate by checking each angle and guess most suitable""" + """Rotate by checking each angle and guess most suitable""" w, h = self.image.size pixels = self.pixels @@ -211,7 +215,6 @@ class OCR(object): if pixels[x, y] == 0: pixels[x, y] = 255 - count = {} for x in xrange(w): -- cgit v1.2.3 From 164512b6a74c94a731fcee7435dce1ccfa2f71e7 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Wed, 17 Jun 2015 18:29:50 +0200 Subject: Spare code cosmetics --- module/plugins/internal/OCR.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 2d41ab39e..5fe6f2532 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -38,7 +38,9 @@ class OCR(Plugin): def deactivate(self): - """Delete all tmp images""" + """ + Delete all tmp images + """ pass @@ -47,8 +49,9 @@ class OCR(Plugin): def run(self, command): - """Run a command""" - + """ + Run a command + """ popen = subprocess.Popen(command, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) popen.wait() output = popen.stdout.read() + " | " + popen.stderr.read() @@ -189,8 +192,9 @@ class OCR(Plugin): def derotate_by_average(self): - """Rotate by checking each angle and guess most suitable""" - + """ + Rotate by checking each angle and guess most suitable + """ w, h = self.image.size pixels = self.pixels -- cgit v1.2.3 From 20b6a2ec022202b0efb6cb69415239fb8f4d1445 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Wed, 17 Jun 2015 18:59:20 +0200 Subject: Spare code cosmetics (2) --- module/plugins/internal/OCR.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 5fe6f2532..0191a4938 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -61,12 +61,12 @@ class OCR(Plugin): def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True, pagesegmode=None): - # tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") + #: tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") try: tmpTif = open(fs_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") tmpTif.close() - # tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") + #: tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") tmpTxt = open(fs_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") tmpTxt.close() @@ -88,7 +88,7 @@ class OCR(Plugin): tessparams.extend(["-psm", str(pagesegmode)]) if subset and (digits or lowercase or uppercase): - # tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") + #: tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") with open(fs_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: tmpSub.write("tessedit_char_whitelist ") @@ -154,7 +154,7 @@ class OCR(Plugin): for y in xrange(h): if pixels[x, y] == 255: continue - # No point in processing white pixels since we only want to remove black pixel + #: No point in processing white pixels since we only want to remove black pixel count = 0 try: @@ -177,12 +177,12 @@ class OCR(Plugin): except Exception: pass - # not enough neighbors are dark pixels so mark this pixel - # to be changed to white + #: not enough neighbors are dark pixels so mark this pixel + #: to be changed to white if count < allowed: pixels[x, y] = 1 - # second pass: this time set all 1's to 255 (white) + #: second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): if pixels[x, y] == 1: -- cgit v1.2.3 From b1759bc440cd6013837697eb8de540914f693ffd Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Tue, 7 Jul 2015 01:23:55 +0200 Subject: No camelCase style anymore --- module/plugins/internal/OCR.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 0191a4938..880f8b570 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -20,7 +20,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.11" + __version__ = "0.12" __description__ = """OCR base plugin""" __license__ = "GPLv3" @@ -71,7 +71,7 @@ class OCR(Plugin): tmpTxt.close() except IOError, e: - self.logError(e) + self.log_error(e) return self.logger.debug("save tiff") -- cgit v1.2.3 From dad722ac7255640e7e0541c4094a4d2e4de79cd3 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 00:05:58 +0200 Subject: Code cosmetics (2) --- module/plugins/internal/OCR.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 880f8b570..9ddca0315 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -61,12 +61,12 @@ class OCR(Plugin): def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True, pagesegmode=None): - #: tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") + # tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") try: tmpTif = open(fs_join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") tmpTif.close() - #: tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") + # tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") tmpTxt = open(fs_join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") tmpTxt.close() @@ -88,7 +88,7 @@ class OCR(Plugin): tessparams.extend(["-psm", str(pagesegmode)]) if subset and (digits or lowercase or uppercase): - #: tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") + # tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") with open(fs_join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: tmpSub.write("tessedit_char_whitelist ") @@ -177,12 +177,12 @@ class OCR(Plugin): except Exception: pass - #: not enough neighbors are dark pixels so mark this pixel - #: to be changed to white + #: Not enough neighbors are dark pixels so mark this pixel + #: To be changed to white if count < allowed: pixels[x, y] = 1 - #: second pass: this time set all 1's to 255 (white) + #: Second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): if pixels[x, y] == 1: -- cgit v1.2.3 From 502517f37c7540b0bddb092e69386d9d6f08800c Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 09:42:34 +0200 Subject: Fix addons --- module/plugins/internal/OCR.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 9ddca0315..387db1d13 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -29,6 +29,14 @@ class OCR(Plugin): def __init__(self): self.logger = logging.getLogger("log") + self.init() + + + def init(self): + """ + Initialize additional data structures + """ + pass def load_image(self, image): -- cgit v1.2.3 From ff9383bfe06d14d23bc0ed6af79aa8967965d078 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 10:59:52 +0200 Subject: Code cosmetics (3) --- module/plugins/internal/OCR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 387db1d13..a00f53947 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -42,7 +42,7 @@ class OCR(Plugin): def load_image(self, image): self.image = Image.open(image) self.pixels = self.image.load() - self.result_captcha = '' + self.result_captcha = "" def deactivate(self): -- cgit v1.2.3 From 56389e28ba5d2f5658278bc7f486d73be747f135 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 11:44:49 +0200 Subject: Rename self.core to self.pyload (plugins only) --- module/plugins/internal/OCR.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index a00f53947..c5cf5c2e9 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -28,6 +28,9 @@ class OCR(Plugin): def __init__(self): + self.pyload = pyfile.m.core + self.info = {} #: Provide information in dict here + self.logger = logging.getLogger("log") self.init() -- cgit v1.2.3 From a5b840079dfa281127c0fc0f5a13708b5ecb5031 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 12:13:02 +0200 Subject: [OCR] Fix __init__ --- module/plugins/internal/OCR.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index c5cf5c2e9..566801e9a 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -27,11 +27,9 @@ class OCR(Plugin): __authors__ = [("pyLoad Team", "admin@pyload.org")] - def __init__(self): + def __init__(self, pyfile): self.pyload = pyfile.m.core self.info = {} #: Provide information in dict here - - self.logger = logging.getLogger("log") self.init() @@ -68,7 +66,7 @@ class OCR(Plugin): output = popen.stdout.read() + " | " + popen.stderr.read() popen.stdout.close() popen.stderr.close() - self.logger.debug("Tesseract ReturnCode %s Output: %s" % (popen.returncode, output)) + self.pyload.log_debug("Tesseract ReturnCode " + popen.returncode, "Output: " + output) def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True, pagesegmode=None): @@ -85,7 +83,7 @@ class OCR(Plugin): self.log_error(e) return - self.logger.debug("save tiff") + self.pyload.log_debug("Saving tiff...") self.image.save(tmpTif.name, 'TIFF') if os.name == "nt": @@ -114,9 +112,9 @@ class OCR(Plugin): tessparams.append("nobatch") tessparams.append(os.path.abspath(tmpSub.name)) - self.logger.debug("run tesseract") + self.pyload.log_debug("Running tesseract...") self.run(tessparams) - self.logger.debug("read txt") + self.pyload.log_debug("Reading txt...") try: with open(tmpTxt.name, 'r') as f: @@ -124,7 +122,7 @@ class OCR(Plugin): except Exception: self.result_captcha = "" - self.logger.debug(self.result_captcha) + self.pyload.log_info(_("OCR result: ") + self.result_captcha) try: os.remove(tmpTif.name) os.remove(tmpTxt.name) -- cgit v1.2.3 From d38e830b7c0b3c6561a0072c74bbccb5fcdf4a61 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 19 Jul 2015 14:43:42 +0200 Subject: New __status__ magic key --- module/plugins/internal/OCR.py | 1 + 1 file changed, 1 insertion(+) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 566801e9a..ee5571f77 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,6 +21,7 @@ class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" __version__ = "0.12" + __status__ = "stable" __description__ = """OCR base plugin""" __license__ = "GPLv3" -- cgit v1.2.3 From 6af9b38a8d5d49355b85aef6ddd003605d6bba05 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Thu, 23 Jul 2015 23:44:45 +0200 Subject: Improve Captcha --- module/plugins/internal/OCR.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index ee5571f77..5c22866c8 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -20,7 +20,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.12" + __version__ = "0.13" __status__ = "stable" __description__ = """OCR base plugin""" @@ -133,7 +133,7 @@ class OCR(Plugin): pass - def get_captcha(self, name): + def recognize(self, name): raise NotImplementedError -- cgit v1.2.3 From 12cbb11279ebfa2f2945c5adb093b97924a8cfc3 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Fri, 24 Jul 2015 00:33:19 +0200 Subject: Small __init__ fixes --- module/plugins/internal/OCR.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 5c22866c8..15d3fc305 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -28,8 +28,8 @@ class OCR(Plugin): __authors__ = [("pyLoad Team", "admin@pyload.org")] - def __init__(self, pyfile): - self.pyload = pyfile.m.core + def __init__(self, plugin): + self.pyload = plugin.pyload self.info = {} #: Provide information in dict here self.init() -- cgit v1.2.3 From f7df6ef48a7c0a8ab6351e046cd12160257c4ef5 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Fri, 24 Jul 2015 02:15:31 +0200 Subject: Hotfixes --- module/plugins/internal/OCR.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 15d3fc305..13089c9c0 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -12,6 +12,7 @@ import logging import os import subprocess # import tempfile +import traceback from module.plugins.internal.Plugin import Plugin from module.utils import save_join as fs_join @@ -129,8 +130,10 @@ class OCR(Plugin): os.remove(tmpTxt.name) if subset and (digits or lowercase or uppercase): os.remove(tmpSub.name) - except Exception: - pass + except OSError, e: + self.log_warning(e) + if self.pyload.debug: + traceback.print_exc() def recognize(self, name): -- cgit v1.2.3 From 94d017cd2a5c1f194960827a8c7e46afc3682008 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Fri, 24 Jul 2015 06:55:49 +0200 Subject: Hotfixes (2) --- module/plugins/internal/OCR.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 13089c9c0..064bf1d7d 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -22,7 +22,7 @@ class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" __version__ = "0.13" - __status__ = "stable" + __status__ = "testing" __description__ = """OCR base plugin""" __license__ = "GPLv3" @@ -88,7 +88,7 @@ class OCR(Plugin): self.pyload.log_debug("Saving tiff...") self.image.save(tmpTif.name, 'TIFF') - if os.name == "nt": + if os.name is "nt": tessparams = [os.path.join(pypath, "tesseract", "tesseract.exe")] else: tessparams = ["tesseract"] @@ -165,7 +165,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 255: + if pixels[x, y] is 255: continue #: No point in processing white pixels since we only want to remove black pixel count = 0 @@ -198,7 +198,7 @@ class OCR(Plugin): #: Second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 1: + if pixels[x, y] is 1: pixels[x, y] = 255 self.pixels = pixels @@ -213,7 +213,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 0: + if pixels[x, y] is 0: pixels[x, y] = 155 highest = {} @@ -229,7 +229,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 0: + if pixels[x, y] is 0: pixels[x, y] = 255 count = {} @@ -237,7 +237,7 @@ class OCR(Plugin): for x in xrange(w): count[x] = 0 for y in xrange(h): - if pixels[x, y] == 155: + if pixels[x, y] is 155: count[x] += 1 sum = 0 @@ -270,10 +270,10 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 0: + if pixels[x, y] is 0: pixels[x, y] = 255 - if pixels[x, y] == 155: + if pixels[x, y] is 155: pixels[x, y] = 0 self.pixels = pixels @@ -327,7 +327,7 @@ class OCR(Plugin): for key, item in values.iteritems(): - if key.__class__ == str: + if key.__class__ is str: result = result.replace(key, item) else: for expr in key: -- cgit v1.2.3 From 761ca5c66e07559925ebbdbc6531f9ca658b12ce Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Fri, 24 Jul 2015 16:11:58 +0200 Subject: Code cosmetics --- module/plugins/internal/OCR.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 064bf1d7d..36d259e0c 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -88,7 +88,7 @@ class OCR(Plugin): self.pyload.log_debug("Saving tiff...") self.image.save(tmpTif.name, 'TIFF') - if os.name is "nt": + if os.name == "nt": tessparams = [os.path.join(pypath, "tesseract", "tesseract.exe")] else: tessparams = ["tesseract"] @@ -165,7 +165,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] is 255: + if pixels[x, y] == 255: continue #: No point in processing white pixels since we only want to remove black pixel count = 0 @@ -198,7 +198,7 @@ class OCR(Plugin): #: Second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): - if pixels[x, y] is 1: + if pixels[x, y] == 1: pixels[x, y] = 255 self.pixels = pixels @@ -213,7 +213,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] is 0: + if pixels[x, y] == 0: pixels[x, y] = 155 highest = {} @@ -229,7 +229,7 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] is 0: + if pixels[x, y] == 0: pixels[x, y] = 255 count = {} @@ -237,7 +237,7 @@ class OCR(Plugin): for x in xrange(w): count[x] = 0 for y in xrange(h): - if pixels[x, y] is 155: + if pixels[x, y] == 155: count[x] += 1 sum = 0 @@ -270,10 +270,10 @@ class OCR(Plugin): for x in xrange(w): for y in xrange(h): - if pixels[x, y] is 0: + if pixels[x, y] == 0: pixels[x, y] = 255 - if pixels[x, y] is 155: + if pixels[x, y] == 155: pixels[x, y] = 0 self.pixels = pixels -- cgit v1.2.3 From 952001324e1faf584b1adcb01c4a0406a3722932 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sat, 25 Jul 2015 09:42:49 +0200 Subject: =?UTF-8?q?Don't=20user=20dictionary=E2=80=99s=20iterator=20method?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- module/plugins/internal/OCR.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 36d259e0c..9896837b5 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -260,7 +260,7 @@ class OCR(Plugin): hkey = 0 hvalue = 0 - for key, value in highest.iteritems(): + for key, value in highest.items(): if value > hvalue: hkey = key hvalue = value @@ -325,7 +325,7 @@ class OCR(Plugin): else: result = self.result_captcha - for key, item in values.iteritems(): + for key, item in values.items(): if key.__class__ is str: result = result.replace(key, item) -- cgit v1.2.3 From f83389333ec10376452aa5f6d5ccd3963c6bafa1 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Mon, 27 Jul 2015 10:28:30 +0200 Subject: Update internal plugins --- module/plugins/internal/OCR.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 9896837b5..37a33206f 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.13" + __version__ = "0.14" __status__ = "testing" __description__ = """OCR base plugin""" @@ -31,7 +31,9 @@ class OCR(Plugin): def __init__(self, plugin): self.pyload = plugin.pyload + self.plugin = plugin self.info = {} #: Provide information in dict here + self.init() @@ -42,6 +44,10 @@ class OCR(Plugin): pass + def _log(self, level, args): + return self.plugin._log(level, (self.__name__,) + args) + + def load_image(self, image): self.image = Image.open(image) self.pixels = self.image.load() -- cgit v1.2.3 From ecf37227de05c73b7ffe2da89a5eda1259a72543 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Tue, 28 Jul 2015 01:09:59 +0200 Subject: Improve _log method --- module/plugins/internal/OCR.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 37a33206f..998b3f040 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.14" + __version__ = "0.15" __status__ = "testing" __description__ = """OCR base plugin""" @@ -44,8 +44,8 @@ class OCR(Plugin): pass - def _log(self, level, args): - return self.plugin._log(level, (self.__name__,) + args) + def _log(self, level, plugintype, pluginname, messages): + return self.plugin._log(level, plugintype, pluginname, (self.__name__,) + messages) def load_image(self, image): -- cgit v1.2.3 From e42f5783442fcbaa53e6c0faf943dd33c397e0b3 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Wed, 29 Jul 2015 08:40:17 +0200 Subject: Fix _log method --- module/plugins/internal/OCR.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 998b3f040..8eb240051 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.15" + __version__ = "0.16" __status__ = "testing" __description__ = """OCR base plugin""" @@ -45,7 +45,10 @@ class OCR(Plugin): def _log(self, level, plugintype, pluginname, messages): - return self.plugin._log(level, plugintype, pluginname, (self.__name__,) + messages) + return self.plugin._log(level, + plugintype, + "%s: %s" % (self.plugin.__name__, self.__name__), + messages) def load_image(self, image): -- cgit v1.2.3 From 91e0803c1f47444072ca7381d789a8e98160ae78 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Wed, 29 Jul 2015 21:11:29 +0200 Subject: Still improving _log methods --- module/plugins/internal/OCR.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 8eb240051..805755f39 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.16" + __version__ = "0.17" __status__ = "testing" __description__ = """OCR base plugin""" @@ -47,8 +47,8 @@ class OCR(Plugin): def _log(self, level, plugintype, pluginname, messages): return self.plugin._log(level, plugintype, - "%s: %s" % (self.plugin.__name__, self.__name__), - messages) + self.plugin.__name__, + (self.__name__,) + messages) def load_image(self, image): -- cgit v1.2.3 From 84f2193d76f7c6f47c834dc8902d8ead8e45a11a Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 2 Aug 2015 07:44:07 +0200 Subject: Fix https://github.com/pyload/pyload/issues/1640 (2) --- module/plugins/internal/OCR.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index 805755f39..a9982b076 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.17" + __version__ = "0.18" __status__ = "testing" __description__ = """OCR base plugin""" @@ -30,11 +30,7 @@ class OCR(Plugin): def __init__(self, plugin): - self.pyload = plugin.pyload - self.plugin = plugin - self.info = {} #: Provide information in dict here - - self.init() + super(OCR, self).__init__(plugin.pyload) def init(self): -- cgit v1.2.3 From fd2b928c08f0e1d1c54096696ef75bf899557db8 Mon Sep 17 00:00:00 2001 From: Walter Purcaro Date: Sun, 2 Aug 2015 08:56:00 +0200 Subject: Fix https://github.com/pyload/pyload/issues/1649 --- module/plugins/internal/OCR.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'module/plugins/internal/OCR.py') diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index a9982b076..b24b3058b 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -21,7 +21,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.18" + __version__ = "0.19" __status__ = "testing" __description__ = """OCR base plugin""" @@ -30,7 +30,9 @@ class OCR(Plugin): def __init__(self, plugin): - super(OCR, self).__init__(plugin.pyload) + self._init(plugin.pyload) + self.plugin = plugin + self.init() def init(self): -- cgit v1.2.3