[ocrfeeder] Clean recognized text before returning it



commit 57be6850d909e0cfcc8a8d19e8eea1dae772c9fd
Author: Joaquim Rocha <jrocha igalia com>
Date:   Thu Jul 8 17:22:07 2010 +0200

    Clean recognized text before returning it
    
    layoutAnalysis.LayoutAnalysis: Remove lines breaks in an attempt to
    make the text more close to the original in the image since OCR
    engines output the text line-by-line with a newline char after every
    line.

 feeder/layoutAnalysis.py |   10 +++++++++-
 1 files changed, 9 insertions(+), 1 deletions(-)
---
diff --git a/feeder/layoutAnalysis.py b/feeder/layoutAnalysis.py
index 63b827e..6b8d2ce 100644
--- a/feeder/layoutAnalysis.py
+++ b/feeder/layoutAnalysis.py
@@ -23,6 +23,7 @@ from util import graphics
 from util.constants import OCRFEEDER_DEBUG, DTP
 from studio.dataHolder import DataBox
 from imageManipulation import ImageProcessor
+import re
 
 NONE = 0
 TOP = -1
@@ -456,4 +457,11 @@ class LayoutAnalysis(object):
 
     def readImage(self, image):
         self.ocr_engine.setImage(image)
-        return self.ocr_engine.read()
+        text = self.ocr_engine.read()
+        text = self.__cleanText(text)
+        return text
+
+    def __cleanText(self, text):
+        clean_text = re.sub(r'(?<!-)-\n(?!\n)', r'', text)
+        clean_text = re.sub(r'(?<!\n)\n', r' ', clean_text)
+        return clean_text



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]