[ocrfeeder] Fix unicode issues when exporting files



commit b3e7ed735f241acd2b7d7aa97be7b26bffdbc9ed
Author: Joaquim Rocha <me joaquimrocha com>
Date:   Sat Jun 4 23:12:39 2016 +0200

    Fix unicode issues when exporting files
    
    These changes make sure that unicode is used in DataBox, and that the
    document generators use it correctly when exporting the data to files.

 src/ocrfeeder/feeder/documentGeneration.py |   10 +++++-----
 src/ocrfeeder/studio/dataHolder.py         |    8 ++++----
 src/ocrfeeder/util/lib.py                  |    6 ++++++
 3 files changed, 15 insertions(+), 9 deletions(-)
---
diff --git a/src/ocrfeeder/feeder/documentGeneration.py b/src/ocrfeeder/feeder/documentGeneration.py
index 51037af..152e9c3 100644
--- a/src/ocrfeeder/feeder/documentGeneration.py
+++ b/src/ocrfeeder/feeder/documentGeneration.py
@@ -213,12 +213,12 @@ class HtmlGenerator(DocumentGenerator):
             os.mkdir(images_folder)
         if pages:
             file = open(os.path.join(self.name, 'index.html'), 'w')
-            file.write(pages[0])
+            file.write(pages[0].encode('utf-8'))
             file.close()
             if len(pages) > 1:
                 for i in xrange(1, len(pages)):
                     file = open(os.path.join(self.name, 'page%s.html' % (i + 1)), 'w')
-                    file.write(pages[i])
+                    file.write(pages[i].encode('utf-8'))
                     file.close()
         if self.styles:
             file = open(os.path.join(self.name, 'style.css'), 'w')
@@ -248,7 +248,7 @@ class OdtGenerator(DocumentGenerator):
         self.document.automaticstyles.addElement(frame_style_rotated)
 
     def addText(self, data_box):
-        text = data_box.getText().decode('utf-8')
+        text = data_box.getText()
         frame_style = Style(name='FrameStyle', family = 'graphic')
         debug('Angle: %s' % data_box.text_data.angle)
         angle = data_box.text_data.angle
@@ -355,10 +355,10 @@ class OdtGenerator(DocumentGenerator):
 class PlaintextGenerator(DocumentGenerator):
     def __init__(self, name):
         self.name = name
-        self.text = ''
+        self.text = u''
 
     def addText(self, newText):
-        self.text += unicode(newText, 'utf-8')
+        self.text += newText
 
     def addPage(self, page):
         self.addText(page.getTextFromBoxes())
diff --git a/src/ocrfeeder/studio/dataHolder.py b/src/ocrfeeder/studio/dataHolder.py
index 2821960..204f1a3 100644
--- a/src/ocrfeeder/studio/dataHolder.py
+++ b/src/ocrfeeder/studio/dataHolder.py
@@ -72,7 +72,7 @@ class DataBox(GObject.GObject):
                      (GObject.TYPE_INT,))
         }
 
-    def __init__(self, x = 0, y = 0, width = 0, height = 0, image = None, type = TEXT_TYPE, text = ''):
+    def __init__(self, x = 0, y = 0, width = 0, height = 0, image = None, type = TEXT_TYPE, text = u''):
         super(DataBox, self).__init__()
         self.x = int(x)
         self.y = int(y)
@@ -81,7 +81,7 @@ class DataBox(GObject.GObject):
         self.image = image
         self.setType(type)
         self.text_data = TextData()
-        self.text = text
+        self.text = self.setText(text)
 
     def configTextData(self, face = 'Sans', size = 12, justification = ALIGN_LEFT, line_space = 1, 
letter_space = 1):
         self.text_data = TextData(face, size, justification, line_space, letter_space)
@@ -132,7 +132,7 @@ class DataBox(GObject.GObject):
         self.text_data.weight = font_weight
 
     def setText(self, text):
-        self.text = text
+        self.text = lib.ensureUnicode(text)
 
     def getText(self):
         return self.text
@@ -233,7 +233,7 @@ class PageData:
         return {'PageData': dictionary}
 
     def getTextFromBoxes(self, data_boxes=None):
-        text = ''
+        text = u''
         if data_boxes is None:
             data_boxes = self.data_boxes
         number_of_boxes = len(data_boxes)
diff --git a/src/ocrfeeder/util/lib.py b/src/ocrfeeder/util/lib.py
index 73b03b9..db60c75 100644
--- a/src/ocrfeeder/util/lib.py
+++ b/src/ocrfeeder/util/lib.py
@@ -191,3 +191,9 @@ def makeRadioButton(label, from_widget=None):
     button.set_use_underline(True)
 
     return button
+
+def ensureUnicode(text):
+    if isinstance(text, unicode):
+        return text
+
+    return unicode(text, 'utf-8')


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]