[ocrfeeder] Handle special chars in filenames when importing PDF files



commit 5286120c8bc8b7ba74e0f9b19b5262b509f38cee
Author: scx <scx mail gmail com>
Date:   Sun Mar 8 21:36:57 2020 +0100

    Handle special chars in filenames when importing PDF files
    
    Some special characters (e.g. quotes) in the filename cause gs to fail.
    What's worse, gs interprets the escape character as a real character.
    This means that it cannot handle all Unix files on its own.
    We need to create a temp symlink as a workaround for gs limitations.
    
    Fixes GNOME/ocrfeeder#20

 src/ocrfeeder/util/lib.py | 67 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 10 deletions(-)
---
diff --git a/src/ocrfeeder/util/lib.py b/src/ocrfeeder/util/lib.py
index bf3c0aa..71a2965 100644
--- a/src/ocrfeeder/util/lib.py
+++ b/src/ocrfeeder/util/lib.py
@@ -28,6 +28,7 @@ from .constants import *
 import sane
 import tempfile
 import locale
+import re
 import xml.etree.ElementTree as etree
 from .log import debug
 
@@ -43,20 +44,66 @@ def getIconOrLabel(icon_name, label_text, icon_size = Gtk.IconSize.SMALL_TOOLBAR
         label = None
     return icon, label
 
+def getSafeGhostscriptPath(file_path):
+    return re.sub(r'[^\w !#$%&()*+,./:;<=>?@\[\\\]^_`{|}~-]', '_', file_path)
+
+def getSafeGhostscriptInputFilename(file_name):
+    return re.sub(r'[/]', '_', getSafeGhostscriptPath(file_name))
+
+def getSafeGhostscriptOutputBasename(file_name):
+    return re.sub(r'[%]', '_', getSafeGhostscriptInputFilename(file_name))
+
 def convertPdfToImages(pdf_file, temp_dir = '/tmp'):
-    dir_name = tempfile.mkdtemp(dir = temp_dir)
+    if not os.path.isfile(pdf_file):
+        debug('Unable to convert PDF: File does not exist: %s', pdf_file)
+        return None
+    try:
+        dir_name = tempfile.mkdtemp(dir = temp_dir)
+    except:
+        debug('Unable to convert PDF: Cannot create temp dir in: %s', temp_dir)
+        return None
+
     debug('Converting PDF: %s to image', pdf_file)
+
+    file_name = os.path.basename(pdf_file)
+    base_name = os.path.splitext(file_name)[0]
+    pdf_path = pdf_file
+    file_name_safe = getSafeGhostscriptInputFilename(file_name)
+    base_name_safe = getSafeGhostscriptOutputBasename(base_name)
+    pdf_file_safe = getSafeGhostscriptPath(pdf_file)
+
+    if pdf_file != pdf_file_safe:
+        try:
+            pdf_path_safe = os.path.join(dir_name, file_name_safe)
+            os.symlink(pdf_file, pdf_path_safe)
+        except:
+            debug('Unable to convert PDF: Cannot create temp symlink in: %s', dir_name)
+            return None
+
+        runGhostscript(dir_name, base_name_safe, pdf_path_safe)
+        try:
+            os.unlink(pdf_path_safe)
+        except:
+            debug('PDF conversion warning: Cannot remove temp symlink: %s', pdf_path_safe)
+    else:
+        runGhostscript(dir_name, base_name_safe, pdf_path)
+
+    return dir_name
+
+def runGhostscript(dir_name, base_name, pdf_path):
+    format='jpeg'
     resolution = 300
-    file_name = os.path.splitext(os.path.basename(pdf_file))[0]
-    command = 'gs -SDEVICE=jpeg -r%(resolution)sx%(resolution)s -sPAPERSIZE=letter ' \
-              '-sOutputFile="%(temp_name)s/%(file_name)s_%%04d.jpg" ' \
-              '-dNOPAUSE -dBATCH -- "%(pdf_file)s"' % \
-              {'temp_name': dir_name,
-               'file_name': file_name,
-               'pdf_file': pdf_file,
-               'resolution': resolution}
+    size = 'letter'
+    command = 'gs -SDEVICE=%(format)s -r%(resolution)sx%(resolution)s -sPAPERSIZE=%(size)s ' \
+              '-sOutputFile=\'%(temp_name)s/%(file_name)s_%%04d.jpg\' ' \
+              '-dNOPAUSE -dBATCH -- \'%(pdf_file)s\'' % \
+              {'format': format,
+               'temp_name': dir_name,
+               'file_name': base_name,
+               'pdf_file': pdf_path,
+               'resolution': resolution,
+               'size': size}
     process = subprocess.run(command, shell=True)
-    return dir_name
 
 def getImagesFromFolder(folder):
     if not os.path.isdir(folder):


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]