[uchardet] script: move the Wikipedia title syntax cleaning to BuildLangModel.py.



commit 198190461e35a7a03ea3364cded69dccf67f8250
Author: Jehan <jehan girinstud io>
Date:   Sun Feb 21 16:20:22 2016 +0100

    script: move the Wikipedia title syntax cleaning to BuildLangModel.py.

 script/BuildLangModel.py |    3 +++
 script/langs/ar.py       |    8 --------
 script/langs/da.py       |    9 ---------
 script/langs/de.py       |    9 ---------
 script/langs/el.py       |    8 --------
 script/langs/eo.py       |    9 ---------
 script/langs/es.py       |    8 --------
 script/langs/fr.py       |   11 ++++++-----
 script/langs/hu.py       |    8 --------
 script/langs/th.py       |    7 -------
 script/langs/tr.py       |    9 ---------
 script/langs/vi.py       |    8 --------
 12 files changed, 9 insertions(+), 88 deletions(-)
---
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 8ed52cf..a412f13 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -172,6 +172,9 @@ def process_text(text, lang):
 
     if lang.clean_wikipedia_content is not None:
         content = lang.clean_wikipedia_content(text)
+    # Clean out the Wikipedia syntax for titles.
+    content = re.sub(r'(=+) *([^=]+) *\1',
+                     r'\2', content)
     # Clean multiple spaces. Newlines and such are normalized to spaces,
     # since they have basically a similar role in the purpose of uchardet.
     content = re.sub(r'\s+', ' ', content)
diff --git a/script/langs/ar.py b/script/langs/ar.py
index 05952b8..2506e7b 100644
--- a/script/langs/ar.py
+++ b/script/langs/ar.py
@@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
 start_pages = ['الصفحة_الرئيسية']
 wikipedia_code = code
 case_mapping = False
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/da.py b/script/langs/da.py
index df94208..18d2379 100644
--- a/script/langs/da.py
+++ b/script/langs/da.py
@@ -67,12 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # We get modify link in the text: "=== Articles connexesModifier ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/de.py b/script/langs/de.py
index 554f142..e004901 100644
--- a/script/langs/de.py
+++ b/script/langs/de.py
@@ -67,12 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/el.py b/script/langs/el.py
index efd9a3e..2726229 100644
--- a/script/langs/el.py
+++ b/script/langs/el.py
@@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
 start_pages = ['Πύλη:Κύρια']
 wikipedia_code = code
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/eo.py b/script/langs/eo.py
index c593921..e9430cc 100644
--- a/script/langs/eo.py
+++ b/script/langs/eo.py
@@ -65,12 +65,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/es.py b/script/langs/es.py
index f48acc5..5219296 100644
--- a/script/langs/es.py
+++ b/script/langs/es.py
@@ -67,11 +67,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/fr.py b/script/langs/fr.py
index 9312b7b..fff730b 100644
--- a/script/langs/fr.py
+++ b/script/langs/fr.py
@@ -70,9 +70,10 @@ case_mapping = True
 
 # A function to clean content returned by the `wikipedia` python lib,
 # in case some unwanted data has been overlooked.
+# Note that we are already cleaning away the '=' from the title syntax
+# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
+# some language may return weird syntax or UI text which should be
+# discarded. If you encounter one of these cases, use this function.
 def clean_wikipedia_content(content):
-    # We get modify link in the text: "=== Articles connexesModifier ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
+    # Do your garbage text cleaning here.
+    return content
diff --git a/script/langs/hu.py b/script/langs/hu.py
index 8ff01cb..e6ee345 100644
--- a/script/langs/hu.py
+++ b/script/langs/hu.py
@@ -64,11 +64,3 @@ wikipedia_code = code
 # This uses Python algorithm to determine upper/lower-case of a given
 # character.
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/th.py b/script/langs/th.py
index 3ddeee1..eb3fdaa 100644
--- a/script/langs/th.py
+++ b/script/langs/th.py
@@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
 start_pages = ['หน้าหลัก']
 wikipedia_code = code
 case_mapping = False
-
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/tr.py b/script/langs/tr.py
index 521c7da..d8b5ac1 100644
--- a/script/langs/tr.py
+++ b/script/langs/tr.py
@@ -65,12 +65,3 @@ wikipedia_code = code
 # This is wrong when it comes to Turkish.
 custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    # Get rid of title syntax: "=== Articles connexes ==="
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned
diff --git a/script/langs/vi.py b/script/langs/vi.py
index 3a38cc4..f44aeb6 100644
--- a/script/langs/vi.py
+++ b/script/langs/vi.py
@@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
 start_pages = ['Chữ_Quốc_ngữ']
 wikipedia_code = code
 case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
-    cleaned = re.sub(r'(=+) *([^=]+) *\1',
-                     r'\2',
-                     content)
-    return cleaned


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]