[uchardet] script: move the Wikipedia title syntax cleaning to BuildLangModel.py.
- From: Jehan Pagès <jehanp src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [uchardet] script: move the Wikipedia title syntax cleaning to BuildLangModel.py.
- Date: Sun, 21 Feb 2016 15:21:12 +0000 (UTC)
commit 198190461e35a7a03ea3364cded69dccf67f8250
Author: Jehan <jehan girinstud io>
Date: Sun Feb 21 16:20:22 2016 +0100
script: move the Wikipedia title syntax cleaning to BuildLangModel.py.
script/BuildLangModel.py | 3 +++
script/langs/ar.py | 8 --------
script/langs/da.py | 9 ---------
script/langs/de.py | 9 ---------
script/langs/el.py | 8 --------
script/langs/eo.py | 9 ---------
script/langs/es.py | 8 --------
script/langs/fr.py | 11 ++++++-----
script/langs/hu.py | 8 --------
script/langs/th.py | 7 -------
script/langs/tr.py | 9 ---------
script/langs/vi.py | 8 --------
12 files changed, 9 insertions(+), 88 deletions(-)
---
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 8ed52cf..a412f13 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -172,6 +172,9 @@ def process_text(text, lang):
if lang.clean_wikipedia_content is not None:
content = lang.clean_wikipedia_content(text)
+ # Clean out the Wikipedia syntax for titles.
+ content = re.sub(r'(=+) *([^=]+) *\1',
+ r'\2', content)
# Clean multiple spaces. Newlines and such are normalized to spaces,
# since they have basically a similar role in the purpose of uchardet.
content = re.sub(r'\s+', ' ', content)
diff --git a/script/langs/ar.py b/script/langs/ar.py
index 05952b8..2506e7b 100644
--- a/script/langs/ar.py
+++ b/script/langs/ar.py
@@ -57,11 +57,3 @@ charsets = ['ISO-8859-6', 'WINDOWS-1256']
start_pages = ['الصفحة_الرئيسية']
wikipedia_code = code
case_mapping = False
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/da.py b/script/langs/da.py
index df94208..18d2379 100644
--- a/script/langs/da.py
+++ b/script/langs/da.py
@@ -67,12 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- # We get modify link in the text: "=== Articles connexesModifier ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/de.py b/script/langs/de.py
index 554f142..e004901 100644
--- a/script/langs/de.py
+++ b/script/langs/de.py
@@ -67,12 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- # Get rid of title syntax: "=== Articles connexes ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/el.py b/script/langs/el.py
index efd9a3e..2726229 100644
--- a/script/langs/el.py
+++ b/script/langs/el.py
@@ -53,11 +53,3 @@ alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψω'
start_pages = ['Πύλη:Κύρια']
wikipedia_code = code
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/eo.py b/script/langs/eo.py
index c593921..e9430cc 100644
--- a/script/langs/eo.py
+++ b/script/langs/eo.py
@@ -65,12 +65,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- # Get rid of title syntax: "=== Articles connexes ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/es.py b/script/langs/es.py
index f48acc5..5219296 100644
--- a/script/langs/es.py
+++ b/script/langs/es.py
@@ -67,11 +67,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/fr.py b/script/langs/fr.py
index 9312b7b..fff730b 100644
--- a/script/langs/fr.py
+++ b/script/langs/fr.py
@@ -70,9 +70,10 @@ case_mapping = True
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.
+# Note that we are already cleaning away the '=' from the title syntax
+# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
+# some language may return weird syntax or UI text which should be
+# discarded. If you encounter one of these cases, use this function.
def clean_wikipedia_content(content):
- # We get modify link in the text: "=== Articles connexesModifier ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
+ # Do your garbage text cleaning here.
+ return content
diff --git a/script/langs/hu.py b/script/langs/hu.py
index 8ff01cb..e6ee345 100644
--- a/script/langs/hu.py
+++ b/script/langs/hu.py
@@ -64,11 +64,3 @@ wikipedia_code = code
# This uses Python algorithm to determine upper/lower-case of a given
# character.
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/th.py b/script/langs/th.py
index 3ddeee1..eb3fdaa 100644
--- a/script/langs/th.py
+++ b/script/langs/th.py
@@ -53,10 +53,3 @@ charsets = ['ISO-8859-11', 'TIS-620']
start_pages = ['หน้าหลัก']
wikipedia_code = code
case_mapping = False
-
-def clean_wikipedia_content(content):
- # Get rid of title syntax: "=== Articles connexes ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/tr.py b/script/langs/tr.py
index 521c7da..d8b5ac1 100644
--- a/script/langs/tr.py
+++ b/script/langs/tr.py
@@ -65,12 +65,3 @@ wikipedia_code = code
# This is wrong when it comes to Turkish.
custom_case_mapping = { 'İ': 'i', 'I': 'ı' }
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- # Get rid of title syntax: "=== Articles connexes ==="
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
diff --git a/script/langs/vi.py b/script/langs/vi.py
index 3a38cc4..f44aeb6 100644
--- a/script/langs/vi.py
+++ b/script/langs/vi.py
@@ -62,11 +62,3 @@ alphabet = 'aăâbcdđeêghiklmnoôơpqrstuưvxy'
start_pages = ['Chữ_Quốc_ngữ']
wikipedia_code = code
case_mapping = True
-
-# A function to clean content returned by the `wikipedia` python lib,
-# in case some unwanted data has been overlooked.
-def clean_wikipedia_content(content):
- cleaned = re.sub(r'(=+) *([^=]+) *\1',
- r'\2',
- content)
- return cleaned
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]