[glib: 1/2] tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332
- From: Philip Withnall <pwithnall src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glib: 1/2] tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332
- Date: Tue, 12 Jun 2018 20:32:45 +0000 (UTC)
commit a580185cdca0de76cf71bd6b3c01230d281f5a05
Author: Christoph Reiter <creiter src gnome org>
Date: Tue May 29 11:31:49 2018 +0200
tests: Port gen-casefold-txt.pl and gen-casemap-txt.pl to Python 3. See #1332
I've tried to keep the code structure roughly the same.
tests/Makefile.am | 4 +-
tests/casefold.txt | 2 +-
tests/casemap.txt | 2 +-
tests/gen-casefold-txt.pl | 82 ---------------
tests/gen-casefold-txt.py | 78 ++++++++++++++
tests/gen-casemap-txt.pl | 256 ----------------------------------------------
tests/gen-casemap-txt.py | 200 ++++++++++++++++++++++++++++++++++++
tests/unicode-caseconv.c | 2 +-
8 files changed, 283 insertions(+), 343 deletions(-)
---
diff --git a/tests/Makefile.am b/tests/Makefile.am
index de3ddb49e..fba18655d 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -146,8 +146,8 @@ endif
EXTRA_DIST += \
$(test_scripts) \
- gen-casefold-txt.pl \
- gen-casemap-txt.pl \
+ gen-casefold-txt.py \
+ gen-casemap-txt.py \
iochannel-test-infile \
timeloop-basic.c \
assert-msg-test.gdb
diff --git a/tests/casefold.txt b/tests/casefold.txt
index f7b47abd2..6043c1201 100644
--- a/tests/casefold.txt
+++ b/tests/casefold.txt
@@ -1,5 +1,5 @@
# Test cases generated from Unicode 10.0.0 data
-# by gen-casefold-test.pl. Do not edit.
+# by gen-casefold-txt.py. Do not edit.
#
# Some special hand crafted tests
#
diff --git a/tests/casemap.txt b/tests/casemap.txt
index 5e983f70f..6533e8dd9 100644
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
@@ -1,5 +1,5 @@
# Test cases generated from Unicode 10.0.0 data
-# by gen-case-tests.pl. Do not edit.
+# by gen-casemap-txt.py. Do not edit.
#
# Some special hand crafted tests
#
diff --git a/tests/gen-casefold-txt.py b/tests/gen-casefold-txt.py
new file mode 100755
index 000000000..3c55828d3
--- /dev/null
+++ b/tests/gen-casefold-txt.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright (C) 1998, 1999 Tom Tromey
+# Copyright (C) 2001 Red Hat Software
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""
+gen-casefold-txt.py - Generate test cases for casefolding from Unicode data.
+See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
+Usage:
+ I consider the output of this program to be unrestricted.
+ Use it as you will.
+"""
+
+import sys
+import argparse
+
+
+def main(argv):
+ parser = argparse.ArgumentParser(
+ description="Generate test cases for casefolding from Unicode data")
+ parser.add_argument("UNICODE-VERSION")
+ parser.add_argument("CaseFolding.txt")
+ args = parser.parse_args(argv[1:])
+ version = getattr(args, "UNICODE-VERSION")
+ filename = getattr(args, "CaseFolding.txt")
+
+ print("""\
+# Test cases generated from Unicode {} data
+# by gen-casefold-txt.py. Do not edit.
+#
+# Some special hand crafted tests
+#
+AaBbCc@@\taabbcc@@
+#
+# Now the automatic tests
+#""".format(version))
+
+ # Names of fields in the CaseFolding table
+ CODE, STATUS, MAPPING = range(3)
+
+ with open(filename, encoding="utf-8") as fileobj:
+ for line in fileobj:
+ # strip comments and skip empty lines
+ line = line.split("#", 1)[0].strip()
+ if not line:
+ continue
+
+ fields = [f.strip() for f in line.split(";", 3)[:3]]
+ if len(fields) != 3:
+ raise SystemExit(
+ "Entry for %s has wrong number of fields (%d)" % (
+ fields[CODE], len(fields)))
+
+ status = fields[STATUS]
+ # skip simple and Turkic mappings
+ if status in "ST":
+ continue
+
+ code = chr(int(fields[CODE], 16))
+ values = "".join(
+ [chr(int(v, 16)) for v in fields[MAPPING].split()])
+ print("{}\t{}".format(code, values))
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv))
diff --git a/tests/gen-casemap-txt.py b/tests/gen-casemap-txt.py
new file mode 100755
index 000000000..98f6bc969
--- /dev/null
+++ b/tests/gen-casemap-txt.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+# Copyright (C) 1998, 1999 Tom Tromey
+# Copyright (C) 2001 Red Hat Software
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""
+gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
+See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
+Usage:
+ I consider the output of this program to be unrestricted.
+ Use it as you will.
+"""
+
+import sys
+import argparse
+
+
+def main(argv):
+ parser = argparse.ArgumentParser(
+ description="Generate test cases for case mapping from Unicode data")
+ parser.add_argument("UNICODE-VERSION")
+ parser.add_argument("UnicodeData.txt")
+ parser.add_argument("SpecialCasing.txt")
+ args = parser.parse_args(argv[1:])
+ version = getattr(args, "UNICODE-VERSION")
+ filename_udata = getattr(args, "UnicodeData.txt")
+ filename_casing = getattr(args, "SpecialCasing.txt")
+
+ # Names of fields in Unicode data table.
+ CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
+ DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
+ COMMENT, UPPER, LOWER, TITLE = range(15)
+
+ # Names of fields in the SpecialCasing table
+ CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
+
+ upper = {}
+ title = {}
+ lower = {}
+
+ def make_hex(codes):
+ """Converts a string of white space separated code points encoded as
+ hex values to a Unicode string. Any extra white space is ignored.
+ """
+ return "".join([chr(int(c, 16)) for c in codes.split()])
+
+ def process_one(code, fields):
+ type_ = fields[CATEGORY]
+ if type_ == "Ll":
+ upper[code] = make_hex(fields[UPPER])
+ lower[code] = chr(code)
+ title[code] = make_hex(fields[TITLE])
+ elif type_ == "Lu":
+ lower[code] = make_hex(fields[LOWER])
+ upper[code] = chr(code)
+ title[code] = make_hex(fields[TITLE])
+ elif type_ == "Lt":
+ upper[code] = make_hex(fields[UPPER])
+ lower[code] = make_hex(fields[LOWER])
+ title[code] = make_hex(fields[LOWER])
+
+ with open(filename_udata, encoding="utf-8") as fileobj:
+ last_code = -1
+ for line in fileobj:
+ line = line.strip()
+ fields = [f.strip() for f in line.split(";")]
+ if len(fields) != 15:
+ raise SystemExit(
+ "Entry for %s has wrong number of fields (%d)" % (
+ fields[CODE], len(fields)))
+
+ code = int(fields[CODE], 16)
+
+ if code > last_code + 1:
+ # Found a gap
+ if fields[NAME].endswith("Last>"):
+ # Fill the gap with the last character read,
+ # since this was a range specified in the char database
+ gfields = fields
+ else:
+ # The gap represents undefined characters. Only the type
+ # matters.
+ gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
+ '', '', '', '']
+
+ last_code += 1
+ while last_code < code:
+ gfields[CODE] = "%04x" % last_code
+ process_one(last_code, gfields)
+ last_code += 1
+
+ process_one(code, fields)
+ last_code = code
+
+ with open(filename_casing, encoding="utf-8") as fileobj:
+ last_code = -1
+ for line in fileobj:
+ # strip comments and skip empty lines
+ line = line.split("#", 1)[0].strip()
+ if not line:
+ continue
+
+ # all lines end with ";" so just remove it
+ line = line.rstrip(";").rstrip()
+ fields = [f.strip() for f in line.split(";")]
+ if len(fields) not in (4, 5):
+ raise SystemExit(
+ "Entry for %s has wrong number of fields (%d)" % (
+ fields[CASE_CODE], len(fields)))
+
+ if len(fields) == 5:
+ # Ignore conditional special cases - we'll handle them manually
+ continue
+
+ code = int(fields[CASE_CODE], 16)
+
+ upper[code] = make_hex(fields[CASE_UPPER])
+ lower[code] = make_hex(fields[CASE_LOWER])
+ title[code] = make_hex(fields[CASE_TITLE])
+
+ print_tests(version, upper, title, lower)
+
+
+def print_tests(version, upper, title, lower):
+ print("""\
+# Test cases generated from Unicode {} data
+# by gen-casemap-txt.py. Do not edit.
+#
+# Some special hand crafted tests
+#
+tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
+# Test reordering of YPOGEGRAMMENI across other accents
+\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
+\t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
+# Handling of final and nonfinal sigma
+\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
+\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
+\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
+# Lithuanian rule of i followed by letter with dot. Not at all sure
+# about the titlecase part here
+lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
+lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
+lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with
ogonek and acute accent)
+lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK
(with acute accent)
+lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
+lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
+lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I
(with ogonek and acute accent)
+lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH
OGONEK (with acute accent)
+# Special case not at initial position
+\ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
+#
+# Now the automatic tests
+#""".format(version))
+
+ for i in range(0x10ffff):
+ if i == 0x3A3:
+ # Greek sigma needs special tests
+ continue
+
+ up = upper.get(i, "")
+ lo = lower.get(i, "")
+ ti = title.get(i, "")
+
+ if any([up, lo, ti]):
+ print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv))
diff --git a/tests/unicode-caseconv.c b/tests/unicode-caseconv.c
index affb55888..c124633d1 100644
--- a/tests/unicode-caseconv.c
+++ b/tests/unicode-caseconv.c
@@ -57,7 +57,7 @@ int main (int argc, char **argv)
test = strings[1];
- /* gen-casemap-txt.pl uses an empty string when a single character
+ /* gen-casemap-txt.py uses an empty string when a single character
* doesn't have an equivalent in a particular case; since that behavior
* is nonsense for multicharacter strings, it would make more sense
* to put the expected result .. the original character unchanged. But
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]