Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 50 additions & 14 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,21 @@
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000

# these ranges need to match unicodedata.c:is_unified_ideograph
# CJK Unified Ideograph ranges.
# makeunicodecjk() generates unicodedata_cjk.h from these, which is
# included by unicodedata.c (is_unified_ideograph function).
cjk_ranges = [
('3400', '4DBF'), # CJK Ideograph Extension A CJK
('4E00', '9FFF'), # CJK Ideograph
('20000', '2A6DF'), # CJK Ideograph Extension B
('2A700', '2B73F'), # CJK Ideograph Extension C
('2B740', '2B81D'), # CJK Ideograph Extension D
('2B820', '2CEAD'), # CJK Ideograph Extension E
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
('30000', '3134A'), # CJK Ideograph Extension G
('31350', '323AF'), # CJK Ideograph Extension H
('323B0', '33479'), # CJK Ideograph Extension J
('3400', '4DBF', 'CJK Ideograph Extension A'),
('4E00', '9FFF', 'CJK Ideograph'),
('20000', '2A6DF', 'CJK Ideograph Extension B'),
('2A700', '2B73F', 'CJK Ideograph Extension C'),
('2B740', '2B81D', 'CJK Ideograph Extension D'),
('2B820', '2CEAD', 'CJK Ideograph Extension E'),
('2CEB0', '2EBE0', 'CJK Ideograph Extension F'),
('2EBF0', '2EE5D', 'CJK Ideograph Extension I'),
('30000', '3134A', 'CJK Ideograph Extension G'),
('31350', '323AF', 'CJK Ideograph Extension H'),
('323B0', '33479', 'CJK Ideograph Extension J'),
]


Expand All @@ -129,11 +131,43 @@ def maketables(trace=0):
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)

makeunicodecjk(trace)
makeunicodename(unicode, trace)
makeunicodedata(unicode, trace)
makeunicodetype(unicode, trace)


# --------------------------------------------------------------------
# CJK Unified Ideograph ranges (is_unified_ideograph function)

def makeunicodecjk(trace):

FILE = "unicodedata2/unicodedata_cjk.h"

print("--- Preparing", FILE, "...")

with open(FILE, "w") as fp:
fprint = partial(print, file=fp)
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
fprint()
fprint("static int")
fprint("is_unified_ideograph(Py_UCS4 code)")
fprint("{")
fprint(" return")
for i, (start, end, name) in enumerate(cjk_ranges):
start_hex = int(start, 16)
end_hex = int(end, 16)
if i < len(cjk_ranges) - 1:
fprint(" (0x%X <= code && code <= 0x%X) || /* %s */"
% (start_hex, end_hex, name))
else:
fprint(" (0x%X <= code && code <= 0x%X); /* %s */"
% (start_hex, end_hex, name))
fprint("}")

print(len(cjk_ranges), "CJK ranges")


# --------------------------------------------------------------------
# unicode character properties

Expand Down Expand Up @@ -1056,8 +1090,10 @@ def __init__(self, version, cjk_check=True):
field = None
elif field:
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
if cjk_check:
expected = [(s, e) for s, e, _ in cjk_ranges]
if expected != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

# public attributes
self.filename = UNICODE_DATA % ''
Expand Down
27 changes: 26 additions & 1 deletion tests/test_unicodedata2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '2cf81cbeaa7cbc8f1ace57dd6c56d1f30f1a2de1'
expectedchecksum = '65670ae03a324c5f9e826a4de3e25bae4d73c9b7'

def test_function_checksum(self):
import unicodedata2
Expand Down Expand Up @@ -186,6 +186,31 @@ def test_issue29456(self):
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)


def test_cjk_unified_ideograph_names(self):
# Test that is_unified_ideograph covers all CJK ranges by checking
# that name() and lookup() work for the first and last codepoint of
# each range. These ranges must be kept in sync between
# makeunicodedata.py:cjk_ranges and unicodedata_cjk.h.
cjk_ranges = [
(0x3400, 0x4DBF), # CJK Ideograph Extension A
(0x4E00, 0x9FFF), # CJK Ideograph
(0x20000, 0x2A6DF), # CJK Ideograph Extension B
(0x2A700, 0x2B73F), # CJK Ideograph Extension C
(0x2B740, 0x2B81D), # CJK Ideograph Extension D
(0x2B820, 0x2CEAD), # CJK Ideograph Extension E
(0x2CEB0, 0x2EBE0), # CJK Ideograph Extension F
(0x2EBF0, 0x2EE5D), # CJK Ideograph Extension I
(0x30000, 0x3134A), # CJK Ideograph Extension G
(0x31350, 0x323AF), # CJK Ideograph Extension H
(0x323B0, 0x33479), # CJK Ideograph Extension J
]
for start, end in cjk_ranges:
for cp in (start, end):
expected_name = "CJK UNIFIED IDEOGRAPH-%X" % cp
char = chr(cp)
self.assertEqual(self.db.name(char), expected_name)
self.assertEqual(self.db.lookup(expected_name), char)

def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, b'a')
Expand Down
18 changes: 2 additions & 16 deletions unicodedata2/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -929,22 +929,8 @@ static const char * const hangul_syllables[][3] = {
{ 0, 0, "H" }
};

/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
}
/* CJK Unified Ideograph ranges, generated by makeunicodedata.py */
#include "unicodedata_cjk.h"

/* macros used to determine if the given code point is in the PUA range that
* we are using to store aliases and named sequences */
Expand Down
18 changes: 18 additions & 0 deletions unicodedata2/unicodedata_cjk.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/* this file was generated by makeunicodedata.py 3.3 */

static int
is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
(0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */
}
Loading