Skip to main content

Modules/unicodedata.c

Source:

cpython 3.14 @ ab2d84fe1023/Modules/unicodedata.c

unicodedata provides access to the Unicode Character Database (UCD). The data tables are generated from UCD files and compiled into the interpreter.

Map

LinesSymbolRole
1-80Data tables_PyUnicode_Database, decomp table, combining class
81-200unicodedata_normalize_implNFC/NFD/NFKC/NFKD normalization
201-350unicodedata_name_implReturn Unicode name for a code point
351-500unicodedata_lookup_implLook up a code point by name
501-650Property accessorscategory, bidirectional, combining, east_asian_width
651-800unicodedata_is*is_normalized, is_decimal, is_digit, is_numeric
801-1200UCD classVersion-specific UCD object for 3.2/4.0/5.0 etc.

Reading

Normalization

// CPython: Modules/unicodedata.c:120 unicodedata_normalize_impl
static PyObject *
unicodedata_normalize_impl(PyObject *self, const char *form, PyObject *input)
{
/* form: "NFC", "NFD", "NFKC", "NFKD" */
if (PyUnicode_GET_LENGTH(input) == 0)
return Py_NewRef(input); /* empty string is already normalized */

if (strcmp(form, "NFC") == 0) return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0) return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0) return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0) return nfd_nfkd(self, input, 1);
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}

NFD decomposes characters (e.g. ée + combining acute accent). NFC re-composes canonical equivalents. NFKC/NFKD additionally apply compatibility decompositions.

nfd_nfkd core

// CPython: Modules/unicodedata.c:200 nfd_nfkd
static PyObject *
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
/* Step 1: Canonical/compatibility decompose each code point */
/* Step 2: Sort combining characters by Canonical_Combining_Class */
/* The CCC determines visual stacking order; same-class chars stay ordered */
...
}

unicodedata_name_impl

// CPython: Modules/unicodedata.c:310 unicodedata_name_impl
static PyObject *
unicodedata_name_impl(PyObject *self, int chr, PyObject *default_value)
{
char name[NAME_MAXLEN + 1];
if (!_PyUnicode_GetName(chr, name, sizeof(name))) {
if (default_value != NULL)
return Py_NewRef(default_value);
PyErr_SetString(PyExc_ValueError,
"no such name");
return NULL;
}
return PyUnicode_FromString(name);
}

unicodedata.name('\N{SNOWMAN}') returns 'SNOWMAN'. The name table is a compressed trie compiled from UnicodeData.txt.

unicodedata_lookup_impl

// CPython: Modules/unicodedata.c:390 unicodedata_lookup_impl
static PyObject *
unicodedata_lookup_impl(PyObject *self, const char *name, Py_ssize_t name_length)
{
Py_UCS4 code;
if (!_PyUnicode_LookupName(name, name_length, &code)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
return PyUnicode_FromOrdinal(code);
}

'\N{SNOWMAN}' in string literals uses this lookup at compile time.

Category

// CPython: Modules/unicodedata.c:530 unicodedata_category_impl
/* Returns two-letter Unicode category: Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me,
Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So, Zs, Zl, Zp,
Cc, Cf, Cs, Co, Cn */
static PyObject *
unicodedata_category_impl(PyObject *self, int chr)
{
int index = (int) _PyUnicode_ToCategory(chr);
return PyUnicode_FromString(_Py_CategoryNames[index]);
}

gopy notes

unicodedata is in module/unicodedata/module.go. The UCD tables are compiled from Unicode 15 data files in module/unicodedata/tables.go (auto-generated). normalize uses Go's golang.org/x/text/unicode/norm package for NFC/NFD/NFKC/NFKD. name/lookup use a Go port of the CPython trie.