Modules/unicodedata.c

Source:

cpython 3.14 @ ab2d84fe1023/Modules/unicodedata.c

unicodedata provides access to the Unicode Character Database (UCD). The data tables are generated from UCD files and compiled into the interpreter.

Map

Lines	Symbol	Role
1-80	Data tables	`_PyUnicode_Database`, decomp table, combining class
81-200	`unicodedata_normalize_impl`	NFC/NFD/NFKC/NFKD normalization
201-350	`unicodedata_name_impl`	Return Unicode name for a code point
351-500	`unicodedata_lookup_impl`	Look up a code point by name
501-650	Property accessors	`category`, `bidirectional`, `combining`, `east_asian_width`
651-800	`unicodedata_is*`	`is_normalized`, `is_decimal`, `is_digit`, `is_numeric`
801-1200	`UCD` class	Version-specific UCD object for 3.2/4.0/5.0 etc.

Reading

Normalization

// CPython: Modules/unicodedata.c:120 unicodedata_normalize_impl
static PyObject *
unicodedata_normalize_impl(PyObject *self, const char *form, PyObject *input)
{
    /* form: "NFC", "NFD", "NFKC", "NFKD" */
    if (PyUnicode_GET_LENGTH(input) == 0)
        return Py_NewRef(input);  /* empty string is already normalized */

    if (strcmp(form, "NFC") == 0)  return nfc_nfkc(self, input, 0);
    if (strcmp(form, "NFKC") == 0) return nfc_nfkc(self, input, 1);
    if (strcmp(form, "NFD") == 0)  return nfd_nfkd(self, input, 0);
    if (strcmp(form, "NFKD") == 0) return nfd_nfkd(self, input, 1);
    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    return NULL;
}

NFD decomposes characters (e.g. é → e + combining acute accent). NFC re-composes canonical equivalents. NFKC/NFKD additionally apply compatibility decompositions.

`nfd_nfkd` core

// CPython: Modules/unicodedata.c:200 nfd_nfkd
static PyObject *
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
    /* Step 1: Canonical/compatibility decompose each code point */
    /* Step 2: Sort combining characters by Canonical_Combining_Class */
    /* The CCC determines visual stacking order; same-class chars stay ordered */
    ...
}

`unicodedata_name_impl`

// CPython: Modules/unicodedata.c:310 unicodedata_name_impl
static PyObject *
unicodedata_name_impl(PyObject *self, int chr, PyObject *default_value)
{
    char name[NAME_MAXLEN + 1];
    if (!_PyUnicode_GetName(chr, name, sizeof(name))) {
        if (default_value != NULL)
            return Py_NewRef(default_value);
        PyErr_SetString(PyExc_ValueError,
            "no such name");
        return NULL;
    }
    return PyUnicode_FromString(name);
}

unicodedata.name('\N{SNOWMAN}') returns 'SNOWMAN'. The name table is a compressed trie compiled from UnicodeData.txt.

`unicodedata_lookup_impl`

// CPython: Modules/unicodedata.c:390 unicodedata_lookup_impl
static PyObject *
unicodedata_lookup_impl(PyObject *self, const char *name, Py_ssize_t name_length)
{
    Py_UCS4 code;
    if (!_PyUnicode_LookupName(name, name_length, &code)) {
        PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
        return NULL;
    }
    return PyUnicode_FromOrdinal(code);
}

'\N{SNOWMAN}' in string literals uses this lookup at compile time.

gopy notes

unicodedata is in module/unicodedata/module.go. The UCD tables are compiled from Unicode 15 data files in module/unicodedata/tables.go (auto-generated). normalize uses Go's golang.org/x/text/unicode/norm package for NFC/NFD/NFKC/NFKD. name/lookup use a Go port of the CPython trie.

Map​

Reading​

Normalization​

nfd_nfkd core​

unicodedata_name_impl​

unicodedata_lookup_impl​

Category​

gopy notes​

Map