Skip to main content

Objects/unicodeobject.c (part 3)

Source:

cpython 3.14 @ ab2d84fe1023/Objects/unicodeobject.c

This annotation covers string manipulation methods. See parts 1-2 for encoding/decoding, internal representation (compact ASCII / latin1 / UCS2 / UCS4), and find/index.

Map

LinesSymbolRole
1-300unicode_split, unicode_rsplitSplit on whitespace or substring
301-500unicode_splitlinesSplit on line boundaries (\n, \r\n, \r, etc.)
501-700unicode_joinsep.join(iterable) — collect and concatenate
701-900unicode_strip, unicode_lstrip, unicode_rstripStrip whitespace or characters
901-1100unicode_center, unicode_ljust, unicode_rjustPad to width with fill character
1101-1200unicode_zfillPad with leading zeros, preserve sign
1201-1400unicode_translatestr.translate(table) — character-by-character substitution
1401-1600unicode_maketransBuild translation table for translate()

Reading

split

// CPython: Objects/unicodeobject.c:88 unicode_split
static PyObject *
unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
{
if (substring == Py_None) {
/* Split on any whitespace, collapsing runs */
return split_whitespace(self, maxsplit);
}
/* Split on specific separator (no collapsing) */
return split(self, substring, maxsplit);
}

' a b '.split()['a', 'b'] (whitespace collapsed). 'a,,b'.split(',')['a', '', 'b'] (separator kept).

join

// CPython: Objects/unicodeobject.c:550 unicode_join
static PyObject *
unicode_join(PyObject *self, PyObject *iterable)
{
PyObject *seq = PySequence_Fast(iterable, "can only join an iterable");
Py_ssize_t seqlen = PySequence_Fast_GET_SIZE(seq);
if (seqlen == 0) return PyUnicode_New(0, 0);
if (seqlen == 1) {
item = PySequence_Fast_GET_ITEM(seq, 0);
return PyUnicode_FromObject(item);
}
/* Two passes: measure total length, then fill */
...
Py_ssize_t sz = 0;
for (i = 0; i < seqlen; i++) {
item = PySequence_Fast_GET_ITEM(seq, i);
sz += PyUnicode_GET_LENGTH(item);
if (i < seqlen - 1) sz += seplen;
}
res = PyUnicode_New(sz, maxchar);
/* Fill res with items interleaved with sep */
...
return res;
}

Two-pass approach: first scan to compute total length and maximum character (determines internal kind), then fill.

strip

// CPython: Objects/unicodeobject.c:740 unicode_strip
static PyObject *
unicode_strip(PyObject *self, PyObject *args)
{
if (chars == Py_None || chars == NULL) {
/* Strip Unicode whitespace */
return _PyUnicode_XStrip(self, BOTHSTRIP, NULL);
}
return _PyUnicode_XStrip(self, BOTHSTRIP, chars);
}

static PyObject *
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
{
Py_UCS4 *buf = PyUnicode_AsUCS4Copy(self);
Py_ssize_t lpos = 0, rpos = len;
/* Scan from left */
if (striptype != RIGHTSTRIP) {
while (lpos < rpos && Py_UNICODE_ISSPACE(buf[lpos])) lpos++;
}
/* Scan from right */
if (striptype != LEFTSTRIP) {
while (rpos > lpos && Py_UNICODE_ISSPACE(buf[rpos-1])) rpos--;
}
...
}

translate

// CPython: Objects/unicodeobject.c:1260 unicode_translate
static PyObject *
unicode_translate(PyObject *self, PyObject *table)
{
/* For each code point, look up table[cp]:
* None → delete the character
* int → replace with that code point
* str → replace with that string
* KeyError → keep original character
*/
...
}

str.translate is used for fast character-by-character rewriting. str.maketrans builds the dict mapping ord() keys.

zfill

// CPython: Objects/unicodeobject.c:1160 unicode_zfill
static PyObject *
unicode_zfill(PyObject *self, PyObject *args)
{
/* Pad with '0', but preserve leading '+' or '-' sign */
Py_ssize_t fill = width - len;
if (fill <= 0) return Py_NewRef(self);
PyObject *u = PyUnicode_New(width, maxchar);
if (sign == '+' || sign == '-') {
/* Insert sign first, then zeros */
PyUnicode_WRITE(kind, data, 0, sign);
PyUnicode_FILL(kind, data + kind, '0', fill);
PyUnicode_CopyCharacters(u, 1 + fill, self, 1, len - 1);
} else {
PyUnicode_FILL(kind, data, '0', fill);
PyUnicode_CopyCharacters(u, fill, self, 0, len);
}
return u;
}

'-5'.zfill(4)'-005' (sign preserved before zeros).

gopy notes

All string methods are in objects/str.go. split uses Go's strings.SplitN. join uses strings.Builder with a two-pass length calculation. strip uses strings.TrimFunc with Unicode whitespace check. translate iterates runes with a map lookup. zfill inserts zeros after the optional sign character.