Objects/unicodeobject.c (part 3)
Source:
cpython 3.14 @ ab2d84fe1023/Objects/unicodeobject.c
This annotation covers string manipulation methods. See parts 1-2 for encoding/decoding, internal representation (compact ASCII / latin1 / UCS2 / UCS4), and find/index.
Map
| Lines | Symbol | Role |
|---|---|---|
| 1-300 | unicode_split, unicode_rsplit | Split on whitespace or substring |
| 301-500 | unicode_splitlines | Split on line boundaries (\n, \r\n, \r, etc.) |
| 501-700 | unicode_join | sep.join(iterable) — collect and concatenate |
| 701-900 | unicode_strip, unicode_lstrip, unicode_rstrip | Strip whitespace or characters |
| 901-1100 | unicode_center, unicode_ljust, unicode_rjust | Pad to width with fill character |
| 1101-1200 | unicode_zfill | Pad with leading zeros, preserve sign |
| 1201-1400 | unicode_translate | str.translate(table) — character-by-character substitution |
| 1401-1600 | unicode_maketrans | Build translation table for translate() |
Reading
split
// CPython: Objects/unicodeobject.c:88 unicode_split
static PyObject *
unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
{
if (substring == Py_None) {
/* Split on any whitespace, collapsing runs */
return split_whitespace(self, maxsplit);
}
/* Split on specific separator (no collapsing) */
return split(self, substring, maxsplit);
}
' a b '.split() → ['a', 'b'] (whitespace collapsed). 'a,,b'.split(',') → ['a', '', 'b'] (separator kept).
join
// CPython: Objects/unicodeobject.c:550 unicode_join
static PyObject *
unicode_join(PyObject *self, PyObject *iterable)
{
PyObject *seq = PySequence_Fast(iterable, "can only join an iterable");
Py_ssize_t seqlen = PySequence_Fast_GET_SIZE(seq);
if (seqlen == 0) return PyUnicode_New(0, 0);
if (seqlen == 1) {
item = PySequence_Fast_GET_ITEM(seq, 0);
return PyUnicode_FromObject(item);
}
/* Two passes: measure total length, then fill */
...
Py_ssize_t sz = 0;
for (i = 0; i < seqlen; i++) {
item = PySequence_Fast_GET_ITEM(seq, i);
sz += PyUnicode_GET_LENGTH(item);
if (i < seqlen - 1) sz += seplen;
}
res = PyUnicode_New(sz, maxchar);
/* Fill res with items interleaved with sep */
...
return res;
}
Two-pass approach: first scan to compute total length and maximum character (determines internal kind), then fill.
strip
// CPython: Objects/unicodeobject.c:740 unicode_strip
static PyObject *
unicode_strip(PyObject *self, PyObject *args)
{
if (chars == Py_None || chars == NULL) {
/* Strip Unicode whitespace */
return _PyUnicode_XStrip(self, BOTHSTRIP, NULL);
}
return _PyUnicode_XStrip(self, BOTHSTRIP, chars);
}
static PyObject *
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
{
Py_UCS4 *buf = PyUnicode_AsUCS4Copy(self);
Py_ssize_t lpos = 0, rpos = len;
/* Scan from left */
if (striptype != RIGHTSTRIP) {
while (lpos < rpos && Py_UNICODE_ISSPACE(buf[lpos])) lpos++;
}
/* Scan from right */
if (striptype != LEFTSTRIP) {
while (rpos > lpos && Py_UNICODE_ISSPACE(buf[rpos-1])) rpos--;
}
...
}
translate
// CPython: Objects/unicodeobject.c:1260 unicode_translate
static PyObject *
unicode_translate(PyObject *self, PyObject *table)
{
/* For each code point, look up table[cp]:
* None → delete the character
* int → replace with that code point
* str → replace with that string
* KeyError → keep original character
*/
...
}
str.translate is used for fast character-by-character rewriting. str.maketrans builds the dict mapping ord() keys.
zfill
// CPython: Objects/unicodeobject.c:1160 unicode_zfill
static PyObject *
unicode_zfill(PyObject *self, PyObject *args)
{
/* Pad with '0', but preserve leading '+' or '-' sign */
Py_ssize_t fill = width - len;
if (fill <= 0) return Py_NewRef(self);
PyObject *u = PyUnicode_New(width, maxchar);
if (sign == '+' || sign == '-') {
/* Insert sign first, then zeros */
PyUnicode_WRITE(kind, data, 0, sign);
PyUnicode_FILL(kind, data + kind, '0', fill);
PyUnicode_CopyCharacters(u, 1 + fill, self, 1, len - 1);
} else {
PyUnicode_FILL(kind, data, '0', fill);
PyUnicode_CopyCharacters(u, fill, self, 0, len);
}
return u;
}
'-5'.zfill(4) → '-005' (sign preserved before zeros).
gopy notes
All string methods are in objects/str.go. split uses Go's strings.SplitN. join uses strings.Builder with a two-pass length calculation. strip uses strings.TrimFunc with Unicode whitespace check. translate iterates runes with a map lookup. zfill inserts zeros after the optional sign character.