Skip to main content

Objects/unicodeobject.c (part 6)

Source:

cpython 3.14 @ ab2d84fe1023/Objects/unicodeobject.c

This annotation covers the string splitting and stripping methods. See previous objects_unicodeobject* annotations for construction, encoding, and str.format.

Map

LinesSymbolRole
1-100unicode_splitstr.split(sep, maxsplit)
101-200unicode_rsplitstr.rsplit(sep, maxsplit) — split from right
201-350unicode_splitlinesSplit on line endings (\n, \r, \r\n, \v, \f, etc.)
351-500unicode_partitionstr.partition(sep)(before, sep, after)
501-650unicode_rpartitionstr.rpartition(sep) — last occurrence
651-900unicode_stripstr.strip(chars) — strip leading and trailing
901-1200_Py_unicode_WhitespaceUnicode whitespace character set for whitespace-split

Reading

str.split

// CPython: Objects/unicodeobject.c:10280 unicode_split
static PyObject *
unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
{
PyObject *substring = Py_None;
Py_ssize_t maxcount = -1;
if (substring == Py_None)
/* Whitespace split */
return split_whitespace(self, maxcount);
return split(self, substring, maxcount);
}

'a b c'.split() collapses multiple whitespace characters. 'a::b::c'.split('::', 1) uses the explicit separator with a max count.

str.splitlines

// CPython: Objects/unicodeobject.c:10380 unicode_splitlines
static PyObject *
unicode_splitlines(PyObject *self, int keepends)
{
/* Line endings: \n, \r, \r\n, \v, \f, \x1c, \x1d, \x1e, \x85, 
, 
 */
return PyUnicode_Splitlines(self, keepends);
}

keepends=True includes the line ending in each element. str.splitlines() splits on all Unicode line boundaries (not just \n).

str.partition

// CPython: Objects/unicodeobject.c:10480 unicode_partition
static PyObject *
unicode_partition(PyObject *self, PyObject *sep_obj)
{
Py_ssize_t pos = PyUnicode_Find(self, sep_obj, 0, PY_SSIZE_T_MAX, 1);
if (pos < 0) {
/* sep not found: return (self, '', '') */
return PyTuple_Pack(3, self, unicode_empty, unicode_empty);
}
Py_ssize_t sep_len = PyUnicode_GET_LENGTH(sep_obj);
return PyTuple_Pack(3,
PyUnicode_Substring(self, 0, pos),
sep_obj,
PyUnicode_Substring(self, pos + sep_len, PY_SSIZE_T_MAX));
}

str.strip

// CPython: Objects/unicodeobject.c:10680 unicode_strip
static PyObject *
unicode_strip(PyObject *self, PyObject *args)
{
PyObject *chars = Py_None;
if (chars == Py_None)
return _PyUnicode_XStrip(self, 3, Py_None); /* strip whitespace */
return _PyUnicode_XStrip(self, 3, chars); /* strip given chars */
}

_PyUnicode_XStrip with mode 1 = lstrip, 2 = rstrip, 3 = both. For character sets, it builds a fast lookup set from the chars argument.

gopy notes

str.split is objects.UnicodeSplit in objects/unicode_split.go. str.splitlines uses the same Unicode line boundary table as CPython. str.partition is objects.UnicodePartition. str.strip uses objects.UnicodeStrip with a UnicodeSet for fast character membership.