Skip to main content

Lib/tokenize.py (part 2)

Source:

cpython 3.14 @ ab2d84fe1023/Lib/tokenize.py

This annotation covers encoding detection. See lib_tokenize_detail for tokenize, generate_tokens, token types, and the FSM.

Map

LinesSymbolRole
1-60detect_encodingDetect file encoding from BOM or # -*- coding: -*- comment
61-140openOpen a Python source file with the correct encoding
141-250_get_normal_nameNormalize encoding name (e.g. UTF-8utf-8)
251-400_detect_encodingRead first two lines; look for BOM or encoding cookie
401-550cookie_reRegex for PEP 263 encoding declaration
551-700TokenError / StopTokenizingException types used by the tokenizer

Reading

detect_encoding

# CPython: Lib/tokenize.py:294 detect_encoding
def detect_encoding(readline):
"""Return (encoding, lines_consumed) for a Python source file.
Uses BOM detection first, then PEP 263 coding cookie.
readline: callable returning the next line as bytes.
"""
bom_found = False
encoding = None
default = 'utf-8'

def find_cookie(line):
line_string = line.decode('latin-1')
match = cookie_re.match(line_string)
if not match: return None
return _get_normal_name(match.group(1))

first = readline()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = 'utf-8-sig'
if first:
encoding = find_cookie(first)
if encoding:
return encoding, [first]

second = readline()
if second:
encoding = find_cookie(second)
if encoding:
return encoding, [first, second]

return default, [first, second]
# CPython: Lib/tokenize.py:62 cookie_re
# PEP 263 encoding cookie:
# "coding[=:]\s*([-\w.]+)" must appear in the first two lines
# Examples:
# # -*- coding: utf-8 -*-
# # vim: set fileencoding=utf-8 :
# # coding: latin-1
cookie_re = re.compile(
r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII | re.MULTILINE)

BOM detection

# CPython: Lib/tokenize.py:310 BOM handling
BOM_UTF8 = b'\xef\xbb\xbf'
# If present at file start: encoding is UTF-8.
# Presence without a coding cookie is fine; the BOM itself declares UTF-8.
# BOM + coding cookie must agree (both UTF-8).

tokenize.open

# CPython: Lib/tokenize.py:400 open
def open(filename):
"""Open a file in read mode using the encoding detected by detect_encoding."""
buffer = builtins.open(filename, 'rb')
try:
encoding, lines = detect_encoding(buffer.readline)
buffer.seek(0)
text = io.TextIOWrapper(buffer, encoding, line_buffering=True)
text.mode = 'r'
return text
except:
buffer.close()
raise

gopy notes

tokenize.open uses builtins.open (gopy's vm.BuiltinOpen), io.TextIOWrapper (gopy objects/textiowrapper.go), and detect_encoding which is pure Python string manipulation. The BOM bytes constant is a Python bytes literal.