Python chooses one of these three kinds of data type to internally represent for a Unicode-characters string, so every Unicode character of the string has the same fixed-length: 1, 2 or 4,
typedef struct {
Py_ssize_t ob_refcnt;
PyTypeObject *ob_type;
} PyObject
/* --- Unicode Type ------------------------------------------------------- */
typedef struct {
/* There are 4 forms of Unicode strings:
- compact ascii:
* structure = PyASCIIObject
* test: PyUnicode_IS_COMPACT_ASCII(op)
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* (length is the length of the utf8 and wstr strings)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
- compact:
* structure = PyCompactUnicodeObject
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* ascii = 0
* utf8 is not shared with data
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data and wstr_length=length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
* wstr_length = 0 if wstr is NULL
* (data starts just after the structure)
- legacy string, not ready:
* structure = PyUnicodeObject
* test: kind == PyUnicode_WCHAR_KIND
* length = 0 (use wstr_length)
* hash = -1
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ascii = 0
* ready = 0
* interned = SSTATE_NOT_INTERNED
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* utf8_length = 0
- legacy string, ready:
* structure = PyUnicodeObject structure
* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* utf8 is shared and utf8_length = length with data.any if ascii = 1
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data.any and wstr_length = length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
* wstr_length = 0 if wstr is NULL
Compact strings use only one memory block (structure + characters),
whereas legacy strings use one block for the structure and one block
for characters.
Legacy strings are created by PyUnicode_FromUnicode() and
PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
when PyUnicode_READY() is called.
See also _PyUnicode_CheckConsistency().
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
SSTATE_NOT_INTERNED (0)
SSTATE_INTERNED_MORTAL (1)
SSTATE_INTERNED_IMMORTAL (2)
If interned != SSTATE_NOT_INTERNED, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
unsigned int interned:2;
/* Character size:
- PyUnicode_WCHAR_KIND (0):
* character type = wchar_t (16 or 32 bits, depending on the
platform)
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
unsigned int kind:3;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or
the data pointer is filled out. The bit is redundant, and helps
to minimize the test in PyUnicode_IS_READY(). */
unsigned int ready:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
} state;
wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
PyCompactUnicodeObject structure. state.compact is set, and the data
immediately follow the structure. */
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using _PyUnicode_Ready. */
typedef struct {
PyCompactUnicodeObject _base;
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
As it's known that each Unicode character in string is represented by a Unicode code point. In
PyUnicodeObject
, these code points are the encoding saved in the
data
, so
PyUnicodeObject
does not use the
UTF-8
encoding in the
data
.
PyObject *PyUnicode_FromStringAndSize(const char *str, Py_ssize_t size)
PyObject *PyUnicode_DecodeUTF8Stateful(const char *str, Py_ssize_t size, const char *errors, Py_ssize_t *consumed)
static PyObject *unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed)
PyObject *PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
static Py_ssize_t ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
// ucs2lib.h
#define STRINGLIB(F) ucs2lib_##F
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest,
Py_ssize_t *outpos)
Let's examine the internal data struct of a string object in modern Python 3.
import ctypes
# It's recommended to go to see [python 3.10 unicodeobject.h](https://github.com/python/cpython/blob/3.10/Include/cpython/unicodeobject.h#L85-L244)
class PyASCIIObject(ctypes.Structure):
# internal fields of the string object
_fields_ = [
("ob_refcnt", ctypes.c_long),
("ob_type", ctypes.c_void_p),
("length", ctypes.c_ssize_t),
("hash", ctypes.c_ssize_t),
("interned", ctypes.c_uint, 2),
("kind", ctypes.c_uint, 3),
("compact", ctypes.c_uint, 1),
("ascii", ctypes.c_uint, 1),
("ready", ctypes.c_uint, 1),
("_padding", ctypes.c_uint, 24),
("wstr", ctypes.POINTER(ctypes.c_wchar))
def __repr__(self) -> str:
return f"ob_refcnt[{self.ob_refcnt}], length[{self.length}], interned[{self.interned}], kind[{self.kind}], compact[{self.compact}], ascii[{self.ascii}], ready[{self.ready}]"
class PyCompactUnicodeObject(PyASCIIObject):
# internal fields of the string object
_fields_ = [
("utf8_length", ctypes.c_ssize_t),
("utf8", ctypes.POINTER(ctypes.c_char)),
("wstr_length", ctypes.c_ssize_t),
def __repr__(self) -> str:
return super().__repr__() + f" utf8_length[{self.utf8_length}], utf8[{self.utf8}], wstr_length[{self.wstr_length}]"
class PyUnicodeObject(PyCompactUnicodeObject):
class _Data(ctypes.Union):
_fields_ = [
("any", ctypes.c_void_p),
("latin1", ctypes.POINTER(ctypes.c_uint8)),
("ucs2", ctypes.POINTER(ctypes.c_uint16)),
("ucs4", ctypes.POINTER(ctypes.c_uint32)),
_fields_ = [
("data", _Data),
Type: compact ascii. Key fields: kind[1], compact[1], ascii[1], ready[1]
>>> string_obj = "Hello, ctypes!"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[14], interned[0], kind[1], compact[1], ascii[1], ready[1]
>>> # compact ascii: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyASCIIObject)
>>> data = ctypes.cast(data_addr, ctypes.c_char_p)
>>> print(f"data: {data.value}")
data: b'Hello, ctypes!'
Type: compact UCS-2. Key fields: kind[1], compact[1], ascii[1], ready[1]
>>> string_obj = "你好!"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[3], interned[0], kind[2], compact[1], ascii[0], ready[1]
>>> compact_obj = PyCompactUnicodeObject.from_address(addr)
>>> print(compact_obj)
ob_refcnt[1], length[3], interned[0], kind[2], compact[1], ascii[0], ready[1] utf8_length[0], utf8[<ctypes.LP_c_char object at 0x7f0c29297ac
0>], wstr_length[0]
>>> # compact: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyCompactUnicodeObject)
>>> data = ctypes.cast(data_addr, ctypes.POINTER(ctypes.c_uint16))
>>> print(f"data: {data[0]}, {data[0]:#06x}, {chr(data[0])}")
data: 20320, 0x4f60, 你
>>> print(f"data: {data[1]}, {data[1]:#06x}, {chr(data[1])}")
data: 22909, 0x597d, 好
>>> print(f"data: {data[2]}, {data[2]:#06x}, {chr(data[2])}")
data: 33, 0x0021, !
>>> print(f"data: {data[3]}, {data[3]:#06x}, {chr(data[3])}")
data: 0, 0x0000,
Type: compact UCS-4. Key fields: kind[4], compact[1], ascii[1], ready[1]
>>> string_obj = "你好🤨"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[3], interned[0], kind[4], compact[1], ascii[0], ready[1]
>>> compact_obj = PyCompactUnicodeObject.from_address(addr)
>>> print(compact_obj)
ob_refcnt[1], length[3], interned[0], kind[4], compact[1], ascii[0], ready[1] utf8_length[0], utf8[<ctypes.LP_c_char object at 0x7f0c292b1ac
0>], wstr_length[3]
>>> # compact: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyCompactUnicodeObject)
>>> data = ctypes.cast(data_addr, ctypes.POINTER(ctypes.c_uint32))
>>> print(f"data: {data[0]}, {data[0]:#010x}, {chr(data[0])}")
data: 20320, 0x00004f60, 你
>>> print(f"data: {data[1]}, {data[1]:#010x}, {chr(data[1])}")
data: 22909, 0x0000597d, 好
>>> print(f"data: {data[2]}, {data[2]:#010x}, {chr(data[2])}")
data: 129320, 0x0001f928, 🤨
>>> print(f"data: {data[3]}, {data[3]:#010x}, {chr(data[3])}")
data: 0, 0x00000000,
Type: legacy string. Key fields: kind[2], compact[0], ascii[0]
I can't produce it in Python3.10, maybe you can try python2.7.
All codes are at object layout
How Python saves memory when storing strings | Artem Golubin
Python behind the scenes #9: how Python strings work
https://nedbatchelder.com/text/unipain.html
https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/