Skip to content

Commit 15ecef1

Browse files
Merge branch 'main' into add-pypickle-getxidata
2 parents 647c538 + bdd23c0 commit 15ecef1

23 files changed

+771
-162
lines changed

Include/internal/pycore_crossinterp.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,13 @@ PyAPI_FUNC(int) _PyPickle_GetXIData(
178178
PyObject *,
179179
_PyXIData_t *);
180180

181+
// _PyObject_GetXIData() for marshal
182+
PyAPI_FUNC(PyObject *) _PyMarshal_ReadObjectFromXIData(_PyXIData_t *);
183+
PyAPI_FUNC(int) _PyMarshal_GetXIData(
184+
PyThreadState *,
185+
PyObject *,
186+
_PyXIData_t *);
187+
181188

182189
/* using cross-interpreter data */
183190

Include/internal/pycore_interp_structs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,11 @@ struct _Py_interp_cached_objects {
667667

668668
/* object.__reduce__ */
669669
PyObject *objreduce;
670+
#ifndef Py_GIL_DISABLED
671+
/* resolve_slotdups() */
670672
PyObject *type_slots_pname;
671673
pytype_slotdef *type_slots_ptrs[MAX_EQUIV];
674+
#endif
672675

673676
/* TypeVar and related types */
674677
PyTypeObject *generic_type;

Include/internal/pycore_object.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ extern int _PyDict_CheckConsistency(PyObject *mp, int check_content);
313313
// Fast inlined version of PyType_HasFeature()
314314
static inline int
315315
_PyType_HasFeature(PyTypeObject *type, unsigned long feature) {
316-
return ((FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags) & feature) != 0);
316+
return ((type->tp_flags) & feature) != 0;
317317
}
318318

319319
extern void _PyType_InitCache(PyInterpreterState *interp);

Include/internal/pycore_typeobject.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ extern int _PyType_AddMethod(PyTypeObject *, PyMethodDef *);
134134
extern void _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask,
135135
unsigned long flags);
136136

137-
extern unsigned int _PyType_GetVersionForCurrentState(PyTypeObject *tp);
138137
PyAPI_FUNC(void) _PyType_SetVersion(PyTypeObject *tp, unsigned int version);
139138
PyTypeObject *_PyType_LookupByVersion(unsigned int version);
140139

Include/object.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,12 @@ given type object has a specified feature.
620620
#define Py_TPFLAGS_HAVE_FINALIZE (1UL << 0)
621621
#define Py_TPFLAGS_HAVE_VERSION_TAG (1UL << 18)
622622

623+
// Flag values for ob_flags (16 bits available, if SIZEOF_VOID_P > 4).
624+
#define _Py_IMMORTAL_FLAGS (1 << 0)
625+
#define _Py_STATICALLY_ALLOCATED_FLAG (1 << 2)
626+
#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG)
627+
#define _Py_TYPE_REVEALED_FLAG (1 << 3)
628+
#endif
623629

624630
#define Py_CONSTANT_NONE 0
625631
#define Py_CONSTANT_FALSE 1
@@ -776,11 +782,7 @@ PyType_HasFeature(PyTypeObject *type, unsigned long feature)
776782
// PyTypeObject is opaque in the limited C API
777783
flags = PyType_GetFlags(type);
778784
#else
779-
# ifdef Py_GIL_DISABLED
780-
flags = _Py_atomic_load_ulong_relaxed(&type->tp_flags);
781-
# else
782-
flags = type->tp_flags;
783-
# endif
785+
flags = type->tp_flags;
784786
#endif
785787
return ((flags & feature) != 0);
786788
}

Include/refcount.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@ immortal. The latter should be the only instances that require
1919
cleanup during runtime finalization.
2020
*/
2121

22-
#define _Py_STATICALLY_ALLOCATED_FLAG 4
23-
#define _Py_IMMORTAL_FLAGS 1
24-
2522
#if SIZEOF_VOID_P > 4
2623
/*
2724
In 64+ bit systems, any object whose 32 bit reference count is >= 2**31

InternalDocs/garbage_collector.md

Lines changed: 181 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -477,17 +477,24 @@ specifically in a generation by calling `gc.collect(generation=NUM)`.
477477
```
478478

479479

480-
Optimization: visiting reachable objects
481-
========================================
480+
Optimization: excluding reachable objects
481+
=========================================
482482

483-
An object cannot be garbage if it can be reached.
483+
An object cannot be garbage if it can be reached. To avoid having to identify
484+
reference cycles across the whole heap, we can reduce the amount of work done
485+
considerably by first identifying objects reachable from objects known to be
486+
alive. These objects are excluded from the normal cyclic detection process.
484487

485-
To avoid having to identify reference cycles across the whole heap, we can
486-
reduce the amount of work done considerably by first moving most reachable objects
487-
to the `visited` space. Empirically, most reachable objects can be reached from a
488-
small set of global objects and local variables.
489-
This step does much less work per object, so reduces the time spent
490-
performing garbage collection by at least half.
488+
The default and free-threaded build both implement this optimization but in
489+
slightly different ways.
490+
491+
Finding reachable objects for the default build GC
492+
--------------------------------------------------
493+
494+
This works by first moving most reachable objects to the `visited` space.
495+
Empirically, most reachable objects can be reached from a small set of global
496+
objects and local variables. This step does much less work per object, so
497+
reduces the time spent performing garbage collection by at least half.
491498

492499
> [!NOTE]
493500
> Objects that are not determined to be reachable by this pass are not necessarily
@@ -515,6 +522,171 @@ increment. All objects directly referred to from those stack frames are
515522
added to the working set.
516523
Then the above algorithm is repeated, starting from step 2.
517524

525+
526+
Finding reachable objects for the free-threaded GC
527+
--------------------------------------------------
528+
529+
Within the `gc_free_threading.c` implementation, this is known as the "mark
530+
alive" pass or phase. It is similar in concept to what is done for the default
531+
build GC. Rather than moving objects between double-linked lists, the
532+
free-threaded GC uses a flag in `ob_gc_bits` to track if an object is
533+
found to be definitely alive (not garbage).
534+
535+
To find objects reachable from known alive objects, known as the "roots", the
536+
`gc_mark_alive_from_roots()` function is used. Root objects include
537+
`interp->sysdict` (the `sys` module dictionary), `interp->builtins`, and
538+
`interp->types`. Also included are all objects referred to by active Python
539+
frames. These objects and the transitive closure of objects reachable from
540+
them have `_PyGC_BITS_ALIVE` set. Any object with that bit set is excluded
541+
from the rest of the cyclic garbage detection process, since we know it cannot
542+
be unreachable.
543+
544+
> [!NOTE]
545+
> If the `gc.freeze()` function has been used, this phase of the collector is
546+
> skipped. That is done for two reasons. First, it is unlikely to be a
547+
> performance win if most of the objects have been marked as frozen. As such,
548+
> they would be excluded for the cyclic garbage detection, without this extra
549+
> work anyhow. Second, one of the purposes of using `gc.freeze()` is to avoid
550+
> dirtying the memory pages holding frozen objects. If this phase was executed,
551+
> the toggling of the `ob_gc_bits` flags would dirty pages and defeat that.
552+
553+
Software prefetch hinting
554+
-------------------------
555+
556+
To speed up the "mark alive" phase of the free-threaded GC, an additional
557+
optimization, known as software prefetching, is used. The GC will execute
558+
explicit prefetch CPU instructions in order to reduce the latency due to
559+
loading data from main memory. This added complexity can pay off since main
560+
memory is so much slower compared to accessing data in the CPU cache. This
561+
is enabled only if the number of long-lived objects exceeds a threshold. If
562+
the set of objects being garbage collected is small, the accessed memory is
563+
likely to fit entirely in the CPU cache and software prefetch is not helpful.
564+
565+
The details of this optimization are intricate, with the source code being the
566+
best reference. However, the rest of this section gives a high level
567+
description of how it works and explains some design decisions.
568+
569+
Software prefetching is only used during the "mark alive" phase of the GC.
570+
Specifically, when we are performing the transitive closure of the "alive"
571+
status of objects (i.e. objects reachable from known alive objects, known as
572+
roots). For each object we find, we need to traverse all objects directly
573+
reachable from that object. If that set of referred objects is in scattered
574+
locations of memory, the hardware prefetch is unlikely to predict the next
575+
accessed memory location.
576+
577+
Making software prefetch work well hinges on a key principle: allow enough time
578+
between issuing the prefetch instruction for a memory location and actually
579+
accessing that location's data. We can call that time difference the prefetch
580+
window. If the window is too large, we fill up the CPU caches with data that's
581+
not needed yet. Worse, the data might be discarded from the cache before we
582+
actually use it. If the window is too small then the memory system does not
583+
have enough time to finish loading the memory and the CPU will have to wait.
584+
The window is indirectly tuned by the prefetch buffer parameters. The buffer
585+
will be explained next.
586+
587+
The prefetch buffer is a FIFO queue of fixed size. When we enqueue an object
588+
reference into the buffer, we also issue a software prefetch instruction for
589+
that memory location. When we dequeue an object reference from the buffer, we
590+
assume or hope that enough time has elapsed so that the memory has been loaded
591+
into the cache. This is the mechanism that provides the window.
592+
593+
When performing the transitive closure of "alive" status, the set of objects
594+
yet to visit are stored in one of two places. First, they can be stored in the
595+
prefech buffer. Second, there is a LIFO stack, of unlimited size. When object
596+
references are found using `tp_traverse`, they are enqueued in the buffer if
597+
it is not full, otherwise they are pushed to the stack.
598+
599+
We must take special care not to access the memory referred to by an object
600+
pointer until we take that reference out of the prefetch buffer. That means we
601+
cannot check if an object is a "container" (if the `PyObject_IS_GC()` test is
602+
true) or if the object already has the "alive" flag set. Both of those tests
603+
would require that the object memory is accessed. There are some additional
604+
elaborations that try to keep the buffer filled to the optimal size but to keep
605+
things simple we will omit those details here. Not discussed in details are
606+
"spans", which help reduce the amount of stack used and make it easier to
607+
control the size of the buffer.
608+
609+
As mentioned, the prefetch window is the time delay between the issue of the
610+
prefetch instruction, on buffer enqueue, and the memory access, after buffer
611+
dequeue. It is tuned by adjusting some buffer parameters. If processing time
612+
were equal for every object then the buffer length would be proportional to the
613+
window. Since processing each object can actually take a variable amount of
614+
time, the relation between the buffer length and the prefetch window is only
615+
approximate. However, this proportional relationship is assumed to hold true
616+
and we avoid the overhead of actually measuring object processing times.
617+
618+
The relevant parameters are the maximum buffer size and the low and high
619+
thresholds for filling. The buffer parameters are set as follows: maximum
620+
length is 256, low threshold is 8, high threshold is 16. These parameters are
621+
used as follows. If the buffer has reached the maximum size, new object
622+
pointers found while following references are pushed to the stack, rather than
623+
put in the buffer. When dequeuing objects from the buffer, we will "prime" the
624+
buffer if the current length drops below the low threshold. Priming means
625+
popping objects from the stack and enqueing them into the buffer. While
626+
priming, we will fill it only until the high threshold is reached.
627+
628+
To measure the effectiveness of the buffer, some benchmark programs were run
629+
with a trace log of memory location prefetch and access instructions. The
630+
prefetch window for each object processed was computed from the trace log.
631+
Each enqueue and dequeue operation were treated as taking one unit of time.
632+
Time to actually process the object was assumed to be zero. A histogram of the
633+
windows is shown below. These traces suggest the buffer length is mostly being
634+
kept between the low and high thresholds, which is what we want. Variations of
635+
the buffer parameters were benchmarked and the best performing parameters were
636+
chosen. Obviously it is unlikely these parameters will be optimal for all
637+
hardware and programs.
638+
639+
```
640+
Prefetch window stats
641+
mean 52.1
642+
median 14.0
643+
max 256
644+
25.60 |65,304 | ******************************
645+
51.20 |5,590 | **
646+
76.80 |3,562 | *
647+
102.40 |2,683 | *
648+
128.00 |2,278 | *
649+
153.60 |2,285 | *
650+
179.20 |2,377 | *
651+
204.80 |2,238 | *
652+
230.40 |2,753 | *
653+
256.00 |5,930 | **
654+
-------- |------- | -------
655+
N= |95,000
656+
```
657+
658+
Using software prefetch instructions is only a win if the set of objects being
659+
examined by the GC does not fit into CPU caches. Otherwise, using the buffer
660+
and prefetch instructions is just overhead. Using the long lived object count
661+
seems a good estimate of if things will fit in the cache. On 64-bit platforms,
662+
the minimum object size is 32 bytes. A 4MB L2 cache would hold about 130,000
663+
objects.
664+
665+
The current threshold for enabling prefetch is that the number of long-lived
666+
objects must exceed 200,000. Based on benchmarking, this seems in the range
667+
where prefetch becomes a net gain. Obviously it depends on hardware details
668+
and also the "shape" of the object graph. For example, your object graph may
669+
be constructed by linearly allocating objects in memory. Then, traversing the
670+
object graph might naturally result in mostly ordered memory access. In that
671+
case, the hardware prefetch is likely to do a nearly perfect job without any
672+
software prefetch hints.
673+
674+
This optimization, as of March 2025, was tuned on the following hardware
675+
platforms:
676+
677+
- Apple M3 Pro, 32 GB RAM, 192+128 KB L1, 16 MB L2, compiled with Clang 19
678+
- AMD Ryzen 5 7600X, 64 GB RAM, 384 KB L1, 6 GB L2, 32 MB L3, compiled with GCC 12.2.0
679+
680+
Benchmarking the effectiveness of this optimization is particularly difficult.
681+
It depends both on hardware details, like CPU cache sizes and memory latencies,
682+
and the specifics of the program's memory access patterns, where objects are
683+
located in memory and in what order they are accessed during the mark alive
684+
phase. When the program's memory access patterns are favourable, working set
685+
of data larger than the CPU cache, objects allocated in such a way that access
686+
order is not linear, then the speedup from using software prefetching is in the
687+
range of 20% to 40% faster for the entire full GC collection.
688+
689+
518690
Optimization: reusing fields to save memory
519691
===========================================
520692

Lib/pydoc.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class or function within a module or module in a package. If the
6868
import re
6969
import sys
7070
import sysconfig
71+
import textwrap
7172
import time
7273
import tokenize
7374
import urllib.parse
@@ -1809,6 +1810,37 @@ def writedocs(dir, pkgpath='', done=None):
18091810
writedoc(modname)
18101811
return
18111812

1813+
1814+
def _introdoc():
1815+
import textwrap
1816+
ver = '%d.%d' % sys.version_info[:2]
1817+
if os.environ.get('PYTHON_BASIC_REPL'):
1818+
pyrepl_keys = ''
1819+
else:
1820+
# Additional help for keyboard shortcuts if enhanced REPL is used.
1821+
pyrepl_keys = '''
1822+
You can use the following keyboard shortcuts at the main interpreter prompt.
1823+
F1: enter interactive help, F2: enter history browsing mode, F3: enter paste
1824+
mode (press again to exit).
1825+
'''
1826+
return textwrap.dedent(f'''\
1827+
Welcome to Python {ver}'s help utility! If this is your first time using
1828+
Python, you should definitely check out the tutorial at
1829+
https://docs.python.org/{ver}/tutorial/.
1830+
1831+
Enter the name of any module, keyword, or topic to get help on writing
1832+
Python programs and using Python modules. To get a list of available
1833+
modules, keywords, symbols, or topics, enter "modules", "keywords",
1834+
"symbols", or "topics".
1835+
{pyrepl_keys}
1836+
Each module also comes with a one-line summary of what it does; to list
1837+
the modules whose name or summary contain a given string such as "spam",
1838+
enter "modules spam".
1839+
1840+
To quit this help utility and return to the interpreter,
1841+
enter "q", "quit" or "exit".
1842+
''')
1843+
18121844
class Helper:
18131845

18141846
# These dictionaries map a topic name to either an alias, or a tuple
@@ -2075,23 +2107,7 @@ def help(self, request, is_cli=False):
20752107
self.output.write('\n')
20762108

20772109
def intro(self):
2078-
self.output.write('''\
2079-
Welcome to Python {0}'s help utility! If this is your first time using
2080-
Python, you should definitely check out the tutorial at
2081-
https://docs.python.org/{0}/tutorial/.
2082-
2083-
Enter the name of any module, keyword, or topic to get help on writing
2084-
Python programs and using Python modules. To get a list of available
2085-
modules, keywords, symbols, or topics, enter "modules", "keywords",
2086-
"symbols", or "topics".
2087-
2088-
Each module also comes with a one-line summary of what it does; to list
2089-
the modules whose name or summary contain a given string such as "spam",
2090-
enter "modules spam".
2091-
2092-
To quit this help utility and return to the interpreter,
2093-
enter "q", "quit" or "exit".
2094-
'''.format('%d.%d' % sys.version_info[:2]))
2110+
self.output.write(_introdoc())
20952111

20962112
def list(self, items, columns=4, width=80):
20972113
items = list(sorted(items))

Lib/test/_crossinterp_definitions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,10 @@ async def asyncgen_spam(*args):
163163
gen_spam_1,
164164
gen_spam_2,
165165
async_spam,
166-
coro_spam, # actually FunctionType?
167166
asyncgen_spam,
167+
]
168+
FUNCTION_LIKE_APPLIED = [
169+
coro_spam, # actually FunctionType?
168170
asynccoro_spam, # actually FunctionType?
169171
]
170172

0 commit comments

Comments
 (0)