Skip to content

gh-94841: Ensure arena_map_get() is inlined in PyObject_Free() #94842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 14, 2022
Merged

gh-94841: Ensure arena_map_get() is inlined in PyObject_Free() #94842

merged 2 commits into from
Jul 14, 2022

Conversation

neonene
Copy link
Contributor

@neonene neonene commented Jul 14, 2022

NOTE: This patch cannot be beckported to 3.10, in which Py_ALWAYS_INLINE macro is not introduced.

Copy link
Member

@gvanrossum gvanrossum left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Did you observe an improvement in performance?

@neonene
Copy link
Contributor Author

neonene commented Jul 14, 2022

3.11b3+ PGO (2022-07-09 7b5737a)

pyperf timeit -s
"from collections import deque; r = range(N, N + 10_000_000)" "deque(r, 0)"
N PR with v143 as-is with v143 as-is with v142
2**29 98.8 ms +- 2 ms 118 ms +- 3 ms (119%) 102 ms +- 3 ms (103%)
2**30 135 ms +- 1 ms 148 ms +- 3 ms (110%) 142 ms +- 2 ms (105%)
2**31 655 ms +- 6 ms 695 ms +- 4 ms (106%) 654 ms +- 9 ms (100%)
Faster PyObject_Free() with v143
sub rsp,28                              | obmalloc.c:739
mov rax,qword ptr ds:[<&_PyObject_Free> | obmalloc.c:741
lea rdx,qword ptr ds:[<_PyObject_Free>] |
cmp rax,rdx                             |
jne python311.7FFCD2385CB8              |
test rcx,rcx                            |
je python311.7FFCD2385C67               |
mov rax,rcx                             |
lea r11,qword ptr ds:[7FFCD2330000]     |
shr rax,31                              |
mov r8,rcx                              |
and r8,FFFFFFFFFFFFC000                 |
mov rdx,qword ptr ds:[r11+rax*8+51DB40] |
test rdx,rdx                            |
je python311.7FFCD2385C93               |
mov rax,rcx                             |
shr rax,22                              |
and eax,7FFF                            |
mov r9,qword ptr ds:[rdx+rax*8]         |
test r9,r9                              |
je python311.7FFCD2385C93               |
mov rax,rcx                             |
mov edx,ecx                             |
shr rax,14                              |
and edx,FFFFF                           |
and eax,3FFF                            |
cmp edx,dword ptr ds:[r9+rax*8+4]       |
jl python311.7FFCD2385C4B               |
mov r10d,dword ptr ds:[r9+rax*8]        |
cmp edx,r10d                            |
jl python311.7FFCD2385C93               |
test r10d,r10d                          |
je python311.7FFCD2385C93               |
mov rdx,qword ptr ds:[r8+8]             |
mov qword ptr ds:[rcx],rdx              |
mov eax,dword ptr ds:[r8]               |
dec eax                                 |
mov qword ptr ds:[r8+8],rcx             |
mov dword ptr ds:[r8],eax               |
test rdx,rdx                            |
je python311.7FFCD2385C6C               |
test eax,eax                            |
je python311.7FFCD2385CC6               |
add rsp,28                              | obmalloc.c:742
ret                                     |
Slower PyObject_Free() with v143
push rbx                                | obmalloc.c:739
sub rsp,20                              |
mov rax,qword ptr ds:[<&_PyObject_Free> | obmalloc.c:741
mov rbx,rcx                             |
lea rcx,qword ptr ds:[<_PyObject_Free>] |
cmp rax,rcx                             |
jne python311.7FFCD238003E              |
test rbx,rbx                            |
je python311.7FFCD237FFDB               |
mov qword ptr ss:[rsp+30],rdi           |
xor edx,edx                             |
mov rdi,rbx                             |
mov rcx,rbx                             |
and rdi,FFFFFFFFFFFFC000                |
call <python311.arena_map_get>          |
test rax,rax                            |
je python311.7FFCD2380010               |
mov rcx,rbx                             |
mov edx,ebx                             |
shr rcx,14                              |
and edx,FFFFF                           |
and ecx,3FFF                            |
cmp edx,dword ptr ds:[rax+rcx*8+4]      |
jl python311.7FFCD237FFBC               |
mov r8d,dword ptr ds:[rax+rcx*8]        |
cmp edx,r8d                             |
jl python311.7FFCD2380010               |
test r8d,r8d                            |
je python311.7FFCD2380010               |
mov rcx,qword ptr ds:[rdi+8]            |
mov qword ptr ds:[rbx],rcx              |
mov eax,dword ptr ds:[rdi]              |
dec eax                                 |
mov qword ptr ds:[rdi+8],rbx            |
mov dword ptr ds:[rdi],eax              |
test rcx,rcx                            |
je python311.7FFCD237FFE1               |
test eax,eax                            |
je python311.7FFCD2380050               |
mov rdi,qword ptr ss:[rsp+30]           |
add rsp,20                              | obmalloc.c:742
pop rbx                                 |
ret                                     |
  • arena_map_get()
mov qword ptr ss:[rsp+8],rbx            | obmalloc.c:1450
mov qword ptr ss:[rsp+10],rsi           |
push rdi                                |
sub rsp,20                              |
mov rax,rcx                             | obmalloc.c:1454
mov rbx,rcx                             |
shr rax,31                              |
lea rcx,qword ptr ds:[<arena_map_root>] | obmalloc.c:1455
mov esi,edx                             |
lea rdi,qword ptr ds:[rcx+rax*8]        |
mov rax,qword ptr ds:[rcx+rax*8]        |
test rax,rax                            |
je python311.7FFCD23AD264               | (PyMem_RawCalloc)
shr rbx,22                              | obmalloc.c:1467
and ebx,7FFF                            |
cmp qword ptr ds:[rax+rbx*8],0          |
je python311.7FFCD23AD287               | (PyMem_RawCalloc)
mov rax,qword ptr ds:[rax+rbx*8]        | obmalloc.c:1478
mov rbx,qword ptr ss:[rsp+30]           | obmalloc.c:1482
mov rsi,qword ptr ss:[rsp+38]           |
add rsp,20                              |
pop rdi                                 |
ret                                     |

@nascheme
Copy link
Member

To backport to 3.10, we can just use __forceinline or include the Py_ALWAYS_INLINE definition in obmalloc.c:

#if defined(Py_DEBUG)
   // If Python is built in debug mode, usually compiler optimizations are
   // disabled. In this case, Py_ALWAYS_INLINE can increase a lot the stack
   // memory usage. For example, forcing inlining using gcc -O0 increases the
   // stack usage from 6 KB to 15 KB per Python function call.
#  define Py_ALWAYS_INLINE
#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
#  define Py_ALWAYS_INLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
#  define Py_ALWAYS_INLINE __forceinline
#else
#  define Py_ALWAYS_INLINE
#endif

Another, more dramatic, option would be to disable the obmalloc radix tree on WIndows. E.g. -DWITH_PYMALLOC_RADIX_TREE=0. When disabled, you get smaller obmalloc arenas and pools (which is a bit slower) but you get a faster address_in_range() test. On AMD64 Linux, it seems about a wash for performance. I should have done some benchmarking on AMD64 Windows.

Copy link
Member

@gvanrossum gvanrossum left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will merge

@gvanrossum gvanrossum merged commit 9b3f779 into python:main Jul 14, 2022
@miss-islington
Copy link
Contributor

Thanks @neonene for the PR, and @gvanrossum for merging it 🌮🎉.. I'm working now to backport this PR to: 3.11.
🐍🍒⛏🤖

miss-islington pushed a commit to miss-islington/cpython that referenced this pull request Jul 14, 2022
@bedevere-bot
Copy link

GH-94862 is a backport of this pull request to the 3.11 branch.

@bedevere-bot bedevere-bot removed the needs backport to 3.11 only security fixes label Jul 14, 2022
@gvanrossum
Copy link
Member

gvanrossum commented Jul 14, 2022

I don’t care much about the 3.10 backport, but feel free to do more research! It looks like the compiler matters more than the processor here.

miss-islington added a commit that referenced this pull request Jul 14, 2022
@neonene neonene deleted the arena branch July 14, 2022 19:35
@gvanrossum
Copy link
Member

@nascheme Do you want to look more into your fixes, either the 3.10-specific way to force inlining or disabling the radix tree on Windows?

nascheme pushed a commit to nascheme/cpython that referenced this pull request Jul 14, 2022
…Free() (pythonGH-94842)

Co-authored-by: Neil Schemenauer <[email protected]>
(cherry picked from commit d0850aa170580b494beda030a94fb04254e4acbc)

Co-authored-by: neonene <[email protected]>
@nascheme
Copy link
Member

I made an attempt at backporting (GH-94868), not sure I did it right though. Using __forceinline seems the simplest thing to do.

nascheme added a commit that referenced this pull request Jul 15, 2022
…GH-94842)

Need to define ALWAYS_INLINE macro for 3.10.

Co-authored-by: neonene <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants