Skip to content

Conversation

@abadams
Copy link
Member

@abadams abadams commented Jan 27, 2026

Using the strategy in CodeGen_LLVM to do big vector interleaves (repeated 2-way interleaves), LLVM generates pretty poor code on x86. This is because x86 has no two-way vector interleave instruction until avx-512, and that instruction requires a runtime shuffle table, using up a register. The instructions x86 does have that take immediates are weird and hard to think about. It's important to stick to instructions that take immediates because interleaves often happen in high register pressure contexts (e.g. block transposes). This PR redoes vector interleaving for power of two blocks on x86 to use unpckl and shufi/vperm2/vinsert instructions only. The algorithm is somewhat complex and requires reasoning about permutations of the bits of the indices of each element. Hopefully it is understandable given the jumbo comment. I first got it working in python and Claude correctly translated that to C++ for me, after which I made extensive rewrites.

On my machine, this makes block transposes significantly faster and shorter in terms of code size and avoids some of the pathological cases on main. E.g. a 16x16 transpose of uint16s on avx2 on main is 621 instructions total, taking 419 cycles. I'd paste it but it's just a huge mess of various instructions. In this PR it's 134 instructions and 64 cycles:

	vmovdqu	-7680(%r9), %ymm0
	vmovdqu	-7168(%r9), %ymm1
	vmovdqu	-6656(%r9), %ymm12
	vmovdqu	-6144(%r9), %ymm14
	vmovdqu	-5632(%r9), %ymm6
	vmovdqu	-5120(%r9), %ymm8
	vmovdqu	-4608(%r9), %ymm15
	vmovdqu	-4096(%r9), %ymm2
	vmovdqu	-3584(%r9), %ymm5
	vmovdqu	-3072(%r9), %ymm9
	vmovups	-2560(%r9), %ymm3
	vmovups	%ymm3, (%rsp)                   
	vmovups	-2048(%r9), %ymm3
	vmovups	%ymm3, 32(%rsp)                 
	vmovdqu	-1536(%r9), %ymm4
	vmovdqu	-1024(%r9), %ymm11
	vmovdqu	-512(%r9), %ymm7
	vpunpcklwd	%ymm6, %ymm0, %ymm3     
	vpunpckhwd	%ymm6, %ymm0, %ymm10    
	vpunpcklwd	%ymm8, %ymm1, %ymm6     
	vpunpckhwd	%ymm8, %ymm1, %ymm13    
	vpunpcklwd	%ymm15, %ymm12, %ymm8   
	vpunpckhwd	%ymm15, %ymm12, %ymm0   
	vpunpcklwd	%ymm2, %ymm14, %ymm12   
	vpunpckhwd	%ymm2, %ymm14, %ymm15   
	vpunpcklwd	%ymm4, %ymm5, %ymm1     
	vpunpckhwd	%ymm4, %ymm5, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vpunpcklwd	%ymm11, %ymm9, %ymm2    
	vpunpckhwd	%ymm11, %ymm9, %ymm4    
	vmovdqu	%ymm4, 64(%rsp)                 
	vmovdqu	(%rsp), %ymm4                   
	vpunpcklwd	%ymm7, %ymm4, %ymm11    
	vpunpckhwd	%ymm7, %ymm4, %ymm9     
	vmovdqu	(%r9), %ymm5
	vmovdqu	32(%rsp), %ymm7                 
	vpunpcklwd	%ymm5, %ymm7, %ymm4     
	vpunpckhwd	%ymm5, %ymm7, %ymm7     
	vpunpcklwd	%ymm8, %ymm3, %ymm5     
	vpunpckhwd	%ymm8, %ymm3, %ymm8     
	vpunpcklwd	%ymm12, %ymm6, %ymm3    
	vpunpckhwd	%ymm12, %ymm6, %ymm14   
	vpunpcklwd	%ymm0, %ymm10, %ymm12   
	vpunpckhwd	%ymm0, %ymm10, %ymm10   
	vpunpcklwd	%ymm15, %ymm13, %ymm0   
	vpunpckhwd	%ymm15, %ymm13, %ymm15  
	vpunpcklwd	%ymm11, %ymm1, %ymm13   
	vpunpckhwd	%ymm11, %ymm1, %ymm11   
	vpunpcklwd	%ymm4, %ymm2, %ymm1     
	vpunpckhwd	%ymm4, %ymm2, %ymm4     
	vmovups	96(%rsp), %ymm2                 
	vpunpcklwd	%ymm9, %ymm2, %ymm6     
	vpunpckhwd	%ymm9, %ymm2, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vmovups	64(%rsp), %ymm9                 
	vpunpcklwd	%ymm7, %ymm9, %ymm2     
	vpunpckhwd	%ymm7, %ymm9, %ymm9     
	vpunpcklwd	%ymm3, %ymm5, %ymm7     
	vpunpckhwd	%ymm3, %ymm5, %ymm5     
	vpunpcklwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 160(%rsp)                
	vpunpckhwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 192(%rsp)                
	vpunpcklwd	%ymm0, %ymm12, %ymm8    
	vpunpckhwd	%ymm0, %ymm12, %ymm0    
	vmovdqu	%ymm0, 32(%rsp)                 
	vpunpcklwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, (%rsp)                   
	vpunpckhwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, 64(%rsp)                 
	vpunpcklwd	%ymm1, %ymm13, %ymm15   
	vpunpckhwd	%ymm1, %ymm13, %ymm12   
	vpunpcklwd	%ymm4, %ymm11, %ymm14   
	vpunpckhwd	%ymm4, %ymm11, %ymm11   
	vpunpcklwd	%ymm2, %ymm6, %ymm13    
	vpunpckhwd	%ymm2, %ymm6, %ymm10    
	vmovups	%ymm7, 128(%rsp)                
	vinsertf128	$1, %xmm15, %ymm7, %ymm0
	vmovups	%ymm0, (%r14,%r10)
	vmovdqa	%ymm5, %ymm7
	vinsertf128	$1, %xmm12, %ymm7, %ymm0
	leaq	(%r14,%r10), %r11
	vmovups	%ymm0, (%r11,%r15,2)
	vmovups	160(%rsp), %ymm5                
	vinsertf128	$1, %xmm14, %ymm5, %ymm0
	vmovups	%ymm0, (%r11,%r15,4)
	vmovups	192(%rsp), %ymm6                
	vinsertf128	$1, %xmm11, %ymm6, %ymm0
	leaq	(%r11,%rsi), %r12
	addq	%rsi, %r12
	vmovups	%ymm0, (%r12,%r15,2)
	vmovups	96(%rsp), %ymm0                 
	vpunpcklwd	%ymm9, %ymm0, %ymm1     
	vpunpckhwd	%ymm9, %ymm0, %ymm0     
	vinsertf128	$1, %xmm13, %ymm8, %ymm2
	vmovups	%ymm2, (%r11,%r15,8)
	vmovups	32(%rsp), %ymm4                 
	vinsertf128	$1, %xmm10, %ymm4, %ymm2
	addq	%r8, %r12
	vmovups	%ymm2, (%r12,%r15,2)
	vmovups	(%rsp), %ymm9                   
	vinsertf128	$1, %xmm1, %ymm9, %ymm2
	leaq	(%r12,%r15,2), %r11
	vmovups	%ymm2, (%rsi,%r11)
	vmovups	64(%rsp), %ymm3                 
	vinsertf128	$1, %xmm0, %ymm3, %ymm2
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$19, 128(%rsp), %ymm15, %ymm2 
                                        
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm12, %ymm7, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm14, %ymm5, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm11, %ymm6, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm13, %ymm8, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm10, %ymm4, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm1, %ymm9, %ymm1 
	addq	%rsi, %r11
	vmovups	%ymm1, (%rsi,%r11)
	vperm2f128	$49, %ymm0, %ymm3, %ymm0 
	addq	%rsi, %r11
	vmovups	%ymm0, (%rsi,%r11)
	addq	$32, %r10
	addq	$8192, %r9                      
	cmpq	$512, %r10                      

This changes what block sizes are best used for transposing. Here are the best block sizes for each type before and after this change:

AVX512:

State Bytes per element Best width Best height Bandwidth (GB/s)
Before 1 16 8 10.154
After 1 32 32 19.0136
Before 2 8 16 20.2936
After 2 16 32 26.3541
Before 4 8 8 24.3352
After 4 16 16 30.8833
Before 8 8 4 22.6106
After 8 8 8 22.5469

AVX2:

State Bytes per element Best width Best height Bandwidth (GB/s)
Before 1 16 8 11.5999
After 1 16 32 17.0604
Before 2 4 8 15.5821
After 2 16 16 23.5512
Before 4 4 8 21.977
After 4 8 8 27.7187
Before 8 4 8 19.0545
After 8 8 4 23.823

A good rule of thumb seems to be that you now want to use 512-byte blocks on avx2, and 1024-byte blocks on avx512.

The previous comment reported a time that seemed to have regressed. It
was not 8.2ms on main - more like 11
@abadams
Copy link
Member Author

abadams commented Jan 27, 2026

Also notable: LLVM is happy to undo all this shuffle factorization work, fuse them back together, and just make a big mess. So a new mechanism in this PR is optimization_fence, which abuses llvm's arithmetic fence intrinsic to prevent fusion of shuffle instructions (it's supposed to be used to prevent of floating point ops).

@alexreinking alexreinking self-requested a review January 27, 2026 20:16
Copy link
Member

@alexreinking alexreinking left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple comment nits, but otherwise, everything read very clearly. Did you write most of these comments yourself, or did Claude?

@abadams
Copy link
Member Author

abadams commented Jan 27, 2026

I wrote them all myself. And re-reading it all myself I found a bunch of stuff I didn't like and made more changes. Maybe it existed in the original python too but at this point I've rewritten enough of the code that I don't think this counts as coauthored by claude anymore.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants