Better vector interleaves on x86 #8925

abadams · 2026-01-27T19:30:26Z

Using the strategy in CodeGen_LLVM to do big vector interleaves (repeated 2-way interleaves), LLVM generates pretty poor code on x86. This is because x86 has no two-way vector interleave instruction until avx-512, and that instruction requires a runtime shuffle table, using up a register. The instructions x86 does have that take immediates are weird and hard to think about. It's important to stick to instructions that take immediates because interleaves often happen in high register pressure contexts (e.g. block transposes). This PR redoes vector interleaving for power of two blocks on x86 to use unpckl and shufi/vperm2/vinsert instructions only. The algorithm is somewhat complex and requires reasoning about permutations of the bits of the indices of each element. Hopefully it is understandable given the jumbo comment. I first got it working in python and Claude correctly translated that to C++ for me, after which I made extensive rewrites.

On my machine, this makes block transposes significantly faster and shorter in terms of code size and avoids some of the pathological cases on main. E.g. a 16x16 transpose of uint16s on avx2 on main is 621 instructions total, taking 419 cycles. I'd paste it but it's just a huge mess of various instructions. In this PR it's 134 instructions and 64 cycles:

	vmovdqu	-7680(%r9), %ymm0
	vmovdqu	-7168(%r9), %ymm1
	vmovdqu	-6656(%r9), %ymm12
	vmovdqu	-6144(%r9), %ymm14
	vmovdqu	-5632(%r9), %ymm6
	vmovdqu	-5120(%r9), %ymm8
	vmovdqu	-4608(%r9), %ymm15
	vmovdqu	-4096(%r9), %ymm2
	vmovdqu	-3584(%r9), %ymm5
	vmovdqu	-3072(%r9), %ymm9
	vmovups	-2560(%r9), %ymm3
	vmovups	%ymm3, (%rsp)                   
	vmovups	-2048(%r9), %ymm3
	vmovups	%ymm3, 32(%rsp)                 
	vmovdqu	-1536(%r9), %ymm4
	vmovdqu	-1024(%r9), %ymm11
	vmovdqu	-512(%r9), %ymm7
	vpunpcklwd	%ymm6, %ymm0, %ymm3     
	vpunpckhwd	%ymm6, %ymm0, %ymm10    
	vpunpcklwd	%ymm8, %ymm1, %ymm6     
	vpunpckhwd	%ymm8, %ymm1, %ymm13    
	vpunpcklwd	%ymm15, %ymm12, %ymm8   
	vpunpckhwd	%ymm15, %ymm12, %ymm0   
	vpunpcklwd	%ymm2, %ymm14, %ymm12   
	vpunpckhwd	%ymm2, %ymm14, %ymm15   
	vpunpcklwd	%ymm4, %ymm5, %ymm1     
	vpunpckhwd	%ymm4, %ymm5, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vpunpcklwd	%ymm11, %ymm9, %ymm2    
	vpunpckhwd	%ymm11, %ymm9, %ymm4    
	vmovdqu	%ymm4, 64(%rsp)                 
	vmovdqu	(%rsp), %ymm4                   
	vpunpcklwd	%ymm7, %ymm4, %ymm11    
	vpunpckhwd	%ymm7, %ymm4, %ymm9     
	vmovdqu	(%r9), %ymm5
	vmovdqu	32(%rsp), %ymm7                 
	vpunpcklwd	%ymm5, %ymm7, %ymm4     
	vpunpckhwd	%ymm5, %ymm7, %ymm7     
	vpunpcklwd	%ymm8, %ymm3, %ymm5     
	vpunpckhwd	%ymm8, %ymm3, %ymm8     
	vpunpcklwd	%ymm12, %ymm6, %ymm3    
	vpunpckhwd	%ymm12, %ymm6, %ymm14   
	vpunpcklwd	%ymm0, %ymm10, %ymm12   
	vpunpckhwd	%ymm0, %ymm10, %ymm10   
	vpunpcklwd	%ymm15, %ymm13, %ymm0   
	vpunpckhwd	%ymm15, %ymm13, %ymm15  
	vpunpcklwd	%ymm11, %ymm1, %ymm13   
	vpunpckhwd	%ymm11, %ymm1, %ymm11   
	vpunpcklwd	%ymm4, %ymm2, %ymm1     
	vpunpckhwd	%ymm4, %ymm2, %ymm4     
	vmovups	96(%rsp), %ymm2                 
	vpunpcklwd	%ymm9, %ymm2, %ymm6     
	vpunpckhwd	%ymm9, %ymm2, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vmovups	64(%rsp), %ymm9                 
	vpunpcklwd	%ymm7, %ymm9, %ymm2     
	vpunpckhwd	%ymm7, %ymm9, %ymm9     
	vpunpcklwd	%ymm3, %ymm5, %ymm7     
	vpunpckhwd	%ymm3, %ymm5, %ymm5     
	vpunpcklwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 160(%rsp)                
	vpunpckhwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 192(%rsp)                
	vpunpcklwd	%ymm0, %ymm12, %ymm8    
	vpunpckhwd	%ymm0, %ymm12, %ymm0    
	vmovdqu	%ymm0, 32(%rsp)                 
	vpunpcklwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, (%rsp)                   
	vpunpckhwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, 64(%rsp)                 
	vpunpcklwd	%ymm1, %ymm13, %ymm15   
	vpunpckhwd	%ymm1, %ymm13, %ymm12   
	vpunpcklwd	%ymm4, %ymm11, %ymm14   
	vpunpckhwd	%ymm4, %ymm11, %ymm11   
	vpunpcklwd	%ymm2, %ymm6, %ymm13    
	vpunpckhwd	%ymm2, %ymm6, %ymm10    
	vmovups	%ymm7, 128(%rsp)                
	vinsertf128	$1, %xmm15, %ymm7, %ymm0
	vmovups	%ymm0, (%r14,%r10)
	vmovdqa	%ymm5, %ymm7
	vinsertf128	$1, %xmm12, %ymm7, %ymm0
	leaq	(%r14,%r10), %r11
	vmovups	%ymm0, (%r11,%r15,2)
	vmovups	160(%rsp), %ymm5                
	vinsertf128	$1, %xmm14, %ymm5, %ymm0
	vmovups	%ymm0, (%r11,%r15,4)
	vmovups	192(%rsp), %ymm6                
	vinsertf128	$1, %xmm11, %ymm6, %ymm0
	leaq	(%r11,%rsi), %r12
	addq	%rsi, %r12
	vmovups	%ymm0, (%r12,%r15,2)
	vmovups	96(%rsp), %ymm0                 
	vpunpcklwd	%ymm9, %ymm0, %ymm1     
	vpunpckhwd	%ymm9, %ymm0, %ymm0     
	vinsertf128	$1, %xmm13, %ymm8, %ymm2
	vmovups	%ymm2, (%r11,%r15,8)
	vmovups	32(%rsp), %ymm4                 
	vinsertf128	$1, %xmm10, %ymm4, %ymm2
	addq	%r8, %r12
	vmovups	%ymm2, (%r12,%r15,2)
	vmovups	(%rsp), %ymm9                   
	vinsertf128	$1, %xmm1, %ymm9, %ymm2
	leaq	(%r12,%r15,2), %r11
	vmovups	%ymm2, (%rsi,%r11)
	vmovups	64(%rsp), %ymm3                 
	vinsertf128	$1, %xmm0, %ymm3, %ymm2
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$19, 128(%rsp), %ymm15, %ymm2 
                                        
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm12, %ymm7, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm14, %ymm5, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm11, %ymm6, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm13, %ymm8, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm10, %ymm4, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm1, %ymm9, %ymm1 
	addq	%rsi, %r11
	vmovups	%ymm1, (%rsi,%r11)
	vperm2f128	$49, %ymm0, %ymm3, %ymm0 
	addq	%rsi, %r11
	vmovups	%ymm0, (%rsi,%r11)
	addq	$32, %r10
	addq	$8192, %r9                      
	cmpq	$512, %r10

This changes what block sizes are best used for transposing. Here are the best block sizes for each type before and after this change:

AVX512:

State	Bytes per element	Best width	Best height	Bandwidth (GB/s)
Before	1	16	8	10.154
After	1	32	32	19.0136
Before	2	8	16	20.2936
After	2	16	32	26.3541
Before	4	8	8	24.3352
After	4	16	16	30.8833
Before	8	8	4	22.6106
After	8	8	8	22.5469

AVX2:

State	Bytes per element	Best width	Best height	Bandwidth (GB/s)
Before	1	16	8	11.5999
After	1	16	32	17.0604
Before	2	4	8	15.5821
After	2	16	16	23.5512
Before	4	4	8	21.977
After	4	8	8	27.7187
Before	8	4	8	19.0545
After	8	8	4	23.823

A good rule of thumb seems to be that you now want to use 512-byte blocks on avx2, and 1024-byte blocks on avx512.

The previous comment reported a time that seemed to have regressed. It was not 8.2ms on main - more like 11

abadams · 2026-01-27T19:54:35Z

Also notable: LLVM is happy to undo all this shuffle factorization work, fuse them back together, and just make a big mess. So a new mechanism in this PR is optimization_fence, which abuses llvm's arithmetic fence intrinsic to prevent fusion of shuffle instructions (it's supposed to be used to prevent of floating point ops).

alexreinking

A couple comment nits, but otherwise, everything read very clearly. Did you write most of these comments yourself, or did Claude?

src/CodeGen_X86.cpp

abadams · 2026-01-27T23:21:30Z

I wrote them all myself. And re-reading it all myself I found a bunch of stuff I didn't like and made more changes. Maybe it existed in the original python too but at this point I've rewritten enough of the code that I don't think this counts as coauthored by claude anymore.

abadams added 4 commits January 26, 2026 15:52

Specialized x86 implementation of interleave_vectors

9e89b7c

Update test to be more exhaustive

188bee0

Fix comment.

2ba8dde

The previous comment reported a time that seemed to have regressed. It was not 8.2ms on main - more like 11

Comment fix

d102f7b

alexreinking self-requested a review January 27, 2026 20:16

abadams added 2 commits January 27, 2026 12:54

clang-tidy fixes

46d41dd

Make variable names more consistent

27f1220

alexreinking approved these changes Jan 27, 2026

View reviewed changes

src/CodeGen_X86.cpp Outdated Show resolved Hide resolved

src/CodeGen_X86.cpp Outdated Show resolved Hide resolved

abadams added 2 commits January 27, 2026 15:18

Simplify code with helper lambda

5576f46

Comment tweaks

107aaa5

Don't do half-width unpcks

0bc1b9f

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Better vector interleaves on x86 #8925

Better vector interleaves on x86 #8925

Uh oh!

abadams commented Jan 27, 2026 •

edited

Loading

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

alexreinking left a comment

Uh oh!

Uh oh!

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Better vector interleaves on x86 #8925

Are you sure you want to change the base?

Better vector interleaves on x86 #8925

Uh oh!

Conversation

abadams commented Jan 27, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

alexreinking left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

abadams commented Jan 27, 2026 •

edited

Loading