07 October, 2017

AsmJit - Register Allocator Progress


This post is dedicated to a new register allocation (RA) in AsmJit library. AsmJit provides register allocation since the beginning as it allows to write JIT code fast and especially to connect various asm components together without a significant effort. However, register allocation is a very complicated and challenging subject and the previous AsmJit version simply used a basic linear-scan with some ad-hoc tweaks.

New RA summary:

  • Control Flow Graph (CFG), CFG builder, and TiedRegs.
  • Liveness analysis at block level (IN, OUT, GEN, KILL, per-block statistics).
  • Liveness analysis at instruction level (live-ranges and per-instruction statistics).
  • Global allocator (bin packing).
  • Local allocation (instruction-by-instruction).
  • Rewriter.

Another thing that really improved is logging. Now each RA part can log a lot of details about the work it does, which is really useful for tuning and bug fixing. All components log to AsmJit's Logger interface that can be turned on/off at runtime. This means that code that is shipped and has initially logging turned off can still provide option to enable it at runtime (based on command line switch, for example).

Sample code

All parts of this post will use a sample 'AlphaBlend' function used by AsmJit tests:

void generateAlphaBlend(asmjit::X86Compiler& cc) {
  using namespace asmjit;

  X86Gp dst = cc.newIntPtr("dst");
  X86Gp src = cc.newIntPtr("src");

  X86Gp i = cc.newIntPtr("i");
  X86Gp j = cc.newIntPtr("j");
  X86Gp t = cc.newIntPtr("t");

  X86Xmm vzero = cc.newXmm("vzero");
  X86Xmm v0080 = cc.newXmm("v0080");
  X86Xmm v0101 = cc.newXmm("v0101");

  Label L_SmallLoop = cc.newLabel();
  Label L_SmallEnd  = cc.newLabel();
  Label L_LargeLoop = cc.newLabel();
  Label L_LargeEnd  = cc.newLabel();
  Label L_DataPool  = cc.newLabel();

  cc.addFunc(FuncSignature3<void, void*, const void*, size_t>(cc.getCodeInfo().getCdeclCallConv()));

  cc.setArg(0, dst);
  cc.setArg(1, src);
  cc.setArg(2, i);

  // How many pixels have to be processed to make the loop aligned.
  cc.lea(t, x86::ptr(L_DataPool));
  cc.xorps(vzero, vzero);
  cc.movaps(v0080, x86::ptr(t, 0));
  cc.movaps(v0101, x86::ptr(t, 16));

  cc.xor_(j, j);
  cc.sub(j, dst);
  cc.and_(j, 15);
  cc.shr(j, 2);

  cc.cmp(j, i);
  cc.cmovg(j, i); // j = min(i, j).
  cc.sub(i, j);   // i -= j.

  // Small loop.
    X86Xmm x0 = cc.newXmm("x0");
    X86Xmm y0 = cc.newXmm("y0");
    X86Xmm a0 = cc.newXmm("a0");

    cc.movd(y0, x86::ptr(src));
    cc.movd(x0, x86::ptr(dst));

    cc.pcmpeqb(a0, a0);
    cc.pxor(a0, y0);
    cc.psrlw(a0, 8);
    cc.punpcklbw(x0, vzero);

    cc.pshuflw(a0, a0, x86::shufImm(1, 1, 1, 1));
    cc.punpcklbw(y0, vzero);

    cc.pmullw(x0, a0);
    cc.paddsw(x0, v0080);
    cc.pmulhuw(x0, v0101);

    cc.paddw(x0, y0);
    cc.packuswb(x0, x0);

    cc.movd(x86::ptr(dst), x0);

    cc.add(dst, 4);
    cc.add(src, 4);


  // Second section, prepare for an aligned loop.

  cc.test(i, i);
  cc.mov(j, i);

  cc.and_(j, 3);
  cc.shr(i, 2);

  // Aligned loop.
    X86Xmm x0 = cc.newXmm("x0");
    X86Xmm x1 = cc.newXmm("x1");
    X86Xmm y0 = cc.newXmm("y0");
    X86Xmm a0 = cc.newXmm("a0");
    X86Xmm a1 = cc.newXmm("a1");

    cc.movups(y0, x86::ptr(src));
    cc.movaps(x0, x86::ptr(dst));

    cc.pcmpeqb(a0, a0);
    cc.xorps(a0, y0);
    cc.movaps(x1, x0);

    cc.psrlw(a0, 8);
    cc.punpcklbw(x0, vzero);

    cc.movaps(a1, a0);
    cc.punpcklwd(a0, a0);

    cc.punpckhbw(x1, vzero);
    cc.punpckhwd(a1, a1);

    cc.pshufd(a0, a0, x86::shufImm(3, 3, 1, 1));
    cc.pshufd(a1, a1, x86::shufImm(3, 3, 1, 1));

    cc.pmullw(x0, a0);
    cc.pmullw(x1, a1);

    cc.paddsw(x0, v0080);
    cc.paddsw(x1, v0080);

    cc.pmulhuw(x0, v0101);
    cc.pmulhuw(x1, v0101);

    cc.add(src, 16);
    cc.packuswb(x0, x1);

    cc.paddw(x0, y0);
    cc.movaps(x86::ptr(dst), x0);

    cc.add(dst, 16);


  cc.test(j, j);

  // Data.
  cc.align(kAlignData, 16);

Control Flow Graph (CFG)

CFG is a directed graph consisting of basic blocks (BB). Each BB has a list of instructions, predecessors, and successors, in addition to other metadata like statistics. The previous version of AsmJit didn't use CFG, it instead used a list of jumps and their targets that was generated during the analysis phase (called fetch in the previous AsmJit).

CFG makes code much more clear and it's something you will see in all Compiler related literature and implementations (all register allocators I checked out recently used CFG).

The CFG of the sample function:

  [Func] L5: void(u64@rdi dst, u64@rsi src, u64@rdx i)
    lea t, [L4]
    xorps vzero, vzero
    movaps v0080, [t]
    movaps v0101, [t+16]
    xor j, j
    sub j, dst
    and j, 15
    shr j, 2
    jz L1
    cmp j, i
    cmovg j, i
    sub i, j
    movd y0, [src]
    movd x0, [dst]
    pcmpeqb a0, a0
    pxor a0, y0
    psrlw a0, 8
    punpcklbw x0, vzero
    pshuflw a0, a0, 85{1|1|1|1}
    punpcklbw y0, vzero
    pmullw x0, a0
    paddsw x0, v0080
    pmulhuw x0, v0101
    paddw x0, y0
    packuswb x0, x0
    movd [dst], x0
    add dst, 4
    add src, 4
    dec j
    jnz L0
    test i, i
    mov j, i
    jz L6
    and j, 3
    shr i, 2
    jz L3
    movups y0, [src]
    movaps x0, [dst]
    pcmpeqb a0, a0
    xorps a0, y0
    movaps x1, x0
    psrlw a0, 8
    punpcklbw x0, vzero
    movaps a1, a0
    punpcklwd a0, a0
    punpckhbw x1, vzero
    punpckhwd a1, a1
    pshufd a0, a0, 245{1|1|3|3}
    pshufd a1, a1, 245{1|1|3|3}
    pmullw x0, a0
    pmullw x1, a1
    paddsw x0, v0080
    paddsw x1, v0080
    pmulhuw x0, v0101
    pmulhuw x1, v0101
    add src, 16
    packuswb x0, x1
    paddw x0, y0
    movaps [dst], x0
    add dst, 16
    dec i
    jnz L2
    test j, j
    jnz L0
  IDom of #1 -> #0
  IDom of #2 -> #1
  IDom of #3 -> #0
  IDom of #4 -> #3
  IDom of #5 -> #4
  IDom of #6 -> #4
  IDom of #7 -> #3
  IDom of #2 -> #0
  Done (3 iterations)

You can see that the logging improved so much. The logger is able to show function signature, each basic block {#XXX}, and also indents the code in a way to make it readable. Part of CFG is also building CFG views (currently only post-order-view) and dominance.

AsmJit does one more step at CFG construction that is not shown above, which is building TiedRegs from operands and filling their Read/Write information. AsmJit uses a data structure called RAInst, which contains an array of TiedRegs and some other data. TiedRegs solve the problem where one virtual register is used more than once in instruction's operands. A single TiedReg can link to 1 or more operand of an instruction, and instruction never uses 2 TiedRegs which would refer to the same virtual register.

Each TiedReg has the following information used during allocation and rewriting:

  • READ flag (R) - Read access.
  • WRITE flag (W) - Write access.
  • RW flag (X) - Combination of R and W flags.
  • USE slot - Register has a USE slot (either read or RW slot).
  • OUT slot - Register has an OUT slot (Write-only slot).

You may wonder why there are access flags as well as slots. Well, slots are used by the allocator in a very specific way. Since TiedRegs can contain data merged from multiple operands it's important to keep what each TiedReg represents. Imagine the following cases:

add x, y     ; x={R|W|Use}     y={R|Use}
lea x, [x+y] ; x={R|W|Use|Out} y={R|Use} 

Note the 'out' in the second case that tells us that 'x' is in both USE and OUT slots. In other words, TiedReg('x') contains a combined information from two operands [R|Use] and [W|Out], which still preserves the fact that we have USE and OUT slots and that the register allocator can reallocate 'x' into a different register by assigning to its OUT slot. And it would work even if we only use 'x' in LEA:

lea x, [x+x] ; x={R|W|Use|Out}

As you can see we still know that there are USE and OUT slots, we just lost the second USE slot, which doesn't really matter.

Liveness Analysis

Liveness analysis tells us which virtual register is alive and where. It happens in two phases:

  • 1. Block level - build LIVE_IN, LIVE_OUT, GEN, and KILL bit-vectors for each virtual register in each block.
  • 2. Instruction level - build live-ranges based on block-level liveness analysis and instructions of each block.

I really tried to optimize the liveness analysis as it has to collect a lot of information and iterate across the whole function. Here is an output from 'BuildLiveness' function:

  LiveIn/Out Done (16 visits)
    IN   [dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [t, j, dst]
    KILL [t, vzero, v0080, v0101, j]
    IN   [vzero, v0080, v0101, j, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [j, i]
    IN   [vzero, v0080, v0101, j, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [vzero, v0080, v0101, j, dst, y0, src, x0, a0]
    KILL [y0, x0, a0]
    IN   [vzero, v0080, v0101, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [i]
    KILL [j]
    IN   [vzero, v0080, v0101, j, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [j, i]
    IN   [vzero, v0080, v0101, j, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [vzero, v0080, v0101, dst, i, src, y0, x0, a0, x1, a1]
    KILL [y0, x0, a0, x1, a1]
    IN   [vzero, v0080, v0101, j, dst, i, src]
    OUT  [vzero, v0080, v0101, j, dst, i, src]
    GEN  [j]
  t     {id:0260 width: 6    freq: 0.5000}: [3:9]
  vzero {id:0261 width: 125  freq: 0.0400}: [5:130]
  v0080 {id:0262 width: 123  freq: 0.0325}: [7:130]
  v0101 {id:0263 width: 121  freq: 0.0331}: [9:130]
  j     {id:0259 width: 116  freq: 0.0948}: [11:62], [65:130]
  dst   {id:0256 width: 128  freq: 0.0547}: [2:130]
  i     {id:0258 width: 128  freq: 0.0547}: [2:130]
  y0    {id:0265 width: 22   freq: 0.1818}: [27:49]
  src   {id:0257 width: 128  freq: 0.0312}: [2:130]
  x0    {id:0264 width: 24   freq: 0.3333}: [29:53]
  a0    {id:0266 width: 12   freq: 0.4167}: [31:43]
  y0    {id:0269 width: 42   freq: 0.0714}: [75:117]
  x0    {id:0267 width: 42   freq: 0.2143}: [77:119]
  a0    {id:0270 width: 22   freq: 0.3182}: [79:101]
  x1    {id:0268 width: 32   freq: 0.1875}: [83:115]
  a1    {id:0271 width: 14   freq: 0.2857}: [89:103]

The output shows liveness analysis of each block and live-ranges of each virtual register. To build live-ranges each instruction must have a position in code (which increments by 2 to describe read/write operations) and based on that information it's very simple to calculate additional statistics like number of occurrences (reads/writes), total width, and frequency. Frequency, in particular, is a very interesting metric that is used later in Bin Packing.

Global allocation - Bin Packing

Bin Packing is simply trying to pack live ranges of more virtual registers into one physical register. AsmJit first sorts virtual registers by weight (frequency) and then tries to bin pack from the highest to the lowest. Here is an output from 'BinPack' function:

[RAPass::BinPack] Available=15 (0x0000FFEF) Count=5
  00: [3:9@260], [11:62@259], [65:130@259]
  01: [2:130@256]
  02: [2:130@258]
  03: [2:130@257]
[RAPass::BinPack] Available=16 (0x0000FFFF) Count=11
  00: [31:43@266], [79:101@270]
  01: [29:53@264], [89:103@271]
  02: [27:49@265], [77:119@267]
  03: [83:115@268]
  04: [75:117@269]
  05: [5:130@261]
  06: [9:130@263]
  07: [7:130@262]

As we can see the solution to the sample function is quite simple. In more complex cases when Bin Packing fails all registers that were not packed are marked as 'StackPreferred', which means that the local allocators would prefer these on the stack. After the bin packing is done it's time for the local allocator.

Local allocation - Instruction by Instruction

Local allocator is very similar to the previous one. It goes from instruction to instruction and allocates registers based on information produced by all previous phases. It handles additional cases like loads/spills and also cases where a fixed register is required.

Local allocator works at block basis. To make sure that the allocation is correct it records the exact state of all register upon all block entries and emits a state-switch if necessary when switching between blocks where the state differs. This design is still very similar to the previous register allocator except one thing - in some cases the new allocator may change the state of unallocated blocks a bit. For example it can set a dirty flag of some register.


Rewriter is executed after the register allocation is complete. It traverses over each instruction and rewrites virtual registers into physical registers based on instruction's TiedRegs (Each TiedReg stores physical register of USE and OUT slots).

And finally, here is a the rewritten code of our function:

    push rbx                        ; 53
    mov rcx, rdi                    ; 488BCF
    mov rbx, rsi                    ; 488BDE
    lea rax, [L4]                   ; 488D05........| lea t, [L4]                 | t{W|Out}
    xorps xmm5, xmm5                ; 0F57ED        | xorps vzero, vzero          | vzero{W|Out|Last}
    movaps xmm7, [rax]              ; 0F2838        | movaps v0080, [t]           | t{R|Use} v0080{W|Out|Last}
    movaps xmm6, [rax+16]           ; 0F287010      | movaps v0101, [t+16]        | t{R|Use|Last|Kill} v0101{W|Out|Last}
    xor rax, rax                    ; 4833C0        | xor j, j                    | j{W|Out}
    sub rax, rcx                    ; 482BC1        | sub j, dst                  | j{X|Use} dst{R|Use|Last}
    and rax, 15                     ; 83E00F        | and j, 15                   | j{X|Use}
    shr rax, 2                      ; 48C1E802      | shr j, 2                    | j{X|Use|Last}
    jz L1                           ; 0F84........  | jz L1
    cmp rax, rdx                    ; 483BC2        | cmp j, i                    | j{R|Use} i{R|Use}
    cmovg rax, rdx                  ; 480F4FC2      | cmovg j, i                  | j{X|Use} i{R|Use}
    sub rdx, rax                    ; 482BD0        | sub i, j                    | i{X|Use|Last} j{R|Use|Last}
  L0:                               ;               | L0:
    movd xmm2, [rbx]                ; 660F6E13      | movd y0, [src]              | src{R|Use} y0{W|Out}
    movd xmm1, [rcx]                ; 660F6E09      | movd x0, [dst]              | dst{R|Use} x0{W|Out}
    pcmpeqb xmm0, xmm0              ; 660F74C0      | pcmpeqb a0, a0              | a0{W|Out}
    pxor xmm0, xmm2                 ; 660FEFC2      | pxor a0, y0                 | a0{X|Use} y0{R|Use}
    psrlw xmm0, 8                   ; 660F71D008    | psrlw a0, 8                 | a0{X|Use}
    punpcklbw xmm1, xmm5            ; 660F60CD      | punpcklbw x0, vzero         | x0{X|Use} vzero{R|Use}
    pshuflw xmm0, xmm0, 85{1|1|1|1} ; F20F70C055    | pshuflw a0, a0, 85{1|1|1|1} | a0{X|Use|Out}
    punpcklbw xmm2, xmm5            ; 660F60D5      | punpcklbw y0, vzero         | y0{X|Use} vzero{R|Use|Last}
    pmullw xmm1, xmm0               ; 660FD5C8      | pmullw x0, a0               | x0{X|Use} a0{R|Use|Last|Kill}
    paddsw xmm1, xmm7               ; 660FEDCF      | paddsw x0, v0080            | x0{X|Use} v0080{R|Use|Last}
    pmulhuw xmm1, xmm6              ; 660FE4CE      | pmulhuw x0, v0101           | x0{X|Use} v0101{R|Use|Last}
    paddw xmm1, xmm2                ; 660FFDCA      | paddw x0, y0                | x0{X|Use} y0{R|Use|Last|Kill}
    packuswb xmm1, xmm1             ; 660F67C9      | packuswb x0, x0             | x0{X|Use}
    movd [rcx], xmm1                ; 660F7E09      | movd [dst], x0              | dst{R|Use} x0{R|Use|Last|Kill}
    add rcx, 4                      ; 4883C104      | add dst, 4                  | dst{X|Use|Last}
    add rbx, 4                      ; 4883C304      | add src, 4                  | src{X|Use|Last}
    dec rax                         ; 48FFC8        | dec j                       | j{X|Use|Last}
    short jnz L0                    ; 75B9          | jnz L0
  L1:                               ;               | L1:
    test rdx, rdx                   ; 4885D2        | test i, i                   | i{R|Use}
    mov rax, rdx                    ; 488BC2        | mov j, i                    | j{W|Out|Last} i{R|Use|Last}
    jz L6                           ; 0F84........  | jz L6
    and rax, 3                      ; 83E003        | and j, 3                    | j{X|Use|Last}
    shr rdx, 2                      ; 48C1EA02      | shr i, 2                    | i{X|Use|Last}
    jz L3                           ; 0F84........  | jz L3
  L2:                               ;               | L2:
    movups xmm4, [rbx]              ; 0F1023        | movups y0, [src]            | src{R|Use} y0{W|Out}
    movaps xmm2, [rcx]              ; 0F2811        | movaps x0, [dst]            | dst{R|Use} x0{W|Out}
    pcmpeqb xmm0, xmm0              ; 660F74C0      | pcmpeqb a0, a0              | a0{W|Out}
    xorps xmm0, xmm4                ; 0F57C4        | xorps a0, y0                | a0{X|Use} y0{R|Use}
    movaps xmm3, xmm2               ; 0F28DA        | movaps x1, x0               | x1{W|Out} x0{R|Use}
    psrlw xmm0, 8                   ; 660F71D008    | psrlw a0, 8                 | a0{X|Use}
    punpcklbw xmm2, xmm5            ; 660F60D5      | punpcklbw x0, vzero         | x0{X|Use} vzero{R|Use}
    movaps xmm1, xmm0               ; 0F28C8        | movaps a1, a0               | a1{W|Out} a0{R|Use}
    punpcklwd xmm0, xmm0            ; 660F61C0      | punpcklwd a0, a0            | a0{X|Use}
    punpckhbw xmm3, xmm5            ; 660F68DD      | punpckhbw x1, vzero         | x1{X|Use} vzero{R|Use|Last}
    punpckhwd xmm1, xmm1            ; 660F69C9      | punpckhwd a1, a1            | a1{X|Use}
    pshufd xmm0, xmm0, 245{1|1|3|3} ; 660F70C0F5    | pshufd a0, a0, 245{1|1|3|3} | a0{X|Use|Out}
    pshufd xmm1, xmm1, 245{1|1|3|3} ; 660F70C9F5    | pshufd a1, a1, 245{1|1|3|3} | a1{X|Use|Out}
    pmullw xmm2, xmm0               ; 660FD5D0      | pmullw x0, a0               | x0{X|Use} a0{R|Use|Last|Kill}
    pmullw xmm3, xmm1               ; 660FD5D9      | pmullw x1, a1               | x1{X|Use} a1{R|Use|Last|Kill}
    paddsw xmm2, xmm7               ; 660FEDD7      | paddsw x0, v0080            | x0{X|Use} v0080{R|Use}
    paddsw xmm3, xmm7               ; 660FEDDF      | paddsw x1, v0080            | x1{X|Use} v0080{R|Use|Last}
    pmulhuw xmm2, xmm6              ; 660FE4D6      | pmulhuw x0, v0101           | x0{X|Use} v0101{R|Use}
    pmulhuw xmm3, xmm6              ; 660FE4DE      | pmulhuw x1, v0101           | x1{X|Use} v0101{R|Use|Last}
    add rbx, 16                     ; 4883C310      | add src, 16                 | src{X|Use|Last}
    packuswb xmm2, xmm3             ; 660F67D3      | packuswb x0, x1             | x0{X|Use} x1{R|Use|Last|Kill}
    paddw xmm2, xmm4                ; 660FFDD4      | paddw x0, y0                | x0{X|Use} y0{R|Use|Last|Kill}
    movaps [rcx], xmm2              ; 0F2911        | movaps [dst], x0            | dst{R|Use} x0{R|Use|Last|Kill}
    add rcx, 16                     ; 4883C110      | add dst, 16                 | dst{X|Use|Last}
    dec rdx                         ; 48FFCA        | dec i                       | i{X|Use|Last}
    short jnz L2                    ; 759E          | jnz L2
  L3:                               ;               | L3:
    test rax, rax                   ; 4885C0        | test j, j                   | j{R|Use|Last}
    jnz L0                          ; 0F8535FFFFFF  | jnz L0
  L6:                               ;               | L6:
    pop rbx                         ; 5B
    ret                             ; C3
    .align 16
    .data 80008000800080008000800080008000
    .data 01010101010101010101010101010101


I hope you enjoyed the summary of my part-time work on AsmJit of the past 9 months. The changes currently reside in next-wip branch which I would like to merge after I fix all remaining blockers. Today is a first day I managed to get Blend2D running with new AsmJit so I guess it will not take long time after now.