6
\$\begingroup\$

For a little project of mine I have written two versions for shifting a 128-bit unsigned integer consisting of four 32-bit unsigned integers in x86 assembly. I cannot really decide which is better in performance, style, etc..

UPDATE: I have written another two different functions. The first one doesn't need any conditional jumps, so there is no problem with branch mispredictions. However, it needs 40 bytes (on a 32-bit platform) to store the jump-tables that I have created in static memory. The second one uses conditional jumps, but in a better way than before, I think. Both functions doesn't really care about shift-values >=128/=0.

UPDATE 2: Because I was unsatisfied with the size of the jump-table (especially on a 64-bit platform) I rewrote the first function as a compromise between conditional jumps and jump-table size.

       .data
   JTABLE:
       .long L0,L1,L2,L3
       .text
       .global _shl_128
       .intel_syntax
   _shl_128:
       push ebx
       push esi
       mov edx, [esp+12]            //pointer to array of integers
       mov ecx, [esp+16]            //value of bits to shift
       mov esi, ecx
       shr esi, 5    
       mov esi, [JTABLE+esi*4]
       mov eax, [edx]
       mov ebx, [edx+4]
       and ecx, 31
       jmp esi
   L0:
       mov esi, [edx+8]
       shld [edx+12], esi, cl
       shld esi, ebx, cl
       shld ebx, eax, cl
       shl eax, cl
       mov [edx], eax
       mov [edx+4], ebx
       mov [edx+8], esi
       jmp L4
   L1:
       mov esi, [edx+8]
       je L5
       shld esi, ebx, cl
       shld ebx, eax, cl
       shl eax
   L5:
       mov [edx+4], eax
       mov [edx+8], ebx
       mov [edx+12], esi
       jmp L7    
   L2:
       je L6
       shld ebx, eax, cl
       shl eax
   L6:
       mov [edx+8], eax
       mov [edx+12], ebx
       jmp L8    
   L3:
       shl eax, cl
       mov [edx+12], eax 
       mov dword ptr [edx+8], 0
   L8:    
       mov dword ptr [edx+4], 0
   L7:    
       mov dword ptr [edx], 0    
   L4:
       pop esi
       pop ebx
       ret
  1. Function:

        .text
        .global _shl_128
        .intel_syntax
    _shl_128:
        push ebx
        push esi
        mov edx, [esp+12]            //pointer to array of integers
        mov ecx, [esp+16]            //value of bits to shift
        mov esi, ecx
        and ecx, 31
        cmp esi, 96
        mov eax, [edx]
        jae L1
        cmp esi, 64
        mov ebx, [edx+4]
        jae L2
        cmp esi, 32
        mov esi, [edx+8]
        jae L3
        shld [edx+12], esi, cl
        shld esi, ebx, cl
        shld ebx, eax, cl
        shl eax, cl
        mov [edx], eax
        mov [edx+4], ebx
        mov [edx+8], esi
        jmp L4
    L3:
        je L5
        shld esi, ebx, cl
        shld ebx, eax, cl
        shl eax
    L5:
        mov [edx+4], eax
        mov [edx+8], ebx
        mov [edx+12], esi
        jmp L6
    L2:
        je L7
        shld ebx, eax, cl
        shl eax
    L7:
        mov [edx+8], eax
        mov [edx+12], ebx
        jmp L8
    L1:
        je L9
        shl eax, cl
    L9:
        mov [edx+8], eax 
        mov dword ptr [edx+8], 0
    L8:    
        mov dword ptr [edx+4], 0
    L6:    
        mov dword ptr [edx], 0    
    L4:
        pop esi
        pop ebx
        ret
    
\$\endgroup\$
8
  • \$\begingroup\$ It looks like you aren't actually shifting 4 32bit integers, but are actually shifting one 128bit integer. \$\endgroup\$ Commented Aug 9, 2017 at 1:32
  • \$\begingroup\$ Yeah, that is exactly what I want to do. Maybe I expressed myself not quite correct but I wanted to say that I want to shift a 128 bit integer consisting of four 32-bit unsigned integers. \$\endgroup\$
    – idlmn89
    Commented Aug 9, 2017 at 2:00
  • \$\begingroup\$ Does your function actually work the way you expect? If you shift 0x2 left by 31 bits, do you expect 0or 0x100000000? Right now I see your code giving you 0. \$\endgroup\$
    – JS1
    Commented Aug 16, 2017 at 5:33
  • \$\begingroup\$ @JS1: You were right. The destination of the jmp-instruction after the code-block for shift-values lower 32 bit was wrong. \$\endgroup\$
    – idlmn89
    Commented Aug 16, 2017 at 6:08
  • \$\begingroup\$ @idlmn89 I don't think that's the only problem. If you want to "carry" bits from one 32-bit word to another, you will need to use a right shift somewhere, and I don't see that in your code anywhere. \$\endgroup\$
    – JS1
    Commented Aug 16, 2017 at 6:43

2 Answers 2

3
\$\begingroup\$

Well, I don't have a 32 bit build environment available just now. While I wrote this for 64bit, I'm only processing 32bits at a time. I haven't timed it, so I can't say if it is faster than yours, but it has no jumps. At the very least, it might give you some ideas.

If the comments look like copy/paste from some C code, there's a reason for that...

; void Shl(int s, DWORD *m)
; Shift the 4 DWORDs at m left by s bits

Shl proc

; s - ecx
; m - rdx

; rax, rcx, rdx, r8-r11 are scratch under Windows 64bit calling convention

    mov r8d, [rdx + 12]     ; t1 = m3; MSB
    mov r9d, [rdx + 8]      ; t2 = m2;
    mov r10d, [rdx + 4]     ; t3 = m1;
    mov r11d, [rdx]         ; t4 = m0; LSB
    xor eax, eax            ; t0 = 0;

    cmp ecx, 32
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;
    cmovge r10d, r11d     ; t3 = t4;
    cmovge r11d, eax      ; t4 = t0;

    cmp ecx, 64
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;
    cmovge r10d, r11d     ; t3 = t4;

    cmp ecx, 96
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;

    cmp ecx, 128
    cmovge r8d, r9d       ; t1 = t2;

    and ecx, 31

    shld r8d, r9d, cl       ; t1 = (t1 << s) | (t2 >> (32 - s));
    shld r9d, r10d, cl      ; t2 = (t2 << s) | (t3 >> (32 - s));
    shld r10d, r11d, cl     ; t3 = (t3 << s) | (t4 >> (32 - s));
    shl r11d, cl            ; t4 = t4 << s;     

    mov [rdx + 12], r8d  ; m[3] = t1;
    mov [rdx + 8], r9d   ; m[2] = t2;
    mov [rdx + 4], r10d  ; m[1] = t3;
    mov [rdx], r11d      ; m[0] = t4;

    ret

Shl endp

Yes, I'm using 7 registers, but if you push everything and use EBP, I think you'll be able to squeeze this in. Another reason I prefer 64bit.

\$\endgroup\$
0
\$\begingroup\$

While the conditional moves are kinda cool in concept, they aren't great for performance. A solution with better performance (at least for the test suite I'm using):

; void Shl(int s, DWORD *m)
; Shift the 4 DWORDs at m left by s bits

Shl proc

; s - ecx
; m - rdx

; rax, rcx, rdx, r8-r11 are scratch under Windows 64bit calling convention

    mov eax, ecx    ; Make a copy of ecx
    and ecx, 31     ; Mask out the lower bits

    xor r8d, r8d    ; zero out the 4 DWORDs
    xor r9d, r9d
    xor r10d, r10d
    xor r11d, r11d

    shr eax, 5      ; Divide by 32
    xor eax, 3      ; eax = 3 - eax

    cmp eax, 3      ; Handle >= 128
    jg start

    mov r8d, [rdx + rax * 4]    ; Read the DWORD to be used as MSB
    dec rax

    jl start                    ; More to read?
    mov r9d, [rdx + rax * 4]    ; Read the next DWORD
    dec rax

    jl start                    ; More to read?
    mov r10d, [rdx + rax * 4]   ; Read the next DWORD
    dec rax

    jl start                    ; More to read?
    mov r11d, [rdx + rax * 4]   ; Read the next DWORD

start:

    shld r8d, r9d, cl       ; t1 = (t1 << s) | (t2 >> (32 - s));
    mov [rdx + 12], r8d     ; m[3] = t1;

    shld r9d, r10d, cl      ; t2 = (t2 << s) | (t3 >> (32 - s));
    mov [rdx + 8], r9d      ; m[2] = t2;

    shld r10d, r11d, cl     ; t3 = (t3 << s) | (t4 >> (32 - s));
    mov [rdx + 4], r10d     ; m[1] = t3;

    shl r11d, cl            ; t4 = t4 << s;
    mov [rdx], r11d         ; m[0] = t4;

    ret

Shl endp

Still uses 7 registers, although that can be shrunk a bit if needed.

While interleaving the shld and mov means the s >= 128 case is suboptimal, it helps for the other (much more common) cases.

I should probably mention that my C code produces (slightly) better performance. But that's due to VS2017 using some BMI2 instructions. That seemed like cheating.

cmov: 1586
this: 1037
BMI2:  917
\$\endgroup\$

Not the answer you're looking for? Browse other questions tagged or ask your own question.