Shifting an 128-bit integer consisting of four 32-bit integers

Question

For a little project of mine I have written two versions for shifting a 128-bit unsigned integer consisting of four 32-bit unsigned integers in x86 assembly. I cannot really decide which is better in performance, style, etc..

UPDATE: I have written another two different functions. The first one doesn't need any conditional jumps, so there is no problem with branch mispredictions. However, it needs 40 bytes (on a 32-bit platform) to store the jump-tables that I have created in static memory. The second one uses conditional jumps, but in a better way than before, I think. Both functions doesn't really care about shift-values >=128/=0.

UPDATE 2: Because I was unsatisfied with the size of the jump-table (especially on a 64-bit platform) I rewrote the first function as a compromise between conditional jumps and jump-table size.

       .data
   JTABLE:
       .long L0,L1,L2,L3
       .text
       .global _shl_128
       .intel_syntax
   _shl_128:
       push ebx
       push esi
       mov edx, [esp+12]            //pointer to array of integers
       mov ecx, [esp+16]            //value of bits to shift
       mov esi, ecx
       shr esi, 5    
       mov esi, [JTABLE+esi*4]
       mov eax, [edx]
       mov ebx, [edx+4]
       and ecx, 31
       jmp esi
   L0:
       mov esi, [edx+8]
       shld [edx+12], esi, cl
       shld esi, ebx, cl
       shld ebx, eax, cl
       shl eax, cl
       mov [edx], eax
       mov [edx+4], ebx
       mov [edx+8], esi
       jmp L4
   L1:
       mov esi, [edx+8]
       je L5
       shld esi, ebx, cl
       shld ebx, eax, cl
       shl eax
   L5:
       mov [edx+4], eax
       mov [edx+8], ebx
       mov [edx+12], esi
       jmp L7    
   L2:
       je L6
       shld ebx, eax, cl
       shl eax
   L6:
       mov [edx+8], eax
       mov [edx+12], ebx
       jmp L8    
   L3:
       shl eax, cl
       mov [edx+12], eax 
       mov dword ptr [edx+8], 0
   L8:    
       mov dword ptr [edx+4], 0
   L7:    
       mov dword ptr [edx], 0    
   L4:
       pop esi
       pop ebx
       ret

Function:

    .text
    .global _shl_128
    .intel_syntax
_shl_128:
    push ebx
    push esi
    mov edx, [esp+12]            //pointer to array of integers
    mov ecx, [esp+16]            //value of bits to shift
    mov esi, ecx
    and ecx, 31
    cmp esi, 96
    mov eax, [edx]
    jae L1
    cmp esi, 64
    mov ebx, [edx+4]
    jae L2
    cmp esi, 32
    mov esi, [edx+8]
    jae L3
    shld [edx+12], esi, cl
    shld esi, ebx, cl
    shld ebx, eax, cl
    shl eax, cl
    mov [edx], eax
    mov [edx+4], ebx
    mov [edx+8], esi
    jmp L4
L3:
    je L5
    shld esi, ebx, cl
    shld ebx, eax, cl
    shl eax
L5:
    mov [edx+4], eax
    mov [edx+8], ebx
    mov [edx+12], esi
    jmp L6
L2:
    je L7
    shld ebx, eax, cl
    shl eax
L7:
    mov [edx+8], eax
    mov [edx+12], ebx
    jmp L8
L1:
    je L9
    shl eax, cl
L9:
    mov [edx+8], eax 
    mov dword ptr [edx+8], 0
L8:    
    mov dword ptr [edx+4], 0
L6:    
    mov dword ptr [edx], 0    
L4:
    pop esi
    pop ebx
    ret

It looks like you aren't actually shifting 4 32bit integers, but are actually shifting one 128bit integer. — David Wohlferd, Commented Aug 9, 2017 at 1:32
Yeah, that is exactly what I want to do. Maybe I expressed myself not quite correct but I wanted to say that I want to shift a 128 bit integer consisting of four 32-bit unsigned integers. — idlmn89, Commented Aug 9, 2017 at 2:00
Does your function actually work the way you expect? If you shift 0x2 left by 31 bits, do you expect 0or 0x100000000? Right now I see your code giving you 0. — JS1, Commented Aug 16, 2017 at 5:33
@JS1: You were right. The destination of the jmp-instruction after the code-block for shift-values lower 32 bit was wrong. — idlmn89, Commented Aug 16, 2017 at 6:08
@idlmn89 I don't think that's the only problem. If you want to "carry" bits from one 32-bit word to another, you will need to use a right shift somewhere, and I don't see that in your code anywhere. — JS1, Commented Aug 16, 2017 at 6:43

David Wohlferd · Accepted Answer · 2017-08-16 07:17:54Z

Well, I don't have a 32 bit build environment available just now. While I wrote this for 64bit, I'm only processing 32bits at a time. I haven't timed it, so I can't say if it is faster than yours, but it has no jumps. At the very least, it might give you some ideas.

If the comments look like copy/paste from some C code, there's a reason for that...

; void Shl(int s, DWORD *m)
; Shift the 4 DWORDs at m left by s bits

Shl proc

; s - ecx
; m - rdx

; rax, rcx, rdx, r8-r11 are scratch under Windows 64bit calling convention

    mov r8d, [rdx + 12]     ; t1 = m3; MSB
    mov r9d, [rdx + 8]      ; t2 = m2;
    mov r10d, [rdx + 4]     ; t3 = m1;
    mov r11d, [rdx]         ; t4 = m0; LSB
    xor eax, eax            ; t0 = 0;

    cmp ecx, 32
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;
    cmovge r10d, r11d     ; t3 = t4;
    cmovge r11d, eax      ; t4 = t0;

    cmp ecx, 64
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;
    cmovge r10d, r11d     ; t3 = t4;

    cmp ecx, 96
    cmovge r8d, r9d       ; t1 = t2;
    cmovge r9d, r10d      ; t2 = t3;

    cmp ecx, 128
    cmovge r8d, r9d       ; t1 = t2;

    and ecx, 31

    shld r8d, r9d, cl       ; t1 = (t1 << s) | (t2 >> (32 - s));
    shld r9d, r10d, cl      ; t2 = (t2 << s) | (t3 >> (32 - s));
    shld r10d, r11d, cl     ; t3 = (t3 << s) | (t4 >> (32 - s));
    shl r11d, cl            ; t4 = t4 << s;     

    mov [rdx + 12], r8d  ; m[3] = t1;
    mov [rdx + 8], r9d   ; m[2] = t2;
    mov [rdx + 4], r10d  ; m[1] = t3;
    mov [rdx], r11d      ; m[0] = t4;

    ret

Shl endp

Yes, I'm using 7 registers, but if you push everything and use EBP, I think you'll be able to squeeze this in. Another reason I prefer 64bit.

David Wohlferd · Accepted Answer · 2017-08-17 09:37:42Z

While the conditional moves are kinda cool in concept, they aren't great for performance. A solution with better performance (at least for the test suite I'm using):

; void Shl(int s, DWORD *m)
; Shift the 4 DWORDs at m left by s bits

Shl proc

; s - ecx
; m - rdx

; rax, rcx, rdx, r8-r11 are scratch under Windows 64bit calling convention

    mov eax, ecx    ; Make a copy of ecx
    and ecx, 31     ; Mask out the lower bits

    xor r8d, r8d    ; zero out the 4 DWORDs
    xor r9d, r9d
    xor r10d, r10d
    xor r11d, r11d

    shr eax, 5      ; Divide by 32
    xor eax, 3      ; eax = 3 - eax

    cmp eax, 3      ; Handle >= 128
    jg start

    mov r8d, [rdx + rax * 4]    ; Read the DWORD to be used as MSB
    dec rax

    jl start                    ; More to read?
    mov r9d, [rdx + rax * 4]    ; Read the next DWORD
    dec rax

    jl start                    ; More to read?
    mov r10d, [rdx + rax * 4]   ; Read the next DWORD
    dec rax

    jl start                    ; More to read?
    mov r11d, [rdx + rax * 4]   ; Read the next DWORD

start:

    shld r8d, r9d, cl       ; t1 = (t1 << s) | (t2 >> (32 - s));
    mov [rdx + 12], r8d     ; m[3] = t1;

    shld r9d, r10d, cl      ; t2 = (t2 << s) | (t3 >> (32 - s));
    mov [rdx + 8], r9d      ; m[2] = t2;

    shld r10d, r11d, cl     ; t3 = (t3 << s) | (t4 >> (32 - s));
    mov [rdx + 4], r10d     ; m[1] = t3;

    shl r11d, cl            ; t4 = t4 << s;
    mov [rdx], r11d         ; m[0] = t4;

    ret

Shl endp

Still uses 7 registers, although that can be shrunk a bit if needed.

While interleaving the shld and mov means the s >= 128 case is suboptimal, it helps for the other (much more common) cases.

I should probably mention that my C code produces (slightly) better performance. But that's due to VS2017 using some BMI2 instructions. That seemed like cheating.

cmov: 1586
this: 1037
BMI2:  917

Stack Exchange Network

Shifting an 128-bit integer consisting of four 32-bit integers

2 Answers 2

Not the answer you're looking for? Browse other questions tagged
performance
comparative-review
integer
bitwise
assembly
or ask your own question.

Hot Network Questions

Shifting an 128-bit integer consisting of four 32-bit integers

2 Answers 2

Not the answer you're looking for? Browse other questions tagged performancecomparative-reviewintegerbitwiseassembly or ask your own question.

Related

Hot Network Questions

Not the answer you're looking for? Browse other questions tagged
performance
comparative-review
integer
bitwise
assembly
or ask your own question.