For a little project of mine I have written two versions for shifting a 128-bit unsigned integer consisting of four 32-bit unsigned integers in x86 assembly. I cannot really decide which is better in performance, style, etc..
UPDATE: I have written another two different functions. The first one doesn't need any conditional jumps, so there is no problem with branch mispredictions. However, it needs 40 bytes (on a 32-bit platform) to store the jump-tables that I have created in static memory. The second one uses conditional jumps, but in a better way than before, I think. Both functions doesn't really care about shift-values >=128/=0.
UPDATE 2: Because I was unsatisfied with the size of the jump-table (especially on a 64-bit platform) I rewrote the first function as a compromise between conditional jumps and jump-table size.
.data
JTABLE:
.long L0,L1,L2,L3
.text
.global _shl_128
.intel_syntax
_shl_128:
push ebx
push esi
mov edx, [esp+12] //pointer to array of integers
mov ecx, [esp+16] //value of bits to shift
mov esi, ecx
shr esi, 5
mov esi, [JTABLE+esi*4]
mov eax, [edx]
mov ebx, [edx+4]
and ecx, 31
jmp esi
L0:
mov esi, [edx+8]
shld [edx+12], esi, cl
shld esi, ebx, cl
shld ebx, eax, cl
shl eax, cl
mov [edx], eax
mov [edx+4], ebx
mov [edx+8], esi
jmp L4
L1:
mov esi, [edx+8]
je L5
shld esi, ebx, cl
shld ebx, eax, cl
shl eax
L5:
mov [edx+4], eax
mov [edx+8], ebx
mov [edx+12], esi
jmp L7
L2:
je L6
shld ebx, eax, cl
shl eax
L6:
mov [edx+8], eax
mov [edx+12], ebx
jmp L8
L3:
shl eax, cl
mov [edx+12], eax
mov dword ptr [edx+8], 0
L8:
mov dword ptr [edx+4], 0
L7:
mov dword ptr [edx], 0
L4:
pop esi
pop ebx
ret
Function:
.text .global _shl_128 .intel_syntax _shl_128: push ebx push esi mov edx, [esp+12] //pointer to array of integers mov ecx, [esp+16] //value of bits to shift mov esi, ecx and ecx, 31 cmp esi, 96 mov eax, [edx] jae L1 cmp esi, 64 mov ebx, [edx+4] jae L2 cmp esi, 32 mov esi, [edx+8] jae L3 shld [edx+12], esi, cl shld esi, ebx, cl shld ebx, eax, cl shl eax, cl mov [edx], eax mov [edx+4], ebx mov [edx+8], esi jmp L4 L3: je L5 shld esi, ebx, cl shld ebx, eax, cl shl eax L5: mov [edx+4], eax mov [edx+8], ebx mov [edx+12], esi jmp L6 L2: je L7 shld ebx, eax, cl shl eax L7: mov [edx+8], eax mov [edx+12], ebx jmp L8 L1: je L9 shl eax, cl L9: mov [edx+8], eax mov dword ptr [edx+8], 0 L8: mov dword ptr [edx+4], 0 L6: mov dword ptr [edx], 0 L4: pop esi pop ebx ret
0x2
left by 31 bits, do you expect0
or0x100000000
? Right now I see your code giving you0
. \$\endgroup\$