I noticed 8 byte std::array comparisons seem to be producing assembly different from bit_cast
ing. GCC seems to do what I expect for a char array, but clang generates an extra mov instruction (spilling the by-value array<>
arg from an 8-byte register to the red zone, but still comparing the register arg with the memory pointed-to by the other arg).
In the std::byte
case we get 8 separate single-byte cmp
vs. a single efficient qword compare for array<char>
. Curious if there is a reason for this difference?
#include <array>
#include <bit>
#include <cstdint>
// produces completely different asm then the other 2 functions
bool compare1(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return p == r;
}
// seems to be similar to bit_casting, but clang generates 1 more instruction
bool compare2(const std::array<char, 8> &p, std::array<char, 8> r)
{
return p == r;
}
// same assembly if you use char instead of byte
bool compare3(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return std::bit_cast<uint64_t>(p) == std::bit_cast<uint64_t>(r);
}
clang asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>): # @compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>)
cmp dil, sil
sete al
jne .LBB0_8
mov eax, edi
shr eax, 8
mov ecx, esi
shr ecx, 8
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 16
mov ecx, esi
shr ecx, 16
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 24
mov ecx, esi
shr ecx, 24
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 32
mov rcx, rsi
shr rcx, 32
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 40
mov rcx, rsi
shr rcx, 40
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 48
mov rcx, rsi
shr rcx, 48
cmp al, cl
sete al
jne .LBB0_8
xor rdi, rsi
shr rdi, 56
sete al
.LBB0_8:
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>): # @compare2(std::array<char, 8ul> const&, std::array<char, 8ul>)
mov qword ptr [rsp - 8], rsi
cmp qword ptr [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>): # @compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>)
cmp qword ptr [rdi], rsi
sete al
ret
gcc asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>):
mov rdx, rdi
mov rax, rsi
cmp sil, dil
jne .L9
movzx ecx, ah
cmp dh, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 16
shr rcx, 16
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 24
shr rcx, 24
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 32
shr rcx, 32
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 40
shr rcx, 40
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 48
shr rcx, 48
cmp sil, cl
jne .L9
shr rdx, 56
shr rax, 56
cmp dl, al
sete al
ret
.L9:
xor eax, eax
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret
compare1
with /O2. IMO this is just an unknown/missed specific optimization forstd::byte
. You may report a bug to Clang and GCC bug trackers.rsi
to a temporary thinking it was going to get used.cmp
is vastly more efficient than 8 separatecmp
instructions of the bytes separately on all x86-64 CPUs (agner.org/optimize), or any microarchitecture for other mainstream ISAs. Especially when that requires even more work to isolate each byte (e.g. shifting register args). Makes me wonder whetherstd::array<char>
has a specialization for==
or something in libstdc++, or if there's really something special about the internal type GCC and clang use forstd::byte
which defeats the optimization pass that normally coalesces into one compare.