Add shift by multiple constant#1220
Conversation
AntoinePrv
left a comment
There was a problem hiding this comment.
Also we need to add all the proper forward to dynamic for all other architectures to avoid ending up with a common implementation (unless that's all there is).
There was a problem hiding this comment.
I named this util as common seems to be more used for implementing the common architecture.
include/xsimd/arch/utils/shifts.hpp
Outdated
|
|
||
| template <class T, class T2, class A, class R, T... Vs> | ||
| XSIMD_INLINE batch<T, A> bitwise_lshift_as_twice_larger( | ||
| batch<T, A> const& self, batch_constant<T, A, Vs...>, R req) noexcept |
There was a problem hiding this comment.
Probably should remove R req here
|
@AntoinePrv unless there's is a huge need on your side, I won't schedule that one for the release. Let's just add masked load / store support for other architecture so that we can release, then begin a new commit wave :-) |
|
@serge-sans-paille I had hoped I could do it quickly before the weekend but it seems it will take longer. |
de97676 to
1cd12f9
Compare
This comment was marked as resolved.
This comment was marked as resolved.
c85add7 to
fb76fd7
Compare
caaab2a to
adead38
Compare
| namespace detail | ||
| { | ||
| // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift`` | ||
| // directly (or in a parent register function). | ||
| // The ``batch_constant`` overload is a rare but useful optimization. | ||
| // Running the detection here is less error prone than to add a fallback to all | ||
| // architectures. | ||
|
|
||
| template <class A, class B, class C, class = void> | ||
| struct has_bitwise_lshift_batch_const : std::false_type | ||
| { | ||
| }; | ||
|
|
||
| template <class A, class B, class C> | ||
| struct has_bitwise_lshift_batch_const<A, B, C, | ||
| void_t<decltype(kernel::bitwise_lshift<A>(std::declval<B>(), std::declval<C>(), A {}))>> | ||
| : std::true_type | ||
| { | ||
| }; |
There was a problem hiding this comment.
We run the detection of an optimization in the xsimd_api to avoid adding many error-prone overloads.
|
I've made a fresh take, and added the fallback to runtime API in This seems much improved over the runtime version. E.g. with GCC 15, SSE2, and uin8_t Compile time (new) .cfi_startproc
movdqa (%rdi), %xmm1
movdqa .LC0(%rip), %xmm0
pcmpeqd %xmm2, %xmm2
movdqa %xmm2, %xmm3
psllw $8, %xmm2
pmullw %xmm1, %xmm0
psrlw $8, %xmm3
pand %xmm2, %xmm1
pmullw .LC3(%rip), %xmm1
pand %xmm3, %xmm0
por %xmm1, %xmm0
ret
.cfi_endproc
.LFE6862:
.size _Z1fRKN5xsimd5batchIhNS_4sse2EEE, .-_Z1fRKN5xsimd5batchIhNS_4sse2EEE
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.value 2
.value 8
.value 16
.value 4
.value 2
.value 2
.value 2
.value 2
.align 16
.LC3:
.value 4
.value 16
.value 8
.value 2
.value 2
.value 2
.value 2
.value 2Run time (before) .cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
movzbl 15(%rdi), %eax
movzbl 13(%rdi), %edx
movzbl 12(%rdi), %ecx
movzbl 8(%rdi), %r15d
movzbl 7(%rdi), %r12d
movzbl 11(%rdi), %esi
movzbl 10(%rdi), %r8d
movb %al, -12(%rsp)
movzbl 9(%rdi), %r13d
movzbl 14(%rdi), %eax
movb %dl, -10(%rsp)
xorl %edx, %edx
movzbl 6(%rdi), %r9d
movzbl 3(%rdi), %ebx
movb %cl, -9(%rsp)
xorl %ecx, %ecx
movzbl 5(%rdi), %r10d
movzbl 4(%rdi), %r11d
addl %r13d, %r13d
addl %r8d, %r8d
movzbl 2(%rdi), %ebp
movzbl 1(%rdi), %r14d
movb %al, -11(%rsp)
sall $4, %ebx
movzbl (%rdi), %edi
movzbl %bl, %ebx
sall $4, %r11d
movzbl %r8b, %r8d
sall $2, %r14d
sall $3, %ebp
movzbl %r11b, %r11d
salq $24, %rbx
movb %dil, %dl
movzbl %bpl, %ebp
salq $32, %r11
salq $16, %rbp
addb %dl, %dl
sall $3, %r10d
movq %rdx, -40(%rsp)
movq %r14, %rdx
movzbl %r10b, %r10d
sall $2, %r9d
movq %rcx, -32(%rsp)
salq $40, %r10
movzbl %r9b, %r9d
salq $16, %r8
movq -40(%rsp), %rcx
salq $48, %r9
movb %dl, %ch
movq -32(%rsp), %rdx
movq %rcx, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq $-16711681, %rdi
orq %rbp, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-4278190081, %rbp
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %rbp, %rdi
orq %rbx, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-1095216660481, %rbx
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %rbx, %rdi
orq %r11, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-280375465082881, %r11
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rcx
andq %r11, %rdi
orq %r10, %rdi
movb %r15b, %cl
movabsq $-71776119061217281, %r10
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
addb %cl, %cl
andq %r10, %rdi
orq %r9, %rdi
leal (%r12,%r12), %r9d
movq %rdi, -40(%rsp)
movq -40(%rsp), %r12
salq $56, %r9
movabsq $72057594037927935, %rdi
andq %rdi, %r12
orq %r9, %r12
movq %r12, -40(%rsp)
movq -40(%rsp), %rdx
movq %rdx, -40(%rsp)
movq %r13, %rdx
movq %rcx, -32(%rsp)
movq -32(%rsp), %rcx
movzbl -11(%rsp), %eax
movb %dl, %ch
movzbl -10(%rsp), %edx
movq %rcx, -32(%rsp)
movq -32(%rsp), %r9
movzbl -9(%rsp), %ecx
andq $-16711681, %r9
orq %r9, %r8
addl %esi, %esi
addl %ecx, %ecx
addl %edx, %edx
movq %r8, -32(%rsp)
movq -32(%rsp), %r8
movzbl %sil, %esi
addl %eax, %eax
salq $24, %rsi
movzbl %cl, %ecx
movzbl %dl, %edx
movzbl %al, %eax
salq $32, %rcx
andq %rbp, %r8
salq $40, %rdx
orq %r8, %rsi
salq $48, %rax
movq %rsi, -32(%rsp)
movq -32(%rsp), %rsi
andq %rbx, %rsi
orq %rsi, %rcx
movq %rcx, -32(%rsp)
movq -32(%rsp), %rcx
andq %r11, %rcx
orq %rcx, %rdx
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %r10, %rdx
orq %rdx, %rax
movq %rax, -32(%rsp)
movzbl -12(%rsp), %eax
leal (%rax,%rax), %edx
movq -32(%rsp), %rax
salq $56, %rdx
andq %rdi, %rax
orq %rdx, %rax
movq %rax, -32(%rsp)
movdqa -40(%rsp), %xmm0
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc |
bdfd07f to
ceb6683
Compare
2dcfeee to
cde846f
Compare
|
@serge-sans-paille this one is ready too |
|
I'm a bit puzzled by the gain here: you're actually not adding anything except the possibility to pass a batch of constant, which is then dispatched to either the original code that takes a constant, or the original code that takes a batch, right? |
|
@serge-sans-paille There are three things in this PR:
They all play out nicely together, for instance with SSE2 and uint8_t data in #1220 (comment)
Result after the compiler does it jobs: with 2 multiply, 2 AND, 1 OR (and a few other things) you get a lshift for
From the code perspective of xsimd, these looks like very niche case. But in practice, this unlocks Another nice cheat happening here is that to generate the "multiply" quantities as a fallback for lshift, we use... lshift, but at compile time 😅 |
|
I'm all in for the second and third effect, but still puzzled by the first case :-) |
| { | ||
| detail::static_check_supported_config<T, A>(); | ||
| return kernel::bitwise_lshift<shift, A>(x, A {}); | ||
| using has_batch_const_impl = detail::has_bitwise_lshift_batch_const<A, decltype(x), decltype(shift)>; |
There was a problem hiding this comment.
it seems to me that the dispatch here should be 'if all Values... are the same, dispatch to the overload that takes a single parameter, otherwise dispatch to the generic overload. Wouldn't that simplify the whole implementation?
@serge-sans-paille @JohanMabille this ideas works, but I cannot figure out how to refactor
bitwise_lshift_as_twice_largerinto a separate header.The issue:
xsimd_sse2.hppneedsutils/shits.hppforbitwise_lshift_as_twice_largerbitwise_lshift_as_twice_largerneeds definitions fromxsimd_sse2.hpp.avx2Forward declaring all the needed functions overloads from
sse2andavx2and the arch types inutils/shits.hpp.Perhaps it should use the functions from
xsimd_apiinstead? (which would also be better if there is a better implementation further down in the inheritance tree).Close #1218