"Gerd Isenberg" <spamtrap@[EMAIL PROTECTED]
> wrote in message
news:61cb8b24-6249-49eb-88f6-dfb02a592f68@[EMAIL PROTECTED]
> Another SSE2 alternative for a 64-bit popcount, sign extending 64 bits
> to 64 bytes to add them.
> May be combined with add/major as well...
> http://chessprogramming.wikispaces.com/Population+Count
This didn't seem too promising, but I tried it anyhow.
interface
function popcnt5(x,n) bind(C)
im****t C_INT64_T
im****t C_PTR
im****t C_INT
implicit none
integer(C_INT64_T) popcnt5
type(C_PTR), value :: x
integer(C_INT) n
end function popcnt5
end interface
...align 16
_popcnt5:
movaps %xmm6, 8(%rsp)
leaq masks(%rip), %rax
movl (%rdx), %edx
xorps %xmm3, %xmm3
xorps %xmm6, %xmm6
...align 16
qword_loop:
movaps (%rcx), %xmm5
movddup %xmm5, %xmm2
movaps %xmm2, %xmm4
movaps %xmm2, %xmm0
movaps %xmm2, %xmm1
andnps 48(%rax), %xmm2
andnps (%rax), %xmm4
andnps 32(%rax), %xmm1
andnps 16(%rax), %xmm0
pcmpeqb %xmm3, %xmm4
pcmpeqb %xmm3, %xmm0
pcmpeqb %xmm3, %xmm1
paddb %xmm0, %xmm4
pcmpeqb %xmm3, %xmm2
paddb %xmm1, %xmm2
paddb %xmm4, %xmm2
movhlps %xmm5, %xmm5
movaps %xmm5, %xmm4
movaps %xmm5, %xmm0
movaps %xmm5, %xmm1
andnps 48(%rax), %xmm5
andnps (%rax), %xmm4
andnps 32(%rax), %xmm1
andnps 16(%rax), %xmm0
pcmpeqb %xmm3, %xmm4
pcmpeqb %xmm3, %xmm0
pcmpeqb %xmm3, %xmm1
paddb %xmm0, %xmm4
pcmpeqb %xmm3, %xmm5
paddb %xmm1, %xmm5
paddb %xmm4, %xmm5
paddb %xmm5, %xmm2
psadbw %xmm3, %xmm2
pslld $24, %xmm2
psrad $24, %xmm2
psubd %xmm2, %xmm6
addq $16, %rcx
subq $16, %rdx
jnz qword_loop
movq %xmm6, %rax
movhlps %xmm6, %xmm6
movq %xmm6, %rdx
addq %rdx, %rax
movaps 8(%rsp), %xmm6
ret
masks:
.long 0x01010101,0x01010101,0x02020202,0x02020202
.long 0x04040404,0x04040404,0x08080808,0x08080808
.long 0x10101010,0x10101010,0x20202020,0x20202020
.long 0x40404040,0x40404040,0x80808080,0x80808080
Also I tweaked the 8-bit LUT:
...align 16
_popcnt2:
movq %rbx, 8(%rsp)
movq %rsi, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rbp, 32(%rsp)
movl (%rdx), %ebp
xorl %r8d, %r8d
leaq LUT(%rip), %rdi
...align 16
byte_loop:
# movaps (%rcx), %xmm0 # 1 oword load
# movq %xmm0, %rbx # 1 oword load
movq (%rcx), %rbx # 2 qword loads
movzx %bl, %rsi
xorq %rdx, %rdx
movb (%rsi,%rdi,1), %dl
movzx %bh, %esi
movb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
# movhlps %xmm0, %xmm0 # 1 oword load
# movq %xmm0, %rbx # 1 oword load
movq 8(%rcx), %rbx # 2 qword loads
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %al
addb %al, %dl
addq %rdx, %r8
addq $16, %rcx
subq $16, %rbp
jnz byte_loop
movq %r8, %rax
movq 8(%rsp), %rbx
movq 16(%rsp), %rsi
movq 24(%rsp), %rdi
movq 32(%rsp), %rbp
ret
And the 4-bit LUT:
...align 16
_popcnt4:
movaps %xmm6, 8(%rsp)
movl (%rdx), %edx
movaps nonsense(%rip), %xmm2
xorps %xmm3, %xmm3
xorps %xmm6, %xmm6
...align 16
oword_loop:
movaps (%rcx), %xmm0
movaps %xmm2, %xmm1
andnps %xmm0, %xmm1
movaps LUT(%rip), %xmm5
andps %xmm2, %xmm0
movaps %xmm5, %xmm4
pshufb %xmm0, %xmm5
psrld $4, %xmm1
pshufb %xmm1, %xmm4
paddd %xmm4, %xmm5
psadbw %xmm3, %xmm5
paddd %xmm5, %xmm6
addq $16, %rcx
subq $16, %rdx
jnz oword_loop
movq %xmm6, %rax
movhlps %xmm6, %xmm6
movq %xmm6, %rdx
addq %rdx, %rax
movaps 8(%rsp), %xmm6
ret
(Fit the above into code from
http://groups.google.com/group/comp.lang.asm.x86/msg/0cd4b133fd17c86a
)
With result:
popcnt1 np = 23000 clocks = 7150
popcnt2 np = 23000 clocks = 43240
popcnt3 np = 23000 clocks = 15120
popcnt4 np = 23000 clocks = 15660
popcnt5 np = 23000 clocks = 37870
popcnt1 np = 23000 clocks = 6950
popcnt2 np = 23000 clocks = 43150
popcnt3 np = 23000 clocks = 15140
popcnt4 np = 23000 clocks = 15600
popcnt5 np = 23000 clocks = 38040
popcnt1 np = 23000 clocks = 6930
popcnt2 np = 23000 clocks = 43120
popcnt3 np = 23000 clocks = 14980
popcnt4 np = 23000 clocks = 15570
popcnt5 np = 23000 clocks = 37540
popcnt1 np = 23000 clocks = 6960
popcnt2 np = 23000 clocks = 43100
popcnt3 np = 23000 clocks = 15040
popcnt4 np = 23000 clocks = 15560
popcnt5 np = 23000 clocks = 37490
So [my adaptation of] your code is getting 32768/37500 = 0.87 bytes
per clock. Faster than my improved 8-bit LUT at 0.76 bytes per clock
but still running with the Slowskys. OTOH, I have gotten the 4-bit
LUT up to nearly the speed of the SWAR method (as the above-referenced
web page styles it) so it may be a candidate for the wrap-up stage of
the compression-accelerated code.
--
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end


|