"Wolfgang Kern" <spamtrap@[EMAIL PROTECTED]
> wrote in message
news:ftvcf7$umq$1@[EMAIL PROTECTED]
> Even this algo suffer on many dependency stalls, it reads four bytes
> and processes 32 bits per iteration, so it reduces the costly memory
> accesses which you encounter with a byte by byte indexed table lookup.
> Granted, a 256 byte table could be easy held in cache, so it would be
> worth to benchmark and compare the performance of the two methods.
Well, I tried rewriting the 8-bit LUT in assembly to minimize
memory access, also I tried the 4-bit LUT. In the following,
popcnt1 is the topical code of this thread, popcnt2 is the
8-bit LUT, popcnt3 is popcnt1 without compression as in the AMD
manual, and popcnt4 is the 4-bit LUT:
C:\gfortran\clf\popcnt>type big_popcnt4.s
.text
...globl _tm1
.def _tm1; .scl 2; .type 32; .endef
_tm1:
rdtsc
shrq $32, %rdx
orq %rdx, %rax
ret
...globl _popcnt1
.def _popcnt1; .scl 2; .type 32; .endef
...align 16
_popcnt1:
subq $56, %rsp
movaps %xmm6, (%rsp)
movaps %xmm7, 16(%rsp)
movaps %xmm8, 32(%rsp)
movaps %xmm14, 64(%rsp)
movaps %xmm15, 80(%rsp)
movq $0x3333333333333333, %rax
movq %rax, %xmm0
movddup %xmm0, %xmm15
xorps %xmm14, %xmm14
pcmpeqd %xmm1, %xmm1
pcmpeqd %xmm3, %xmm3
pcmpeqd %xmm5, %xmm5
xorps %xmm7, %xmm7
movl $32768, %edx
...align 16
loop1: movaps (%rcx), %xmm6
xorps %xmm6, %xmm1
andps %xmm1, %xmm6
movaps 16(%rcx), %xmm0
xorps %xmm0, %xmm1
andps %xmm1, %xmm0
orps %xmm0, %xmm6
movaps 32(%rcx), %xmm4
xorps %xmm4, %xmm1
andps %xmm1, %xmm4
movaps 48(%rcx), %xmm0
xorps %xmm0, %xmm1
andps %xmm1, %xmm0
orps %xmm0, %xmm4
movaps 64(%rcx), %xmm2
xorps %xmm2, %xmm1
andps %xmm1, %xmm2
movaps 80(%rcx), %xmm0
xorps %xmm0, %xmm1
andps %xmm1, %xmm0
orps %xmm0, %xmm2
movaps 96(%rcx), %xmm0
xorps %xmm0, %xmm1
andps %xmm1, %xmm0
movaps 112(%rcx), %xmm8
xorps %xmm8, %xmm1
andps %xmm1, %xmm8
orps %xmm8, %xmm0
xorps %xmm6, %xmm3
andps %xmm3, %xmm6
xorps %xmm4, %xmm3
andps %xmm3, %xmm4
orps %xmm4, %xmm6
xorps %xmm2, %xmm3
andps %xmm3, %xmm2
xorps %xmm0, %xmm3
andps %xmm3, %xmm0
orps %xmm0, %xmm2
xorps %xmm6, %xmm5
andps %xmm5, %xmm6
xorps %xmm2, %xmm5
andps %xmm5, %xmm2
orps %xmm2, %xmm6
movaps stuff, %xmm0
andps %xmm6, %xmm0
psrld $1, %xmm0
psubd %xmm0, %xmm6
movaps %xmm15, %xmm0
andnps %xmm6, %xmm0
psrld $2, %xmm0
andps %xmm15, %xmm6
paddd %xmm0, %xmm6
movaps %xmm6, %xmm0
psrld $4, %xmm6
paddd %xmm0, %xmm6
andps nonsense, %xmm6
psadbw %xmm14, %xmm6
paddd %xmm6, %xmm7
addq $128, %rcx
subq $128, %rdx
jnz loop1
movaps %xmm5, %xmm6
call wrap
paddq %xmm7, %xmm7
psubq %xmm6, %xmm7
movaps %xmm3, %xmm6
call wrap
paddq %xmm7, %xmm7
psubq %xmm6, %xmm7
movaps %xmm1, %xmm6
call wrap
paddq %xmm7, %xmm7
psubq %xmm6, %xmm7
movhlps %xmm7, %xmm1
paddq %xmm7, %xmm1
movq %xmm1, %rax
addq $896, %rax
movaps (%rsp), %xmm6
movaps 16(%rsp), %xmm7
movaps 32(%rsp), %xmm8
movaps 64(%rsp), %xmm14
movaps 80(%rsp), %xmm15
addq $56, %rsp
ret
...align 16
wrap: movaps stuff, %xmm0
andps %xmm6, %xmm0
psrld $1, %xmm0
psubd %xmm0, %xmm6
movaps %xmm15, %xmm0
andnps %xmm6, %xmm0
psrld $2, %xmm0
andps %xmm15, %xmm6
paddd %xmm0, %xmm6
movaps %xmm6, %xmm0
psrld $4, %xmm6
paddd %xmm0, %xmm6
andps nonsense, %xmm6
psadbw %xmm14, %xmm6
ret
...globl _popcnt2
.def _popcnt2; .scl 2; .type 32; .endef
...align 16
_popcnt2:
movq %rbx, 8(%rsp)
movq %rsi, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rbp, 32(%rsp)
movl (%rdx), %ebp
xorl %eax, %eax
leaq LUT(%rip), %rdi
byte_loop:
movaps (%rcx), %xmm0
movq %xmm0, %rbx
movzx %bl, %rsi
xorl %edx, %edx
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
movhlps %xmm0, %xmm0
movq %xmm0, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
shrq $16, %rbx
movzx %bl, %esi
addb (%rsi,%rdi,1), %dl
movzx %bh, %esi
addb (%rsi,%rdi,1), %dl
movzx %dl, %edx
addq %rdx, %rax
addq $16, %rcx
subq $16, %rbp
jnz byte_loop
movq 8(%rsp), %rbx
movq 16(%rsp), %rsi
movq 24(%rsp), %rdi
movq 32(%rsp), %rbp
ret
...globl _popcnt3
.def _popcnt3; .scl 2; .type 32; .endef
...align 16
_popcnt3:
movq $0x3333333333333333, %rax
movq %rax, %xmm0
movddup %xmm0, %xmm3
xorps %xmm2, %xmm2
xorps %xmm4, %xmm4
movl $32768, %edx
addq %rdx, %rcx
negq %rdx
...align 16
loop2: movaps (%rcx,%rdx), %xmm1
movapd stuff, %xmm0
andps %xmm1, %xmm0
psrld $1, %xmm0
psubd %xmm0, %xmm1
movapd %xmm3, %xmm0
andnps %xmm1, %xmm0
psrld $2, %xmm0
andpd %xmm3, %xmm1
paddd %xmm0, %xmm1
movaps %xmm1, %xmm0
psrld $4, %xmm1
paddd %xmm0, %xmm1
andpd nonsense, %xmm1
psadbw %xmm2, %xmm1
paddd %xmm1, %xmm4
addq $16, %rdx
jnz loop2
movhlps %xmm4, %xmm1
paddq %xmm4, %xmm1
movq %xmm1, %rax
ret
...globl _popcnt4
.def _popcnt4; .scl 2; .type 32; .endef
...align 16
_popcnt4:
movaps %xmm6, 8(%rsp)
movl (%rdx), %edx
movaps nonsense(%rip), %xmm2
movaps LUT(%rip), %xmm3
xorps %xmm4, %xmm4
xorps %xmm6, %xmm6
oword_loop:
movaps (%rcx), %xmm0
movaps %xmm0, %xmm1
andps %xmm2, %xmm0
movaps %xmm3, %xmm5
pshufb %xmm0, %xmm5
psadbw %xmm4, %xmm5
paddd %xmm5, %xmm6
psrld $4, %xmm1
andps %xmm2, %xmm1
movaps %xmm3, %xmm5
pshufb %xmm1, %xmm5
psadbw %xmm4, %xmm5
paddd %xmm5, %xmm6
addq $16, %rcx
subq $16, %rdx
jnz oword_loop
movq %xmm6, %rax
movhlps %xmm6, %xmm6
movq %xmm6, %rdx
addq %rdx, %rax
movq 8(%rsp), %xmm6
ret
.data
...align 16
stuff:
.long 0xaaaaaaaa
.long 0xaaaaaaaa
.long 0xaaaaaaaa
.long 0xaaaaaaaa
nonsense:
.long 0x0f0f0f0f
.long 0x0f0f0f0f
.long 0x0f0f0f0f
.long 0x0f0f0f0f
LUT:
.byte 0,1,1,2,1,2,2,3
.byte 1,2,2,3,2,3,3,4
.byte 1,2,2,3,2,3,3,4
.byte 2,3,3,4,3,4,4,5
.byte 1,2,2,3,2,3,3,4
.byte 2,3,3,4,3,4,4,5
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 1,2,2,3,2,3,3,4
.byte 2,3,3,4,3,4,4,5
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 3,4,4,5,4,5,5,6
.byte 4,5,5,6,5,6,6,7
.byte 1,2,2,3,2,3,3,4
.byte 2,3,3,4,3,4,4,5
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 3,4,4,5,4,5,5,6
.byte 4,5,5,6,5,6,6,7
.byte 2,3,3,4,3,4,4,5
.byte 3,4,4,5,4,5,5,6
.byte 3,4,4,5,4,5,5,6
.byte 4,5,5,6,5,6,6,7
.byte 3,4,4,5,4,5,5,6
.byte 4,5,5,6,5,6,6,7
.byte 4,5,5,6,5,6,6,7
.byte 5,6,6,7,6,7,7,8
C:\gfortran\clf\popcnt>type big_test4.f90
program big_test4
use ISO_C_BINDING
implicit none
interface
function tm1() bind(C)
im****t C_INT64_T
implicit none
integer(C_INT64_T) tm1
end function tm1
end interface
interface
function popcnt1(x) bind(C)
im****t C_INT64_T
im****t C_PTR
implicit none
integer(C_INT64_T) popcnt1
type(C_PTR), value :: x
end function popcnt1
end interface
interface
function popcnt2(x,n) bind(C)
im****t C_INT64_T
im****t C_PTR
im****t C_INT
implicit none
integer(C_INT64_T) popcnt2
type(C_PTR), value :: x
integer(C_INT) n
end function popcnt2
end interface
interface
function popcnt3(x) bind(C)
im****t C_INT64_T
im****t C_PTR
implicit none
integer(C_INT64_T) popcnt3
type(C_PTR), value :: x
end function popcnt3
end interface
interface
function popcnt4(x,n) bind(C)
im****t C_INT64_T
im****t C_PTR
im****t C_INT
implicit none
integer(C_INT64_T) popcnt4
type(C_PTR), value :: x
integer(C_INT) n
end function popcnt4
end interface
interface
subroutine sieve(t, n) bind(C)
im****t C_PTR
im****t C_INT
implicit none
type(C_PTR), value :: t
integer(C_INT) n
end subroutine sieve
end interface
integer(C_INT8_T), allocatable, target :: x(:)
integer(C_INT), parameter :: Nbytes = 32768 ! popcnt1 hardwired to this
integer, parameter :: align = 16 ! Will use xmm registers
type(C_PTR) x_ptr
integer(C_INTPTR_T) x_start
integer(C_INTPTR_T) t_start
type(C_PTR) t_ptr
integer i
integer(C_INT64_T) t0, t1, np
allocate(x(Nbytes+align-1))
x_ptr = C_LOC(x(1))
x_start = transfer(x_ptr, x_start)
t_start = iand(x_start+align-1, int(not(align-1),C_INTPTR_T))
t_ptr = transfer(t_start, t_ptr)
call sieve(t_ptr, Nbytes)
do i = 1, 4
t0 = tm1()
np = popcnt1(t_ptr)
t1 = tm1()
write(*,'(2(a,i0))') 'popcnt1 np = ', np, ' clocks = ', t1-t0
t0 = tm1()
np = popcnt2(t_ptr, Nbytes)
t1 = tm1()
write(*,'(2(a,i0))') 'popcnt2 np = ', np, ' clocks = ', t1-t0
t0 = tm1()
np = popcnt3(t_ptr)
t1 = tm1()
write(*,'(2(a,i0))') 'popcnt3 np = ', np, ' clocks = ', t1-t0
t0 = tm1()
np = popcnt4(t_ptr, Nbytes)
t1 = tm1()
write(*,'(2(a,i0))') 'popcnt4 np = ', np, ' clocks = ', t1-t0
end do
end program big_test4
subroutine sieve(t, n) bind(C)
use ISO_C_BINDING
implicit none
integer(C_INT) n
integer(C_INT8_T) t(0:n-1)
integer i, lim, j
lim = sqrt(8*n+0.5_C_DOUBLE)
t = -1
t(0) = ibclr(t(0), 0)
do i = 2, lim
if(btest(t((i-1)/8),modulo(i-1,8))) then
do j = i**2, 8*n, i
t((j-1)/8) = ibclr(t((j-1)/8),modulo(j-1,8))
end do
end if
end do
end subroutine sieve
C:\gfortran\clf\popcnt>c:\gfortran\win64\bin\x86_64-pc-mingw32-gfortran
-O2
big_
test4.f90 big_popcnt4.s -obig_test4
C:\gfortran\clf\popcnt>big_test4 > big_test4.txt
C:\gfortran\clf\popcnt>type big_test4.txt
popcnt1 np = 23000 clocks = 7370
popcnt2 np = 23000 clocks = 47560
popcnt3 np = 23000 clocks = 15190
popcnt4 np = 23000 clocks = 17540
popcnt1 np = 23000 clocks = 7010
popcnt2 np = 23000 clocks = 47480
popcnt3 np = 23000 clocks = 15130
popcnt4 np = 23000 clocks = 17530
popcnt1 np = 23000 clocks = 6920
popcnt2 np = 23000 clocks = 47540
popcnt3 np = 23000 clocks = 15030
popcnt4 np = 23000 clocks = 17460
popcnt1 np = 23000 clocks = 6930
popcnt2 np = 23000 clocks = 47520
popcnt3 np = 23000 clocks = 15040
popcnt4 np = 23000 clocks = 17520
So we got down to about 1 byte per 1.5 clocks for the 8-bit LUT.
Quite disappointing. The 4-bit LUT is close to the AMD manual
method, though. For a processor with fast PSHUFB and no
hardware POPCNT this may be a good thing. PSHUFB unfortunately is
slow on my Core 2 Duo E6700.
--
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end


|