Talk About Network

Google


Register and Login
Nick
Password
Register create new account Sign up is FREE and you can post replies, new topics, bookmark posts and more!
Recover lost password


Programming > Assembly x86 > Re: Population ...
Latest [ Topics | Posts ] Archive Post A New Topic Post a Reply
<< Topic < Post Post 15 of 33 Topic 4614 of 4728
Post > Topic >>

Re: Population count in SSE2, again

by "James Van Buskirk" <spamtrap@[EMAIL PROTECTED] > Apr 14, 2008 at 01:53 PM

"Wolfgang Kern" <spamtrap@[EMAIL PROTECTED]
> wrote in message 
news:ftvcf7$umq$1@[EMAIL PROTECTED]
> Even this algo suffer on many dependency stalls, it reads four bytes
> and processes 32 bits per iteration, so it reduces the costly memory
> accesses which you encounter with a byte by byte indexed table lookup.
> Granted, a 256 byte table could be easy held in cache, so it would be
> worth to benchmark and compare the performance of the two methods.

Well, I tried rewriting the 8-bit LUT in assembly to minimize
memory access, also I tried the 4-bit LUT.  In the following,
popcnt1 is the topical code of this thread, popcnt2 is the
8-bit LUT, popcnt3 is popcnt1 without compression as in the AMD
manual, and popcnt4 is the 4-bit LUT:

C:\gfortran\clf\popcnt>type big_popcnt4.s
        .text
...globl _tm1
        .def    _tm1;   .scl    2;      .type   32;     .endef
_tm1:
        rdtsc
        shrq    $32, %rdx
        orq     %rdx, %rax
        ret
...globl _popcnt1
        .def    _popcnt1;       .scl    2;      .type   32;     .endef
...align 16
_popcnt1:
        subq    $56, %rsp
        movaps  %xmm6, (%rsp)
        movaps  %xmm7, 16(%rsp)
        movaps  %xmm8, 32(%rsp)
        movaps  %xmm14, 64(%rsp)
        movaps  %xmm15, 80(%rsp)
        movq    $0x3333333333333333, %rax
        movq    %rax, %xmm0
        movddup %xmm0, %xmm15
        xorps   %xmm14, %xmm14

        pcmpeqd %xmm1, %xmm1
        pcmpeqd %xmm3, %xmm3
        pcmpeqd %xmm5, %xmm5
        xorps   %xmm7, %xmm7

        movl    $32768, %edx

...align 16
loop1:  movaps  (%rcx), %xmm6
        xorps   %xmm6, %xmm1
        andps   %xmm1, %xmm6
        movaps  16(%rcx), %xmm0
        xorps   %xmm0, %xmm1
        andps   %xmm1, %xmm0
        orps    %xmm0, %xmm6
        movaps  32(%rcx), %xmm4
        xorps   %xmm4, %xmm1
        andps   %xmm1, %xmm4
        movaps  48(%rcx), %xmm0
        xorps   %xmm0, %xmm1
        andps   %xmm1, %xmm0
        orps    %xmm0, %xmm4
        movaps  64(%rcx), %xmm2
        xorps   %xmm2, %xmm1
        andps   %xmm1, %xmm2
        movaps  80(%rcx), %xmm0
        xorps   %xmm0, %xmm1
        andps   %xmm1, %xmm0
        orps    %xmm0, %xmm2
        movaps  96(%rcx), %xmm0
        xorps   %xmm0, %xmm1
        andps   %xmm1, %xmm0
        movaps  112(%rcx), %xmm8
        xorps   %xmm8, %xmm1
        andps   %xmm1, %xmm8
        orps    %xmm8, %xmm0

        xorps   %xmm6, %xmm3
        andps   %xmm3, %xmm6
        xorps   %xmm4, %xmm3
        andps   %xmm3, %xmm4
        orps    %xmm4, %xmm6
        xorps   %xmm2, %xmm3
        andps   %xmm3, %xmm2
        xorps   %xmm0, %xmm3
        andps   %xmm3, %xmm0
        orps    %xmm0, %xmm2
        xorps   %xmm6, %xmm5
        andps   %xmm5, %xmm6
        xorps   %xmm2, %xmm5
        andps   %xmm5, %xmm2
        orps    %xmm2, %xmm6

        movaps  stuff, %xmm0
        andps   %xmm6, %xmm0
        psrld   $1, %xmm0
        psubd   %xmm0, %xmm6
        movaps  %xmm15, %xmm0
        andnps  %xmm6, %xmm0
        psrld   $2, %xmm0
        andps   %xmm15, %xmm6
        paddd   %xmm0, %xmm6
        movaps  %xmm6, %xmm0
        psrld   $4, %xmm6
        paddd   %xmm0, %xmm6
        andps   nonsense, %xmm6
        psadbw  %xmm14, %xmm6
        paddd   %xmm6, %xmm7

        addq    $128, %rcx
        subq    $128, %rdx
        jnz     loop1

        movaps  %xmm5, %xmm6
        call    wrap
        paddq   %xmm7, %xmm7
        psubq   %xmm6, %xmm7
        movaps  %xmm3, %xmm6
        call    wrap
        paddq   %xmm7, %xmm7
        psubq   %xmm6, %xmm7
        movaps  %xmm1, %xmm6
        call    wrap
        paddq   %xmm7, %xmm7
        psubq   %xmm6, %xmm7
        movhlps %xmm7, %xmm1
        paddq   %xmm7, %xmm1
        movq    %xmm1, %rax
        addq    $896, %rax

        movaps  (%rsp), %xmm6
        movaps  16(%rsp), %xmm7
        movaps  32(%rsp), %xmm8
        movaps  64(%rsp), %xmm14
        movaps  80(%rsp), %xmm15
        addq    $56, %rsp
        ret

...align 16
wrap:   movaps  stuff, %xmm0
        andps   %xmm6, %xmm0
        psrld   $1, %xmm0
        psubd   %xmm0, %xmm6
        movaps  %xmm15, %xmm0
        andnps  %xmm6, %xmm0
        psrld   $2, %xmm0
        andps   %xmm15, %xmm6
        paddd   %xmm0, %xmm6
        movaps  %xmm6, %xmm0
        psrld   $4, %xmm6
        paddd   %xmm0, %xmm6
        andps   nonsense, %xmm6
        psadbw  %xmm14, %xmm6
        ret

...globl _popcnt2
        .def    _popcnt2;       .scl    2;      .type   32;     .endef
...align 16
_popcnt2:
        movq    %rbx, 8(%rsp)
        movq    %rsi, 16(%rsp)
        movq    %rdi, 24(%rsp)
        movq    %rbp, 32(%rsp)
        movl    (%rdx), %ebp
        xorl    %eax, %eax
        leaq    LUT(%rip), %rdi
byte_loop:
        movaps  (%rcx), %xmm0
        movq    %xmm0, %rbx
        movzx   %bl, %rsi
        xorl    %edx, %edx
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        movhlps %xmm0, %xmm0
        movq    %xmm0, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        shrq    $16, %rbx
        movzx   %bl, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %bh, %esi
        addb    (%rsi,%rdi,1), %dl
        movzx   %dl, %edx
        addq    %rdx, %rax
        addq    $16, %rcx
        subq    $16, %rbp
        jnz     byte_loop
        movq    8(%rsp), %rbx
        movq    16(%rsp), %rsi
        movq    24(%rsp), %rdi
        movq    32(%rsp), %rbp
        ret

...globl _popcnt3
        .def    _popcnt3;       .scl    2;      .type   32;     .endef
...align 16
_popcnt3:
        movq    $0x3333333333333333, %rax
        movq    %rax, %xmm0
        movddup %xmm0, %xmm3
        xorps   %xmm2, %xmm2

        xorps   %xmm4, %xmm4

        movl    $32768, %edx
        addq    %rdx, %rcx
        negq    %rdx

...align 16
loop2:  movaps  (%rcx,%rdx), %xmm1
        movapd  stuff, %xmm0
        andps   %xmm1, %xmm0
        psrld   $1, %xmm0
        psubd   %xmm0, %xmm1
        movapd  %xmm3, %xmm0
        andnps  %xmm1, %xmm0
        psrld   $2, %xmm0
        andpd   %xmm3, %xmm1
        paddd   %xmm0, %xmm1
        movaps  %xmm1, %xmm0
        psrld   $4, %xmm1
        paddd   %xmm0, %xmm1
        andpd   nonsense, %xmm1
        psadbw  %xmm2, %xmm1
        paddd   %xmm1, %xmm4

        addq    $16, %rdx
        jnz     loop2
        movhlps %xmm4, %xmm1
        paddq   %xmm4, %xmm1
        movq    %xmm1, %rax

        ret

...globl _popcnt4
        .def    _popcnt4;       .scl    2;      .type   32;     .endef
...align 16
_popcnt4:
        movaps  %xmm6, 8(%rsp)
        movl    (%rdx), %edx
        movaps  nonsense(%rip), %xmm2
        movaps  LUT(%rip), %xmm3
        xorps   %xmm4, %xmm4
        xorps   %xmm6, %xmm6
oword_loop:
        movaps  (%rcx), %xmm0
        movaps  %xmm0, %xmm1
        andps   %xmm2, %xmm0
        movaps  %xmm3, %xmm5
        pshufb  %xmm0, %xmm5
        psadbw  %xmm4, %xmm5
        paddd   %xmm5, %xmm6

        psrld   $4, %xmm1
        andps   %xmm2, %xmm1

        movaps  %xmm3, %xmm5
        pshufb  %xmm1, %xmm5
        psadbw  %xmm4, %xmm5
        paddd   %xmm5, %xmm6

        addq    $16, %rcx
        subq    $16, %rdx
        jnz     oword_loop
        movq    %xmm6, %rax
        movhlps %xmm6, %xmm6
        movq    %xmm6, %rdx
        addq    %rdx, %rax
        movq    8(%rsp), %xmm6
        ret

        .data
...align 16
        stuff:
        .long   0xaaaaaaaa
        .long   0xaaaaaaaa
        .long   0xaaaaaaaa
        .long   0xaaaaaaaa
        nonsense:
        .long   0x0f0f0f0f
        .long   0x0f0f0f0f
        .long   0x0f0f0f0f
        .long   0x0f0f0f0f
        LUT:
        .byte   0,1,1,2,1,2,2,3
        .byte   1,2,2,3,2,3,3,4
        .byte   1,2,2,3,2,3,3,4
        .byte   2,3,3,4,3,4,4,5
        .byte   1,2,2,3,2,3,3,4
        .byte   2,3,3,4,3,4,4,5
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   1,2,2,3,2,3,3,4
        .byte   2,3,3,4,3,4,4,5
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   3,4,4,5,4,5,5,6
        .byte   4,5,5,6,5,6,6,7
        .byte   1,2,2,3,2,3,3,4
        .byte   2,3,3,4,3,4,4,5
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   3,4,4,5,4,5,5,6
        .byte   4,5,5,6,5,6,6,7
        .byte   2,3,3,4,3,4,4,5
        .byte   3,4,4,5,4,5,5,6
        .byte   3,4,4,5,4,5,5,6
        .byte   4,5,5,6,5,6,6,7
        .byte   3,4,4,5,4,5,5,6
        .byte   4,5,5,6,5,6,6,7
        .byte   4,5,5,6,5,6,6,7
        .byte   5,6,6,7,6,7,7,8

C:\gfortran\clf\popcnt>type big_test4.f90
program big_test4
   use ISO_C_BINDING
   implicit none
   interface
      function tm1() bind(C)
         im****t C_INT64_T
         implicit none

         integer(C_INT64_T) tm1
      end function tm1
   end interface

   interface
      function popcnt1(x) bind(C)
         im****t C_INT64_T
         im****t C_PTR
         implicit none

         integer(C_INT64_T) popcnt1
         type(C_PTR), value :: x
      end function popcnt1
   end interface

   interface
      function popcnt2(x,n) bind(C)
         im****t C_INT64_T
         im****t C_PTR
         im****t C_INT
         implicit none

         integer(C_INT64_T) popcnt2
         type(C_PTR), value :: x
         integer(C_INT) n
      end function popcnt2
   end interface

   interface
      function popcnt3(x) bind(C)
         im****t C_INT64_T
         im****t C_PTR
         implicit none

         integer(C_INT64_T) popcnt3
         type(C_PTR), value :: x
      end function popcnt3
   end interface

   interface
      function popcnt4(x,n) bind(C)
         im****t C_INT64_T
         im****t C_PTR
         im****t C_INT
         implicit none

         integer(C_INT64_T) popcnt4
         type(C_PTR), value :: x
         integer(C_INT) n
      end function popcnt4
   end interface

   interface
      subroutine sieve(t, n) bind(C)
         im****t C_PTR
         im****t C_INT
         implicit none

         type(C_PTR), value :: t
         integer(C_INT) n
      end subroutine sieve
   end interface

   integer(C_INT8_T), allocatable, target :: x(:)
   integer(C_INT), parameter :: Nbytes = 32768 ! popcnt1 hardwired to this
   integer, parameter :: align = 16 ! Will use xmm registers
   type(C_PTR) x_ptr
   integer(C_INTPTR_T) x_start
   integer(C_INTPTR_T) t_start
   type(C_PTR) t_ptr
   integer i
   integer(C_INT64_T) t0, t1, np

   allocate(x(Nbytes+align-1))
   x_ptr = C_LOC(x(1))
   x_start = transfer(x_ptr, x_start)
   t_start = iand(x_start+align-1, int(not(align-1),C_INTPTR_T))
   t_ptr = transfer(t_start, t_ptr)
   call sieve(t_ptr, Nbytes)
   do i = 1, 4
      t0 = tm1()
      np = popcnt1(t_ptr)
      t1 = tm1()
      write(*,'(2(a,i0))') 'popcnt1 np = ', np, ' clocks = ', t1-t0
      t0 = tm1()
      np = popcnt2(t_ptr, Nbytes)
      t1 = tm1()
      write(*,'(2(a,i0))') 'popcnt2 np = ', np, ' clocks = ', t1-t0
      t0 = tm1()
      np = popcnt3(t_ptr)
      t1 = tm1()
      write(*,'(2(a,i0))') 'popcnt3 np = ', np, ' clocks = ', t1-t0
      t0 = tm1()
      np = popcnt4(t_ptr, Nbytes)
      t1 = tm1()
      write(*,'(2(a,i0))') 'popcnt4 np = ', np, ' clocks = ', t1-t0
   end do

end program big_test4

subroutine sieve(t, n) bind(C)
   use ISO_C_BINDING
   implicit none
   integer(C_INT) n
   integer(C_INT8_T) t(0:n-1)
   integer i, lim, j

   lim = sqrt(8*n+0.5_C_DOUBLE)
   t = -1
   t(0) = ibclr(t(0), 0)
   do i = 2, lim
      if(btest(t((i-1)/8),modulo(i-1,8))) then
         do j = i**2, 8*n, i
            t((j-1)/8) = ibclr(t((j-1)/8),modulo(j-1,8))
         end do
      end if
   end do
end subroutine sieve

C:\gfortran\clf\popcnt>c:\gfortran\win64\bin\x86_64-pc-mingw32-gfortran
-O2 
big_
test4.f90 big_popcnt4.s -obig_test4

C:\gfortran\clf\popcnt>big_test4 > big_test4.txt

C:\gfortran\clf\popcnt>type big_test4.txt
popcnt1 np = 23000 clocks = 7370
popcnt2 np = 23000 clocks = 47560
popcnt3 np = 23000 clocks = 15190
popcnt4 np = 23000 clocks = 17540
popcnt1 np = 23000 clocks = 7010
popcnt2 np = 23000 clocks = 47480
popcnt3 np = 23000 clocks = 15130
popcnt4 np = 23000 clocks = 17530
popcnt1 np = 23000 clocks = 6920
popcnt2 np = 23000 clocks = 47540
popcnt3 np = 23000 clocks = 15030
popcnt4 np = 23000 clocks = 17460
popcnt1 np = 23000 clocks = 6930
popcnt2 np = 23000 clocks = 47520
popcnt3 np = 23000 clocks = 15040
popcnt4 np = 23000 clocks = 17520

So we got down to about 1 byte per 1.5 clocks for the 8-bit LUT.
Quite disappointing.  The 4-bit LUT is close to the AMD manual
method, though.  For a processor with fast PSHUFB and no
hardware POPCNT this may be a good thing.  PSHUFB unfortunately is
slow on my Core 2 Duo E6700.

-- 
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end
 




 33 Posts in Topic:
Population count in SSE2, again
"James Van Buskirk&q  2008-04-12 03:14:45 
Re: Population count in SSE2, again
Terence <spamtrap@[EM  2008-04-12 16:55:40 
Re: Population count in SSE2, again
Terje Mathisen <spamt  2008-04-13 15:08:36 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-13 09:20:49 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-13 21:48:40 
Re: Population count in SSE2, again
Jake Waskett <spamtra  2008-04-13 21:43:32 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-14 01:55:14 
Re: Population count in SSE2, again
Jake Waskett <spamtra  2008-04-14 11:19:35 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-14 02:38:07 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-14 20:53:32 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-15 17:13:38 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-15 21:58:21 
Re: Population count in SSE2, again
Terence <spamtrap@[EM  2008-04-13 17:14:55 
Re: Population count in SSE2, again
"Wolfgang Kern"  2008-04-14 12:42:35 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-14 13:53:21 
Re: Population count in SSE2, again
"Wolfgang Kern"  2008-04-16 15:34:09 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-16 10:05:48 
Re: Population count in SSE2, again
Robert Redelmeier <red  2008-04-14 14:21:05 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-14 02:58:34 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-14 18:09:21 
Re: Population count in SSE2, again
Terje Mathisen <spamt  2008-04-15 07:28:26 
Re: Population count in SSE2, again
Terence <spamtrap@[EM  2008-04-14 05:00:42 
Re: Population count in SSE2, again
Terence <spamtrap@[EM  2008-04-14 15:09:35 
Re: Population count in SSE2, again
Terence <spamtrap@[EM  2008-04-15 02:29:34 
Re: Population count in SSE2, again
Gerd Isenberg <spamtr  2008-04-15 02:56:39 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-16 00:33:16 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-16 14:42:37 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-16 19:38:21 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-16 12:41:36 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-16 21:39:47 
Re: Population count in SSE2, again
"Maarten Kronenburg&  2008-04-17 16:43:31 
Re: Population count in SSE2, again
Gerd Isenberg <spamtr  2008-04-16 09:58:11 
Re: Population count in SSE2, again
"James Van Buskirk&q  2008-04-16 12:59:38 

Post A Reply:
  Go here to Signup

AddThis Feed Button


About - Advertising - Contact - Frequently Asked Questions - Privacy Policy - Terms of Use - Signup

Contact
tan12V112 Thu Jul 24 0:29:37 CDT 2008.