Talk About Network



Register and Login
Nick
Password
Register create new account Sign up is FREE and you can post replies, new topics, bookmark posts and more!
Recover lost password


Programming > Assembly x86 > Throughput of P...
Latest [ Topics | Posts ] Archive Post A New Topic Post a Reply
<< Topic < Post Post 1 of 4 Topic 4615 of 4646
Post > Topic >>

Throughput of PXOR

by "James Van Buskirk" <spamtrap@[EMAIL PROTECTED] > Apr 12, 2008 at 08:53 PM

While I was timing the popcnt code posted earlier I noticed that my
CPU (Core 2 Duo E6700) was showing results consistent with issuing
2 SSE2 logical operations per clock cycle.  This seemed odd to me
because Intel's manuals and Agner Fog both say I should be getting
a throughput of 3 logical operations (PXOR, XORPS, XORPD) per cycle.

Accordingly I made up a less complicated test:

C:\gfortran\james\throughput>type test.s
        .section .rdata,"dr"
        .align 16
fmt:
        .ascii "Time for %d iterations = %d\n"
        .byte   0
        .text
...globl _MAIN__
        .def    _MAIN__;        .scl    2;      .type   32;     .endef
# Entry point of program
_MAIN__:
# Set up stack, save regs, initialize loop counter
        subq    $40, %rsp
        movq    %rsi, 64(%rsp)
        movl    $5, %esi
# Run test 5 times
main_loop:
# Read time stamp counter
        call    _tm1
        movq    %rax, 48(%rsp)
# Run _test1 100 times
        movq    $100, %rcx
        call    _test1
# Read time stamp counter again
        call    _tm1
# Format timing results for output
        leaq    output(%rip), %rcx
        leaq    fmt(%rip), %rdx
        movq    $100, %r8
        movq    %rax, %r9
        subq    48(%rsp), %r9
        call    _wsprintfA
        movq    %rax, 56(%rsp)
# Print out formatted timing results
        movl    $-11, %ecx
        call    _GetStdHandle
        movq    %rax, %rcx
        leaq    output(%rip), %rdx
        movq    56(%rsp), %r8
        leaq    bw(%rip), %r9
        xor     %eax, %eax
        movq    %rax, 32(%rsp)
        call    _WriteFile
        sub     $1, %rsi
        jnz     main_loop
#
# Clean up and return to OS
        movq    64(%rsp), %rsi
        xor     %eax, %eax
        addq    $40, %rsp
        ret
...globl _tm1
        .def    _tm1;   .scl    2;      .type   32;     .endef
_tm1:
        rdtsc
        shrq    $32, %rdx
        orq     %rdx, %rax
        ret

...globl _test1
        .def    _test1; .scl    2;      .type   32;     .endef
...align 16
_test1:
        xorps   %xmm0, %xmm1
        xorps   %xmm2, %xmm3
        xorps   %xmm4, %xmm5 #1
        xorps   %xmm1, %xmm0
        xorps   %xmm3, %xmm2
        xorps   %xmm5, %xmm4 #2
        xorps   %xmm0, %xmm1
        xorps   %xmm2, %xmm3
        xorps   %xmm4, %xmm5 #3
        xorps   %xmm1, %xmm0
        xorps   %xmm3, %xmm2
        xorps   %xmm5, %xmm4 #4
        xorps   %xmm0, %xmm1
        xorps   %xmm2, %xmm3
        xorps   %xmm4, %xmm5 #5
        xorps   %xmm1, %xmm0
        xorps   %xmm3, %xmm2
        xorps   %xmm5, %xmm4 #6
        xorps   %xmm0, %xmm1
        xorps   %xmm2, %xmm3
        xorps   %xmm4, %xmm5 #7
        xorps   %xmm1, %xmm0
        xorps   %xmm3, %xmm2
        xorps   %xmm5, %xmm4 #8
        xorps   %xmm0, %xmm1
        xorps   %xmm2, %xmm3
        sub     $1, %rcx     #8
        xorps   %xmm1, %xmm0
        xorps   %xmm3, %xmm2
        jnz     _test1       #10
        ret

...data
...align 16
output:
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
        .long 0
bw:
        .long 0

C:\gfortran\james\throughput>as test.s -otest.o

C:\gfortran\james\throughput>ld test.o -lkernel32 -luser32 -otest.exe

C:\gfortran\james\throughput>test
Time for 100 iterations = 1460
Time for 100 iterations = 1440
Time for 100 iterations = 1460
Time for 100 iterations = 1460
Time for 100 iterations = 1460

Sure enough, it's taking 1400 clock cycles to execute
100*(3*10-2) = 2800 XORPS operations.  If I were getting throughput
of 3 instructions per clock cycle it would have taken 1000 cycles
(plus timing overhead.)

So what gives here?  Are all manuals wrong, or is my program wrong,
or am I interpreting its results incorrectly?

-- 
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end




 4 Posts in Topic:
Throughput of PXOR
"James Van Buskirk&q  2008-04-12 20:53:39 
Re: Throughput of PXOR
"Maarten Kronenburg&  2008-04-13 23:42:51 
Re: Throughput of PXOR
"James Van Buskirk&q  2008-04-14 02:00:55 
Re: Throughput of PXOR
"Maarten Kronenburg&  2008-04-14 14:00:24 

Post A Reply:
  Go here to Signup

AddThis Feed Button


About - Advertising - Contact - Frequently Asked Questions - Privacy Policy - Terms of Use - Signup

Contact
tan12V112 Wed May 14 13:09:50 CDT 2008.