While I was timing the popcnt code posted earlier I noticed that my
CPU (Core 2 Duo E6700) was showing results consistent with issuing
2 SSE2 logical operations per clock cycle. This seemed odd to me
because Intel's manuals and Agner Fog both say I should be getting
a throughput of 3 logical operations (PXOR, XORPS, XORPD) per cycle.
Accordingly I made up a less complicated test:
C:\gfortran\james\throughput>type test.s
.section .rdata,"dr"
.align 16
fmt:
.ascii "Time for %d iterations = %d\n"
.byte 0
.text
...globl _MAIN__
.def _MAIN__; .scl 2; .type 32; .endef
# Entry point of program
_MAIN__:
# Set up stack, save regs, initialize loop counter
subq $40, %rsp
movq %rsi, 64(%rsp)
movl $5, %esi
# Run test 5 times
main_loop:
# Read time stamp counter
call _tm1
movq %rax, 48(%rsp)
# Run _test1 100 times
movq $100, %rcx
call _test1
# Read time stamp counter again
call _tm1
# Format timing results for output
leaq output(%rip), %rcx
leaq fmt(%rip), %rdx
movq $100, %r8
movq %rax, %r9
subq 48(%rsp), %r9
call _wsprintfA
movq %rax, 56(%rsp)
# Print out formatted timing results
movl $-11, %ecx
call _GetStdHandle
movq %rax, %rcx
leaq output(%rip), %rdx
movq 56(%rsp), %r8
leaq bw(%rip), %r9
xor %eax, %eax
movq %rax, 32(%rsp)
call _WriteFile
sub $1, %rsi
jnz main_loop
#
# Clean up and return to OS
movq 64(%rsp), %rsi
xor %eax, %eax
addq $40, %rsp
ret
...globl _tm1
.def _tm1; .scl 2; .type 32; .endef
_tm1:
rdtsc
shrq $32, %rdx
orq %rdx, %rax
ret
...globl _test1
.def _test1; .scl 2; .type 32; .endef
...align 16
_test1:
xorps %xmm0, %xmm1
xorps %xmm2, %xmm3
xorps %xmm4, %xmm5 #1
xorps %xmm1, %xmm0
xorps %xmm3, %xmm2
xorps %xmm5, %xmm4 #2
xorps %xmm0, %xmm1
xorps %xmm2, %xmm3
xorps %xmm4, %xmm5 #3
xorps %xmm1, %xmm0
xorps %xmm3, %xmm2
xorps %xmm5, %xmm4 #4
xorps %xmm0, %xmm1
xorps %xmm2, %xmm3
xorps %xmm4, %xmm5 #5
xorps %xmm1, %xmm0
xorps %xmm3, %xmm2
xorps %xmm5, %xmm4 #6
xorps %xmm0, %xmm1
xorps %xmm2, %xmm3
xorps %xmm4, %xmm5 #7
xorps %xmm1, %xmm0
xorps %xmm3, %xmm2
xorps %xmm5, %xmm4 #8
xorps %xmm0, %xmm1
xorps %xmm2, %xmm3
sub $1, %rcx #8
xorps %xmm1, %xmm0
xorps %xmm3, %xmm2
jnz _test1 #10
ret
...data
...align 16
output:
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
bw:
.long 0
C:\gfortran\james\throughput>as test.s -otest.o
C:\gfortran\james\throughput>ld test.o -lkernel32 -luser32 -otest.exe
C:\gfortran\james\throughput>test
Time for 100 iterations = 1460
Time for 100 iterations = 1440
Time for 100 iterations = 1460
Time for 100 iterations = 1460
Time for 100 iterations = 1460
Sure enough, it's taking 1400 clock cycles to execute
100*(3*10-2) = 2800 XORPS operations. If I were getting throughput
of 3 instructions per clock cycle it would have taken 1000 cycles
(plus timing overhead.)
So what gives here? Are all manuals wrong, or is my program wrong,
or am I interpreting its results incorrectly?
--
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end


|