Ok here is the full proggie, any suggestions are highly welcome. I
wanted to optimize this as much as I could by reading do***entation
and tutorials. So this code is running 100000 primes for ~2.5 secs. I
know this is very slow, since I've read people having ~1000,000 primes
in 0.5secs, in handwritten ASM. Although the person used SSE2
instructions which, sadly, aren't sup****ted by my old Athlon XP 2600+
Barton.
Any help is greatly appreciated either on optimization or the
precision-rounding problem.
Thanks!
PS! The "test" tags are for profiling.
..data
to:
.long 100000
..text
..globl _start
_start:
pushl $2 /* we push 2 to the stack */
movl $1, %ecx /* add 1 to total primes */
movl $1, %ebx /* nrs. to check on this round */
movl $3, %edx /* nr. in hand */
top:
pushl %edx /* push current num into stack */
incl %ecx /* increment count of primes*/
bad:
cmp to, %edx /*compare if number in hand is not over the
limit */
jge done /* ... if it is -> done */
mov %esp,%ebp /* set index_pointer to end of stack */
mov %ecx,%ebx /* set "to check" to beginning */
addl $2,%edx /* increment numbers by 2 */
test1:
cmp $0,%ebx /* compare 0 and "left to check" */
je top /* if 0 go to top, we found a prime */
test2:
/* 3DNOW! instructions */
prefetchnta (%ebp)
MOVD (%ebp), %MM0 /* stack_num */
addl $4,%ebp /* move pointer */
PI2FD %MM0, %MM0 /* stack to float */
MOVQ %MM0, %MM2 /* copy of stack */
PFRCP %MM0, %MM5 /*1/stack imprecise */
PFRCPIT1 %MM5, %MM0 /* 1/stack more precise*/
PFRCPIT2 %MM5, %MM0 /* 1/stack very precise */
MOVD %edx, %MM1 /* prime_cand */
PI2FD %MM1, %MM1 /* prime to float */
MOVQ %MM1, %MM3 /* copy of prime */
PFMUL %MM0, %MM1 /* 1/stack * prime */
PF2ID %MM1, %MM1 /* answer to int ...*/
PI2FD %MM1, %MM1 /* ... and back to float */
PFMUL %MM1, %MM2
PFCMPEQ %MM2, %MM3
MOVD %MM3, %eax
FEMMS
test3:
decl %ebx /* decrease "to check" */
cmp $0,%eax /* if comparison result is equal to 1,
then not prime -> bad */
je test1
jmp bad
done:
movl %ecx, %eax /* DEBUG */
movl $0, %ebx
movl $1, %eax
int $0x80


|