github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/copyi.asm (about) 1 dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. 2 3 dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C alignment dst/src, A=0mod8 N=4mod8 35 C A/A A/N N/A N/N 36 C K7 0.75 1.0 1.0 0.75 37 38 39 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); 40 C 41 C Copy src,size to dst,size. 42 C 43 C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at 44 C 1.33 c/l. 45 C 46 C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization 47 C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing 48 C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in 49 C one cycle, so perhaps some scheduling is needed to ensure it's a 50 C load+store in each cycle, not store+store. 51 C 52 C If both source and destination are unaligned then one limb is processed at 53 C the start to make them aligned and so get 0.75 c/l, whereas if they'd been 54 C used unaligned it would be 1.5 c/l. 55 56 defframe(PARAM_SIZE,12) 57 defframe(PARAM_SRC, 8) 58 defframe(PARAM_DST, 4) 59 60 dnl parameter space reused 61 define(SAVE_EBX,`PARAM_SIZE') 62 63 dnl minimum 5 since the unrolled code can't handle less than 5 64 deflit(UNROLL_THRESHOLD, 5) 65 66 TEXT 67 ALIGN(32) 68 PROLOGUE(mpn_copyi) 69 deflit(`FRAME',0) 70 71 movl PARAM_SIZE, %ecx 72 movl %ebx, SAVE_EBX 73 74 movl PARAM_SRC, %eax 75 movl PARAM_DST, %edx 76 77 cmpl $UNROLL_THRESHOLD, %ecx 78 jae L(unroll) 79 80 orl %ecx, %ecx 81 jz L(simple_done) 82 83 L(simple): 84 C eax src, incrementing 85 C ebx scratch 86 C ecx counter 87 C edx dst, incrementing 88 C 89 C this loop is 2 cycles/limb 90 91 movl (%eax), %ebx 92 movl %ebx, (%edx) 93 decl %ecx 94 leal 4(%eax), %eax 95 leal 4(%edx), %edx 96 jnz L(simple) 97 98 L(simple_done): 99 movl SAVE_EBX, %ebx 100 ret 101 102 103 L(unroll): 104 movl %eax, %ebx 105 leal -12(%eax,%ecx,4), %eax C src end - 12 106 subl $3, %ecx C size-3 107 108 andl %edx, %ebx 109 leal (%edx,%ecx,4), %edx C dst end - 12 110 negl %ecx 111 112 testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) 113 jz L(aligned) 114 115 C both src and dst unaligned, process one limb to align them 116 movl (%eax,%ecx,4), %ebx 117 movl %ebx, (%edx,%ecx,4) 118 incl %ecx 119 L(aligned): 120 121 122 ALIGN(16) 123 L(top): 124 C eax src end - 12 125 C ebx 126 C ecx counter, negative, limbs 127 C edx dst end - 12 128 129 movq (%eax,%ecx,4), %mm0 130 movq 8(%eax,%ecx,4), %mm1 131 addl $4, %ecx 132 movq %mm0, -16(%edx,%ecx,4) 133 movq %mm1, -16+8(%edx,%ecx,4) 134 ja L(top) C jump no carry and not zero 135 136 137 C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining 138 139 testb $2, %cl 140 jnz L(finish_not_two) 141 142 movq (%eax,%ecx,4), %mm0 143 movq %mm0, (%edx,%ecx,4) 144 L(finish_not_two): 145 146 testb $1, %cl 147 jnz L(done) 148 149 movl 8(%eax), %ebx 150 movl %ebx, 8(%edx) 151 152 L(done): 153 movl SAVE_EBX, %ebx 154 emms 155 ret 156 157 EPILOGUE()