github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/copyi.asm (about) 1 dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing. 2 3 dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: 1.25 cycles/limb 35 36 37 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); 38 C 39 C Destination prefetching is done to avoid repeated write-throughs on lines 40 C not already in L1. 41 C 42 C At least one of the src or dst pointer needs to be incremented rather than 43 C using indexing, so that there's somewhere to put the loop control without 44 C an AGI. Incrementing one and not two lets us keep loop overhead to 2 45 C cycles. Making it the src pointer incremented avoids an AGI on the %ecx 46 C subtracts in the finishup code. 47 C 48 C The block of finishup code is almost as big as the main loop itself, which 49 C is unfortunate, but it's faster that way than with say rep movsl, by about 50 C 10 cycles for instance on P55. 51 C 52 C There's nothing to be gained from MMX on P55, since it can do only one 53 C movq load (or store) per cycle, so the throughput would be the same as the 54 C code here (and even then only if src and dst have the same alignment mod 55 C 8). 56 57 defframe(PARAM_SIZE,12) 58 defframe(PARAM_SRC, 8) 59 defframe(PARAM_DST, 4) 60 61 TEXT 62 ALIGN(8) 63 PROLOGUE(mpn_copyi) 64 deflit(`FRAME',0) 65 66 movl PARAM_SIZE, %ecx 67 movl PARAM_DST, %edx 68 69 pushl %ebx FRAME_pushl() 70 pushl %esi FRAME_pushl() 71 72 leal (%edx,%ecx,4), %edx C &dst[size-1] 73 xorl $-1, %ecx C -size-1 74 75 movl PARAM_SRC, %esi 76 addl $8, %ecx C -size+7 77 78 jns L(end) 79 80 movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0] 81 nop 82 83 L(top): 84 C eax scratch 85 C ebx scratch 86 C ecx counter, limbs, negative 87 C edx &dst[size-1] 88 C esi src, incrementing 89 C edi 90 C ebp 91 92 movl (%edx,%ecx,4), %eax C fetch destination cache line 93 addl $8, %ecx 94 95 movl (%esi), %eax C read words pairwise 96 movl 4(%esi), %ebx 97 movl %eax, -60(%edx,%ecx,4) C store words pairwise 98 movl %ebx, -56(%edx,%ecx,4) 99 100 movl 8(%esi), %eax 101 movl 12(%esi), %ebx 102 movl %eax, -52(%edx,%ecx,4) 103 movl %ebx, -48(%edx,%ecx,4) 104 105 movl 16(%esi), %eax 106 movl 20(%esi), %ebx 107 movl %eax, -44(%edx,%ecx,4) 108 movl %ebx, -40(%edx,%ecx,4) 109 110 movl 24(%esi), %eax 111 movl 28(%esi), %ebx 112 movl %eax, -36(%edx,%ecx,4) 113 movl %ebx, -32(%edx,%ecx,4) 114 115 leal 32(%esi), %esi 116 js L(top) 117 118 119 L(end): 120 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining 121 C esi src end 122 C edx dst, next location to store 123 124 subl $4, %ecx 125 jns L(no4) 126 127 movl (%esi), %eax 128 movl 4(%esi), %ebx 129 movl %eax, -12(%edx,%ecx,4) 130 movl %ebx, -8(%edx,%ecx,4) 131 132 movl 8(%esi), %eax 133 movl 12(%esi), %ebx 134 movl %eax, -4(%edx,%ecx,4) 135 movl %ebx, (%edx,%ecx,4) 136 137 addl $16, %esi 138 addl $4, %ecx 139 L(no4): 140 141 subl $2, %ecx 142 jns L(no2) 143 144 movl (%esi), %eax 145 movl 4(%esi), %ebx 146 movl %eax, -4(%edx,%ecx,4) 147 movl %ebx, (%edx,%ecx,4) 148 149 addl $8, %esi 150 addl $2, %ecx 151 L(no2): 152 153 jnz L(done) 154 155 movl (%esi), %eax 156 movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here 157 158 L(done): 159 popl %esi 160 popl %ebx 161 162 ret 163 164 EPILOGUE()