github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/com.asm (about) 1 dnl AMD Athlon mpn_com -- mpn bitwise one's complement. 2 3 dnl Copyright 2002 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K7: 1.0 cycles/limb 35 36 37 C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 38 C 39 C The loop form below is necessary for the claimed speed. It needs to be 40 C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it 41 C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on 42 C the movq's and achieve the necessary size. 43 C 44 C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one 45 C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked 46 C (offset by the size, as per the loop addressing) and one high limb 47 C processed separately to get alignment. 48 C 49 C The padding for the nails case is unattractive, but shouldn't cost any 50 C cycles. Explicit .byte's guarantee the desired instructions, at a point 51 C where we're probably stalled waiting for loads anyway. 52 C 53 C Enhancements: 54 C 55 C The combination load/pxor/store might be able to be unrolled to approach 56 C 0.5 c/l if desired. 57 58 defframe(PARAM_SIZE,12) 59 defframe(PARAM_SRC, 8) 60 defframe(PARAM_DST, 4) 61 62 TEXT 63 ALIGN(16) 64 65 PROLOGUE(mpn_com) 66 deflit(`FRAME',0) 67 68 movl PARAM_DST, %edx 69 movl PARAM_SIZE, %ecx 70 pcmpeqd %mm7, %mm7 71 72 leal (%edx,%ecx,4), %eax 73 andl $4, %eax 74 ifelse(GMP_NAIL_BITS,0,, 75 ` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK 76 77 movl PARAM_SRC, %eax 78 movd -4(%eax,%ecx,4), %mm0 C src high limb 79 80 ifelse(GMP_NAIL_BITS,0,, 81 ` C padding for alignment below 82 .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi 83 .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi 84 ') 85 86 jz L(aligned) 87 88 pxor %mm7, %mm0 89 movd %mm0, -4(%edx,%ecx,4) C dst high limb 90 decl %ecx 91 jz L(done) 92 L(aligned): 93 94 addl $4, %eax 95 addl $4, %edx 96 decl %ecx 97 jz L(one) 98 99 C offset 0x30 for no nails, or 0x40 for nails 100 ALIGN(16) 101 L(top): 102 C eax src 103 C ebx 104 C ecx counter 105 C edx dst 106 107 subl $2, %ecx 108 movq (%eax,%ecx,4), %mm0 109 pxor %mm7, %mm0 110 movq %mm0, (%edx,%ecx,4) 111 jg L(top) 112 113 jnz L(done) C if size even 114 115 L(one): 116 movd -4(%eax), %mm0 C src low limb 117 pxor %mm7, %mm0 118 movd %mm0, -4(%edx) C dst low limb 119 120 L(done): 121 emms 122 123 ret 124 125 EPILOGUE()