github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/hamdist.asm (about) 1 dnl Intel P5 mpn_hamdist -- mpn hamming distance. 2 3 dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P5: 14.0 cycles/limb 35 36 37 C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size); 38 C 39 C It might be possible to shave 1 cycle from the loop, and hence 2 40 C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor 41 C would be 1, if the right schedule could be found (not found so far). 42 C Wanting to avoid potential cache bank clashes makes it tricky. 43 44 C The slightly strange quoting here helps the renaming done by tune/many.pl. 45 deflit(TABLE_NAME, 46 m4_assert_defined(`GSYM_PREFIX') 47 GSYM_PREFIX`'mpn_popcount``'_table') 48 49 C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental 50 C linking. 51 52 defframe(PARAM_SIZE,12) 53 defframe(PARAM_SRC2, 8) 54 defframe(PARAM_SRC1, 4) 55 56 TEXT 57 ALIGN(8) 58 59 PROLOGUE(mpn_hamdist) 60 deflit(`FRAME',0) 61 62 movl PARAM_SIZE, %ecx 63 pushl %esi FRAME_pushl() 64 65 shll %ecx C size in byte pairs 66 pushl %edi FRAME_pushl() 67 68 ifdef(`PIC',` 69 pushl %ebx FRAME_pushl() 70 pushl %ebp FRAME_pushl() 71 ifdef(`DARWIN',` 72 movl PARAM_SRC1, %esi 73 movl PARAM_SRC2, %edi 74 LEA( TABLE_NAME, %ebp) 75 xorl %ebx, %ebx C byte 76 xorl %edx, %edx C byte 77 xorl %eax, %eax C total 78 ',` 79 call L(here) FRAME_pushl() 80 L(here): 81 movl PARAM_SRC1, %esi 82 popl %ebp FRAME_popl() 83 84 movl PARAM_SRC2, %edi 85 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 86 87 xorl %ebx, %ebx C byte 88 xorl %edx, %edx C byte 89 90 movl TABLE_NAME@GOT(%ebp), %ebp 91 xorl %eax, %eax C total 92 ') 93 define(TABLE,`(%ebp,$1)') 94 ',` 95 dnl non-PIC 96 movl PARAM_SRC1, %esi 97 movl PARAM_SRC2, %edi 98 99 xorl %eax, %eax C total 100 pushl %ebx FRAME_pushl() 101 102 xorl %edx, %edx C byte 103 xorl %ebx, %ebx C byte 104 105 define(TABLE,`TABLE_NAME($1)') 106 ') 107 108 109 C The nop after the xorb seems necessary. Although a movb might be 110 C expected to go down the V pipe in the second cycle of the xorb, it 111 C doesn't and costs an extra 2 cycles. 112 L(top): 113 C eax total 114 C ebx byte 115 C ecx counter, 2*size to 2 116 C edx byte 117 C esi src1 118 C edi src2 119 C ebp [PIC] table 120 121 addl %ebx, %eax 122 movb -1(%esi,%ecx,2), %bl 123 124 addl %edx, %eax 125 movb -1(%edi,%ecx,2), %dl 126 127 xorb %dl, %bl 128 movb -2(%esi,%ecx,2), %dl 129 130 xorb -2(%edi,%ecx,2), %dl 131 nop 132 133 movb TABLE(%ebx), %bl 134 decl %ecx 135 136 movb TABLE(%edx), %dl 137 jnz L(top) 138 139 140 ifdef(`PIC',` 141 popl %ebp 142 ') 143 addl %ebx, %eax 144 popl %ebx 145 146 addl %edx, %eax 147 popl %edi 148 149 popl %esi 150 151 ret 152 153 EPILOGUE() 154 ASM_END()