github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/hamdist.asm (about) 1 dnl IA-64 mpn_hamdist -- mpn hamming distance. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2003-2005 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: 2 37 C Itanium 2: 1 38 39 C INPUT PARAMETERS 40 define(`up', `r32') 41 define(`vp', `r33') 42 define(`n', `r34') 43 44 define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') 45 define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') 46 define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27') 47 define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') 48 define(`s',`r8') 49 50 51 ASM_START() 52 PROLOGUE(mpn_hamdist) 53 .prologue 54 ifdef(`HAVE_ABI_32', 55 ` addp4 up = 0, up C M I 56 addp4 vp = 0, vp C M I 57 zxt4 n = n C I 58 ;; 59 ') 60 61 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01 62 ld8 r11 = [vp], 8 C load first vlimb M01 63 mov.i r2 = ar.lc C save ar.lc I0 64 }{.mmi; and r14 = 3, n C M I 65 cmp.lt p15, p0 = 4, n C small count? M I 66 add n = -5, n C M I 67 ;; 68 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 69 cmp.eq p7, p0 = 2, r14 C M I 70 cmp.eq p8, p0 = 3, r14 C M I 71 }{.bbb 72 (p6) br.dptk .Lb01 C B 73 (p7) br.dptk .Lb10 C B 74 (p8) br.dptk .Lb11 C B 75 } 76 77 78 .Lb00: ld8 u1 = [up], 8 C M01 79 ld8 v1 = [vp], 8 C M01 80 shr.u n = n, 2 C I0 81 xor x0 = r10, r11 C M I 82 ;; 83 ld8 u2 = [up], 8 C M01 84 ld8 v2 = [vp], 8 C M01 85 mov.i ar.lc = n C I0 86 xor x1 = u1, v1 C M I 87 ;; 88 ld8 u3 = [up], 8 C M01 89 ld8 v3 = [vp], 8 C M01 90 xor x2 = u2, v2 C M I 91 mov s = 0 C M I 92 (p15) br.cond.dptk .grt4 C B 93 ;; 94 popcnt c0 = x0 C I0 95 xor x3 = u3, v3 C M I 96 ;; 97 popcnt c1 = x1 C I0 98 ;; 99 popcnt c2 = x2 C I0 100 br .Lcj4 C B 101 102 .grt4: ld8 u0 = [up], 8 C M01 103 ld8 v0 = [vp], 8 C M01 104 xor x1 = u1, v1 C M I 105 ;; 106 ld8 u1 = [up], 8 C M01 107 ld8 v1 = [vp], 8 C M01 108 xor x2 = u2, v2 C M I 109 ;; 110 ld8 u2 = [up], 8 C M01 111 ld8 v2 = [vp], 8 C M01 112 popcnt c0 = x0 C I0 113 xor x3 = u3, v3 C M I 114 ;; 115 ld8 u3 = [up], 8 C M01 116 ld8 v3 = [vp], 8 C M01 117 popcnt c1 = x1 C I0 118 xor x0 = u0, v0 C M I 119 br.cloop.dpnt .grt8 C B 120 121 popcnt c2 = x2 C I0 122 xor x1 = u1, v1 C M I 123 br .Lcj8 C B 124 125 .grt8: ld8 u0 = [up], 8 C M01 126 ld8 v0 = [vp], 8 C M01 127 popcnt c2 = x2 C I0 128 xor x1 = u1, v1 C M I 129 br .LL00 C B 130 131 132 .Lb01: xor x3 = r10, r11 C M I 133 shr.u n = n, 2 C I0 134 (p15) br.cond.dptk .grt1 C B 135 ;; 136 popcnt r8 = x3 C I0 137 br.ret.sptk.many b0 C B 138 139 .grt1: ld8 u0 = [up], 8 C M01 140 ld8 v0 = [vp], 8 C M01 141 mov.i ar.lc = n C I0 142 ;; 143 ld8 u1 = [up], 8 C M01 144 ld8 v1 = [vp], 8 C M01 145 mov s = 0 C M I 146 ;; 147 ld8 u2 = [up], 8 C M01 148 ld8 v2 = [vp], 8 C M01 149 ;; 150 ld8 u3 = [up], 8 C M01 151 ld8 v3 = [vp], 8 C M01 152 xor x0 = u0, v0 C M I 153 br.cloop.dpnt .grt5 C B 154 155 xor x1 = u1, v1 C M I 156 ;; 157 popcnt c3 = x3 C I0 158 xor x2 = u2, v2 C M I 159 ;; 160 popcnt c0 = x0 C I0 161 xor x3 = u3, v3 C M I 162 ;; 163 popcnt c1 = x1 C I0 164 br .Lcj5 C B 165 166 .grt5: ld8 u0 = [up], 8 C M01 167 ld8 v0 = [vp], 8 C M01 168 xor x1 = u1, v1 C M I 169 ;; 170 ld8 u1 = [up], 8 C M01 171 ld8 v1 = [vp], 8 C M01 172 popcnt c3 = x3 C I0 173 xor x2 = u2, v2 C M I 174 ;; 175 ld8 u2 = [up], 8 C M01 176 ld8 v2 = [vp], 8 C M01 177 popcnt c0 = x0 C I0 178 xor x3 = u3, v3 C M I 179 ;; 180 ld8 u3 = [up], 8 C M01 181 ld8 v3 = [vp], 8 C M01 182 popcnt c1 = x1 C I0 183 xor x0 = u0, v0 C M I 184 br.cloop.dpnt .Loop C B 185 br .Lend C B 186 187 188 .Lb10: ld8 u3 = [up], 8 C M01 189 ld8 v3 = [vp], 8 C M01 190 xor x2 = r10, r11 C M I 191 (p15) br.cond.dptk .grt2 C B 192 ;; 193 xor x3 = u3, v3 C M I 194 ;; 195 popcnt c2 = x2 C I0 196 ;; 197 popcnt c3 = x3 C I0 198 ;; 199 add s = c2, c3 C M I 200 br.ret.sptk.many b0 C B 201 202 .grt2: ld8 u0 = [up], 8 C M01 203 ld8 v0 = [vp], 8 C M01 204 shr.u n = n, 2 C I0 205 ;; 206 ld8 u1 = [up], 8 C M01 207 ld8 v1 = [vp], 8 C M01 208 mov.i ar.lc = n C I0 209 mov s = 0 C M I 210 ;; 211 ld8 u2 = [up], 8 C M01 212 ld8 v2 = [vp], 8 C M01 213 xor x3 = u3, v3 C M I 214 ;; 215 ld8 u3 = [up], 8 C M01 216 ld8 v3 = [vp], 8 C M01 217 xor x0 = u0, v0 C M I 218 br.cloop.dptk .grt6 C B 219 220 popcnt c2 = x2 C I0 221 xor x1 = u1, v1 C M I 222 ;; 223 popcnt c3 = x3 C I0 224 xor x2 = u2, v2 C M I 225 ;; 226 popcnt c0 = x0 C I0 227 xor x3 = u3, v3 C M I 228 br .Lcj6 C B 229 230 .grt6: ld8 u0 = [up], 8 C M01 231 ld8 v0 = [vp], 8 C M01 232 popcnt c2 = x2 C I0 233 xor x1 = u1, v1 C M I 234 ;; 235 ld8 u1 = [up], 8 C M01 236 ld8 v1 = [vp], 8 C M01 237 popcnt c3 = x3 C I0 238 xor x2 = u2, v2 C M I 239 ;; 240 ld8 u2 = [up], 8 C M01 241 ld8 v2 = [vp], 8 C M01 242 popcnt c0 = x0 C I0 243 xor x3 = u3, v3 C M I 244 br .LL10 C B 245 246 247 .Lb11: ld8 u2 = [up], 8 C M01 248 ld8 v2 = [vp], 8 C M01 249 shr.u n = n, 2 C I0 250 xor x1 = r10, r11 C M I 251 ;; 252 ld8 u3 = [up], 8 C M01 253 ld8 v3 = [vp], 8 C M01 254 xor x2 = u2, v2 C M I 255 (p15) br.cond.dptk .grt3 C B 256 ;; 257 xor x3 = u3, v3 C M I 258 ;; 259 popcnt c1 = x1 C I0 260 ;; 261 popcnt c2 = x2 C I0 262 ;; 263 popcnt c3 = x3 C I0 264 ;; 265 add s = c1, c2 C M I 266 ;; 267 add s = s, c3 C M I 268 br.ret.sptk.many b0 C B 269 270 .grt3: ld8 u0 = [up], 8 C M01 271 ld8 v0 = [vp], 8 C M01 272 mov.i ar.lc = n C I0 273 ;; 274 ld8 u1 = [up], 8 C M01 275 ld8 v1 = [vp], 8 C M01 276 mov s = 0 C M I 277 ;; 278 ld8 u2 = [up], 8 C M01 279 ld8 v2 = [vp], 8 C M01 280 xor x3 = u3, v3 C M I 281 ;; 282 ld8 u3 = [up], 8 C M01 283 ld8 v3 = [vp], 8 C M01 284 popcnt c1 = x1 C I0 285 xor x0 = u0, v0 C M I 286 br.cloop.dptk .grt7 C B 287 popcnt c2 = x2 C I0 288 xor x1 = u1, v1 C M I 289 ;; 290 popcnt c3 = x3 C I0 291 xor x2 = u2, v2 C M I 292 br .Lcj7 C B 293 294 .grt7: ld8 u0 = [up], 8 C M01 295 ld8 v0 = [vp], 8 C M01 296 popcnt c2 = x2 C I0 297 xor x1 = u1, v1 C M I 298 ;; 299 ld8 u1 = [up], 8 C M01 300 ld8 v1 = [vp], 8 C M01 301 popcnt c3 = x3 C I0 302 xor x2 = u2, v2 C M I 303 br .LL11 C B 304 305 306 ALIGN(32) 307 .Loop: ld8 u0 = [up], 8 C M01 308 ld8 v0 = [vp], 8 C M01 309 popcnt c2 = x2 C I0 310 add s = s, c3 C M I 311 xor x1 = u1, v1 C M I 312 nop.b 1 C - 313 ;; 314 .LL00: ld8 u1 = [up], 8 C M01 315 ld8 v1 = [vp], 8 C M01 316 popcnt c3 = x3 C I0 317 add s = s, c0 C M I 318 xor x2 = u2, v2 C M I 319 nop.b 1 C - 320 ;; 321 .LL11: ld8 u2 = [up], 8 C M01 322 ld8 v2 = [vp], 8 C M01 323 popcnt c0 = x0 C I0 324 add s = s, c1 C M I 325 xor x3 = u3, v3 C M I 326 nop.b 1 C - 327 ;; 328 .LL10: ld8 u3 = [up], 8 C M01 329 ld8 v3 = [vp], 8 C M01 330 popcnt c1 = x1 C I0 331 add s = s, c2 C M I 332 xor x0 = u0, v0 C M I 333 br.cloop.dptk .Loop C B 334 ;; 335 336 .Lend: popcnt c2 = x2 C I0 337 add s = s, c3 C M I 338 xor x1 = u1, v1 C M I 339 ;; 340 .Lcj8: popcnt c3 = x3 C I0 341 add s = s, c0 C M I 342 xor x2 = u2, v2 C M I 343 ;; 344 .Lcj7: popcnt c0 = x0 C I0 345 add s = s, c1 C M I 346 xor x3 = u3, v3 C M I 347 ;; 348 .Lcj6: popcnt c1 = x1 C I0 349 add s = s, c2 C M I 350 ;; 351 .Lcj5: popcnt c2 = x2 C I0 352 add s = s, c3 C M I 353 ;; 354 .Lcj4: popcnt c3 = x3 C I0 355 add s = s, c0 C M I 356 ;; 357 add s = s, c1 C M I 358 ;; 359 add s = s, c2 C M I 360 ;; 361 add s = s, c3 C M I 362 mov.i ar.lc = r2 C I0 363 br.ret.sptk.many b0 C B 364 EPILOGUE() 365 ASM_END()