github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/vector/compare/axpy.go (about) 1 package compare 2 3 import ( 4 "unsafe" 5 ) 6 7 func at[T any](xs []T, index uintptr) *T { 8 return (*T)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(xs)), index*unsafe.Sizeof(xs[0]))) 9 } 10 11 //go:noinline 12 func AxpyBasic(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 13 xi, yi := uintptr(0), uintptr(0) 14 for i := uintptr(0); i < n; i++ { 15 ys[yi] += alpha * xs[xi] 16 17 xi += incx 18 yi += incy 19 } 20 } 21 22 //go:noinline 23 func AxpyUnsafe(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 24 xi, yi := uintptr(0), uintptr(0) 25 for i := uintptr(0); i < n; i++ { 26 *at(ys, yi) += alpha * *at(xs, xi) 27 xi += incx 28 yi += incy 29 } 30 } 31 32 //go:noinline 33 func AxpyUnsafeX(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 34 xi, yi := uintptr(0), uintptr(0) 35 for ; n > 0; n-- { 36 *at(ys, yi) += alpha * *at(xs, xi) 37 xi += incx 38 yi += incy 39 } 40 } 41 42 //go:noinline 43 func AxpyUnsafeInline(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 44 for i := uintptr(0); i < n; i++ { 45 *at(ys, i*incy) += alpha * *at(xs, i*incx) 46 } 47 } 48 49 //go:noinline 50 func AxpyPointer(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 51 xp := unsafe.Pointer(unsafe.SliceData(xs)) 52 yp := unsafe.Pointer(unsafe.SliceData(ys)) 53 xn := unsafe.Add(xp, 4*n*incx) 54 for uintptr(xp) < uintptr(xn) { 55 *(*float32)(yp) += alpha * *(*float32)(xp) 56 xp, yp = unsafe.Add(xp, 4*incx), unsafe.Add(yp, 4*incy) 57 } 58 } 59 60 //go:noinline 61 func AxpyPointerLoop(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 62 xp := unsafe.Pointer(unsafe.SliceData(xs)) 63 yp := unsafe.Pointer(unsafe.SliceData(ys)) 64 for i := uintptr(0); i < n; i++ { 65 *(*float32)(yp) += alpha * *(*float32)(xp) 66 xp, yp = unsafe.Add(xp, 4*incx), unsafe.Add(yp, 4*incy) 67 } 68 } 69 70 //go:noinline 71 func AxpyPointerLoopX(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 72 xp := unsafe.Pointer(unsafe.SliceData(xs)) 73 yp := unsafe.Pointer(unsafe.SliceData(ys)) 74 for ; n > 0; n-- { 75 *(*float32)(yp) += alpha * *(*float32)(xp) 76 xp, yp = unsafe.Add(xp, 4*incx), unsafe.Add(yp, 4*incy) 77 } 78 } 79 80 const mask4 = ^uintptr(3) 81 const mask8 = ^uintptr(7) 82 83 //go:noinline 84 func AxpyBasicR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 85 xi, yi := uintptr(0), uintptr(0) 86 i := uintptr(0) 87 n4 := n & mask4 88 for ; i < n4; i += 4 { 89 ys[yi+0*incy] += alpha * xs[xi+0*incx] 90 ys[yi+1*incy] += alpha * xs[xi+1*incx] 91 ys[yi+2*incy] += alpha * xs[xi+2*incx] 92 ys[yi+3*incy] += alpha * xs[xi+3*incx] 93 94 xi += incx * 4 95 yi += incy * 4 96 } 97 for ; i < n; i++ { 98 ys[yi] += alpha * xs[xi] 99 xi += incx 100 yi += incy 101 } 102 } 103 104 //go:noinline 105 func AxpyUnsafeR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 106 xi, yi := uintptr(0), uintptr(0) 107 i := uintptr(0) 108 n4 := n & mask4 109 for ; i < n4; i += 4 { 110 *at(ys, yi+0*incy) += alpha * *at(xs, xi+0*incx) 111 *at(ys, yi+1*incy) += alpha * *at(xs, xi+1*incx) 112 *at(ys, yi+2*incy) += alpha * *at(xs, xi+2*incx) 113 *at(ys, yi+3*incy) += alpha * *at(xs, xi+3*incx) 114 xi += incx * 4 115 yi += incy * 4 116 } 117 for ; i < n; i++ { 118 *at(ys, yi+0) += alpha * *at(xs, xi+0) 119 xi += incx 120 yi += incy 121 } 122 } 123 124 //go:noinline 125 func AxpyBasicXR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 126 xi, yi := uintptr(0), uintptr(0) 127 for ; n >= 4; n -= 4 { 128 ys[yi+0*incy] += alpha * xs[xi+0*incx] 129 ys[yi+1*incy] += alpha * xs[xi+1*incx] 130 ys[yi+2*incy] += alpha * xs[xi+2*incx] 131 ys[yi+3*incy] += alpha * xs[xi+3*incx] 132 133 xi += incx * 4 134 yi += incy * 4 135 } 136 for ; n > 0; n-- { 137 ys[yi] += alpha * xs[xi] 138 xi += incx 139 yi += incy 140 } 141 } 142 143 //go:noinline 144 func AxpyUnsafeXR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 145 xi, yi := uintptr(0), uintptr(0) 146 for ; n >= 4; n -= 4 { 147 *at(ys, yi+0*incy) += alpha * *at(xs, xi+0*incx) 148 *at(ys, yi+1*incy) += alpha * *at(xs, xi+1*incx) 149 *at(ys, yi+2*incy) += alpha * *at(xs, xi+2*incx) 150 *at(ys, yi+3*incy) += alpha * *at(xs, xi+3*incx) 151 xi += incx * 4 152 yi += incy * 4 153 } 154 for ; n > 0; n-- { 155 *at(ys, yi+0) += alpha * *at(xs, xi+0) 156 xi += incx 157 yi += incy 158 } 159 } 160 161 //go:noinline 162 func AxpyUnsafeR8(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 163 if n == 0 { 164 return 165 } 166 _, _ = xs[(n-1)*incx], ys[(n-1)*incy] 167 xi, yi := uintptr(0), uintptr(0) 168 n8 := n & mask8 169 i := uintptr(0) 170 for ; i < n8; i += 8 { 171 *at(ys, yi+0*incy) += alpha * *at(xs, xi+0*incx) 172 *at(ys, yi+1*incy) += alpha * *at(xs, xi+1*incx) 173 *at(ys, yi+2*incy) += alpha * *at(xs, xi+2*incx) 174 *at(ys, yi+3*incy) += alpha * *at(xs, xi+3*incx) 175 *at(ys, yi+4*incy) += alpha * *at(xs, xi+4*incx) 176 *at(ys, yi+5*incy) += alpha * *at(xs, xi+5*incx) 177 *at(ys, yi+6*incy) += alpha * *at(xs, xi+6*incx) 178 *at(ys, yi+7*incy) += alpha * *at(xs, xi+7*incx) 179 xi += incx * 8 180 yi += incy * 8 181 } 182 for ; i < n; i++ { 183 *at(ys, yi+0) += alpha * *at(xs, xi+0) 184 xi += incx 185 yi += incy 186 } 187 } 188 189 //go:noinline 190 func AxpyUnsafeXR8(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 191 if n == 0 { 192 return 193 } 194 _, _ = xs[(n-1)*incx], ys[(n-1)*incy] 195 xi, yi := uintptr(0), uintptr(0) 196 for ; n >= 8; n -= 8 { 197 *at(ys, yi+0*incy) += alpha * *at(xs, xi+0*incx) 198 *at(ys, yi+1*incy) += alpha * *at(xs, xi+1*incx) 199 *at(ys, yi+2*incy) += alpha * *at(xs, xi+2*incx) 200 *at(ys, yi+3*incy) += alpha * *at(xs, xi+3*incx) 201 *at(ys, yi+4*incy) += alpha * *at(xs, xi+4*incx) 202 *at(ys, yi+5*incy) += alpha * *at(xs, xi+5*incx) 203 *at(ys, yi+6*incy) += alpha * *at(xs, xi+6*incx) 204 *at(ys, yi+7*incy) += alpha * *at(xs, xi+7*incx) 205 xi += incx * 8 206 yi += incy * 8 207 } 208 for ; n > 0; n-- { 209 *at(ys, yi+0) += alpha * *at(xs, xi+0) 210 xi += incx 211 yi += incy 212 } 213 } 214 215 //go:noinline 216 func AxpyUnsafeInlineR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 217 i := uintptr(0) 218 n4 := n & mask4 219 for ; i < n4; i += 4 { 220 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 221 *at(ys, (i+1)*incy) += alpha * *at(xs, (i+1)*incx) 222 *at(ys, (i+2)*incy) += alpha * *at(xs, (i+2)*incx) 223 *at(ys, (i+3)*incy) += alpha * *at(xs, (i+3)*incx) 224 } 225 for ; i < n; i++ { 226 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 227 } 228 } 229 230 //go:noinline 231 func AxpyUnsafeInlineXR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 232 i := uintptr(0) 233 for ; n >= 4; n -= 4 { 234 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 235 *at(ys, (i+1)*incy) += alpha * *at(xs, (i+1)*incx) 236 *at(ys, (i+2)*incy) += alpha * *at(xs, (i+2)*incx) 237 *at(ys, (i+3)*incy) += alpha * *at(xs, (i+3)*incx) 238 i += 4 239 } 240 for ; n > 0; n-- { 241 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 242 i++ 243 } 244 } 245 246 //go:noinline 247 func AxpyUnsafeInlineR8(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 248 i := uintptr(0) 249 n8 := n & mask8 250 for ; i < n8; i += 8 { 251 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 252 *at(ys, (i+1)*incy) += alpha * *at(xs, (i+1)*incx) 253 *at(ys, (i+2)*incy) += alpha * *at(xs, (i+2)*incx) 254 *at(ys, (i+3)*incy) += alpha * *at(xs, (i+3)*incx) 255 *at(ys, (i+4)*incy) += alpha * *at(xs, (i+4)*incx) 256 *at(ys, (i+5)*incy) += alpha * *at(xs, (i+5)*incx) 257 *at(ys, (i+6)*incy) += alpha * *at(xs, (i+6)*incx) 258 *at(ys, (i+7)*incy) += alpha * *at(xs, (i+7)*incx) 259 } 260 for ; i < n; i++ { 261 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 262 } 263 } 264 265 //go:noinline 266 func AxpyUnsafeInlineXR8(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 267 i := uintptr(0) 268 for ; n >= 8; n -= 8 { 269 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 270 *at(ys, (i+1)*incy) += alpha * *at(xs, (i+1)*incx) 271 *at(ys, (i+2)*incy) += alpha * *at(xs, (i+2)*incx) 272 *at(ys, (i+3)*incy) += alpha * *at(xs, (i+3)*incx) 273 *at(ys, (i+4)*incy) += alpha * *at(xs, (i+4)*incx) 274 *at(ys, (i+5)*incy) += alpha * *at(xs, (i+5)*incx) 275 *at(ys, (i+6)*incy) += alpha * *at(xs, (i+6)*incx) 276 *at(ys, (i+7)*incy) += alpha * *at(xs, (i+7)*incx) 277 i += 8 278 } 279 for ; n > 0; n-- { 280 *at(ys, (i+0)*incy) += alpha * *at(xs, (i+0)*incx) 281 i++ 282 } 283 } 284 285 //go:noinline 286 func AxpyPointerR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 287 const Size = unsafe.Sizeof(xs[0]) 288 289 xp := unsafe.Pointer(unsafe.SliceData(xs)) 290 yp := unsafe.Pointer(unsafe.SliceData(ys)) 291 292 xn4 := unsafe.Add(xp, (n&mask4)*incx*Size) 293 xn := unsafe.Add(xp, n*incx*Size) 294 for uintptr(xp) < uintptr(xn4) { 295 *(*float32)(unsafe.Add(yp, 0*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 0*Size*incx)) 296 *(*float32)(unsafe.Add(yp, 1*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 1*Size*incx)) 297 *(*float32)(unsafe.Add(yp, 2*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 2*Size*incx)) 298 *(*float32)(unsafe.Add(yp, 3*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 3*Size*incx)) 299 xp, yp = unsafe.Add(xp, 4*incx*Size), unsafe.Add(yp, 4*incy*Size) 300 } 301 for uintptr(xp) < uintptr(xn) { 302 *(*float32)(yp) += alpha * *(*float32)(xp) 303 xp, yp = unsafe.Add(xp, incx*Size), unsafe.Add(yp, incy*Size) 304 } 305 } 306 307 //go:noinline 308 func AxpyPointerLoopR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 309 const Size = unsafe.Sizeof(xs[0]) 310 311 xp := unsafe.Pointer(unsafe.SliceData(xs)) 312 yp := unsafe.Pointer(unsafe.SliceData(ys)) 313 314 i := uintptr(0) 315 n4 := n & mask4 316 for ; i < n4; i += 4 { 317 *(*float32)(unsafe.Add(yp, 0*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 0*Size*incx)) 318 *(*float32)(unsafe.Add(yp, 1*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 1*Size*incx)) 319 *(*float32)(unsafe.Add(yp, 2*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 2*Size*incx)) 320 *(*float32)(unsafe.Add(yp, 3*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 3*Size*incx)) 321 xp, yp = unsafe.Add(xp, 4*incx*Size), unsafe.Add(yp, 4*incy*Size) 322 } 323 for ; i < n; i++ { 324 *(*float32)(yp) += alpha * *(*float32)(xp) 325 xp, yp = unsafe.Add(xp, incx*Size), unsafe.Add(yp, incy*Size) 326 } 327 } 328 329 //go:noinline 330 func AxpyPointerLoopXR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 331 const Size = unsafe.Sizeof(xs[0]) 332 333 xp := unsafe.Pointer(unsafe.SliceData(xs)) 334 yp := unsafe.Pointer(unsafe.SliceData(ys)) 335 336 for ; n >= 4; n -= 4 { 337 *(*float32)(unsafe.Add(yp, 0*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 0*Size*incx)) 338 *(*float32)(unsafe.Add(yp, 1*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 1*Size*incx)) 339 *(*float32)(unsafe.Add(yp, 2*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 2*Size*incx)) 340 *(*float32)(unsafe.Add(yp, 3*Size*incy)) += alpha * *(*float32)(unsafe.Add(xp, 3*Size*incx)) 341 xp, yp = unsafe.Add(xp, 4*incx*Size), unsafe.Add(yp, 4*incy*Size) 342 } 343 for ; n > 0; n-- { 344 *(*float32)(yp) += alpha * *(*float32)(xp) 345 xp, yp = unsafe.Add(xp, incx*Size), unsafe.Add(yp, incy*Size) 346 } 347 } 348 349 //go:noinline 350 func AxpyPointerLoopInterleaveR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 351 const Size = unsafe.Sizeof(xs[0]) 352 353 xp := unsafe.Pointer(unsafe.SliceData(xs)) 354 yp := unsafe.Pointer(unsafe.SliceData(ys)) 355 356 i := uintptr(0) 357 n4 := n & mask4 358 for ; i < n4; i += 4 { 359 x0 := *(*float32)(unsafe.Add(xp, 0*Size*incx)) 360 x1 := *(*float32)(unsafe.Add(xp, 1*Size*incx)) 361 x2 := *(*float32)(unsafe.Add(xp, 2*Size*incx)) 362 x3 := *(*float32)(unsafe.Add(xp, 3*Size*incx)) 363 364 m0 := alpha * x0 365 m1 := alpha * x1 366 m2 := alpha * x2 367 m3 := alpha * x3 368 369 t0 := *(*float32)(unsafe.Add(yp, 0*Size*incy)) + m0 370 t1 := *(*float32)(unsafe.Add(yp, 1*Size*incy)) + m1 371 t2 := *(*float32)(unsafe.Add(yp, 2*Size*incy)) + m2 372 t3 := *(*float32)(unsafe.Add(yp, 3*Size*incy)) + m3 373 374 *(*float32)(unsafe.Add(yp, 0*Size*incy)) = t0 375 *(*float32)(unsafe.Add(yp, 1*Size*incy)) = t1 376 *(*float32)(unsafe.Add(yp, 2*Size*incy)) = t2 377 *(*float32)(unsafe.Add(yp, 3*Size*incy)) = t3 378 379 xp, yp = unsafe.Add(xp, 4*incx*Size), unsafe.Add(yp, 4*incy*Size) 380 } 381 for ; i < n; i++ { 382 *(*float32)(yp) += alpha * *(*float32)(xp) 383 xp, yp = unsafe.Add(xp, incx*Size), unsafe.Add(yp, incy*Size) 384 } 385 } 386 387 //go:noinline 388 func AxpyPointerLoopInterleaveXR4(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 389 const Size = unsafe.Sizeof(xs[0]) 390 391 xp := unsafe.Pointer(unsafe.SliceData(xs)) 392 yp := unsafe.Pointer(unsafe.SliceData(ys)) 393 394 for ; n >= 4; n -= 4 { 395 x0 := *(*float32)(unsafe.Add(xp, 0*Size*incx)) 396 x1 := *(*float32)(unsafe.Add(xp, 1*Size*incx)) 397 x2 := *(*float32)(unsafe.Add(xp, 2*Size*incx)) 398 x3 := *(*float32)(unsafe.Add(xp, 3*Size*incx)) 399 400 m0 := alpha * x0 401 m1 := alpha * x1 402 m2 := alpha * x2 403 m3 := alpha * x3 404 405 t0 := *(*float32)(unsafe.Add(yp, 0*Size*incy)) + m0 406 t1 := *(*float32)(unsafe.Add(yp, 1*Size*incy)) + m1 407 t2 := *(*float32)(unsafe.Add(yp, 2*Size*incy)) + m2 408 t3 := *(*float32)(unsafe.Add(yp, 3*Size*incy)) + m3 409 410 *(*float32)(unsafe.Add(yp, 0*Size*incy)) = t0 411 *(*float32)(unsafe.Add(yp, 1*Size*incy)) = t1 412 *(*float32)(unsafe.Add(yp, 2*Size*incy)) = t2 413 *(*float32)(unsafe.Add(yp, 3*Size*incy)) = t3 414 415 xp, yp = unsafe.Add(xp, 4*incx*Size), unsafe.Add(yp, 4*incy*Size) 416 } 417 for ; n > 0; n-- { 418 *(*float32)(yp) += alpha * *(*float32)(xp) 419 xp, yp = unsafe.Add(xp, incx*Size), unsafe.Add(yp, incy*Size) 420 } 421 } 422 423 //go:noinline 424 func AxpyPointerR4Alt(alpha float32, xs []float32, incx uintptr, ys []float32, incy uintptr, n uintptr) { 425 const Size = unsafe.Sizeof(xs[0]) 426 427 xp, yp := unsafe.SliceData(xs), unsafe.SliceData(ys) 428 xn := offset(xp, n*incx) 429 xn4 := offset(xp, (n&mask4)*incx) 430 431 for less(xp, xn4) { 432 *offset(yp, 0*incy) += alpha * *offset(xp, 0*incx) 433 *offset(yp, 1*incy) += alpha * *offset(xp, 1*incx) 434 *offset(yp, 2*incy) += alpha * *offset(xp, 2*incx) 435 *offset(yp, 3*incy) += alpha * *offset(xp, 3*incx) 436 xp, yp = offset(xp, 4*incx), offset(yp, 4*incy) 437 } 438 for less(xp, xn) { 439 *yp += alpha * *xp 440 xp, yp = offset(xp, incx), offset(yp, incy) 441 } 442 } 443 444 func offset[T any](x *T, count uintptr) *T { 445 return (*T)(unsafe.Add(unsafe.Pointer(x), count*unsafe.Sizeof(*x))) 446 } 447 func less[T any](x, y *T) bool { 448 return uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) 449 }