github.com/apache/arrow/go/v16@v16.1.0/arrow/compute/internal/kernels/constant_factor_avx2_amd64.s (about) 1 //go:build go1.18 && !noasm && !appengine 2 // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT 3 4 TEXT ·_multiply_constant_int32_int32_avx2(SB), $0-32 5 6 MOVQ src+0(FP), DI 7 MOVQ dest+8(FP), SI 8 MOVQ len+16(FP), DX 9 MOVQ factor+24(FP), CX 10 11 WORD $0xd285 // test edx, edx 12 JLE LBB0_16 13 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 14 WORD $0xfa83; BYTE $0x1f // cmp edx, 31 15 JBE LBB0_2 16 LONG $0x8f048d4a // lea rax, [rdi + 4*r9] 17 WORD $0x3948; BYTE $0xf0 // cmp rax, rsi 18 JBE LBB0_9 19 LONG $0x8e048d4a // lea rax, [rsi + 4*r9] 20 WORD $0x3948; BYTE $0xf8 // cmp rax, rdi 21 JBE LBB0_9 22 23 LBB0_2: 24 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 25 26 LBB0_3: 27 WORD $0x894d; BYTE $0xd8 // mov r8, r11 28 WORD $0xf749; BYTE $0xd0 // not r8 29 WORD $0x014d; BYTE $0xc8 // add r8, r9 30 WORD $0x894c; BYTE $0xc8 // mov rax, r9 31 LONG $0x03e08348 // and rax, 3 32 JE LBB0_5 33 34 LBB0_4: 35 LONG $0x9f148b42 // mov edx, dword [rdi + 4*r11] 36 WORD $0xaf0f; BYTE $0xd1 // imul edx, ecx 37 LONG $0x9e148942 // mov dword [rsi + 4*r11], edx 38 LONG $0x01c38349 // add r11, 1 39 LONG $0xffc08348 // add rax, -1 40 JNE LBB0_4 41 42 LBB0_5: 43 LONG $0x03f88349 // cmp r8, 3 44 JB LBB0_16 45 46 LBB0_6: 47 LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11] 48 WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx 49 LONG $0x9e048942 // mov dword [rsi + 4*r11], eax 50 LONG $0x9f448b42; BYTE $0x04 // mov eax, dword [rdi + 4*r11 + 4] 51 WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx 52 LONG $0x9e448942; BYTE $0x04 // mov dword [rsi + 4*r11 + 4], eax 53 LONG $0x9f448b42; BYTE $0x08 // mov eax, dword [rdi + 4*r11 + 8] 54 WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx 55 LONG $0x9e448942; BYTE $0x08 // mov dword [rsi + 4*r11 + 8], eax 56 LONG $0x9f448b42; BYTE $0x0c // mov eax, dword [rdi + 4*r11 + 12] 57 WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx 58 LONG $0x9e448942; BYTE $0x0c // mov dword [rsi + 4*r11 + 12], eax 59 LONG $0x04c38349 // add r11, 4 60 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 61 JNE LBB0_6 62 JMP LBB0_16 63 64 LBB0_9: 65 WORD $0x8945; BYTE $0xcb // mov r11d, r9d 66 LONG $0xe0e38341 // and r11d, -32 67 LONG $0xc16ef9c5 // vmovd xmm0, ecx 68 LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 69 LONG $0xe0438d49 // lea rax, [r11 - 32] 70 WORD $0x8949; BYTE $0xc0 // mov r8, rax 71 LONG $0x05e8c149 // shr r8, 5 72 LONG $0x01c08349 // add r8, 1 73 WORD $0x8548; BYTE $0xc0 // test rax, rax 74 JE LBB0_10 75 WORD $0x894d; BYTE $0xc2 // mov r10, r8 76 LONG $0xfee28349 // and r10, -2 77 WORD $0xf749; BYTE $0xda // neg r10 78 WORD $0xc031 // xor eax, eax 79 80 LBB0_12: 81 LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax] 82 LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32] 83 LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64] 84 LONG $0x407de2c4; WORD $0x8764; BYTE $0x60 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 96] 85 LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1 86 LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2 87 LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3 88 LONG $0x647ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm4 89 QUAD $0x0080878c407de2c4; WORD $0x0000 // vpmulld ymm1, ymm0, yword [rdi + 4*rax + 128] 90 QUAD $0x00a08794407de2c4; WORD $0x0000 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 160] 91 QUAD $0x00c0879c407de2c4; WORD $0x0000 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 192] 92 QUAD $0x00e087a4407de2c4; WORD $0x0000 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 224] 93 QUAD $0x000080868c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 128], ymm1 94 QUAD $0x0000a086947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 160], ymm2 95 QUAD $0x0000c0869c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 192], ymm3 96 QUAD $0x0000e086a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 224], ymm4 97 LONG $0x40c08348 // add rax, 64 98 LONG $0x02c28349 // add r10, 2 99 JNE LBB0_12 100 LONG $0x01c0f641 // test r8b, 1 101 JE LBB0_15 102 103 LBB0_14: 104 LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax] 105 LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32] 106 LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64] 107 LONG $0x407de2c4; WORD $0x8744; BYTE $0x60 // vpmulld ymm0, ymm0, yword [rdi + 4*rax + 96] 108 LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1 109 LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2 110 LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3 111 LONG $0x447ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm0 112 113 LBB0_15: 114 WORD $0x394d; BYTE $0xcb // cmp r11, r9 115 JNE LBB0_3 116 117 LBB0_16: 118 VZEROUPPER 119 RET 120 121 LBB0_10: 122 WORD $0xc031 // xor eax, eax 123 LONG $0x01c0f641 // test r8b, 1 124 JNE LBB0_14 125 JMP LBB0_15 126 127 TEXT ·_divide_constant_int32_int32_avx2(SB), $0-32 128 129 MOVQ src+0(FP), DI 130 MOVQ dest+8(FP), SI 131 MOVQ len+16(FP), DX 132 MOVQ factor+24(FP), CX 133 134 WORD $0xd285 // test edx, edx 135 JLE LBB1_8 136 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 137 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 138 JNE LBB1_9 139 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 140 141 LBB1_3: 142 LONG $0x01c1f641 // test r9b, 1 143 JE LBB1_8 144 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 145 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 146 WORD $0x0948; BYTE $0xca // or rdx, rcx 147 LONG $0x20eac148 // shr rdx, 32 148 JE LBB1_5 149 WORD $0x9948 // cqo 150 WORD $0xf748; BYTE $0xf9 // idiv rcx 151 JMP LBB1_7 152 153 LBB1_9: 154 WORD $0x8945; BYTE $0xca // mov r10d, r9d 155 LONG $0xfee28341 // and r10d, -2 156 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 157 JMP LBB1_10 158 159 LBB1_15: 160 WORD $0x9948 // cqo 161 WORD $0xf748; BYTE $0xf9 // idiv rcx 162 163 LBB1_16: 164 LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax 165 LONG $0x02c08349 // add r8, 2 166 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 167 JE LBB1_3 168 169 LBB1_10: 170 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 171 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 172 WORD $0x0948; BYTE $0xca // or rdx, rcx 173 LONG $0x20eac148 // shr rdx, 32 174 JE LBB1_11 175 WORD $0x9948 // cqo 176 WORD $0xf748; BYTE $0xf9 // idiv rcx 177 JMP LBB1_13 178 179 LBB1_11: 180 WORD $0xd231 // xor edx, edx 181 WORD $0xf1f7 // div ecx 182 183 LBB1_13: 184 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 185 LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4] 186 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 187 WORD $0x0948; BYTE $0xca // or rdx, rcx 188 LONG $0x20eac148 // shr rdx, 32 189 JNE LBB1_15 190 WORD $0xd231 // xor edx, edx 191 WORD $0xf1f7 // div ecx 192 JMP LBB1_16 193 194 LBB1_5: 195 WORD $0xd231 // xor edx, edx 196 WORD $0xf1f7 // div ecx 197 198 LBB1_7: 199 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 200 201 LBB1_8: 202 RET 203 204 TEXT ·_multiply_constant_int32_int64_avx2(SB), $0-32 205 206 MOVQ src+0(FP), DI 207 MOVQ dest+8(FP), SI 208 MOVQ len+16(FP), DX 209 MOVQ factor+24(FP), CX 210 211 WORD $0xd285 // test edx, edx 212 JLE LBB2_7 213 WORD $0x8941; BYTE $0xd0 // mov r8d, edx 214 WORD $0xfa83; BYTE $0x0f // cmp edx, 15 215 JA LBB2_3 216 WORD $0xd231 // xor edx, edx 217 JMP LBB2_6 218 219 LBB2_3: 220 WORD $0x8944; BYTE $0xc2 // mov edx, r8d 221 WORD $0xe283; BYTE $0xf0 // and edx, -16 222 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx 223 LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 224 WORD $0xc031 // xor eax, eax 225 LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32 226 227 LBB2_4: 228 LONG $0x257de2c4; WORD $0x8714 // vpmovsxdq ymm2, oword [rdi + 4*rax] 229 LONG $0x257de2c4; WORD $0x875c; BYTE $0x10 // vpmovsxdq ymm3, oword [rdi + 4*rax + 16] 230 LONG $0x257de2c4; WORD $0x8764; BYTE $0x20 // vpmovsxdq ymm4, oword [rdi + 4*rax + 32] 231 LONG $0x257de2c4; WORD $0x876c; BYTE $0x30 // vpmovsxdq ymm5, oword [rdi + 4*rax + 48] 232 LONG $0xf2f4f5c5 // vpmuludq ymm6, ymm1, ymm2 233 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 234 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 235 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 236 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 237 LONG $0xd2f4fdc5 // vpmuludq ymm2, ymm0, ymm2 238 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 239 LONG $0xf3f4f5c5 // vpmuludq ymm6, ymm1, ymm3 240 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 241 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 242 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 243 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 244 LONG $0xdbf4fdc5 // vpmuludq ymm3, ymm0, ymm3 245 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 246 LONG $0xf4f4f5c5 // vpmuludq ymm6, ymm1, ymm4 247 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 248 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 249 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 250 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 251 LONG $0xe4f4fdc5 // vpmuludq ymm4, ymm0, ymm4 252 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 253 LONG $0xf5f4f5c5 // vpmuludq ymm6, ymm1, ymm5 254 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 255 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 256 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 257 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 258 LONG $0xedf4fdc5 // vpmuludq ymm5, ymm0, ymm5 259 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 260 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 261 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 262 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 263 LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5 264 LONG $0x10c08348 // add rax, 16 265 WORD $0x3948; BYTE $0xc2 // cmp rdx, rax 266 JNE LBB2_4 267 WORD $0x394c; BYTE $0xc2 // cmp rdx, r8 268 JE LBB2_7 269 270 LBB2_6: 271 LONG $0x97046348 // movsxd rax, dword [rdi + 4*rdx] 272 LONG $0xc1af0f48 // imul rax, rcx 273 LONG $0xd6048948 // mov qword [rsi + 8*rdx], rax 274 LONG $0x01c28348 // add rdx, 1 275 WORD $0x3949; BYTE $0xd0 // cmp r8, rdx 276 JNE LBB2_6 277 278 LBB2_7: 279 VZEROUPPER 280 RET 281 282 TEXT ·_divide_constant_int32_int64_avx2(SB), $0-32 283 284 MOVQ src+0(FP), DI 285 MOVQ dest+8(FP), SI 286 MOVQ len+16(FP), DX 287 MOVQ factor+24(FP), CX 288 289 WORD $0xd285 // test edx, edx 290 JLE LBB3_8 291 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 292 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 293 JNE LBB3_9 294 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 295 296 LBB3_3: 297 LONG $0x01c1f641 // test r9b, 1 298 JE LBB3_8 299 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 300 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 301 WORD $0x0948; BYTE $0xca // or rdx, rcx 302 LONG $0x20eac148 // shr rdx, 32 303 JE LBB3_5 304 WORD $0x9948 // cqo 305 WORD $0xf748; BYTE $0xf9 // idiv rcx 306 JMP LBB3_7 307 308 LBB3_9: 309 WORD $0x8945; BYTE $0xca // mov r10d, r9d 310 LONG $0xfee28341 // and r10d, -2 311 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 312 JMP LBB3_10 313 314 LBB3_15: 315 WORD $0x9948 // cqo 316 WORD $0xf748; BYTE $0xf9 // idiv rcx 317 318 LBB3_16: 319 LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax 320 LONG $0x02c08349 // add r8, 2 321 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 322 JE LBB3_3 323 324 LBB3_10: 325 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] 326 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 327 WORD $0x0948; BYTE $0xca // or rdx, rcx 328 LONG $0x20eac148 // shr rdx, 32 329 JE LBB3_11 330 WORD $0x9948 // cqo 331 WORD $0xf748; BYTE $0xf9 // idiv rcx 332 JMP LBB3_13 333 334 LBB3_11: 335 WORD $0xd231 // xor edx, edx 336 WORD $0xf1f7 // div ecx 337 338 LBB3_13: 339 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 340 LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4] 341 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 342 WORD $0x0948; BYTE $0xca // or rdx, rcx 343 LONG $0x20eac148 // shr rdx, 32 344 JNE LBB3_15 345 WORD $0xd231 // xor edx, edx 346 WORD $0xf1f7 // div ecx 347 JMP LBB3_16 348 349 LBB3_5: 350 WORD $0xd231 // xor edx, edx 351 WORD $0xf1f7 // div ecx 352 353 LBB3_7: 354 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 355 356 LBB3_8: 357 RET 358 359 TEXT ·_multiply_constant_int64_int32_avx2(SB), $0-32 360 361 MOVQ src+0(FP), DI 362 MOVQ dest+8(FP), SI 363 MOVQ len+16(FP), DX 364 MOVQ factor+24(FP), CX 365 366 WORD $0xd285 // test edx, edx 367 JLE LBB4_7 368 WORD $0x8941; BYTE $0xd0 // mov r8d, edx 369 WORD $0xfa83; BYTE $0x0f // cmp edx, 15 370 JA LBB4_3 371 WORD $0xd231 // xor edx, edx 372 JMP LBB4_6 373 374 LBB4_3: 375 WORD $0x8944; BYTE $0xc2 // mov edx, r8d 376 WORD $0xe283; BYTE $0xf0 // and edx, -16 377 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx 378 LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 379 WORD $0xc031 // xor eax, eax 380 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 381 382 LBB4_4: 383 LONG $0x1410f8c5; BYTE $0xc7 // vmovups xmm2, oword [rdi + 8*rax] 384 LONG $0x5c10f8c5; WORD $0x20c7 // vmovups xmm3, oword [rdi + 8*rax + 32] 385 LONG $0x6410f8c5; WORD $0x40c7 // vmovups xmm4, oword [rdi + 8*rax + 64] 386 LONG $0x6c10f8c5; WORD $0x60c7 // vmovups xmm5, oword [rdi + 8*rax + 96] 387 LONG $0x54c6e8c5; WORD $0x10c7; BYTE $0x88 // vshufps xmm2, xmm2, oword [rdi + 8*rax + 16], 136 388 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 389 LONG $0x4069e2c4; BYTE $0xd6 // vpmulld xmm2, xmm2, xmm6 390 LONG $0x5cc6e0c5; WORD $0x30c7; BYTE $0x88 // vshufps xmm3, xmm3, oword [rdi + 8*rax + 48], 136 391 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 392 LONG $0x4061e2c4; BYTE $0xde // vpmulld xmm3, xmm3, xmm6 393 LONG $0x64c6d8c5; WORD $0x50c7; BYTE $0x88 // vshufps xmm4, xmm4, oword [rdi + 8*rax + 80], 136 394 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 395 LONG $0x4059e2c4; BYTE $0xe6 // vpmulld xmm4, xmm4, xmm6 396 LONG $0x6cc6d0c5; WORD $0x70c7; BYTE $0x88 // vshufps xmm5, xmm5, oword [rdi + 8*rax + 112], 136 397 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 398 LONG $0x4051e2c4; BYTE $0xee // vpmulld xmm5, xmm5, xmm6 399 LONG $0x147ffac5; BYTE $0x86 // vmovdqu oword [rsi + 4*rax], xmm2 400 LONG $0x5c7ffac5; WORD $0x1086 // vmovdqu oword [rsi + 4*rax + 16], xmm3 401 LONG $0x647ffac5; WORD $0x2086 // vmovdqu oword [rsi + 4*rax + 32], xmm4 402 LONG $0x6c7ffac5; WORD $0x3086 // vmovdqu oword [rsi + 4*rax + 48], xmm5 403 LONG $0x10c08348 // add rax, 16 404 WORD $0x3948; BYTE $0xc2 // cmp rdx, rax 405 JNE LBB4_4 406 WORD $0x394c; BYTE $0xc2 // cmp rdx, r8 407 JE LBB4_7 408 409 LBB4_6: 410 WORD $0x048b; BYTE $0xd7 // mov eax, dword [rdi + 8*rdx] 411 WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx 412 WORD $0x0489; BYTE $0x96 // mov dword [rsi + 4*rdx], eax 413 LONG $0x01c28348 // add rdx, 1 414 WORD $0x3949; BYTE $0xd0 // cmp r8, rdx 415 JNE LBB4_6 416 417 LBB4_7: 418 VZEROUPPER 419 RET 420 421 TEXT ·_divide_constant_int64_int32_avx2(SB), $0-32 422 423 MOVQ src+0(FP), DI 424 MOVQ dest+8(FP), SI 425 MOVQ len+16(FP), DX 426 MOVQ factor+24(FP), CX 427 428 WORD $0xd285 // test edx, edx 429 JLE LBB5_8 430 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 431 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 432 JNE LBB5_9 433 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 434 435 LBB5_3: 436 LONG $0x01c1f641 // test r9b, 1 437 JE LBB5_8 438 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 439 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 440 WORD $0x0948; BYTE $0xca // or rdx, rcx 441 LONG $0x20eac148 // shr rdx, 32 442 JE LBB5_5 443 WORD $0x9948 // cqo 444 WORD $0xf748; BYTE $0xf9 // idiv rcx 445 JMP LBB5_7 446 447 LBB5_9: 448 WORD $0x8945; BYTE $0xca // mov r10d, r9d 449 LONG $0xfee28341 // and r10d, -2 450 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 451 JMP LBB5_10 452 453 LBB5_15: 454 WORD $0x9948 // cqo 455 WORD $0xf748; BYTE $0xf9 // idiv rcx 456 457 LBB5_16: 458 LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax 459 LONG $0x02c08349 // add r8, 2 460 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 461 JE LBB5_3 462 463 LBB5_10: 464 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 465 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 466 WORD $0x0948; BYTE $0xca // or rdx, rcx 467 LONG $0x20eac148 // shr rdx, 32 468 JE LBB5_11 469 WORD $0x9948 // cqo 470 WORD $0xf748; BYTE $0xf9 // idiv rcx 471 JMP LBB5_13 472 473 LBB5_11: 474 WORD $0xd231 // xor edx, edx 475 WORD $0xf1f7 // div ecx 476 477 LBB5_13: 478 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 479 LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8] 480 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 481 WORD $0x0948; BYTE $0xca // or rdx, rcx 482 LONG $0x20eac148 // shr rdx, 32 483 JNE LBB5_15 484 WORD $0xd231 // xor edx, edx 485 WORD $0xf1f7 // div ecx 486 JMP LBB5_16 487 488 LBB5_5: 489 WORD $0xd231 // xor edx, edx 490 WORD $0xf1f7 // div ecx 491 492 LBB5_7: 493 LONG $0x86048942 // mov dword [rsi + 4*r8], eax 494 495 LBB5_8: 496 RET 497 498 TEXT ·_multiply_constant_int64_int64_avx2(SB), $0-32 499 500 MOVQ src+0(FP), DI 501 MOVQ dest+8(FP), SI 502 MOVQ len+16(FP), DX 503 MOVQ factor+24(FP), CX 504 505 WORD $0xd285 // test edx, edx 506 JLE LBB6_16 507 WORD $0x8941; BYTE $0xd0 // mov r8d, edx 508 WORD $0xfa83; BYTE $0x0f // cmp edx, 15 509 JBE LBB6_2 510 LONG $0xc7048d4a // lea rax, [rdi + 8*r8] 511 WORD $0x3948; BYTE $0xf0 // cmp rax, rsi 512 JBE LBB6_9 513 LONG $0xc6048d4a // lea rax, [rsi + 8*r8] 514 WORD $0x3948; BYTE $0xf8 // cmp rax, rdi 515 JBE LBB6_9 516 517 LBB6_2: 518 WORD $0x3145; BYTE $0xdb // xor r11d, r11d 519 520 LBB6_3: 521 WORD $0x894d; BYTE $0xd9 // mov r9, r11 522 WORD $0xf749; BYTE $0xd1 // not r9 523 WORD $0x014d; BYTE $0xc1 // add r9, r8 524 WORD $0x894c; BYTE $0xc0 // mov rax, r8 525 LONG $0x03e08348 // and rax, 3 526 JE LBB6_5 527 528 LBB6_4: 529 LONG $0xdf148b4a // mov rdx, qword [rdi + 8*r11] 530 LONG $0xd1af0f48 // imul rdx, rcx 531 LONG $0xde14894a // mov qword [rsi + 8*r11], rdx 532 LONG $0x01c38349 // add r11, 1 533 LONG $0xffc08348 // add rax, -1 534 JNE LBB6_4 535 536 LBB6_5: 537 LONG $0x03f98349 // cmp r9, 3 538 JB LBB6_16 539 540 LBB6_6: 541 LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11] 542 LONG $0xc1af0f48 // imul rax, rcx 543 LONG $0xde04894a // mov qword [rsi + 8*r11], rax 544 LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r11 + 8] 545 LONG $0xc1af0f48 // imul rax, rcx 546 LONG $0xde44894a; BYTE $0x08 // mov qword [rsi + 8*r11 + 8], rax 547 LONG $0xdf448b4a; BYTE $0x10 // mov rax, qword [rdi + 8*r11 + 16] 548 LONG $0xc1af0f48 // imul rax, rcx 549 LONG $0xde44894a; BYTE $0x10 // mov qword [rsi + 8*r11 + 16], rax 550 LONG $0xdf448b4a; BYTE $0x18 // mov rax, qword [rdi + 8*r11 + 24] 551 LONG $0xc1af0f48 // imul rax, rcx 552 LONG $0xde44894a; BYTE $0x18 // mov qword [rsi + 8*r11 + 24], rax 553 LONG $0x04c38349 // add r11, 4 554 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 555 JNE LBB6_6 556 JMP LBB6_16 557 558 LBB6_9: 559 WORD $0x8945; BYTE $0xc3 // mov r11d, r8d 560 LONG $0xf0e38341 // and r11d, -16 561 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx 562 LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 563 LONG $0xf0438d49 // lea rax, [r11 - 16] 564 WORD $0x8949; BYTE $0xc1 // mov r9, rax 565 LONG $0x04e9c149 // shr r9, 4 566 LONG $0x01c18349 // add r9, 1 567 LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32 568 WORD $0x8548; BYTE $0xc0 // test rax, rax 569 JE LBB6_10 570 WORD $0x894d; BYTE $0xca // mov r10, r9 571 LONG $0xfee28349 // and r10, -2 572 WORD $0xf749; BYTE $0xda // neg r10 573 WORD $0xc031 // xor eax, eax 574 575 LBB6_12: 576 LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] 577 LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32] 578 LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] 579 LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96] 580 LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 581 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 582 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 583 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 584 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 585 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 586 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 587 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 588 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 589 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 590 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 591 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 592 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 593 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 594 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 595 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 596 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 597 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 598 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 599 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 600 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 601 LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1 602 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 603 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 604 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 605 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 606 LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0 607 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 608 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 609 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 610 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 611 LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5 612 QUAD $0x000080c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 128] 613 QUAD $0x0000a0c79c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdi + 8*rax + 160] 614 QUAD $0x0000c0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 192] 615 QUAD $0x0000e0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 224] 616 LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 617 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 618 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 619 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 620 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 621 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 622 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 623 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 624 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 625 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 626 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 627 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 628 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 629 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 630 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 631 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 632 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 633 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 634 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 635 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 636 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 637 LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1 638 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 639 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 640 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 641 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 642 LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0 643 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 644 QUAD $0x000080c6947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 128], ymm2 645 QUAD $0x0000a0c69c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 160], ymm3 646 QUAD $0x0000c0c6a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 192], ymm4 647 QUAD $0x0000e0c6ac7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 224], ymm5 648 LONG $0x20c08348 // add rax, 32 649 LONG $0x02c28349 // add r10, 2 650 JNE LBB6_12 651 LONG $0x01c1f641 // test r9b, 1 652 JE LBB6_15 653 654 LBB6_14: 655 LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] 656 LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32] 657 LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] 658 LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96] 659 LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 660 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 661 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 662 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 663 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 664 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 665 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 666 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 667 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 668 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 669 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 670 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 671 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 672 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 673 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 674 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 675 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 676 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 677 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 678 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 679 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 680 LONG $0xc9f4d5c5 // vpmuludq ymm1, ymm5, ymm1 681 LONG $0xd573cdc5; BYTE $0x20 // vpsrlq ymm6, ymm5, 32 682 LONG $0xf0f4cdc5 // vpmuludq ymm6, ymm6, ymm0 683 LONG $0xced4f5c5 // vpaddq ymm1, ymm1, ymm6 684 LONG $0xf173f5c5; BYTE $0x20 // vpsllq ymm1, ymm1, 32 685 LONG $0xc0f4d5c5 // vpmuludq ymm0, ymm5, ymm0 686 LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 687 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 688 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 689 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 690 LONG $0x447ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm0 691 692 LBB6_15: 693 WORD $0x394d; BYTE $0xc3 // cmp r11, r8 694 JNE LBB6_3 695 696 LBB6_16: 697 VZEROUPPER 698 RET 699 700 LBB6_10: 701 WORD $0xc031 // xor eax, eax 702 LONG $0x01c1f641 // test r9b, 1 703 JNE LBB6_14 704 JMP LBB6_15 705 706 TEXT ·_divide_constant_int64_int64_avx2(SB), $0-32 707 708 MOVQ src+0(FP), DI 709 MOVQ dest+8(FP), SI 710 MOVQ len+16(FP), DX 711 MOVQ factor+24(FP), CX 712 713 WORD $0xd285 // test edx, edx 714 JLE LBB7_8 715 WORD $0x8941; BYTE $0xd1 // mov r9d, edx 716 WORD $0xfa83; BYTE $0x01 // cmp edx, 1 717 JNE LBB7_9 718 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 719 720 LBB7_3: 721 LONG $0x01c1f641 // test r9b, 1 722 JE LBB7_8 723 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 724 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 725 WORD $0x0948; BYTE $0xca // or rdx, rcx 726 LONG $0x20eac148 // shr rdx, 32 727 JE LBB7_5 728 WORD $0x9948 // cqo 729 WORD $0xf748; BYTE $0xf9 // idiv rcx 730 JMP LBB7_7 731 732 LBB7_9: 733 WORD $0x8945; BYTE $0xca // mov r10d, r9d 734 LONG $0xfee28341 // and r10d, -2 735 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d 736 JMP LBB7_10 737 738 LBB7_15: 739 WORD $0x9948 // cqo 740 WORD $0xf748; BYTE $0xf9 // idiv rcx 741 742 LBB7_16: 743 LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax 744 LONG $0x02c08349 // add r8, 2 745 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 746 JE LBB7_3 747 748 LBB7_10: 749 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] 750 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 751 WORD $0x0948; BYTE $0xca // or rdx, rcx 752 LONG $0x20eac148 // shr rdx, 32 753 JE LBB7_11 754 WORD $0x9948 // cqo 755 WORD $0xf748; BYTE $0xf9 // idiv rcx 756 JMP LBB7_13 757 758 LBB7_11: 759 WORD $0xd231 // xor edx, edx 760 WORD $0xf1f7 // div ecx 761 762 LBB7_13: 763 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 764 LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8] 765 WORD $0x8948; BYTE $0xc2 // mov rdx, rax 766 WORD $0x0948; BYTE $0xca // or rdx, rcx 767 LONG $0x20eac148 // shr rdx, 32 768 JNE LBB7_15 769 WORD $0xd231 // xor edx, edx 770 WORD $0xf1f7 // div ecx 771 JMP LBB7_16 772 773 LBB7_5: 774 WORD $0xd231 // xor edx, edx 775 WORD $0xf1f7 // div ecx 776 777 LBB7_7: 778 LONG $0xc604894a // mov qword [rsi + 8*r8], rax 779 780 LBB7_8: 781 RET