gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/max.s (about) 1 .LCPI0_0: 2 .quad 0xffefffffffffffff # double -1.7976931348623157E+308 3 Max_F64(double*, unsigned long): # @Max_F64(double*, unsigned long) 4 testq %rsi, %rsi 5 je .LBB0_1 6 cmpq $16, %rsi 7 jae .LBB0_4 8 vmovsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero 9 xorl %eax, %eax 10 jmp .LBB0_11 11 .LBB0_1: 12 vmovsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero 13 retq 14 .LBB0_4: 15 movq %rsi, %rax 16 andq $-16, %rax 17 leaq -16(%rax), %rcx 18 movq %rcx, %r8 19 shrq $4, %r8 20 addq $1, %r8 21 testq %rcx, %rcx 22 je .LBB0_5 23 movq %r8, %rcx 24 andq $-2, %rcx 25 vbroadcastsd .LCPI0_0(%rip), %ymm0 # ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] 26 xorl %edx, %edx 27 vmovapd %ymm0, %ymm1 28 vmovapd %ymm0, %ymm2 29 vmovapd %ymm0, %ymm3 30 .LBB0_7: # =>This Inner Loop Header: Depth=1 31 vmaxpd (%rdi,%rdx,8), %ymm0, %ymm0 32 vmaxpd 32(%rdi,%rdx,8), %ymm1, %ymm1 33 vmaxpd 64(%rdi,%rdx,8), %ymm2, %ymm2 34 vmaxpd 96(%rdi,%rdx,8), %ymm3, %ymm3 35 vmaxpd 128(%rdi,%rdx,8), %ymm0, %ymm0 36 vmaxpd 160(%rdi,%rdx,8), %ymm1, %ymm1 37 vmaxpd 192(%rdi,%rdx,8), %ymm2, %ymm2 38 vmaxpd 224(%rdi,%rdx,8), %ymm3, %ymm3 39 addq $32, %rdx 40 addq $-2, %rcx 41 jne .LBB0_7 42 testb $1, %r8b 43 je .LBB0_10 44 .LBB0_9: 45 vmaxpd (%rdi,%rdx,8), %ymm0, %ymm0 46 vmaxpd 32(%rdi,%rdx,8), %ymm1, %ymm1 47 vmaxpd 64(%rdi,%rdx,8), %ymm2, %ymm2 48 vmaxpd 96(%rdi,%rdx,8), %ymm3, %ymm3 49 .LBB0_10: 50 vmaxpd %ymm3, %ymm0, %ymm0 51 vmaxpd %ymm2, %ymm1, %ymm1 52 vmaxpd %ymm0, %ymm1, %ymm0 53 vextractf128 $1, %ymm0, %xmm1 54 vmaxpd %xmm1, %xmm0, %xmm0 55 vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] 56 vmaxsd %xmm1, %xmm0, %xmm0 57 cmpq %rsi, %rax 58 je .LBB0_12 59 .LBB0_11: # =>This Inner Loop Header: Depth=1 60 vmaxsd (%rdi,%rax,8), %xmm0, %xmm0 61 addq $1, %rax 62 cmpq %rax, %rsi 63 jne .LBB0_11 64 .LBB0_12: 65 vzeroupper 66 retq 67 .LBB0_5: 68 vbroadcastsd .LCPI0_0(%rip), %ymm0 # ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] 69 xorl %edx, %edx 70 vmovapd %ymm0, %ymm1 71 vmovapd %ymm0, %ymm2 72 vmovapd %ymm0, %ymm3 73 testb $1, %r8b 74 jne .LBB0_9 75 jmp .LBB0_10 76 .LCPI1_0: 77 .long 0xff7fffff # float -3.40282347E+38 78 Max_F32(float*, unsigned long): # @Max_F32(float*, unsigned long) 79 testq %rsi, %rsi 80 je .LBB1_1 81 cmpq $32, %rsi 82 jae .LBB1_4 83 vmovss .LCPI1_0(%rip), %xmm0 # xmm0 = mem[0],zero,zero,zero 84 xorl %eax, %eax 85 jmp .LBB1_11 86 .LBB1_1: 87 vmovss .LCPI1_0(%rip), %xmm0 # xmm0 = mem[0],zero,zero,zero 88 retq 89 .LBB1_4: 90 movq %rsi, %rax 91 andq $-32, %rax 92 leaq -32(%rax), %rcx 93 movq %rcx, %r8 94 shrq $5, %r8 95 addq $1, %r8 96 testq %rcx, %rcx 97 je .LBB1_5 98 movq %r8, %rcx 99 andq $-2, %rcx 100 vbroadcastss .LCPI1_0(%rip), %ymm0 # ymm0 = [-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38] 101 xorl %edx, %edx 102 vmovaps %ymm0, %ymm1 103 vmovaps %ymm0, %ymm2 104 vmovaps %ymm0, %ymm3 105 .LBB1_7: # =>This Inner Loop Header: Depth=1 106 vmaxps (%rdi,%rdx,4), %ymm0, %ymm0 107 vmaxps 32(%rdi,%rdx,4), %ymm1, %ymm1 108 vmaxps 64(%rdi,%rdx,4), %ymm2, %ymm2 109 vmaxps 96(%rdi,%rdx,4), %ymm3, %ymm3 110 vmaxps 128(%rdi,%rdx,4), %ymm0, %ymm0 111 vmaxps 160(%rdi,%rdx,4), %ymm1, %ymm1 112 vmaxps 192(%rdi,%rdx,4), %ymm2, %ymm2 113 vmaxps 224(%rdi,%rdx,4), %ymm3, %ymm3 114 addq $64, %rdx 115 addq $-2, %rcx 116 jne .LBB1_7 117 testb $1, %r8b 118 je .LBB1_10 119 .LBB1_9: 120 vmaxps (%rdi,%rdx,4), %ymm0, %ymm0 121 vmaxps 32(%rdi,%rdx,4), %ymm1, %ymm1 122 vmaxps 64(%rdi,%rdx,4), %ymm2, %ymm2 123 vmaxps 96(%rdi,%rdx,4), %ymm3, %ymm3 124 .LBB1_10: 125 vmaxps %ymm3, %ymm0, %ymm0 126 vmaxps %ymm2, %ymm1, %ymm1 127 vmaxps %ymm0, %ymm1, %ymm0 128 vextractf128 $1, %ymm0, %xmm1 129 vmaxps %xmm1, %xmm0, %xmm0 130 vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] 131 vmaxps %xmm1, %xmm0, %xmm0 132 vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] 133 vmaxss %xmm1, %xmm0, %xmm0 134 cmpq %rsi, %rax 135 je .LBB1_12 136 .LBB1_11: # =>This Inner Loop Header: Depth=1 137 vmaxss (%rdi,%rax,4), %xmm0, %xmm0 138 addq $1, %rax 139 cmpq %rax, %rsi 140 jne .LBB1_11 141 .LBB1_12: 142 vzeroupper 143 retq 144 .LBB1_5: 145 vbroadcastss .LCPI1_0(%rip), %ymm0 # ymm0 = [-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38,-3.40282347E+38] 146 xorl %edx, %edx 147 vmovaps %ymm0, %ymm1 148 vmovaps %ymm0, %ymm2 149 vmovaps %ymm0, %ymm3 150 testb $1, %r8b 151 jne .LBB1_9 152 jmp .LBB1_10 153 Maximum_F64(double*, double*, unsigned long): # @Maximum_F64(double*, double*, unsigned long) 154 testq %rdx, %rdx 155 je .LBB2_9 156 cmpq $16, %rdx 157 jae .LBB2_3 158 xorl %eax, %eax 159 jmp .LBB2_6 160 .LBB2_3: 161 movq %rdx, %rax 162 andq $-16, %rax 163 leaq 96(%rdi), %r8 164 xorl %ecx, %ecx 165 .LBB2_4: # =>This Inner Loop Header: Depth=1 166 vmovupd (%rsi,%rcx,8), %ymm0 167 vmovupd 32(%rsi,%rcx,8), %ymm1 168 vmovupd 64(%rsi,%rcx,8), %ymm2 169 vmovupd 96(%rsi,%rcx,8), %ymm3 170 vmovupd -96(%r8,%rcx,8), %ymm4 171 vmovupd -64(%r8,%rcx,8), %ymm5 172 vmovupd -32(%r8,%rcx,8), %ymm6 173 vmovupd (%r8,%rcx,8), %ymm7 174 vcmpltpd %ymm0, %ymm4, %ymm4 175 vmaskmovpd %ymm0, %ymm4, -96(%r8,%rcx,8) 176 vcmpltpd %ymm1, %ymm5, %ymm0 177 vmaskmovpd %ymm1, %ymm0, -64(%r8,%rcx,8) 178 vcmpltpd %ymm2, %ymm6, %ymm0 179 vmaskmovpd %ymm2, %ymm0, -32(%r8,%rcx,8) 180 vcmpltpd %ymm3, %ymm7, %ymm0 181 vmaskmovpd %ymm3, %ymm0, (%r8,%rcx,8) 182 addq $16, %rcx 183 cmpq %rcx, %rax 184 jne .LBB2_4 185 cmpq %rdx, %rax 186 jne .LBB2_6 187 .LBB2_9: 188 vzeroupper 189 retq 190 .LBB2_8: # in Loop: Header=BB2_6 Depth=1 191 addq $1, %rax 192 cmpq %rax, %rdx 193 je .LBB2_9 194 .LBB2_6: # =>This Inner Loop Header: Depth=1 195 vmovsd (%rsi,%rax,8), %xmm0 # xmm0 = mem[0],zero 196 vucomisd (%rdi,%rax,8), %xmm0 197 jbe .LBB2_8 198 vmovsd %xmm0, (%rdi,%rax,8) 199 jmp .LBB2_8 200 Maximum_F32(float*, float*, unsigned long): # @Maximum_F32(float*, float*, unsigned long) 201 testq %rdx, %rdx 202 je .LBB3_9 203 cmpq $32, %rdx 204 jae .LBB3_3 205 xorl %eax, %eax 206 jmp .LBB3_6 207 .LBB3_3: 208 movq %rdx, %rax 209 andq $-32, %rax 210 leaq 96(%rdi), %r8 211 xorl %ecx, %ecx 212 .LBB3_4: # =>This Inner Loop Header: Depth=1 213 vmovups (%rsi,%rcx,4), %ymm0 214 vmovups 32(%rsi,%rcx,4), %ymm1 215 vmovups 64(%rsi,%rcx,4), %ymm2 216 vmovups 96(%rsi,%rcx,4), %ymm3 217 vmovups -96(%r8,%rcx,4), %ymm4 218 vmovups -64(%r8,%rcx,4), %ymm5 219 vmovups -32(%r8,%rcx,4), %ymm6 220 vmovups (%r8,%rcx,4), %ymm7 221 vcmpltps %ymm0, %ymm4, %ymm4 222 vmaskmovps %ymm0, %ymm4, -96(%r8,%rcx,4) 223 vcmpltps %ymm1, %ymm5, %ymm0 224 vmaskmovps %ymm1, %ymm0, -64(%r8,%rcx,4) 225 vcmpltps %ymm2, %ymm6, %ymm0 226 vmaskmovps %ymm2, %ymm0, -32(%r8,%rcx,4) 227 vcmpltps %ymm3, %ymm7, %ymm0 228 vmaskmovps %ymm3, %ymm0, (%r8,%rcx,4) 229 addq $32, %rcx 230 cmpq %rcx, %rax 231 jne .LBB3_4 232 cmpq %rdx, %rax 233 jne .LBB3_6 234 .LBB3_9: 235 vzeroupper 236 retq 237 .LBB3_8: # in Loop: Header=BB3_6 Depth=1 238 addq $1, %rax 239 cmpq %rax, %rdx 240 je .LBB3_9 241 .LBB3_6: # =>This Inner Loop Header: Depth=1 242 vmovss (%rsi,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 243 vucomiss (%rdi,%rax,4), %xmm0 244 jbe .LBB3_8 245 vmovss %xmm0, (%rdi,%rax,4) 246 jmp .LBB3_8 247 MaximumNumber_F64(double*, double, unsigned long): # @MaximumNumber_F64(double*, double, unsigned long) 248 testq %rsi, %rsi 249 je .LBB4_9 250 cmpq $16, %rsi 251 jae .LBB4_3 252 xorl %eax, %eax 253 jmp .LBB4_6 254 .LBB4_3: 255 movq %rsi, %rax 256 andq $-16, %rax 257 vbroadcastsd %xmm0, %ymm1 258 leaq 96(%rdi), %rcx 259 xorl %edx, %edx 260 .LBB4_4: # =>This Inner Loop Header: Depth=1 261 vmovupd -96(%rcx,%rdx,8), %ymm2 262 vmovupd -64(%rcx,%rdx,8), %ymm3 263 vmovupd -32(%rcx,%rdx,8), %ymm4 264 vmovupd (%rcx,%rdx,8), %ymm5 265 vcmpltpd %ymm1, %ymm2, %ymm2 266 vmaskmovpd %ymm1, %ymm2, -96(%rcx,%rdx,8) 267 vcmpltpd %ymm1, %ymm3, %ymm2 268 vmaskmovpd %ymm1, %ymm2, -64(%rcx,%rdx,8) 269 vcmpltpd %ymm1, %ymm4, %ymm2 270 vmaskmovpd %ymm1, %ymm2, -32(%rcx,%rdx,8) 271 vcmpltpd %ymm1, %ymm5, %ymm2 272 vmaskmovpd %ymm1, %ymm2, (%rcx,%rdx,8) 273 addq $16, %rdx 274 cmpq %rdx, %rax 275 jne .LBB4_4 276 cmpq %rsi, %rax 277 jne .LBB4_6 278 .LBB4_9: 279 vzeroupper 280 retq 281 .LBB4_8: # in Loop: Header=BB4_6 Depth=1 282 addq $1, %rax 283 cmpq %rax, %rsi 284 je .LBB4_9 285 .LBB4_6: # =>This Inner Loop Header: Depth=1 286 vucomisd (%rdi,%rax,8), %xmm0 287 jbe .LBB4_8 288 vmovsd %xmm0, (%rdi,%rax,8) 289 jmp .LBB4_8 290 MaximumNumber_F32(float*, float, unsigned long): # @MaximumNumber_F32(float*, float, unsigned long) 291 testq %rsi, %rsi 292 je .LBB5_9 293 cmpq $32, %rsi 294 jae .LBB5_3 295 xorl %eax, %eax 296 jmp .LBB5_6 297 .LBB5_3: 298 movq %rsi, %rax 299 andq $-32, %rax 300 vbroadcastss %xmm0, %ymm1 301 leaq 96(%rdi), %rcx 302 xorl %edx, %edx 303 .LBB5_4: # =>This Inner Loop Header: Depth=1 304 vmovups -96(%rcx,%rdx,4), %ymm2 305 vmovups -64(%rcx,%rdx,4), %ymm3 306 vmovups -32(%rcx,%rdx,4), %ymm4 307 vmovups (%rcx,%rdx,4), %ymm5 308 vcmpltps %ymm1, %ymm2, %ymm2 309 vmaskmovps %ymm1, %ymm2, -96(%rcx,%rdx,4) 310 vcmpltps %ymm1, %ymm3, %ymm2 311 vmaskmovps %ymm1, %ymm2, -64(%rcx,%rdx,4) 312 vcmpltps %ymm1, %ymm4, %ymm2 313 vmaskmovps %ymm1, %ymm2, -32(%rcx,%rdx,4) 314 vcmpltps %ymm1, %ymm5, %ymm2 315 vmaskmovps %ymm1, %ymm2, (%rcx,%rdx,4) 316 addq $32, %rdx 317 cmpq %rdx, %rax 318 jne .LBB5_4 319 cmpq %rsi, %rax 320 jne .LBB5_6 321 .LBB5_9: 322 vzeroupper 323 retq 324 .LBB5_8: # in Loop: Header=BB5_6 Depth=1 325 addq $1, %rax 326 cmpq %rax, %rsi 327 je .LBB5_9 328 .LBB5_6: # =>This Inner Loop Header: Depth=1 329 vucomiss (%rdi,%rax,4), %xmm0 330 jbe .LBB5_8 331 vmovss %xmm0, (%rdi,%rax,4) 332 jmp .LBB5_8