gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/min.s (about) 1 .LCPI0_0: 2 .quad 0x7fefffffffffffff # double 1.7976931348623157E+308 3 Min_F64_D(double*, unsigned long): # @Min_F64_D(double*, unsigned long) 4 testq %rsi, %rsi 5 je .LBB0_1 6 cmpq $16, %rsi 7 jae .LBB0_4 8 vmovsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero 9 xorl %eax, %eax 10 jmp .LBB0_11 11 .LBB0_1: 12 vmovsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero 13 retq 14 .LBB0_4: 15 movq %rsi, %rax 16 andq $-16, %rax 17 leaq -16(%rax), %rcx 18 movq %rcx, %r8 19 shrq $4, %r8 20 addq $1, %r8 21 testq %rcx, %rcx 22 je .LBB0_5 23 movq %r8, %rcx 24 andq $-2, %rcx 25 vbroadcastsd .LCPI0_0(%rip), %ymm0 # ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] 26 xorl %edx, %edx 27 vmovapd %ymm0, %ymm1 28 vmovapd %ymm0, %ymm2 29 vmovapd %ymm0, %ymm3 30 .LBB0_7: # =>This Inner Loop Header: Depth=1 31 vminpd (%rdi,%rdx,8), %ymm0, %ymm0 32 vminpd 32(%rdi,%rdx,8), %ymm1, %ymm1 33 vminpd 64(%rdi,%rdx,8), %ymm2, %ymm2 34 vminpd 96(%rdi,%rdx,8), %ymm3, %ymm3 35 vminpd 128(%rdi,%rdx,8), %ymm0, %ymm0 36 vminpd 160(%rdi,%rdx,8), %ymm1, %ymm1 37 vminpd 192(%rdi,%rdx,8), %ymm2, %ymm2 38 vminpd 224(%rdi,%rdx,8), %ymm3, %ymm3 39 addq $32, %rdx 40 addq $-2, %rcx 41 jne .LBB0_7 42 testb $1, %r8b 43 je .LBB0_10 44 .LBB0_9: 45 vminpd (%rdi,%rdx,8), %ymm0, %ymm0 46 vminpd 32(%rdi,%rdx,8), %ymm1, %ymm1 47 vminpd 64(%rdi,%rdx,8), %ymm2, %ymm2 48 vminpd 96(%rdi,%rdx,8), %ymm3, %ymm3 49 .LBB0_10: 50 vminpd %ymm3, %ymm0, %ymm0 51 vminpd %ymm2, %ymm1, %ymm1 52 vminpd %ymm0, %ymm1, %ymm0 53 vextractf128 $1, %ymm0, %xmm1 54 vminpd %xmm1, %xmm0, %xmm0 55 vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] 56 vminsd %xmm1, %xmm0, %xmm0 57 cmpq %rsi, %rax 58 je .LBB0_12 59 .LBB0_11: # =>This Inner Loop Header: Depth=1 60 vminsd (%rdi,%rax,8), %xmm0, %xmm0 61 addq $1, %rax 62 cmpq %rax, %rsi 63 jne .LBB0_11 64 .LBB0_12: 65 vzeroupper 66 retq 67 .LBB0_5: 68 vbroadcastsd .LCPI0_0(%rip), %ymm0 # ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] 69 xorl %edx, %edx 70 vmovapd %ymm0, %ymm1 71 vmovapd %ymm0, %ymm2 72 vmovapd %ymm0, %ymm3 73 testb $1, %r8b 74 jne .LBB0_9 75 jmp .LBB0_10 76 .LCPI1_0: 77 .long 0x7f7fffff # float 3.40282347E+38 78 Min_F32_F(float*, unsigned long): # @Min_F32_F(float*, unsigned long) 79 testq %rsi, %rsi 80 je .LBB1_1 81 cmpq $32, %rsi 82 jae .LBB1_4 83 vmovss .LCPI1_0(%rip), %xmm0 # xmm0 = mem[0],zero,zero,zero 84 xorl %eax, %eax 85 jmp .LBB1_11 86 .LBB1_1: 87 vmovss .LCPI1_0(%rip), %xmm0 # xmm0 = mem[0],zero,zero,zero 88 retq 89 .LBB1_4: 90 movq %rsi, %rax 91 andq $-32, %rax 92 leaq -32(%rax), %rcx 93 movq %rcx, %r8 94 shrq $5, %r8 95 addq $1, %r8 96 testq %rcx, %rcx 97 je .LBB1_5 98 movq %r8, %rcx 99 andq $-2, %rcx 100 vbroadcastss .LCPI1_0(%rip), %ymm0 # ymm0 = [3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38] 101 xorl %edx, %edx 102 vmovaps %ymm0, %ymm1 103 vmovaps %ymm0, %ymm2 104 vmovaps %ymm0, %ymm3 105 .LBB1_7: # =>This Inner Loop Header: Depth=1 106 vminps (%rdi,%rdx,4), %ymm0, %ymm0 107 vminps 32(%rdi,%rdx,4), %ymm1, %ymm1 108 vminps 64(%rdi,%rdx,4), %ymm2, %ymm2 109 vminps 96(%rdi,%rdx,4), %ymm3, %ymm3 110 vminps 128(%rdi,%rdx,4), %ymm0, %ymm0 111 vminps 160(%rdi,%rdx,4), %ymm1, %ymm1 112 vminps 192(%rdi,%rdx,4), %ymm2, %ymm2 113 vminps 224(%rdi,%rdx,4), %ymm3, %ymm3 114 addq $64, %rdx 115 addq $-2, %rcx 116 jne .LBB1_7 117 testb $1, %r8b 118 je .LBB1_10 119 .LBB1_9: 120 vminps (%rdi,%rdx,4), %ymm0, %ymm0 121 vminps 32(%rdi,%rdx,4), %ymm1, %ymm1 122 vminps 64(%rdi,%rdx,4), %ymm2, %ymm2 123 vminps 96(%rdi,%rdx,4), %ymm3, %ymm3 124 .LBB1_10: 125 vminps %ymm3, %ymm0, %ymm0 126 vminps %ymm2, %ymm1, %ymm1 127 vminps %ymm0, %ymm1, %ymm0 128 vextractf128 $1, %ymm0, %xmm1 129 vminps %xmm1, %xmm0, %xmm0 130 vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] 131 vminps %xmm1, %xmm0, %xmm0 132 vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] 133 vminss %xmm1, %xmm0, %xmm0 134 cmpq %rsi, %rax 135 je .LBB1_12 136 .LBB1_11: # =>This Inner Loop Header: Depth=1 137 vminss (%rdi,%rax,4), %xmm0, %xmm0 138 addq $1, %rax 139 cmpq %rax, %rsi 140 jne .LBB1_11 141 .LBB1_12: 142 vzeroupper 143 retq 144 .LBB1_5: 145 vbroadcastss .LCPI1_0(%rip), %ymm0 # ymm0 = [3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38] 146 xorl %edx, %edx 147 vmovaps %ymm0, %ymm1 148 vmovaps %ymm0, %ymm2 149 vmovaps %ymm0, %ymm3 150 testb $1, %r8b 151 jne .LBB1_9 152 jmp .LBB1_10 153 Minimum_F64_V(double*, double*, unsigned long): # @Minimum_F64_V(double*, double*, unsigned long) 154 testq %rdx, %rdx 155 je .LBB2_9 156 cmpq $16, %rdx 157 jae .LBB2_3 158 xorl %eax, %eax 159 jmp .LBB2_6 160 .LBB2_3: 161 movq %rdx, %rax 162 andq $-16, %rax 163 leaq 96(%rdi), %r8 164 xorl %ecx, %ecx 165 .LBB2_4: # =>This Inner Loop Header: Depth=1 166 vmovupd (%rsi,%rcx,8), %ymm0 167 vmovupd 32(%rsi,%rcx,8), %ymm1 168 vmovupd 64(%rsi,%rcx,8), %ymm2 169 vmovupd 96(%rsi,%rcx,8), %ymm3 170 vcmpltpd -96(%r8,%rcx,8), %ymm0, %ymm4 171 vcmpltpd -64(%r8,%rcx,8), %ymm1, %ymm5 172 vcmpltpd -32(%r8,%rcx,8), %ymm2, %ymm6 173 vcmpltpd (%r8,%rcx,8), %ymm3, %ymm7 174 vmaskmovpd %ymm0, %ymm4, -96(%r8,%rcx,8) 175 vmaskmovpd %ymm1, %ymm5, -64(%r8,%rcx,8) 176 vmaskmovpd %ymm2, %ymm6, -32(%r8,%rcx,8) 177 vmaskmovpd %ymm3, %ymm7, (%r8,%rcx,8) 178 addq $16, %rcx 179 cmpq %rcx, %rax 180 jne .LBB2_4 181 cmpq %rdx, %rax 182 jne .LBB2_6 183 .LBB2_9: 184 vzeroupper 185 retq 186 .LBB2_8: # in Loop: Header=BB2_6 Depth=1 187 addq $1, %rax 188 cmpq %rax, %rdx 189 je .LBB2_9 190 .LBB2_6: # =>This Inner Loop Header: Depth=1 191 vmovsd (%rsi,%rax,8), %xmm0 # xmm0 = mem[0],zero 192 vucomisd (%rdi,%rax,8), %xmm0 193 jae .LBB2_8 194 vmovsd %xmm0, (%rdi,%rax,8) 195 jmp .LBB2_8 196 Minimum_F32_V(float*, float*, unsigned long): # @Minimum_F32_V(float*, float*, unsigned long) 197 testq %rdx, %rdx 198 je .LBB3_9 199 cmpq $32, %rdx 200 jae .LBB3_3 201 xorl %eax, %eax 202 jmp .LBB3_6 203 .LBB3_3: 204 movq %rdx, %rax 205 andq $-32, %rax 206 leaq 96(%rdi), %r8 207 xorl %ecx, %ecx 208 .LBB3_4: # =>This Inner Loop Header: Depth=1 209 vmovups (%rsi,%rcx,4), %ymm0 210 vmovups 32(%rsi,%rcx,4), %ymm1 211 vmovups 64(%rsi,%rcx,4), %ymm2 212 vmovups 96(%rsi,%rcx,4), %ymm3 213 vcmpltps -96(%r8,%rcx,4), %ymm0, %ymm4 214 vcmpltps -64(%r8,%rcx,4), %ymm1, %ymm5 215 vcmpltps -32(%r8,%rcx,4), %ymm2, %ymm6 216 vcmpltps (%r8,%rcx,4), %ymm3, %ymm7 217 vmaskmovps %ymm0, %ymm4, -96(%r8,%rcx,4) 218 vmaskmovps %ymm1, %ymm5, -64(%r8,%rcx,4) 219 vmaskmovps %ymm2, %ymm6, -32(%r8,%rcx,4) 220 vmaskmovps %ymm3, %ymm7, (%r8,%rcx,4) 221 addq $32, %rcx 222 cmpq %rcx, %rax 223 jne .LBB3_4 224 cmpq %rdx, %rax 225 jne .LBB3_6 226 .LBB3_9: 227 vzeroupper 228 retq 229 .LBB3_8: # in Loop: Header=BB3_6 Depth=1 230 addq $1, %rax 231 cmpq %rax, %rdx 232 je .LBB3_9 233 .LBB3_6: # =>This Inner Loop Header: Depth=1 234 vmovss (%rsi,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 235 vucomiss (%rdi,%rax,4), %xmm0 236 jae .LBB3_8 237 vmovss %xmm0, (%rdi,%rax,4) 238 jmp .LBB3_8 239 MinimumNumber_F64_V(double*, double, unsigned long): # @MinimumNumber_F64_V(double*, double, unsigned long) 240 testq %rsi, %rsi 241 je .LBB4_9 242 cmpq $16, %rsi 243 jae .LBB4_3 244 xorl %eax, %eax 245 jmp .LBB4_6 246 .LBB4_3: 247 movq %rsi, %rax 248 andq $-16, %rax 249 vbroadcastsd %xmm0, %ymm1 250 leaq 96(%rdi), %rcx 251 xorl %edx, %edx 252 .LBB4_4: # =>This Inner Loop Header: Depth=1 253 vcmpltpd -96(%rcx,%rdx,8), %ymm1, %ymm2 254 vcmpltpd -64(%rcx,%rdx,8), %ymm1, %ymm3 255 vcmpltpd -32(%rcx,%rdx,8), %ymm1, %ymm4 256 vcmpltpd (%rcx,%rdx,8), %ymm1, %ymm5 257 vmaskmovpd %ymm1, %ymm2, -96(%rcx,%rdx,8) 258 vmaskmovpd %ymm1, %ymm3, -64(%rcx,%rdx,8) 259 vmaskmovpd %ymm1, %ymm4, -32(%rcx,%rdx,8) 260 vmaskmovpd %ymm1, %ymm5, (%rcx,%rdx,8) 261 addq $16, %rdx 262 cmpq %rdx, %rax 263 jne .LBB4_4 264 cmpq %rsi, %rax 265 jne .LBB4_6 266 .LBB4_9: 267 vzeroupper 268 retq 269 .LBB4_8: # in Loop: Header=BB4_6 Depth=1 270 addq $1, %rax 271 cmpq %rax, %rsi 272 je .LBB4_9 273 .LBB4_6: # =>This Inner Loop Header: Depth=1 274 vucomisd (%rdi,%rax,8), %xmm0 275 jae .LBB4_8 276 vmovsd %xmm0, (%rdi,%rax,8) 277 jmp .LBB4_8 278 MinimumNumber_F32_V(float*, float, unsigned long): # @MinimumNumber_F32_V(float*, float, unsigned long) 279 testq %rsi, %rsi 280 je .LBB5_9 281 cmpq $32, %rsi 282 jae .LBB5_3 283 xorl %eax, %eax 284 jmp .LBB5_6 285 .LBB5_3: 286 movq %rsi, %rax 287 andq $-32, %rax 288 vbroadcastss %xmm0, %ymm1 289 leaq 96(%rdi), %rcx 290 xorl %edx, %edx 291 .LBB5_4: # =>This Inner Loop Header: Depth=1 292 vcmpltps -96(%rcx,%rdx,4), %ymm1, %ymm2 293 vcmpltps -64(%rcx,%rdx,4), %ymm1, %ymm3 294 vcmpltps -32(%rcx,%rdx,4), %ymm1, %ymm4 295 vcmpltps (%rcx,%rdx,4), %ymm1, %ymm5 296 vmaskmovps %ymm1, %ymm2, -96(%rcx,%rdx,4) 297 vmaskmovps %ymm1, %ymm3, -64(%rcx,%rdx,4) 298 vmaskmovps %ymm1, %ymm4, -32(%rcx,%rdx,4) 299 vmaskmovps %ymm1, %ymm5, (%rcx,%rdx,4) 300 addq $32, %rdx 301 cmpq %rdx, %rax 302 jne .LBB5_4 303 cmpq %rsi, %rax 304 jne .LBB5_6 305 .LBB5_9: 306 vzeroupper 307 retq 308 .LBB5_8: # in Loop: Header=BB5_6 Depth=1 309 addq $1, %rax 310 cmpq %rax, %rsi 311 je .LBB5_9 312 .LBB5_6: # =>This Inner Loop Header: Depth=1 313 vucomiss (%rdi,%rax,4), %xmm0 314 jae .LBB5_8 315 vmovss %xmm0, (%rdi,%rax,4) 316 jmp .LBB5_8