github.com/apache/arrow/go/v14@v14.0.1/internal/utils/_lib/min_max_neon.s (about) 1 .text 2 .file "min_max.c" 3 .globl int32_max_min_neon // -- Begin function int32_max_min_neon 4 .p2align 2 5 .type int32_max_min_neon,@function 6 int32_max_min_neon: // @int32_max_min_neon 7 // %bb.0: 8 stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 9 cmp w1, #1 // =1 10 mov x29, sp 11 b.lt .LBB0_3 12 // %bb.1: 13 cmp w1, #3 // =3 14 mov w8, w1 15 b.hi .LBB0_4 16 // %bb.2: 17 mov x9, xzr 18 mov w11, #-2147483648 19 mov w10, #2147483647 20 b .LBB0_7 21 .LBB0_3: 22 mov w10, #2147483647 23 mov w11, #-2147483648 24 str w11, [x3] 25 str w10, [x2] 26 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 27 ret 28 .LBB0_4: 29 and x9, x8, #0xfffffffc 30 add x10, x0, #8 // =8 31 movi v2.2s, #128, lsl #24 32 mvni v0.2s, #128, lsl #24 33 mvni v1.2s, #128, lsl #24 34 mov x11, x9 35 movi v3.2s, #128, lsl #24 36 .LBB0_5: // =>This Inner Loop Header: Depth=1 37 ldp d4, d5, [x10, #-8] 38 subs x11, x11, #4 // =4 39 add x10, x10, #16 // =16 40 smin v0.2s, v0.2s, v4.2s 41 smin v1.2s, v1.2s, v5.2s 42 smax v2.2s, v2.2s, v4.2s 43 smax v3.2s, v3.2s, v5.2s 44 b.ne .LBB0_5 45 // %bb.6: 46 smax v2.2s, v2.2s, v3.2s 47 smin v0.2s, v0.2s, v1.2s 48 dup v1.2s, v2.s[1] 49 dup v3.2s, v0.s[1] 50 smax v1.2s, v2.2s, v1.2s 51 smin v0.2s, v0.2s, v3.2s 52 cmp x9, x8 53 fmov w11, s1 54 fmov w10, s0 55 b.eq .LBB0_9 56 .LBB0_7: 57 add x12, x0, x9, lsl #2 58 sub x8, x8, x9 59 .LBB0_8: // =>This Inner Loop Header: Depth=1 60 ldr w9, [x12], #4 61 cmp w10, w9 62 csel w10, w10, w9, lt 63 cmp w11, w9 64 csel w11, w11, w9, gt 65 subs x8, x8, #1 // =1 66 b.ne .LBB0_8 67 .LBB0_9: 68 str w11, [x3] 69 str w10, [x2] 70 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 71 ret 72 .Lfunc_end0: 73 .size int32_max_min_neon, .Lfunc_end0-int32_max_min_neon 74 // -- End function 75 .globl uint32_max_min_neon // -- Begin function uint32_max_min_neon 76 .p2align 2 77 .type uint32_max_min_neon,@function 78 uint32_max_min_neon: // @uint32_max_min_neon 79 // %bb.0: 80 stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 81 cmp w1, #1 // =1 82 mov x29, sp 83 b.lt .LBB1_3 84 // %bb.1: 85 cmp w1, #3 // =3 86 mov w8, w1 87 b.hi .LBB1_4 88 // %bb.2: 89 mov x9, xzr 90 mov w10, wzr 91 mov w11, #-1 92 b .LBB1_7 93 .LBB1_3: 94 mov w10, wzr 95 mov w11, #-1 96 str w10, [x3] 97 str w11, [x2] 98 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 99 ret 100 .LBB1_4: 101 and x9, x8, #0xfffffffc 102 movi v1.2d, #0000000000000000 103 movi v0.2d, #0xffffffffffffffff 104 add x10, x0, #8 // =8 105 movi v2.2d, #0xffffffffffffffff 106 mov x11, x9 107 movi v3.2d, #0000000000000000 108 .LBB1_5: // =>This Inner Loop Header: Depth=1 109 ldp d4, d5, [x10, #-8] 110 subs x11, x11, #4 // =4 111 add x10, x10, #16 // =16 112 umin v0.2s, v0.2s, v4.2s 113 umin v2.2s, v2.2s, v5.2s 114 umax v1.2s, v1.2s, v4.2s 115 umax v3.2s, v3.2s, v5.2s 116 b.ne .LBB1_5 117 // %bb.6: 118 umax v1.2s, v1.2s, v3.2s 119 umin v0.2s, v0.2s, v2.2s 120 dup v2.2s, v1.s[1] 121 dup v3.2s, v0.s[1] 122 umax v1.2s, v1.2s, v2.2s 123 umin v0.2s, v0.2s, v3.2s 124 cmp x9, x8 125 fmov w10, s1 126 fmov w11, s0 127 b.eq .LBB1_9 128 .LBB1_7: 129 add x12, x0, x9, lsl #2 130 sub x8, x8, x9 131 .LBB1_8: // =>This Inner Loop Header: Depth=1 132 ldr w9, [x12], #4 133 cmp w11, w9 134 csel w11, w11, w9, lo 135 cmp w10, w9 136 csel w10, w10, w9, hi 137 subs x8, x8, #1 // =1 138 b.ne .LBB1_8 139 .LBB1_9: 140 str w10, [x3] 141 str w11, [x2] 142 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 143 ret 144 .Lfunc_end1: 145 .size uint32_max_min_neon, .Lfunc_end1-uint32_max_min_neon 146 // -- End function 147 .globl int64_max_min_neon // -- Begin function int64_max_min_neon 148 .p2align 2 149 .type int64_max_min_neon,@function 150 int64_max_min_neon: // @int64_max_min_neon 151 // %bb.0: 152 stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 153 cmp w1, #1 // =1 154 mov x29, sp 155 b.lt .LBB2_3 156 // %bb.1: 157 mov w8, w1 158 mov x11, #-9223372036854775808 159 cmp w1, #3 // =3 160 mov x10, #9223372036854775807 161 b.hi .LBB2_4 162 // %bb.2: 163 mov x9, xzr 164 b .LBB2_7 165 .LBB2_3: 166 mov x10, #9223372036854775807 167 mov x11, #-9223372036854775808 168 str x11, [x3] 169 str x10, [x2] 170 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 171 ret 172 .LBB2_4: 173 and x9, x8, #0xfffffffc 174 dup v1.2d, x11 175 dup v0.2d, x10 176 add x10, x0, #16 // =16 177 mov x11, x9 178 mov v2.16b, v0.16b 179 mov v3.16b, v1.16b 180 .LBB2_5: // =>This Inner Loop Header: Depth=1 181 ldp q4, q5, [x10, #-16] 182 mov v6.16b, v3.16b 183 mov v7.16b, v1.16b 184 mov v3.16b, v2.16b 185 mov v1.16b, v0.16b 186 cmgt v0.2d, v4.2d, v0.2d 187 cmgt v2.2d, v5.2d, v2.2d 188 bsl v0.16b, v1.16b, v4.16b 189 cmgt v1.2d, v7.2d, v4.2d 190 bsl v2.16b, v3.16b, v5.16b 191 cmgt v3.2d, v6.2d, v5.2d 192 subs x11, x11, #4 // =4 193 bsl v1.16b, v7.16b, v4.16b 194 bsl v3.16b, v6.16b, v5.16b 195 add x10, x10, #32 // =32 196 b.ne .LBB2_5 197 // %bb.6: 198 cmgt v4.2d, v1.2d, v3.2d 199 cmgt v5.2d, v2.2d, v0.2d 200 bsl v4.16b, v1.16b, v3.16b 201 bsl v5.16b, v0.16b, v2.16b 202 dup v0.2d, v4.d[1] 203 dup v1.2d, v5.d[1] 204 cmgt v2.2d, v4.2d, v0.2d 205 cmgt v3.2d, v1.2d, v5.2d 206 bsl v2.16b, v4.16b, v0.16b 207 bsl v3.16b, v5.16b, v1.16b 208 cmp x9, x8 209 fmov x11, d2 210 fmov x10, d3 211 b.eq .LBB2_9 212 .LBB2_7: 213 add x12, x0, x9, lsl #3 214 sub x8, x8, x9 215 .LBB2_8: // =>This Inner Loop Header: Depth=1 216 ldr x9, [x12], #8 217 cmp x10, x9 218 csel x10, x10, x9, lt 219 cmp x11, x9 220 csel x11, x11, x9, gt 221 subs x8, x8, #1 // =1 222 b.ne .LBB2_8 223 .LBB2_9: 224 str x11, [x3] 225 str x10, [x2] 226 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 227 ret 228 .Lfunc_end2: 229 .size int64_max_min_neon, .Lfunc_end2-int64_max_min_neon 230 // -- End function 231 .globl uint64_max_min_neon // -- Begin function uint64_max_min_neon 232 .p2align 2 233 .type uint64_max_min_neon,@function 234 uint64_max_min_neon: // @uint64_max_min_neon 235 // %bb.0: 236 stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 237 cmp w1, #1 // =1 238 mov x29, sp 239 b.lt .LBB3_3 240 // %bb.1: 241 cmp w1, #3 // =3 242 mov w8, w1 243 b.hi .LBB3_4 244 // %bb.2: 245 mov x9, xzr 246 mov x10, xzr 247 mov x11, #-1 248 b .LBB3_7 249 .LBB3_3: 250 mov x10, xzr 251 mov x11, #-1 252 str x10, [x3] 253 str x11, [x2] 254 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 255 ret 256 .LBB3_4: 257 and x9, x8, #0xfffffffc 258 add x10, x0, #16 // =16 259 movi v1.2d, #0000000000000000 260 movi v0.2d, #0xffffffffffffffff 261 movi v2.2d, #0xffffffffffffffff 262 mov x11, x9 263 movi v3.2d, #0000000000000000 264 .LBB3_5: // =>This Inner Loop Header: Depth=1 265 ldp q4, q5, [x10, #-16] 266 mov v6.16b, v3.16b 267 mov v7.16b, v1.16b 268 mov v3.16b, v2.16b 269 mov v1.16b, v0.16b 270 cmhi v0.2d, v4.2d, v0.2d 271 cmhi v2.2d, v5.2d, v2.2d 272 bsl v0.16b, v1.16b, v4.16b 273 cmhi v1.2d, v7.2d, v4.2d 274 bsl v2.16b, v3.16b, v5.16b 275 cmhi v3.2d, v6.2d, v5.2d 276 subs x11, x11, #4 // =4 277 bsl v1.16b, v7.16b, v4.16b 278 bsl v3.16b, v6.16b, v5.16b 279 add x10, x10, #32 // =32 280 b.ne .LBB3_5 281 // %bb.6: 282 cmhi v4.2d, v1.2d, v3.2d 283 cmhi v5.2d, v2.2d, v0.2d 284 bsl v4.16b, v1.16b, v3.16b 285 bsl v5.16b, v0.16b, v2.16b 286 dup v0.2d, v4.d[1] 287 dup v1.2d, v5.d[1] 288 cmhi v2.2d, v4.2d, v0.2d 289 cmhi v3.2d, v1.2d, v5.2d 290 bsl v2.16b, v4.16b, v0.16b 291 bsl v3.16b, v5.16b, v1.16b 292 cmp x9, x8 293 fmov x10, d2 294 fmov x11, d3 295 b.eq .LBB3_9 296 .LBB3_7: 297 add x12, x0, x9, lsl #3 298 sub x8, x8, x9 299 .LBB3_8: // =>This Inner Loop Header: Depth=1 300 ldr x9, [x12], #8 301 cmp x11, x9 302 csel x11, x11, x9, lo 303 cmp x10, x9 304 csel x10, x10, x9, hi 305 subs x8, x8, #1 // =1 306 b.ne .LBB3_8 307 .LBB3_9: 308 str x10, [x3] 309 str x11, [x2] 310 ldp x29, x30, [sp], #16 // 16-byte Folded Reload 311 ret 312 .Lfunc_end3: 313 .size uint64_max_min_neon, .Lfunc_end3-uint64_max_min_neon 314 // -- End function 315 316 .ident "clang version 9.0.1-12 " 317 .section ".note.GNU-stack","",@progbits 318 .addrsig