gitee.com/quant1x/num@v0.3.2/asm/c2goasm/subroutine_test.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2017 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "strings" 21 "testing" 22 ) 23 24 func testName(t *testing.T, fullname, expected string) { 25 name := extractName(fullname) 26 if name != expected { 27 t.Errorf("TestNames(): \nexpected %s\ngot %s", expected, name) 28 } 29 } 30 31 func TestNames(t *testing.T) { 32 33 // C++ name mangling 34 testName(t, "_ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv444pToBgra") 35 testName(t, "_ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv420pToBgra") 36 testName(t, "_ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv422pToBgra") 37 testName(t, "_ZN4Simd4Avx213ReduceGray2x2EPKhmmmPhmmm", "SimdAvx2ReduceGray2x2") 38 testName(t, "_ZN4Simd4Avx216AbsDifferenceSumEPKhmS2_mmmPy", "SimdAvx2AbsDifferenceSum") 39 40 // Plain C style names 41 testName(t, "XORShift128Plus", "XORShift128Plus") 42 } 43 44 func subroutineEqual(a, b []Subroutine) bool { 45 46 if a == nil && b == nil { 47 return true 48 } 49 50 if a == nil || b == nil { 51 return false 52 } 53 54 if len(a) != len(b) { 55 return false 56 } 57 58 for i := range a { 59 if !(a[i].name == b[i].name && equalString(a[i].body, b[i].body)) { 60 return false 61 } 62 } 63 64 return true 65 } 66 67 func testSubroutine(t *testing.T, fullsrc []string, expected []Subroutine) { 68 subroutines := segmentSource(fullsrc) 69 if !subroutineEqual(subroutines, expected) { 70 t.Errorf("testSubroutine(): \nexpected %#v\ngot %#v", expected, subroutines) 71 } 72 } 73 74 func TestSubroutine(t *testing.T) { 75 76 disabledForTesting = true 77 78 src1 := strings.Split(` .section __TEXT,__text,regular,pure_instructions 79 .macosx_version_min 10, 11 80 .intel_syntax noprefix 81 .section __TEXT,__const 82 .align 5 83 LCPI0_0: 84 .byte 255 ## 0xff 85 .byte 0 ## 0x0 86 LCPI0_1: 87 .short 9617 ## 0x2591 88 .short 0 ## 0x0 89 LCPI0_2: 90 .short 1868 ## 0x74c 91 .short 4899 ## 0x1323 92 .section __TEXT,__literal4,4byte_literals 93 .align 2 94 LCPI0_3: 95 .long 8192 ## 0x2000 96 .section __TEXT,__text,regular,pure_instructions 97 .globl __ZN4Simd4Avx210BgraToGrayEPKhmmmPhm 98 .align 4, 0x90 99 __ZN4Simd4Avx210BgraToGrayEPKhmmmPhm: ## @_ZN4Simd4Avx210BgraToGrayEPKhmmmPhm 100 ## BB#0: 101 push rbp 102 mov rbp, rsp 103 mov rax, rdi 104 and rax, -32 105 cmp rax, rdi 106 jne LBB0_9 107 ## BB#1: 108 mov r10, r9 109 jne LBB0_9 110 ## BB#2: 111 mov rax, r8 112 jne LBB0_9 113 ## BB#3: 114 test rdx, rdx 115 je LBB0_15 116 ## BB#4: ## %.preheader.lr.ph.i.1 117 mov r11, rsi 118 .align 4, 0x90 119 LBB0_5: ## %.preheader.i.5 120 je LBB0_6 121 .align 4, 0x90 122 LBB0_16: ## %.lr.ph.i.12 123 ## Parent Loop BB0_5 Depth=1 124 ## => This Inner Loop Header: Depth=2 125 vmovdqu ymm4, ymmword ptr [rdi + 4*rax] 126 cmp rax, r11 127 jb LBB0_16 128 LBB0_6: ## %._crit_edge.i.6 129 ## in Loop: Header=BB0_5 Depth=1 130 cmp r11, rsi 131 je LBB0_8 132 ## BB#7: ## in Loop: Header=BB0_5 Depth=1 133 vmovdqu ymm4, ymmword ptr [rdi + 4*rsi - 128] 134 vmovdqu ymmword ptr [r8 + rsi - 32], ymm4 135 LBB0_8: ## in Loop: Header=BB0_5 Depth=1 136 add rdi, rcx 137 jne LBB0_5 138 jmp LBB0_15 139 LBB0_9: 140 test rdx, rdx 141 je LBB0_15 142 ## BB#10: ## %.preheader.lr.ph.i 143 mov r11, rsi 144 vpbroadcastd ymm3, dword ptr [rip + LCPI0_3] 145 .align 4, 0x90 146 LBB0_11: ## %.preheader.i 147 ## =>This Loop Header: Depth=1 148 ## Child Loop BB0_17 Depth 2 149 mov eax, 0 150 test r11, r11 151 je LBB0_12 152 .align 4, 0x90 153 LBB0_17: ## %.lr.ph.i 154 ## Parent Loop BB0_11 Depth=1 155 ## => This Inner Loop Header: Depth=2 156 vmovdqu ymm4, ymmword ptr [rdi + 4*rax] 157 vmovdqu ymm5, ymmword ptr [rdi + 4*rax + 32] 158 jb LBB0_17 159 LBB0_12: ## %._crit_edge.i 160 ## in Loop: Header=BB0_11 Depth=1 161 cmp r11, rsi 162 je LBB0_14 163 ## BB#13: ## in Loop: Header=BB0_11 Depth=1 164 vmovdqu ymm4, ymmword ptr [rdi + 4*rsi - 128] 165 vmovdqu ymmword ptr [r8 + rsi - 32], ymm4 166 LBB0_14: ## in Loop: Header=BB0_11 Depth=1 167 add rdi, rcx 168 add r8, r9 169 inc r10 170 cmp r10, rdx 171 jne LBB0_11 172 LBB0_15: ## %_ZN4Simd4Avx210BgraToGrayILb1EEEvPKhmmmPhm.exit 173 pop rbp 174 vzeroupper 175 ret 176 177 .subsections_via_symbols 178 `, "\n") 179 180 subroutine1 := []Subroutine{} 181 subroutine1 = append(subroutine1, Subroutine{name: "SimdAvx2BgraToGray", body: src1[25:98]}) 182 183 testSubroutine(t, src1, subroutine1) 184 185 src2 := strings.Split(` .section __TEXT,__text,regular,pure_instructions 186 .macosx_version_min 10, 11 187 .intel_syntax noprefix 188 .section __TEXT,__const 189 .align 5 190 LCPI0_0: 191 .short 16 ## 0x10 192 .short 13074 ## 0x3312 193 .short 0 ## 0x0 194 .section __TEXT,__text,regular,pure_instructions 195 .globl __ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh 196 .align 4, 0x90 197 __ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh 198 ## BB#0: 199 push rbp 200 mov rbp, rsp 201 push r15 202 push r14 203 push r13 204 push r12 205 push rbx 206 and rsp, -32 207 sub rsp, 192 208 mov qword ptr [rsp + 56], r9 ## 8-byte Spill 209 mov r9b, byte ptr [rbp + 48] 210 mov r15, qword ptr [rbp + 40] 211 mov r13, qword ptr [rbp + 32] 212 mov r10, qword ptr [rbp + 16] 213 mov rbx, rsi 214 and rbx, -32 215 cmp rbx, rsi 216 jne LBB0_14 217 ### BB#1: 218 mov rbx, rdi 219 cmp rbx, r13 220 jne LBB0_14 221 ## BB#8: 222 movzx eax, r9b 223 cmp qword ptr [rbp + 24], 0 224 je LBB0_20 225 ## BB#9: ## %.preheader.lr.ph.i.1 226 vinserti128 ymm14, ymm0, xmm0, 1 227 vmovdqu ymmword ptr [r13 + r9 + 96], ymm0 228 LBB0_13: ## in Loop: Header=BB0_10 Depth=1 229 add rdi, rsi 230 jb LBB0_22 231 LBB0_17: ## %._crit_edge.i 232 cmp rbx, qword ptr [rbp + 16] 233 cmp r11, qword ptr [rbp + 24] 234 jne LBB0_16 235 LBB0_20: ## %_ZN4Simd4Avx213Yuv444pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit 236 lea rsp, [rbp - 40] 237 pop rbx 238 pop r12 239 pop r13 240 pop r14 241 pop r15 242 pop rbp 243 vzeroupper 244 ret 245 246 .section __TEXT,__const 247 .align 5 248 LCPI1_0: 249 .byte 0 ## 0x0 250 .space 1 251 .space 1 252 .space 1 253 LCPI1_13: 254 .space 32 255 .section __TEXT,__text,regular,pure_instructions 256 .globl __ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh 257 .align 4, 0x90 258 __ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh 259 ## BB#0: 260 push rbp 261 mov rbp, rsp 262 push r15 263 push r14 264 push r13 265 push r12 266 push rbx 267 and rsp, -32 268 sub rsp, 864 269 mov qword ptr [rsp + 144], r9 ## 8-byte Spill 270 mov qword ptr [rsp + 152], rcx ## 8-byte Spill 271 xor r12d, r12d 272 .align 4, 0x90 273 LBB1_12: ## %.lr.ph.i.18 274 ## Parent Loop BB1_10 Depth=1 275 cmp r15, r11 276 jb LBB1_12 277 LBB1_13: ## %._crit_edge.i.8 278 ## in Loop: Header=BB1_10 Depth=1 279 vmovdqa ymm7, ymm10 280 vmovdqu ymmword ptr [rax + rsi + 224], ymm0 281 LBB1_15: ## in Loop: Header=BB1_10 Depth=1 282 add rdi, qword ptr [rsp + 192] ## 8-byte Folded Reload 283 vmovdqa ymm7, ymmword ptr [rip + LCPI1_7] ## ymm7 = <u,u,u,u,1,1,1,1,u,u,u,u,1,1,1,1> 284 .align 4, 0x90 285 LBB1_18: ## %.preheader.i 286 ## =>This Loop Header: Depth=1 287 cmp rsi, rbx 288 jb LBB1_23 289 LBB1_19: ## %._crit_edge.i 290 ## in Loop: Header=BB1_18 Depth=1 291 vmovdqu ymmword ptr [rax + rsi + 224], ymm0 292 LBB1_21: ## in Loop: Header=BB1_18 Depth=1 293 add rdi, qword ptr [rsp + 96] ## 8-byte Folded Reload 294 jb LBB1_18 295 LBB1_22: ## %_ZN4Simd4Avx213Yuv420pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit 296 lea rsp, [rbp - 40] 297 pop rbx 298 pop r12 299 pop r13 300 pop r14 301 pop r15 302 pop rbp 303 vzeroupper 304 ret 305 306 .section __TEXT,__const 307 .align 5 308 LCPI2_0: 309 .byte 0 ## 0x0 310 .byte 2 ## 0x2 311 .byte 15 ## 0xf 312 LCPI2_12: 313 .space 1 314 .space 1 315 .section __TEXT,__text,regular,pure_instructions 316 .globl __ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh 317 .align 4, 0x90 318 __ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh 319 ## BB#0: 320 push rbp 321 mov rbp, rsp 322 push r15 323 push r14 324 push r13 325 push r12 326 push rbx 327 and rsp, -32 328 sub rsp, 416 329 mov qword ptr [rsp + 184], rcx ## 8-byte Spill 330 mov qword ptr [rsp + 176], rsi ## 8-byte Spill 331 mov cl, byte ptr [rbp + 48] 332 mov r12, qword ptr [rbp + 40] 333 mov rax, qword ptr [rbp + 32] 334 mov r10, qword ptr [rbp + 16] 335 jne LBB2_14 336 ## BB#1: 337 mov rsi, rdi 338 jne LBB2_14 339 ## BB#8: 340 movzx ecx, cl 341 cmp qword ptr [rbp + 24], 0 342 mov rcx, r9 343 je LBB2_20 344 ## BB#9: ## %.preheader.lr.ph.i.1 345 vinserti128 ymm12, ymm0, xmm0, 1 346 .align 4, 0x90 347 LBB2_10: ## %.preheader.i.7 348 .align 4, 0x90 349 LBB2_21: ## %.lr.ph.i.16 350 jb LBB2_21 351 LBB2_11: ## %._crit_edge.i.8 352 je LBB2_13 353 ## BB#12: ## in Loop: Header=BB2_10 Depth=1 354 vmovdqa ymm15, ymm9 355 vmovdqu ymmword ptr [rax + r15 + 224], ymm0 356 LBB2_13: ## in Loop: Header=BB2_10 Depth=1 357 add rdi, qword ptr [rsp + 176] ## 8-byte Folded Reload 358 jmp LBB2_20 359 LBB2_14: 360 mov qword ptr [rsp + 168], r9 ## 8-byte Spill 361 je LBB2_20 362 ## BB#15: ## %.preheader.lr.ph.i 363 vinserti128 ymm0, ymm0, xmm0, 1 364 .align 4, 0x90 365 LBB2_16: ## %.preheader.i 366 ## =>This Loop Header: Depth=1 367 je LBB2_17 368 .align 4, 0x90 369 LBB2_22: ## %.lr.ph.i 370 cmp r15, rbx 371 jb LBB2_22 372 LBB2_17: ## %._crit_edge.i 373 ## in Loop: Header=BB2_16 Depth=1 374 cmp rbx, qword ptr [rbp + 16] 375 je LBB2_19 376 ## BB#18: ## in Loop: Header=BB2_16 Depth=1 377 vpermq ymm1, ymmword ptr [rdx + rsi], 216 ## ymm1 = mem[0,2,1,3] 378 vmovdqu ymmword ptr [rax + r13 + 224], ymm0 379 LBB2_19: ## in Loop: Header=BB2_16 Depth=1 380 add rdi, qword ptr [rsp + 176] ## 8-byte Folded Reload 381 jne LBB2_16 382 LBB2_20: ## %_ZN4Simd4Avx213Yuv422pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit 383 lea rsp, [rbp - 40] 384 pop rbx 385 pop r12 386 pop r13 387 pop r14 388 pop r15 389 pop rbp 390 vzeroupper 391 ret 392 393 .subsections_via_symbols`, "\n") 394 395 subroutine2 := []Subroutine{} 396 subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv444pToBgra", body: src2[23:60]}) 397 subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv420pToBgra", body: src2[84:120]}) 398 subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv422pToBgra", body: src2[144:207]}) 399 400 testSubroutine(t, src2, subroutine2) 401 402 src3 := strings.Split(` .globl __ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_ 403 .align 4, 0x90 404 __ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_: ## @_ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_ 405 ## BB#0: 406 push rbp 407 mov rbp, rsp 408 vmovups ymm0, ymmword ptr [rdi] 409 vmovups ymm1, ymmword ptr [rsi] 410 vfmadd213ps ymm1, ymm0, ymmword ptr [rdx] 411 vmovups ymmword ptr [rcx], ymm1 412 pop rbp 413 vzeroupper 414 ret 415 416 .subsections_via_symbols`, "\n") 417 418 subroutine3 := []Subroutine{} 419 subroutine3 = append(subroutine3, Subroutine{name: "SimdAvx2MultiplyAndAdd", body: src3[6:13]}) 420 421 testSubroutine(t, src3, subroutine3) 422 423 src4 := strings.Split(` .section __TEXT,__text,regular,pure_instructions 424 .macosx_version_min 10, 11 425 .intel_syntax noprefix 426 .globl __Z22MultiplyAndAddConstantPfS_S_ 427 .align 4, 0x90 428 __Z22MultiplyAndAddConstantPfS_S_: ## @_Z22MultiplyAndAddConstantPfS_S_ 429 ## BB#0: 430 push rbp 431 mov rbp, rsp 432 vmovups ymm0, ymmword ptr [rdi] 433 vmovups ymm1, ymmword ptr [rsi] 434 vfmadd213ps ymm1, ymm0, ymmword ptr [rip + __ZL1a] 435 vmovups ymmword ptr [rdx], ymm1 436 pop rbp 437 vzeroupper 438 ret 439 440 .section __DATA,__data 441 .align 5 ## @_ZL1a 442 __ZL1a: 443 .long 1065353216 ## float 1.000000e+00 444 .long 1073741824 ## float 2.000000e+00 445 .long 1077936128 ## float 3.000000e+00 446 .long 1082130432 ## float 4.000000e+00 447 .long 1084227584 ## float 5.000000e+00 448 .long 1086324736 ## float 6.000000e+00 449 .long 1088421888 ## float 7.000000e+00 450 .long 1090519040 ## float 8.000000e+00 451 `, "\n") 452 453 subroutine4 := []Subroutine{} 454 subroutine4 = append(subroutine4, Subroutine{name: "MultiplyAndAddConstant", body: src4[9:16]}) 455 456 testSubroutine(t, src4, subroutine4) 457 458 subroutine5 := []Subroutine{} 459 subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv420p", body: srcOsx[43:53]}) 460 subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv422p", body: srcOsx[94:103]}) 461 subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv444p", body: srcOsx[142:151]}) 462 463 testSubroutine(t, srcOsx, subroutine5) 464 465 subroutine6 := []Subroutine{} 466 subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv420p", body: srcClang[41:51]}) 467 subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv422p", body: srcClang[94:103]}) 468 subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv444p", body: srcClang[144:153]}) 469 470 testSubroutine(t, srcClang, subroutine6) 471 472 subroutine7 := []Subroutine{} 473 subroutine7 = append(subroutine7, Subroutine{name: "SimdSse2Bgr48pToBgra32", body: srcRetInMiddle[36:291]}) 474 475 testSubroutine(t, srcRetInMiddle, subroutine7) 476 477 disabledForTesting = false 478 subroutine8 := []Subroutine{} 479 subroutine8 = append(subroutine8, Subroutine{name: "sample_sum_sse4_2", body: srcLabelHasSpecialComment[11:113]}) 480 481 testSubroutine(t, srcLabelHasSpecialComment, subroutine8) 482 } 483 484 var srcClang = strings.Split(` .text 485 .intel_syntax noprefix 486 .section .rodata.cst16,"aM",@progbits,16 487 .align 16 488 .LCPI0_0: 489 .byte 255 # 0xff 490 .LCPI0_1: 491 .byte 255 # 0xff 492 .byte 0 # 0x0 493 .LCPI0_2: 494 .quad 281474976776192 # 0x1000000010000 495 .LCPI0_3: 496 .short 1606 # 0x646 497 .short 4211 # 0x1073 498 .LCPI0_4: 499 .short 8258 # 0x2042 500 .LCPI0_5: 501 .short 16 # 0x10 502 .LCPI0_6: 503 .short 2 # 0x2 504 .LCPI0_7: 505 .short 7193 # 0x1c19 506 .LCPI0_8: 507 .short 60768 # 0xed60 508 .LCPI0_9: 509 .short 128 # 0x80 510 .LCPI0_10: 511 .short 64373 # 0xfb75 512 .LCPI0_11: 513 .short 59507 # 0xe873 514 .LCPI0_12: 515 .zero 16 516 .text 517 .globl _ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m 518 .align 16, 0x90 519 .type _ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m,@function 520 _ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m 521 # BB#0: 522 push rbp 523 push r15 524 push r14 525 526 .LBB0_24: # %_ZN4Simd4Sse213BgraToYuv420pILb1EEEvPKhmmmPhmS4_mS4_m.exit 527 add rsp, 136 528 pop rbx 529 pop r12 530 pop r13 531 pop r14 532 pop r15 533 pop rbp 534 ret 535 .Lfunc_end0: 536 .size _ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m, .Lfunc_end0-_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m 537 538 .section .rodata.cst16,"aM",@progbits,16 539 .align 16 540 .LCPI1_0: 541 .byte 255 # 0xff 542 .LCPI1_1: 543 .byte 0 # 0x0 544 .LCPI1_2: 545 .quad 281474976776192 # 0x1000000010000 546 .LCPI1_3: 547 .short 4211 # 0x1073 548 .LCPI1_4: 549 .short 8192 # 0x2000 550 .LCPI1_5: 551 .short 16 # 0x10 552 .LCPI1_6: 553 .short 1 # 0x1 554 .LCPI1_7: 555 .short 7193 # 0x1c19 556 .LCPI1_8: 557 .short 60768 # 0xed60 558 .LCPI1_9: 559 .short 128 # 0x80 560 .LCPI1_10: 561 .short 64373 # 0xfb75 562 .LCPI1_11: 563 .short 59507 # 0xe873 564 .LCPI1_12: 565 .zero 16 566 .text 567 .globl _ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m 568 .align 16, 0x90 569 .type _ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m,@function 570 _ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m 571 # BB#0: 572 push rbp 573 push r15 574 push r14 575 push r13 576 push r12 577 push rbx 578 579 .LBB1_20: # %_ZN4Simd4Sse213BgraToYuv422pILb1EEEvPKhmmmPhmS4_mS4_m.exit 580 pop rbx 581 pop r12 582 pop r13 583 pop r14 584 pop r15 585 pop rbp 586 ret 587 .Lfunc_end1: 588 .size _ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m, .Lfunc_end1-_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m 589 590 .section .rodata.cst16,"aM",@progbits,16 591 .align 16 592 .LCPI2_0: 593 .byte 0 # 0x0 594 .LCPI2_1: 595 .byte 255 # 0xff 596 .LCPI2_2: 597 .quad 281474976776192 # 0x1000000010000 598 .LCPI2_3: 599 .short 1606 # 0x646 600 .LCPI2_4: 601 .short 8258 # 0x2042 602 .LCPI2_5: 603 .short 16 # 0x10 604 .LCPI2_6: 605 .short 7193 # 0x1c19 606 .LCPI2_7: 607 .short 60768 # 0xed60 608 .LCPI2_8: 609 .short 128 # 0x80 610 .LCPI2_9: 611 .short 64373 # 0xfb75 612 .LCPI2_10: 613 .short 59507 # 0xe873 614 .LCPI2_11: 615 .zero 16 616 .text 617 .globl _ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m 618 .align 16, 0x90 619 .type _ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m,@function 620 _ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m 621 # BB#0: 622 push rbp 623 push r15 624 push r14 625 push r13 626 push r12 627 push rbx 628 629 .LBB2_20: # %_ZN4Simd4Sse213BgraToYuv444pILb1EEEvPKhmmmPhmS4_mS4_m.exit 630 pop rbx 631 pop r12 632 pop r13 633 pop r14 634 pop r15 635 pop rbp 636 ret 637 .Lfunc_end2: 638 .size _ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m, .Lfunc_end2-_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m 639 640 641 .ident "clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)" 642 .section ".note.GNU-stack","",@progbits`, "\n") 643 644 var srcOsx = strings.Split(` .section __TEXT,__text,regular,pure_instructions 645 .macosx_version_min 10, 11 646 .intel_syntax noprefix 647 .section __TEXT,__literal16,16byte_literals 648 .align 4 649 LCPI0_0: 650 .byte 255 ## 0xff 651 LCPI0_1: 652 .byte 1 ## 0x1 653 LCPI0_2: 654 .quad 281474976776192 ## 0x1000000010000 655 LCPI0_3: 656 .short 1606 ## 0x646 657 LCPI0_4: 658 .short 8258 ## 0x2042 659 LCPI0_5: 660 .short 16 ## 0x10 661 LCPI0_6: 662 .short 2 ## 0x2 663 LCPI0_7: 664 .short 7193 ## 0x1c19 665 LCPI0_8: 666 .short 60768 ## 0xed60 667 LCPI0_9: 668 .short 128 ## 0x80 669 LCPI0_10: 670 .short 64373 ## 0xfb75 671 LCPI0_11: 672 .short 59507 ## 0xe873 673 LCPI0_12: 674 .space 16 675 .section __TEXT,__text,regular,pure_instructions 676 .globl __ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m 677 .align 4, 0x90 678 __ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m 679 ## BB#0: 680 push rbp 681 mov rbp, rsp 682 push r15 683 push r14 684 push r13 685 push r12 686 push rbx 687 688 LBB0_24: ## %_ZN4Simd4Sse213BgraToYuv420pILb1EEEvPKhmmmPhmS4_mS4_m.exit 689 add rsp, 88 690 pop rbx 691 pop r12 692 pop r13 693 pop r14 694 pop r15 695 pop rbp 696 ret 697 698 .section __TEXT,__literal16,16byte_literals 699 .align 4 700 LCPI1_0: 701 .byte 255 ## 0xff 702 LCPI1_1: 703 .byte 1 ## 0x1 704 LCPI1_2: 705 .quad 281474976776192 ## 0x1000000010000 706 LCPI1_3: 707 .short 1606 ## 0x646 708 LCPI1_4: 709 .short 8258 ## 0x2042 710 LCPI1_5: 711 .short 16 ## 0x10 712 LCPI1_6: 713 .short 1 ## 0x1 714 LCPI1_7: 715 .short 7193 ## 0x1c19 716 LCPI1_8: 717 .short 60768 ## 0xed60 718 LCPI1_9: 719 .short 128 ## 0x80 720 LCPI1_10: 721 .short 64373 ## 0xfb75 722 LCPI1_11: 723 .short 59507 ## 0xe873 724 LCPI1_12: 725 .space 16 726 .section __TEXT,__text,regular,pure_instructions 727 .globl __ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m 728 .align 4, 0x90 729 __ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m 730 ## BB#0: 731 push rbp 732 mov rbp, rsp 733 push r15 734 push r14 735 push r13 736 push r12 737 push rbx 738 739 LBB1_20: ## %_ZN4Simd4Sse213BgraToYuv422pILb1EEEvPKhmmmPhmS4_mS4_m.exit 740 pop rbx 741 pop r12 742 pop r13 743 pop r14 744 pop r15 745 pop rbp 746 ret 747 748 .section __TEXT,__literal16,16byte_literals 749 .align 4 750 LCPI2_0: 751 .byte 255 ## 0xff 752 LCPI2_1: 753 .byte 1 ## 0x1 754 LCPI2_2: 755 .quad 281474976776192 ## 0x1000000010000 756 LCPI2_3: 757 .short 1606 ## 0x646 758 LCPI2_4: 759 .short 8258 ## 0x2042 760 LCPI2_5: 761 .short 16 ## 0x10 762 LCPI2_6: 763 .short 7193 ## 0x1c19 764 LCPI2_7: 765 .short 60768 ## 0xed60 766 LCPI2_8: 767 .short 128 ## 0x80 768 LCPI2_9: 769 .short 64373 ## 0xfb75 770 LCPI2_10: 771 .short 59507 ## 0xe873 772 LCPI2_11: 773 .space 16 774 .section __TEXT,__text,regular,pure_instructions 775 .globl __ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m 776 .align 4, 0x90 777 __ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m 778 ## BB#0: 779 push rbp 780 mov rbp, rsp 781 push r15 782 push r14 783 push r13 784 push r12 785 push rbx 786 787 LBB2_20: ## %_ZN4Simd4Sse213BgraToYuv444pILb1EEEvPKhmmmPhmS4_mS4_m.exit 788 pop rbx 789 pop r12 790 pop r13 791 pop r14 792 pop r15 793 pop rbp 794 ret 795 796 797 .subsections_via_symbols`, "\n") 798 799 var srcRetInMiddle = strings.Split(`.text 800 .intel_syntax noprefix 801 .file "/home/harsha/repos/Simd/src/Simd/SimdSse2BgrToBgra.cpp" 802 .section .rodata.cst16,"aM",@progbits,16 803 .align 16 804 .LCPI0_0: 805 .byte 255 # 0xff 806 .byte 0 # 0x0 807 .byte 255 # 0xff 808 .byte 0 # 0x0 809 .byte 255 # 0xff 810 .byte 0 # 0x0 811 .byte 255 # 0xff 812 .byte 0 # 0x0 813 .byte 255 # 0xff 814 .byte 0 # 0x0 815 .byte 255 # 0xff 816 .byte 0 # 0x0 817 .byte 255 # 0xff 818 .byte 0 # 0x0 819 .byte 255 # 0xff 820 .byte 0 # 0x0 821 .text 822 .globl _ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh 823 .align 16, 0x90 824 .type _ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh,@function 825 _ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh: # @_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh 826 # BB#0: 827 push rbp 828 mov rbp, rsp 829 push r15 830 push r14 831 push r13 832 push r12 833 push rbx 834 and rsp, -8 835 mov r14b, byte ptr [rbp + 48] 836 mov r10, qword ptr [rbp + 40] 837 mov rax, qword ptr [rbp + 32] 838 mov r11, qword ptr [rbp + 24] 839 mov r12, qword ptr [rbp + 16] 840 mov rbx, rsi 841 and rbx, -16 842 cmp rbx, rsi 843 jne .LBB0_22 844 # BB#1: 845 mov rbx, rdi 846 and rbx, -16 847 cmp rbx, rdi 848 jne .LBB0_22 849 # BB#2: 850 mov rbx, r9 851 and rbx, -16 852 cmp rbx, r9 853 jne .LBB0_22 854 # BB#3: 855 mov rbx, r8 856 and rbx, -16 857 cmp rbx, r8 858 jne .LBB0_22 859 # BB#4: 860 mov rbx, r11 861 and rbx, -16 862 cmp rbx, r11 863 jne .LBB0_22 864 # BB#5: 865 mov rbx, r12 866 and rbx, -16 867 cmp rbx, r12 868 jne .LBB0_22 869 # BB#6: 870 mov rbx, r10 871 and rbx, -16 872 cmp rbx, r10 873 jne .LBB0_22 874 # BB#7: 875 mov rbx, rax 876 and rbx, -16 877 cmp rbx, rax 878 jne .LBB0_22 879 # BB#8: 880 test rcx, rcx 881 je .LBB0_36 882 # BB#9: # %.preheader.lr.ph.i1 883 movzx ebx, r14b 884 shl ebx, 8 885 pxor xmm0, xmm0 886 pinsrw xmm0, ebx, 0 887 pinsrw xmm0, ebx, 1 888 pinsrw xmm0, ebx, 2 889 pinsrw xmm0, ebx, 3 890 pinsrw xmm0, ebx, 4 891 pinsrw xmm0, ebx, 5 892 pinsrw xmm0, ebx, 6 893 pinsrw xmm0, ebx, 7 894 mov r15, rdx 895 and r15, -8 896 je .LBB0_19 897 # BB#10: # %.lr.ph.us.i16.preheader 898 xor r14d, r14d 899 cmp r15, rdx 900 jne .LBB0_15 901 # BB#11: 902 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 903 .align 16, 0x90 904 .LBB0_12: # %.lr.ph.us.i16.us 905 # =>This Loop Header: Depth=1 906 # Child Loop BB0_13 Depth 2 907 xor ebx, ebx 908 .align 16, 0x90 909 .LBB0_13: # Parent Loop BB0_12 Depth=1 910 # => This Inner Loop Header: Depth=2 911 movdqa xmm2, xmmword ptr [rdi + 2*rbx] 912 pand xmm2, xmm1 913 movdqa xmm3, xmmword ptr [r8 + 2*rbx] 914 pand xmm3, xmm1 915 movdqa xmm4, xmmword ptr [r12 + 2*rbx] 916 pand xmm4, xmm1 917 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 918 por xmm3, xmm2 919 por xmm4, xmm0 920 movdqa xmm2, xmm3 921 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 922 movdqa xmmword ptr [rax + 4*rbx], xmm2 923 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 924 movdqa xmmword ptr [rax + 4*rbx + 16], xmm3 925 add rbx, 8 926 cmp rbx, rdx 927 jb .LBB0_13 928 # BB#14: # %._crit_edge.us.i17.us 929 # in Loop: Header=BB0_12 Depth=1 930 add rdi, rsi 931 add r8, r9 932 add r12, r11 933 add rax, r10 934 inc r14 935 cmp r14, rcx 936 jne .LBB0_12 937 jmp .LBB0_36 938 .LBB0_22: 939 test rcx, rcx 940 je .LBB0_36 941 # BB#23: # %.preheader.lr.ph.i 942 movzx ebx, r14b 943 shl ebx, 8 944 pxor xmm0, xmm0 945 pinsrw xmm0, ebx, 0 946 pinsrw xmm0, ebx, 1 947 pinsrw xmm0, ebx, 2 948 pinsrw xmm0, ebx, 3 949 pinsrw xmm0, ebx, 4 950 pinsrw xmm0, ebx, 5 951 pinsrw xmm0, ebx, 6 952 pinsrw xmm0, ebx, 7 953 mov r13, rdx 954 and r13, -8 955 je .LBB0_29 956 # BB#24: # %.lr.ph.us.i.preheader 957 xor r14d, r14d 958 cmp r13, rdx 959 jne .LBB0_32 960 # BB#25: 961 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 962 .align 16, 0x90 963 .LBB0_26: # %.lr.ph.us.i.us 964 # =>This Loop Header: Depth=1 965 # Child Loop BB0_27 Depth 2 966 xor ebx, ebx 967 .align 16, 0x90 968 .LBB0_27: # Parent Loop BB0_26 Depth=1 969 # => This Inner Loop Header: Depth=2 970 movdqu xmm2, xmmword ptr [rdi + 2*rbx] 971 pand xmm2, xmm1 972 movdqu xmm3, xmmword ptr [r8 + 2*rbx] 973 pand xmm3, xmm1 974 movdqu xmm4, xmmword ptr [r12 + 2*rbx] 975 pand xmm4, xmm1 976 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 977 por xmm3, xmm2 978 por xmm4, xmm0 979 movdqa xmm2, xmm3 980 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 981 movdqu xmmword ptr [rax + 4*rbx], xmm2 982 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 983 movdqu xmmword ptr [rax + 4*rbx + 16], xmm3 984 add rbx, 8 985 cmp rbx, rdx 986 jb .LBB0_27 987 # BB#28: # %._crit_edge.us.i.us 988 # in Loop: Header=BB0_26 Depth=1 989 add rdi, rsi 990 add r8, r9 991 add r12, r11 992 add rax, r10 993 inc r14 994 cmp r14, rcx 995 jne .LBB0_26 996 jmp .LBB0_36 997 .LBB0_29: # %.preheader.i.preheader 998 cmp r13, rdx 999 je .LBB0_36 1000 # BB#30: # %.preheader.i.preheader65 1001 lea r14, [rdi + 2*rdx - 16] 1002 lea rbx, [r8 + 2*rdx - 16] 1003 lea rdi, [r12 + 2*rdx - 16] 1004 lea rax, [rax + 4*rdx - 16] 1005 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1006 .align 16, 0x90 1007 .LBB0_31: # %.preheader.i 1008 # =>This Inner Loop Header: Depth=1 1009 movdqu xmm2, xmmword ptr [r14] 1010 pand xmm2, xmm1 1011 movdqu xmm3, xmmword ptr [rbx] 1012 pand xmm3, xmm1 1013 movdqu xmm4, xmmword ptr [rdi] 1014 pand xmm4, xmm1 1015 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1016 por xmm3, xmm2 1017 por xmm4, xmm0 1018 movdqa xmm2, xmm3 1019 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1020 movdqu xmmword ptr [rax - 16], xmm2 1021 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1022 movdqu xmmword ptr [rax], xmm3 1023 add r14, rsi 1024 add rbx, r9 1025 add rdi, r11 1026 add rax, r10 1027 dec rcx 1028 jne .LBB0_31 1029 jmp .LBB0_36 1030 .LBB0_32: 1031 lea r15, [4*rdx - 32] 1032 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1033 .align 16, 0x90 1034 .LBB0_33: # %.lr.ph.us.i 1035 # =>This Loop Header: Depth=1 1036 # Child Loop BB0_34 Depth 2 1037 xor ebx, ebx 1038 .align 16, 0x90 1039 .LBB0_34: # Parent Loop BB0_33 Depth=1 1040 # => This Inner Loop Header: Depth=2 1041 movdqu xmm2, xmmword ptr [rdi + 2*rbx] 1042 pand xmm2, xmm1 1043 movdqu xmm3, xmmword ptr [r8 + 2*rbx] 1044 pand xmm3, xmm1 1045 movdqu xmm4, xmmword ptr [r12 + 2*rbx] 1046 pand xmm4, xmm1 1047 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1048 por xmm3, xmm2 1049 por xmm4, xmm0 1050 movdqa xmm2, xmm3 1051 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1052 movdqu xmmword ptr [rax + 4*rbx], xmm2 1053 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1054 movdqu xmmword ptr [rax + 4*rbx + 16], xmm3 1055 add rbx, 8 1056 cmp rbx, r13 1057 jb .LBB0_34 1058 # BB#35: # %._crit_edge.us.i 1059 # in Loop: Header=BB0_33 Depth=1 1060 movdqu xmm2, xmmword ptr [rdi + 2*rdx - 16] 1061 pand xmm2, xmm1 1062 movdqu xmm3, xmmword ptr [r8 + 2*rdx - 16] 1063 pand xmm3, xmm1 1064 movdqu xmm4, xmmword ptr [r12 + 2*rdx - 16] 1065 pand xmm4, xmm1 1066 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1067 por xmm3, xmm2 1068 por xmm4, xmm0 1069 movdqa xmm2, xmm3 1070 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1071 movdqu xmmword ptr [rax + r15], xmm2 1072 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1073 movdqu xmmword ptr [rax + r15 + 16], xmm3 1074 add rdi, rsi 1075 add r8, r9 1076 add r12, r11 1077 add rax, r10 1078 inc r14 1079 cmp r14, rcx 1080 jne .LBB0_33 1081 .LBB0_36: # %_ZN4Simd4Sse214Bgr48pToBgra32ILb1EEEvPKhmmmS3_mS3_mPhmh.exit 1082 lea rsp, [rbp - 40] 1083 pop rbx 1084 pop r12 1085 pop r13 1086 pop r14 1087 pop r15 1088 pop rbp 1089 ret 1090 .LBB0_19: # %.preheader.lr.ph.split.i 1091 cmp r15, rdx 1092 je .LBB0_36 1093 # BB#20: # %.preheader.i26.preheader 1094 lea r14, [rdi + 2*rdx - 16] 1095 lea rbx, [r8 + 2*rdx - 16] 1096 lea rdi, [r12 + 2*rdx - 16] 1097 lea rax, [rax + 4*rdx - 16] 1098 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1099 .align 16, 0x90 1100 .LBB0_21: # %.preheader.i26 1101 # =>This Inner Loop Header: Depth=1 1102 movdqu xmm2, xmmword ptr [r14] 1103 pand xmm2, xmm1 1104 movdqu xmm3, xmmword ptr [rbx] 1105 pand xmm3, xmm1 1106 movdqu xmm4, xmmword ptr [rdi] 1107 pand xmm4, xmm1 1108 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1109 por xmm3, xmm2 1110 por xmm4, xmm0 1111 movdqa xmm2, xmm3 1112 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1113 movdqu xmmword ptr [rax - 16], xmm2 1114 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1115 movdqu xmmword ptr [rax], xmm3 1116 add r14, rsi 1117 add rbx, r9 1118 add rdi, r11 1119 add rax, r10 1120 dec rcx 1121 jne .LBB0_21 1122 jmp .LBB0_36 1123 .LBB0_15: 1124 lea r13, [4*rdx - 32] 1125 movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1126 .align 16, 0x90 1127 .LBB0_16: # %.lr.ph.us.i16 1128 # =>This Loop Header: Depth=1 1129 # Child Loop BB0_17 Depth 2 1130 xor ebx, ebx 1131 .align 16, 0x90 1132 .LBB0_17: # Parent Loop BB0_16 Depth=1 1133 # => This Inner Loop Header: Depth=2 1134 movdqa xmm2, xmmword ptr [rdi + 2*rbx] 1135 pand xmm2, xmm1 1136 movdqa xmm3, xmmword ptr [r8 + 2*rbx] 1137 pand xmm3, xmm1 1138 movdqa xmm4, xmmword ptr [r12 + 2*rbx] 1139 pand xmm4, xmm1 1140 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1141 por xmm3, xmm2 1142 por xmm4, xmm0 1143 movdqa xmm2, xmm3 1144 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1145 movdqa xmmword ptr [rax + 4*rbx], xmm2 1146 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1147 movdqa xmmword ptr [rax + 4*rbx + 16], xmm3 1148 add rbx, 8 1149 cmp rbx, r15 1150 jb .LBB0_17 1151 # BB#18: # %._crit_edge.us.i17 1152 # in Loop: Header=BB0_16 Depth=1 1153 movdqu xmm2, xmmword ptr [rdi + 2*rdx - 16] 1154 pand xmm2, xmm1 1155 movdqu xmm3, xmmword ptr [r8 + 2*rdx - 16] 1156 pand xmm3, xmm1 1157 movdqu xmm4, xmmword ptr [r12 + 2*rdx - 16] 1158 pand xmm4, xmm1 1159 pslldq xmm3, 1 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1160 por xmm3, xmm2 1161 por xmm4, xmm0 1162 movdqa xmm2, xmm3 1163 punpcklwd xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1164 movdqu xmmword ptr [rax + r13], xmm2 1165 punpckhwd xmm3, xmm4 # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1166 movdqu xmmword ptr [rax + r13 + 16], xmm3 1167 add rdi, rsi 1168 add r8, r9 1169 add r12, r11 1170 add rax, r10 1171 inc r14 1172 cmp r14, rcx 1173 jne .LBB0_16 1174 jmp .LBB0_36 1175 .Lfunc_end0: 1176 .size _ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh, .Lfunc_end0-_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh 1177 1178 1179 .ident "clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)" 1180 .section ".note.GNU-stack","",@progbits`, "\n") 1181 1182 var srcLabelHasSpecialComment = strings.Split(` .text 1183 .intel_syntax noprefix 1184 .file "sample.c" 1185 .globl sample_sum_sse4_2 # -- Begin function sample_sum_sse4_2 1186 .p2align 4, 0x90 1187 .type sample_sum_sse4_2,@function 1188 sample_sum_sse4_2: # @sample_sum_sse4_2 1189 # %bb.0: 1190 push rbp 1191 mov rbp, rsp 1192 and rsp, -8 1193 test rsi, rsi 1194 jle .LBB0_1 1195 # %bb.2: 1196 lea rcx, [rdi + 8*rsi] 1197 lea rax, [rdi + 8] 1198 cmp rcx, rax 1199 cmova rax, rcx 1200 mov r9, rdi 1201 not r9 1202 add r9, rax 1203 shr r9, 3 1204 add r9, 1 1205 cmp r9, 4 1206 jae .LBB0_4 1207 # %bb.3: 1208 xor eax, eax 1209 jmp .LBB0_13 1210 .LBB0_1: 1211 xor eax, eax 1212 .LBB0_14: 1213 mov rsp, rbp 1214 pop rbp 1215 ret 1216 .LBB0_4: 1217 mov r8, r9 1218 and r8, -4 1219 lea rsi, [r8 - 4] 1220 mov rdx, rsi 1221 shr rdx, 2 1222 add rdx, 1 1223 mov eax, edx 1224 and eax, 3 1225 cmp rsi, 12 1226 jae .LBB0_6 1227 # %bb.5: 1228 pxor xmm0, xmm0 1229 xor esi, esi 1230 pxor xmm1, xmm1 1231 test rax, rax 1232 jne .LBB0_9 1233 jmp .LBB0_11 1234 .LBB0_6: 1235 mov esi, 1 1236 sub rsi, rdx 1237 lea rdx, [rax + rsi] 1238 add rdx, -1 1239 pxor xmm0, xmm0 1240 xor esi, esi 1241 pxor xmm1, xmm1 1242 .p2align 4, 0x90 1243 .LBB0_7: # =>This Inner Loop Header: Depth=1 1244 movdqu xmm2, xmmword ptr [rdi + 8*rsi] 1245 paddq xmm2, xmm0 1246 movdqu xmm0, xmmword ptr [rdi + 8*rsi + 16] 1247 paddq xmm0, xmm1 1248 movdqu xmm1, xmmword ptr [rdi + 8*rsi + 32] 1249 movdqu xmm3, xmmword ptr [rdi + 8*rsi + 48] 1250 movdqu xmm4, xmmword ptr [rdi + 8*rsi + 64] 1251 paddq xmm4, xmm1 1252 paddq xmm4, xmm2 1253 movdqu xmm2, xmmword ptr [rdi + 8*rsi + 80] 1254 paddq xmm2, xmm3 1255 paddq xmm2, xmm0 1256 movdqu xmm0, xmmword ptr [rdi + 8*rsi + 96] 1257 paddq xmm0, xmm4 1258 movdqu xmm1, xmmword ptr [rdi + 8*rsi + 112] 1259 paddq xmm1, xmm2 1260 add rsi, 16 1261 add rdx, 4 1262 jne .LBB0_7 1263 # %bb.8: 1264 test rax, rax 1265 je .LBB0_11 1266 .LBB0_9: 1267 lea rdx, [rdi + 8*rsi] 1268 add rdx, 16 1269 neg rax 1270 .p2align 4, 0x90 1271 .LBB0_10: # =>This Inner Loop Header: Depth=1 1272 movdqu xmm2, xmmword ptr [rdx - 16] 1273 paddq xmm0, xmm2 1274 movdqu xmm2, xmmword ptr [rdx] 1275 paddq xmm1, xmm2 1276 add rdx, 32 1277 add rax, 1 1278 jne .LBB0_10 1279 .LBB0_11: 1280 paddq xmm0, xmm1 1281 pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] 1282 paddq xmm1, xmm0 1283 movq rax, xmm1 1284 cmp r9, r8 1285 je .LBB0_14 1286 # %bb.12: 1287 lea rdi, [rdi + 8*r8] 1288 .p2align 4, 0x90 1289 .LBB0_13: # =>This Inner Loop Header: Depth=1 1290 add rax, qword ptr [rdi] 1291 add rdi, 8 1292 cmp rdi, rcx 1293 jb .LBB0_13 1294 jmp .LBB0_14 1295 .Lfunc_end0: 1296 .size sample_sum_sse4_2, .Lfunc_end0-sample_sum_sse4_2 1297 # -- End function 1298 .section .rodata.cst16,"aM",@progbits,16 1299 .p2align 4 # -- Begin function sample_max_sse4_2 1300 `, "\n")