gitee.com/quant1x/num@v0.3.2/asm/c2goasm/subroutine_test.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2017 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package main
    18  
    19  import (
    20  	"strings"
    21  	"testing"
    22  )
    23  
    24  func testName(t *testing.T, fullname, expected string) {
    25  	name := extractName(fullname)
    26  	if name != expected {
    27  		t.Errorf("TestNames(): \nexpected %s\ngot      %s", expected, name)
    28  	}
    29  }
    30  
    31  func TestNames(t *testing.T) {
    32  
    33  	// C++ name mangling
    34  	testName(t, "_ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv444pToBgra")
    35  	testName(t, "_ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv420pToBgra")
    36  	testName(t, "_ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh", "SimdAvx2Yuv422pToBgra")
    37  	testName(t, "_ZN4Simd4Avx213ReduceGray2x2EPKhmmmPhmmm", "SimdAvx2ReduceGray2x2")
    38  	testName(t, "_ZN4Simd4Avx216AbsDifferenceSumEPKhmS2_mmmPy", "SimdAvx2AbsDifferenceSum")
    39  
    40  	// Plain C style names
    41  	testName(t, "XORShift128Plus", "XORShift128Plus")
    42  }
    43  
    44  func subroutineEqual(a, b []Subroutine) bool {
    45  
    46  	if a == nil && b == nil {
    47  		return true
    48  	}
    49  
    50  	if a == nil || b == nil {
    51  		return false
    52  	}
    53  
    54  	if len(a) != len(b) {
    55  		return false
    56  	}
    57  
    58  	for i := range a {
    59  		if !(a[i].name == b[i].name && equalString(a[i].body, b[i].body)) {
    60  			return false
    61  		}
    62  	}
    63  
    64  	return true
    65  }
    66  
    67  func testSubroutine(t *testing.T, fullsrc []string, expected []Subroutine) {
    68  	subroutines := segmentSource(fullsrc)
    69  	if !subroutineEqual(subroutines, expected) {
    70  		t.Errorf("testSubroutine(): \nexpected %#v\ngot      %#v", expected, subroutines)
    71  	}
    72  }
    73  
    74  func TestSubroutine(t *testing.T) {
    75  
    76  	disabledForTesting = true
    77  
    78  	src1 := strings.Split(`	.section	__TEXT,__text,regular,pure_instructions
    79  	.macosx_version_min 10, 11
    80  	.intel_syntax noprefix
    81  	.section	__TEXT,__const
    82  	.align	5
    83  LCPI0_0:
    84  	.byte	255                     ## 0xff
    85  	.byte	0                       ## 0x0
    86  LCPI0_1:
    87  	.short	9617                    ## 0x2591
    88  	.short	0                       ## 0x0
    89  LCPI0_2:
    90  	.short	1868                    ## 0x74c
    91  	.short	4899                    ## 0x1323
    92  	.section	__TEXT,__literal4,4byte_literals
    93  	.align	2
    94  LCPI0_3:
    95  	.long	8192                    ## 0x2000
    96  	.section	__TEXT,__text,regular,pure_instructions
    97  	.globl	__ZN4Simd4Avx210BgraToGrayEPKhmmmPhm
    98  	.align	4, 0x90
    99  __ZN4Simd4Avx210BgraToGrayEPKhmmmPhm:   ## @_ZN4Simd4Avx210BgraToGrayEPKhmmmPhm
   100  ## BB#0:
   101  	push    rbp
   102  	mov     rbp, rsp
   103  	mov     rax, rdi
   104  	and     rax, -32
   105  	cmp     rax, rdi
   106  	jne     LBB0_9
   107  ## BB#1:
   108  	mov	r10, r9
   109  	jne	LBB0_9
   110  ## BB#2:
   111  	mov	rax, r8
   112  	jne	LBB0_9
   113  ## BB#3:
   114  	test	rdx, rdx
   115  	je	LBB0_15
   116  ## BB#4:                                ## %.preheader.lr.ph.i.1
   117  	mov	r11, rsi
   118  	.align	4, 0x90
   119  LBB0_5:                                 ## %.preheader.i.5
   120  	je	LBB0_6
   121  	.align	4, 0x90
   122  LBB0_16:                                ## %.lr.ph.i.12
   123                                          ##   Parent Loop BB0_5 Depth=1
   124                                          ## =>  This Inner Loop Header: Depth=2
   125  	vmovdqu	ymm4, ymmword ptr [rdi + 4*rax]
   126  	cmp	rax, r11
   127  	jb	LBB0_16
   128  LBB0_6:                                 ## %._crit_edge.i.6
   129                                          ##   in Loop: Header=BB0_5 Depth=1
   130  	cmp	r11, rsi
   131  	je	LBB0_8
   132  ## BB#7:                                ##   in Loop: Header=BB0_5 Depth=1
   133  	vmovdqu	ymm4, ymmword ptr [rdi + 4*rsi - 128]
   134  	vmovdqu	ymmword ptr [r8 + rsi - 32], ymm4
   135  LBB0_8:                                 ##   in Loop: Header=BB0_5 Depth=1
   136  	add	rdi, rcx
   137  	jne	LBB0_5
   138  	jmp	LBB0_15
   139  LBB0_9:
   140  	test	rdx, rdx
   141  	je	LBB0_15
   142  ## BB#10:                               ## %.preheader.lr.ph.i
   143  	mov	r11, rsi
   144  	vpbroadcastd	ymm3, dword ptr [rip + LCPI0_3]
   145  	.align	4, 0x90
   146  LBB0_11:                                ## %.preheader.i
   147                                          ## =>This Loop Header: Depth=1
   148                                          ##     Child Loop BB0_17 Depth 2
   149  	mov	eax, 0
   150  	test	r11, r11
   151  	je	LBB0_12
   152  	.align	4, 0x90
   153  LBB0_17:                                ## %.lr.ph.i
   154                                          ##   Parent Loop BB0_11 Depth=1
   155                                          ## =>  This Inner Loop Header: Depth=2
   156  	vmovdqu	ymm4, ymmword ptr [rdi + 4*rax]
   157  	vmovdqu	ymm5, ymmword ptr [rdi + 4*rax + 32]
   158  	jb	LBB0_17
   159  LBB0_12:                                ## %._crit_edge.i
   160                                          ##   in Loop: Header=BB0_11 Depth=1
   161  	cmp	r11, rsi
   162  	je	LBB0_14
   163  ## BB#13:                               ##   in Loop: Header=BB0_11 Depth=1
   164  	vmovdqu	ymm4, ymmword ptr [rdi + 4*rsi - 128]
   165  	vmovdqu	ymmword ptr [r8 + rsi - 32], ymm4
   166  LBB0_14:                                ##   in Loop: Header=BB0_11 Depth=1
   167  	add	rdi, rcx
   168  	add	r8, r9
   169  	inc	r10
   170  	cmp	r10, rdx
   171  	jne	LBB0_11
   172  LBB0_15:                                ## %_ZN4Simd4Avx210BgraToGrayILb1EEEvPKhmmmPhm.exit
   173  	pop	rbp
   174  	vzeroupper
   175  	ret
   176  
   177  .subsections_via_symbols
   178  `, "\n")
   179  
   180  	subroutine1 := []Subroutine{}
   181  	subroutine1 = append(subroutine1, Subroutine{name: "SimdAvx2BgraToGray", body: src1[25:98]})
   182  
   183  	testSubroutine(t, src1, subroutine1)
   184  
   185  	src2 := strings.Split(`	.section	__TEXT,__text,regular,pure_instructions
   186  	.macosx_version_min 10, 11
   187  	.intel_syntax noprefix
   188  	.section	__TEXT,__const
   189  	.align	5
   190  LCPI0_0:
   191  	.short	16                      ## 0x10
   192  	.short	13074                   ## 0x3312
   193  	.short	0                       ## 0x0
   194  	.section	__TEXT,__text,regular,pure_instructions
   195  	.globl	__ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh
   196  	.align	4, 0x90
   197  __ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv444pToBgraEPKhmS2_mS2_mmmPhmh
   198  ## BB#0:
   199  	push    rbp
   200  	mov     rbp, rsp
   201  	push    r15
   202  	push    r14
   203  	push    r13
   204  	push    r12
   205  	push    rbx
   206  	and     rsp, -32
   207  	sub     rsp, 192
   208  	mov     qword ptr [rsp + 56], r9 ## 8-byte Spill
   209  	mov     r9b, byte ptr [rbp + 48]
   210  	mov     r15, qword ptr [rbp + 40]
   211  	mov     r13, qword ptr [rbp + 32]
   212  	mov     r10, qword ptr [rbp + 16]
   213  	mov     rbx, rsi
   214  	and     rbx, -32
   215  	cmp     rbx, rsi
   216  	jne     LBB0_14
   217  ### BB#1:
   218  	mov	rbx, rdi
   219  	cmp	rbx, r13
   220  	jne	LBB0_14
   221  ## BB#8:
   222  	movzx	eax, r9b
   223  	cmp	qword ptr [rbp + 24], 0
   224  	je	LBB0_20
   225  ## BB#9:                                ## %.preheader.lr.ph.i.1
   226  	vinserti128	ymm14, ymm0, xmm0, 1
   227  	vmovdqu	ymmword ptr [r13 + r9 + 96], ymm0
   228  LBB0_13:                                ##   in Loop: Header=BB0_10 Depth=1
   229  	add	rdi, rsi
   230  	jb	LBB0_22
   231  LBB0_17:                                ## %._crit_edge.i
   232  	cmp	rbx, qword ptr [rbp + 16]
   233  	cmp	r11, qword ptr [rbp + 24]
   234  	jne	LBB0_16
   235  LBB0_20:                                ## %_ZN4Simd4Avx213Yuv444pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit
   236  	lea	rsp, [rbp - 40]
   237  	pop	rbx
   238  	pop     r12
   239  	pop     r13
   240  	pop     r14
   241  	pop     r15
   242  	pop	rbp
   243  	vzeroupper
   244  	ret
   245  
   246  	.section	__TEXT,__const
   247  	.align	5
   248  LCPI1_0:
   249  	.byte	0                       ## 0x0
   250  	.space	1
   251  	.space	1
   252  	.space	1
   253  LCPI1_13:
   254  	.space	32
   255  	.section	__TEXT,__text,regular,pure_instructions
   256  	.globl	__ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh
   257  	.align	4, 0x90
   258  __ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv420pToBgraEPKhmS2_mS2_mmmPhmh
   259  ## BB#0:
   260  	push	rbp
   261  	mov	rbp, rsp
   262  	push	r15
   263  	push    r14
   264  	push    r13
   265  	push    r12
   266  	push    rbx
   267  	and     rsp, -32
   268  	sub     rsp, 864
   269  	mov     qword ptr [rsp + 144], r9 ## 8-byte Spill
   270  	mov     qword ptr [rsp + 152], rcx ## 8-byte Spill
   271  	xor	r12d, r12d
   272  	.align	4, 0x90
   273  LBB1_12:                                ## %.lr.ph.i.18
   274                                          ##   Parent Loop BB1_10 Depth=1
   275  	cmp	r15, r11
   276  	jb	LBB1_12
   277  LBB1_13:                                ## %._crit_edge.i.8
   278                                          ##   in Loop: Header=BB1_10 Depth=1
   279  	vmovdqa	ymm7, ymm10
   280  	vmovdqu	ymmword ptr [rax + rsi + 224], ymm0
   281  LBB1_15:                                ##   in Loop: Header=BB1_10 Depth=1
   282  	add	rdi, qword ptr [rsp + 192] ## 8-byte Folded Reload
   283  	vmovdqa	ymm7, ymmword ptr [rip + LCPI1_7] ## ymm7 = <u,u,u,u,1,1,1,1,u,u,u,u,1,1,1,1>
   284  	.align	4, 0x90
   285  LBB1_18:                                ## %.preheader.i
   286                                          ## =>This Loop Header: Depth=1
   287  	cmp	rsi, rbx
   288  	jb	LBB1_23
   289  LBB1_19:                                ## %._crit_edge.i
   290                                          ##   in Loop: Header=BB1_18 Depth=1
   291  	vmovdqu	ymmword ptr [rax + rsi + 224], ymm0
   292  LBB1_21:                                ##   in Loop: Header=BB1_18 Depth=1
   293  	add	rdi, qword ptr [rsp + 96] ## 8-byte Folded Reload
   294  	jb	LBB1_18
   295  LBB1_22:                                ## %_ZN4Simd4Avx213Yuv420pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit
   296  	lea	rsp, [rbp - 40]
   297  	pop	rbx
   298  	pop     r12
   299  	pop     r13
   300  	pop     r14
   301  	pop     r15
   302  	pop	rbp
   303  	vzeroupper
   304  	ret
   305  
   306  	.section	__TEXT,__const
   307  	.align	5
   308  LCPI2_0:
   309  	.byte	0                       ## 0x0
   310  	.byte	2                       ## 0x2
   311  	.byte	15                      ## 0xf
   312  LCPI2_12:
   313  	.space	1
   314  	.space	1
   315  	.section	__TEXT,__text,regular,pure_instructions
   316  	.globl	__ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh
   317  	.align	4, 0x90
   318  __ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh: ## @_ZN4Simd4Avx213Yuv422pToBgraEPKhmS2_mS2_mmmPhmh
   319  ## BB#0:
   320  	push	rbp
   321  	mov     rbp, rsp
   322  	push    r15
   323  	push    r14
   324  	push    r13
   325  	push    r12
   326  	push    rbx
   327  	and     rsp, -32
   328  	sub     rsp, 416
   329  	mov     qword ptr [rsp + 184], rcx ## 8-byte Spill
   330  	mov     qword ptr [rsp + 176], rsi ## 8-byte Spill
   331  	mov     cl, byte ptr [rbp + 48]
   332  	mov     r12, qword ptr [rbp + 40]
   333  	mov     rax, qword ptr [rbp + 32]
   334  	mov     r10, qword ptr [rbp + 16]
   335  	jne	LBB2_14
   336  ## BB#1:
   337  	mov	rsi, rdi
   338  	jne	LBB2_14
   339  ## BB#8:
   340  	movzx	ecx, cl
   341  	cmp	qword ptr [rbp + 24], 0
   342  	mov	rcx, r9
   343  	je	LBB2_20
   344  ## BB#9:                                ## %.preheader.lr.ph.i.1
   345  	vinserti128	ymm12, ymm0, xmm0, 1
   346  	.align	4, 0x90
   347  LBB2_10:                                ## %.preheader.i.7
   348  	.align	4, 0x90
   349  LBB2_21:                                ## %.lr.ph.i.16
   350  	jb	LBB2_21
   351  LBB2_11:                                ## %._crit_edge.i.8
   352  	je	LBB2_13
   353  ## BB#12:                               ##   in Loop: Header=BB2_10 Depth=1
   354  	vmovdqa	ymm15, ymm9
   355  	vmovdqu	ymmword ptr [rax + r15 + 224], ymm0
   356  LBB2_13:                                ##   in Loop: Header=BB2_10 Depth=1
   357  	add	rdi, qword ptr [rsp + 176] ## 8-byte Folded Reload
   358  	jmp	LBB2_20
   359  LBB2_14:
   360  	mov	qword ptr [rsp + 168], r9 ## 8-byte Spill
   361  	je	LBB2_20
   362  ## BB#15:                               ## %.preheader.lr.ph.i
   363  	vinserti128	ymm0, ymm0, xmm0, 1
   364  	.align	4, 0x90
   365  LBB2_16:                                ## %.preheader.i
   366                                          ## =>This Loop Header: Depth=1
   367  	je	LBB2_17
   368  	.align	4, 0x90
   369  LBB2_22:                                ## %.lr.ph.i
   370  	cmp	r15, rbx
   371  	jb	LBB2_22
   372  LBB2_17:                                ## %._crit_edge.i
   373                                          ##   in Loop: Header=BB2_16 Depth=1
   374  	cmp	rbx, qword ptr [rbp + 16]
   375  	je	LBB2_19
   376  ## BB#18:                               ##   in Loop: Header=BB2_16 Depth=1
   377  	vpermq	ymm1, ymmword ptr [rdx + rsi], 216 ## ymm1 = mem[0,2,1,3]
   378  	vmovdqu	ymmword ptr [rax + r13 + 224], ymm0
   379  LBB2_19:                                ##   in Loop: Header=BB2_16 Depth=1
   380  	add	rdi, qword ptr [rsp + 176] ## 8-byte Folded Reload
   381  	jne	LBB2_16
   382  LBB2_20:                                ## %_ZN4Simd4Avx213Yuv422pToBgraILb1EEEvPKhmS3_mS3_mmmPhmh.exit
   383  	lea	rsp, [rbp - 40]
   384  	pop	rbx
   385  	pop     r12
   386  	pop     r13
   387  	pop     r14
   388  	pop     r15
   389  	pop	rbp
   390  	vzeroupper
   391  	ret
   392  
   393  .subsections_via_symbols`, "\n")
   394  
   395  	subroutine2 := []Subroutine{}
   396  	subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv444pToBgra", body: src2[23:60]})
   397  	subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv420pToBgra", body: src2[84:120]})
   398  	subroutine2 = append(subroutine2, Subroutine{name: "SimdAvx2Yuv422pToBgra", body: src2[144:207]})
   399  
   400  	testSubroutine(t, src2, subroutine2)
   401  
   402  	src3 := strings.Split(`        .globl  __ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_
   403          .align  4, 0x90
   404  __ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_: ## @_ZN4Simd4Avx214MultiplyAndAddEPfS1_S1_S1_
   405  ## BB#0:
   406          push    rbp
   407          mov     rbp, rsp
   408          vmovups ymm0, ymmword ptr [rdi]
   409          vmovups ymm1, ymmword ptr [rsi]
   410          vfmadd213ps     ymm1, ymm0, ymmword ptr [rdx]
   411          vmovups ymmword ptr [rcx], ymm1
   412          pop     rbp
   413          vzeroupper
   414          ret
   415  
   416  .subsections_via_symbols`, "\n")
   417  
   418  	subroutine3 := []Subroutine{}
   419  	subroutine3 = append(subroutine3, Subroutine{name: "SimdAvx2MultiplyAndAdd", body: src3[6:13]})
   420  
   421  	testSubroutine(t, src3, subroutine3)
   422  
   423  	src4 := strings.Split(`        .section        __TEXT,__text,regular,pure_instructions
   424          .macosx_version_min 10, 11
   425          .intel_syntax noprefix
   426          .globl  __Z22MultiplyAndAddConstantPfS_S_
   427          .align  4, 0x90
   428  __Z22MultiplyAndAddConstantPfS_S_:      ## @_Z22MultiplyAndAddConstantPfS_S_
   429  ## BB#0:
   430          push    rbp
   431          mov     rbp, rsp
   432          vmovups ymm0, ymmword ptr [rdi]
   433          vmovups ymm1, ymmword ptr [rsi]
   434          vfmadd213ps     ymm1, ymm0, ymmword ptr [rip + __ZL1a]
   435          vmovups ymmword ptr [rdx], ymm1
   436          pop     rbp
   437          vzeroupper
   438          ret
   439  
   440          .section        __DATA,__data
   441          .align  5                       ## @_ZL1a
   442  __ZL1a:
   443          .long   1065353216              ## float 1.000000e+00
   444          .long   1073741824              ## float 2.000000e+00
   445          .long   1077936128              ## float 3.000000e+00
   446          .long   1082130432              ## float 4.000000e+00
   447          .long   1084227584              ## float 5.000000e+00
   448          .long   1086324736              ## float 6.000000e+00
   449          .long   1088421888              ## float 7.000000e+00
   450          .long   1090519040              ## float 8.000000e+00
   451  `, "\n")
   452  
   453  	subroutine4 := []Subroutine{}
   454  	subroutine4 = append(subroutine4, Subroutine{name: "MultiplyAndAddConstant", body: src4[9:16]})
   455  
   456  	testSubroutine(t, src4, subroutine4)
   457  
   458  	subroutine5 := []Subroutine{}
   459  	subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv420p", body: srcOsx[43:53]})
   460  	subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv422p", body: srcOsx[94:103]})
   461  	subroutine5 = append(subroutine5, Subroutine{name: "SimdSse2BgraToYuv444p", body: srcOsx[142:151]})
   462  
   463  	testSubroutine(t, srcOsx, subroutine5)
   464  
   465  	subroutine6 := []Subroutine{}
   466  	subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv420p", body: srcClang[41:51]})
   467  	subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv422p", body: srcClang[94:103]})
   468  	subroutine6 = append(subroutine6, Subroutine{name: "SimdSse2BgraToYuv444p", body: srcClang[144:153]})
   469  
   470  	testSubroutine(t, srcClang, subroutine6)
   471  
   472  	subroutine7 := []Subroutine{}
   473  	subroutine7 = append(subroutine7, Subroutine{name: "SimdSse2Bgr48pToBgra32", body: srcRetInMiddle[36:291]})
   474  
   475  	testSubroutine(t, srcRetInMiddle, subroutine7)
   476  
   477  	disabledForTesting = false
   478  	subroutine8 := []Subroutine{}
   479  	subroutine8 = append(subroutine8, Subroutine{name: "sample_sum_sse4_2", body: srcLabelHasSpecialComment[11:113]})
   480  
   481  	testSubroutine(t, srcLabelHasSpecialComment, subroutine8)
   482  }
   483  
   484  var srcClang = strings.Split(`	.text
   485  	.intel_syntax noprefix
   486  	.section	.rodata.cst16,"aM",@progbits,16
   487  	.align	16
   488  .LCPI0_0:
   489  	.byte	255                     # 0xff
   490  .LCPI0_1:
   491  	.byte	255                     # 0xff
   492  	.byte	0                       # 0x0
   493  .LCPI0_2:
   494  	.quad	281474976776192         # 0x1000000010000
   495  .LCPI0_3:
   496  	.short	1606                    # 0x646
   497  	.short	4211                    # 0x1073
   498  .LCPI0_4:
   499  	.short	8258                    # 0x2042
   500  .LCPI0_5:
   501  	.short	16                      # 0x10
   502  .LCPI0_6:
   503  	.short	2                       # 0x2
   504  .LCPI0_7:
   505  	.short	7193                    # 0x1c19
   506  .LCPI0_8:
   507  	.short	60768                   # 0xed60
   508  .LCPI0_9:
   509  	.short	128                     # 0x80
   510  .LCPI0_10:
   511  	.short	64373                   # 0xfb75
   512  .LCPI0_11:
   513  	.short	59507                   # 0xe873
   514  .LCPI0_12:
   515  	.zero	16
   516  	.text
   517  	.globl	_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m
   518  	.align	16, 0x90
   519  	.type	_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m,@function
   520  _ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m
   521  # BB#0:
   522  	push	rbp
   523  	push	r15
   524  	push	r14
   525  
   526  .LBB0_24:                               # %_ZN4Simd4Sse213BgraToYuv420pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   527  	add	rsp, 136
   528  	pop	rbx
   529  	pop	r12
   530  	pop	r13
   531  	pop	r14
   532  	pop	r15
   533  	pop	rbp
   534  	ret
   535  .Lfunc_end0:
   536  	.size	_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m, .Lfunc_end0-_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m
   537  
   538  	.section	.rodata.cst16,"aM",@progbits,16
   539  	.align	16
   540  .LCPI1_0:
   541  	.byte	255                     # 0xff
   542  .LCPI1_1:
   543  	.byte	0                       # 0x0
   544  .LCPI1_2:
   545  	.quad	281474976776192         # 0x1000000010000
   546  .LCPI1_3:
   547  	.short	4211                    # 0x1073
   548  .LCPI1_4:
   549  	.short	8192                    # 0x2000
   550  .LCPI1_5:
   551  	.short	16                      # 0x10
   552  .LCPI1_6:
   553  	.short	1                       # 0x1
   554  .LCPI1_7:
   555  	.short	7193                    # 0x1c19
   556  .LCPI1_8:
   557  	.short	60768                   # 0xed60
   558  .LCPI1_9:
   559  	.short	128                     # 0x80
   560  .LCPI1_10:
   561  	.short	64373                   # 0xfb75
   562  .LCPI1_11:
   563  	.short	59507                   # 0xe873
   564  .LCPI1_12:
   565  	.zero	16
   566  	.text
   567  	.globl	_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m
   568  	.align	16, 0x90
   569  	.type	_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m,@function
   570  _ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m
   571  # BB#0:
   572  	push	rbp
   573  	push	r15
   574  	push	r14
   575  	push	r13
   576  	push	r12
   577  	push	rbx
   578  
   579  .LBB1_20:                               # %_ZN4Simd4Sse213BgraToYuv422pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   580  	pop	rbx
   581  	pop	r12
   582  	pop	r13
   583  	pop	r14
   584  	pop	r15
   585  	pop	rbp
   586  	ret
   587  .Lfunc_end1:
   588  	.size	_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m, .Lfunc_end1-_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m
   589  
   590  	.section	.rodata.cst16,"aM",@progbits,16
   591  	.align	16
   592  .LCPI2_0:
   593  	.byte	0                       # 0x0
   594  .LCPI2_1:
   595  	.byte	255                     # 0xff
   596  .LCPI2_2:
   597  	.quad	281474976776192         # 0x1000000010000
   598  .LCPI2_3:
   599  	.short	1606                    # 0x646
   600  .LCPI2_4:
   601  	.short	8258                    # 0x2042
   602  .LCPI2_5:
   603  	.short	16                      # 0x10
   604  .LCPI2_6:
   605  	.short	7193                    # 0x1c19
   606  .LCPI2_7:
   607  	.short	60768                   # 0xed60
   608  .LCPI2_8:
   609  	.short	128                     # 0x80
   610  .LCPI2_9:
   611  	.short	64373                   # 0xfb75
   612  .LCPI2_10:
   613  	.short	59507                   # 0xe873
   614  .LCPI2_11:
   615  	.zero	16
   616  	.text
   617  	.globl	_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m
   618  	.align	16, 0x90
   619  	.type	_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m,@function
   620  _ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m: # @_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m
   621  # BB#0:
   622  	push	rbp
   623  	push	r15
   624  	push	r14
   625  	push	r13
   626  	push	r12
   627  	push	rbx
   628  
   629  .LBB2_20:                               # %_ZN4Simd4Sse213BgraToYuv444pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   630  	pop	rbx
   631  	pop	r12
   632  	pop	r13
   633  	pop	r14
   634  	pop	r15
   635  	pop	rbp
   636  	ret
   637  .Lfunc_end2:
   638  	.size	_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m, .Lfunc_end2-_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m
   639  
   640  
   641  	.ident	"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"
   642  	.section	".note.GNU-stack","",@progbits`, "\n")
   643  
   644  var srcOsx = strings.Split(`	.section	__TEXT,__text,regular,pure_instructions
   645  	.macosx_version_min 10, 11
   646  	.intel_syntax noprefix
   647  	.section	__TEXT,__literal16,16byte_literals
   648  	.align	4
   649  LCPI0_0:
   650  	.byte	255                     ## 0xff
   651  LCPI0_1:
   652  	.byte	1                       ## 0x1
   653  LCPI0_2:
   654  	.quad	281474976776192         ## 0x1000000010000
   655  LCPI0_3:
   656  	.short	1606                    ## 0x646
   657  LCPI0_4:
   658  	.short	8258                    ## 0x2042
   659  LCPI0_5:
   660  	.short	16                      ## 0x10
   661  LCPI0_6:
   662  	.short	2                       ## 0x2
   663  LCPI0_7:
   664  	.short	7193                    ## 0x1c19
   665  LCPI0_8:
   666  	.short	60768                   ## 0xed60
   667  LCPI0_9:
   668  	.short	128                     ## 0x80
   669  LCPI0_10:
   670  	.short	64373                   ## 0xfb75
   671  LCPI0_11:
   672  	.short	59507                   ## 0xe873
   673  LCPI0_12:
   674  	.space	16
   675  	.section	__TEXT,__text,regular,pure_instructions
   676  	.globl	__ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m
   677  	.align	4, 0x90
   678  __ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv420pEPKhmmmPhmS3_mS3_m
   679  ## BB#0:
   680  	push	rbp
   681  	mov	rbp, rsp
   682  	push	r15
   683  	push	r14
   684  	push	r13
   685  	push	r12
   686  	push	rbx
   687  
   688  LBB0_24:                                ## %_ZN4Simd4Sse213BgraToYuv420pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   689  	add	rsp, 88
   690  	pop	rbx
   691  	pop	r12
   692  	pop	r13
   693  	pop	r14
   694  	pop	r15
   695  	pop	rbp
   696  	ret
   697  
   698  	.section	__TEXT,__literal16,16byte_literals
   699  	.align	4
   700  LCPI1_0:
   701  	.byte	255                     ## 0xff
   702  LCPI1_1:
   703  	.byte	1                       ## 0x1
   704  LCPI1_2:
   705  	.quad	281474976776192         ## 0x1000000010000
   706  LCPI1_3:
   707  	.short	1606                    ## 0x646
   708  LCPI1_4:
   709  	.short	8258                    ## 0x2042
   710  LCPI1_5:
   711  	.short	16                      ## 0x10
   712  LCPI1_6:
   713  	.short	1                       ## 0x1
   714  LCPI1_7:
   715  	.short	7193                    ## 0x1c19
   716  LCPI1_8:
   717  	.short	60768                   ## 0xed60
   718  LCPI1_9:
   719  	.short	128                     ## 0x80
   720  LCPI1_10:
   721  	.short	64373                   ## 0xfb75
   722  LCPI1_11:
   723  	.short	59507                   ## 0xe873
   724  LCPI1_12:
   725  	.space	16
   726  	.section	__TEXT,__text,regular,pure_instructions
   727  	.globl	__ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m
   728  	.align	4, 0x90
   729  __ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv422pEPKhmmmPhmS3_mS3_m
   730  ## BB#0:
   731  	push	rbp
   732  	mov	rbp, rsp
   733  	push	r15
   734  	push	r14
   735  	push	r13
   736  	push	r12
   737  	push	rbx
   738  
   739  LBB1_20:                                ## %_ZN4Simd4Sse213BgraToYuv422pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   740  	pop	rbx
   741  	pop	r12
   742  	pop	r13
   743  	pop	r14
   744  	pop	r15
   745  	pop	rbp
   746  	ret
   747  
   748  	.section	__TEXT,__literal16,16byte_literals
   749  	.align	4
   750  LCPI2_0:
   751  	.byte	255                     ## 0xff
   752  LCPI2_1:
   753  	.byte	1                       ## 0x1
   754  LCPI2_2:
   755  	.quad	281474976776192         ## 0x1000000010000
   756  LCPI2_3:
   757  	.short	1606                    ## 0x646
   758  LCPI2_4:
   759  	.short	8258                    ## 0x2042
   760  LCPI2_5:
   761  	.short	16                      ## 0x10
   762  LCPI2_6:
   763  	.short	7193                    ## 0x1c19
   764  LCPI2_7:
   765  	.short	60768                   ## 0xed60
   766  LCPI2_8:
   767  	.short	128                     ## 0x80
   768  LCPI2_9:
   769  	.short	64373                   ## 0xfb75
   770  LCPI2_10:
   771  	.short	59507                   ## 0xe873
   772  LCPI2_11:
   773  	.space	16
   774  	.section	__TEXT,__text,regular,pure_instructions
   775  	.globl	__ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m
   776  	.align	4, 0x90
   777  __ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m: ## @_ZN4Simd4Sse213BgraToYuv444pEPKhmmmPhmS3_mS3_m
   778  ## BB#0:
   779  	push	rbp
   780  	mov	rbp, rsp
   781  	push	r15
   782  	push	r14
   783  	push	r13
   784  	push	r12
   785  	push	rbx
   786  
   787  LBB2_20:                                ## %_ZN4Simd4Sse213BgraToYuv444pILb1EEEvPKhmmmPhmS4_mS4_m.exit
   788  	pop	rbx
   789  	pop	r12
   790  	pop	r13
   791  	pop	r14
   792  	pop	r15
   793  	pop	rbp
   794  	ret
   795  
   796  
   797  .subsections_via_symbols`, "\n")
   798  
   799  var srcRetInMiddle = strings.Split(`.text
   800  .intel_syntax noprefix
   801  .file	"/home/harsha/repos/Simd/src/Simd/SimdSse2BgrToBgra.cpp"
   802  .section	.rodata.cst16,"aM",@progbits,16
   803  .align	16
   804  .LCPI0_0:
   805  .byte	255                     # 0xff
   806  .byte	0                       # 0x0
   807  .byte	255                     # 0xff
   808  .byte	0                       # 0x0
   809  .byte	255                     # 0xff
   810  .byte	0                       # 0x0
   811  .byte	255                     # 0xff
   812  .byte	0                       # 0x0
   813  .byte	255                     # 0xff
   814  .byte	0                       # 0x0
   815  .byte	255                     # 0xff
   816  .byte	0                       # 0x0
   817  .byte	255                     # 0xff
   818  .byte	0                       # 0x0
   819  .byte	255                     # 0xff
   820  .byte	0                       # 0x0
   821  .text
   822  .globl	_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh
   823  .align	16, 0x90
   824  .type	_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh,@function
   825  _ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh: # @_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh
   826  # BB#0:
   827  push	rbp
   828  mov	rbp, rsp
   829  push	r15
   830  push	r14
   831  push	r13
   832  push	r12
   833  push	rbx
   834  and	rsp, -8
   835  mov	r14b, byte ptr [rbp + 48]
   836  mov	r10, qword ptr [rbp + 40]
   837  mov	rax, qword ptr [rbp + 32]
   838  mov	r11, qword ptr [rbp + 24]
   839  mov	r12, qword ptr [rbp + 16]
   840  mov	rbx, rsi
   841  and	rbx, -16
   842  cmp	rbx, rsi
   843  jne	.LBB0_22
   844  # BB#1:
   845  mov	rbx, rdi
   846  and	rbx, -16
   847  cmp	rbx, rdi
   848  jne	.LBB0_22
   849  # BB#2:
   850  mov	rbx, r9
   851  and	rbx, -16
   852  cmp	rbx, r9
   853  jne	.LBB0_22
   854  # BB#3:
   855  mov	rbx, r8
   856  and	rbx, -16
   857  cmp	rbx, r8
   858  jne	.LBB0_22
   859  # BB#4:
   860  mov	rbx, r11
   861  and	rbx, -16
   862  cmp	rbx, r11
   863  jne	.LBB0_22
   864  # BB#5:
   865  mov	rbx, r12
   866  and	rbx, -16
   867  cmp	rbx, r12
   868  jne	.LBB0_22
   869  # BB#6:
   870  mov	rbx, r10
   871  and	rbx, -16
   872  cmp	rbx, r10
   873  jne	.LBB0_22
   874  # BB#7:
   875  mov	rbx, rax
   876  and	rbx, -16
   877  cmp	rbx, rax
   878  jne	.LBB0_22
   879  # BB#8:
   880  test	rcx, rcx
   881  je	.LBB0_36
   882  # BB#9:                                 # %.preheader.lr.ph.i1
   883  movzx	ebx, r14b
   884  shl	ebx, 8
   885  pxor	xmm0, xmm0
   886  pinsrw	xmm0, ebx, 0
   887  pinsrw	xmm0, ebx, 1
   888  pinsrw	xmm0, ebx, 2
   889  pinsrw	xmm0, ebx, 3
   890  pinsrw	xmm0, ebx, 4
   891  pinsrw	xmm0, ebx, 5
   892  pinsrw	xmm0, ebx, 6
   893  pinsrw	xmm0, ebx, 7
   894  mov	r15, rdx
   895  and	r15, -8
   896  je	.LBB0_19
   897  # BB#10:                                # %.lr.ph.us.i16.preheader
   898  xor	r14d, r14d
   899  cmp	r15, rdx
   900  jne	.LBB0_15
   901  # BB#11:
   902  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
   903  .align	16, 0x90
   904  .LBB0_12:                               # %.lr.ph.us.i16.us
   905  # =>This Loop Header: Depth=1
   906  #     Child Loop BB0_13 Depth 2
   907  xor	ebx, ebx
   908  .align	16, 0x90
   909  .LBB0_13:                               #   Parent Loop BB0_12 Depth=1
   910  # =>  This Inner Loop Header: Depth=2
   911  movdqa	xmm2, xmmword ptr [rdi + 2*rbx]
   912  pand	xmm2, xmm1
   913  movdqa	xmm3, xmmword ptr [r8 + 2*rbx]
   914  pand	xmm3, xmm1
   915  movdqa	xmm4, xmmword ptr [r12 + 2*rbx]
   916  pand	xmm4, xmm1
   917  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
   918  por	xmm3, xmm2
   919  por	xmm4, xmm0
   920  movdqa	xmm2, xmm3
   921  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
   922  movdqa	xmmword ptr [rax + 4*rbx], xmm2
   923  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   924  movdqa	xmmword ptr [rax + 4*rbx + 16], xmm3
   925  add	rbx, 8
   926  cmp	rbx, rdx
   927  jb	.LBB0_13
   928  # BB#14:                                # %._crit_edge.us.i17.us
   929  #   in Loop: Header=BB0_12 Depth=1
   930  add	rdi, rsi
   931  add	r8, r9
   932  add	r12, r11
   933  add	rax, r10
   934  inc	r14
   935  cmp	r14, rcx
   936  jne	.LBB0_12
   937  jmp	.LBB0_36
   938  .LBB0_22:
   939  test	rcx, rcx
   940  je	.LBB0_36
   941  # BB#23:                                # %.preheader.lr.ph.i
   942  movzx	ebx, r14b
   943  shl	ebx, 8
   944  pxor	xmm0, xmm0
   945  pinsrw	xmm0, ebx, 0
   946  pinsrw	xmm0, ebx, 1
   947  pinsrw	xmm0, ebx, 2
   948  pinsrw	xmm0, ebx, 3
   949  pinsrw	xmm0, ebx, 4
   950  pinsrw	xmm0, ebx, 5
   951  pinsrw	xmm0, ebx, 6
   952  pinsrw	xmm0, ebx, 7
   953  mov	r13, rdx
   954  and	r13, -8
   955  je	.LBB0_29
   956  # BB#24:                                # %.lr.ph.us.i.preheader
   957  xor	r14d, r14d
   958  cmp	r13, rdx
   959  jne	.LBB0_32
   960  # BB#25:
   961  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
   962  .align	16, 0x90
   963  .LBB0_26:                               # %.lr.ph.us.i.us
   964  # =>This Loop Header: Depth=1
   965  #     Child Loop BB0_27 Depth 2
   966  xor	ebx, ebx
   967  .align	16, 0x90
   968  .LBB0_27:                               #   Parent Loop BB0_26 Depth=1
   969  # =>  This Inner Loop Header: Depth=2
   970  movdqu	xmm2, xmmword ptr [rdi + 2*rbx]
   971  pand	xmm2, xmm1
   972  movdqu	xmm3, xmmword ptr [r8 + 2*rbx]
   973  pand	xmm3, xmm1
   974  movdqu	xmm4, xmmword ptr [r12 + 2*rbx]
   975  pand	xmm4, xmm1
   976  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
   977  por	xmm3, xmm2
   978  por	xmm4, xmm0
   979  movdqa	xmm2, xmm3
   980  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
   981  movdqu	xmmword ptr [rax + 4*rbx], xmm2
   982  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   983  movdqu	xmmword ptr [rax + 4*rbx + 16], xmm3
   984  add	rbx, 8
   985  cmp	rbx, rdx
   986  jb	.LBB0_27
   987  # BB#28:                                # %._crit_edge.us.i.us
   988  #   in Loop: Header=BB0_26 Depth=1
   989  add	rdi, rsi
   990  add	r8, r9
   991  add	r12, r11
   992  add	rax, r10
   993  inc	r14
   994  cmp	r14, rcx
   995  jne	.LBB0_26
   996  jmp	.LBB0_36
   997  .LBB0_29:                               # %.preheader.i.preheader
   998  cmp	r13, rdx
   999  je	.LBB0_36
  1000  # BB#30:                                # %.preheader.i.preheader65
  1001  lea	r14, [rdi + 2*rdx - 16]
  1002  lea	rbx, [r8 + 2*rdx - 16]
  1003  lea	rdi, [r12 + 2*rdx - 16]
  1004  lea	rax, [rax + 4*rdx - 16]
  1005  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
  1006  .align	16, 0x90
  1007  .LBB0_31:                               # %.preheader.i
  1008  # =>This Inner Loop Header: Depth=1
  1009  movdqu	xmm2, xmmword ptr [r14]
  1010  pand	xmm2, xmm1
  1011  movdqu	xmm3, xmmword ptr [rbx]
  1012  pand	xmm3, xmm1
  1013  movdqu	xmm4, xmmword ptr [rdi]
  1014  pand	xmm4, xmm1
  1015  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1016  por	xmm3, xmm2
  1017  por	xmm4, xmm0
  1018  movdqa	xmm2, xmm3
  1019  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1020  movdqu	xmmword ptr [rax - 16], xmm2
  1021  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1022  movdqu	xmmword ptr [rax], xmm3
  1023  add	r14, rsi
  1024  add	rbx, r9
  1025  add	rdi, r11
  1026  add	rax, r10
  1027  dec	rcx
  1028  jne	.LBB0_31
  1029  jmp	.LBB0_36
  1030  .LBB0_32:
  1031  lea	r15, [4*rdx - 32]
  1032  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
  1033  .align	16, 0x90
  1034  .LBB0_33:                               # %.lr.ph.us.i
  1035  # =>This Loop Header: Depth=1
  1036  #     Child Loop BB0_34 Depth 2
  1037  xor	ebx, ebx
  1038  .align	16, 0x90
  1039  .LBB0_34:                               #   Parent Loop BB0_33 Depth=1
  1040  # =>  This Inner Loop Header: Depth=2
  1041  movdqu	xmm2, xmmword ptr [rdi + 2*rbx]
  1042  pand	xmm2, xmm1
  1043  movdqu	xmm3, xmmword ptr [r8 + 2*rbx]
  1044  pand	xmm3, xmm1
  1045  movdqu	xmm4, xmmword ptr [r12 + 2*rbx]
  1046  pand	xmm4, xmm1
  1047  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1048  por	xmm3, xmm2
  1049  por	xmm4, xmm0
  1050  movdqa	xmm2, xmm3
  1051  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1052  movdqu	xmmword ptr [rax + 4*rbx], xmm2
  1053  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1054  movdqu	xmmword ptr [rax + 4*rbx + 16], xmm3
  1055  add	rbx, 8
  1056  cmp	rbx, r13
  1057  jb	.LBB0_34
  1058  # BB#35:                                # %._crit_edge.us.i
  1059  #   in Loop: Header=BB0_33 Depth=1
  1060  movdqu	xmm2, xmmword ptr [rdi + 2*rdx - 16]
  1061  pand	xmm2, xmm1
  1062  movdqu	xmm3, xmmword ptr [r8 + 2*rdx - 16]
  1063  pand	xmm3, xmm1
  1064  movdqu	xmm4, xmmword ptr [r12 + 2*rdx - 16]
  1065  pand	xmm4, xmm1
  1066  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1067  por	xmm3, xmm2
  1068  por	xmm4, xmm0
  1069  movdqa	xmm2, xmm3
  1070  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1071  movdqu	xmmword ptr [rax + r15], xmm2
  1072  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1073  movdqu	xmmword ptr [rax + r15 + 16], xmm3
  1074  add	rdi, rsi
  1075  add	r8, r9
  1076  add	r12, r11
  1077  add	rax, r10
  1078  inc	r14
  1079  cmp	r14, rcx
  1080  jne	.LBB0_33
  1081  .LBB0_36:                               # %_ZN4Simd4Sse214Bgr48pToBgra32ILb1EEEvPKhmmmS3_mS3_mPhmh.exit
  1082  lea	rsp, [rbp - 40]
  1083  pop	rbx
  1084  pop	r12
  1085  pop	r13
  1086  pop	r14
  1087  pop	r15
  1088  pop	rbp
  1089  ret
  1090  .LBB0_19:                               # %.preheader.lr.ph.split.i
  1091  cmp	r15, rdx
  1092  je	.LBB0_36
  1093  # BB#20:                                # %.preheader.i26.preheader
  1094  lea	r14, [rdi + 2*rdx - 16]
  1095  lea	rbx, [r8 + 2*rdx - 16]
  1096  lea	rdi, [r12 + 2*rdx - 16]
  1097  lea	rax, [rax + 4*rdx - 16]
  1098  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
  1099  .align	16, 0x90
  1100  .LBB0_21:                               # %.preheader.i26
  1101  # =>This Inner Loop Header: Depth=1
  1102  movdqu	xmm2, xmmword ptr [r14]
  1103  pand	xmm2, xmm1
  1104  movdqu	xmm3, xmmword ptr [rbx]
  1105  pand	xmm3, xmm1
  1106  movdqu	xmm4, xmmword ptr [rdi]
  1107  pand	xmm4, xmm1
  1108  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1109  por	xmm3, xmm2
  1110  por	xmm4, xmm0
  1111  movdqa	xmm2, xmm3
  1112  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1113  movdqu	xmmword ptr [rax - 16], xmm2
  1114  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1115  movdqu	xmmword ptr [rax], xmm3
  1116  add	r14, rsi
  1117  add	rbx, r9
  1118  add	rdi, r11
  1119  add	rax, r10
  1120  dec	rcx
  1121  jne	.LBB0_21
  1122  jmp	.LBB0_36
  1123  .LBB0_15:
  1124  lea	r13, [4*rdx - 32]
  1125  movdqa	xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
  1126  .align	16, 0x90
  1127  .LBB0_16:                               # %.lr.ph.us.i16
  1128  # =>This Loop Header: Depth=1
  1129  #     Child Loop BB0_17 Depth 2
  1130  xor	ebx, ebx
  1131  .align	16, 0x90
  1132  .LBB0_17:                               #   Parent Loop BB0_16 Depth=1
  1133  # =>  This Inner Loop Header: Depth=2
  1134  movdqa	xmm2, xmmword ptr [rdi + 2*rbx]
  1135  pand	xmm2, xmm1
  1136  movdqa	xmm3, xmmword ptr [r8 + 2*rbx]
  1137  pand	xmm3, xmm1
  1138  movdqa	xmm4, xmmword ptr [r12 + 2*rbx]
  1139  pand	xmm4, xmm1
  1140  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1141  por	xmm3, xmm2
  1142  por	xmm4, xmm0
  1143  movdqa	xmm2, xmm3
  1144  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1145  movdqa	xmmword ptr [rax + 4*rbx], xmm2
  1146  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1147  movdqa	xmmword ptr [rax + 4*rbx + 16], xmm3
  1148  add	rbx, 8
  1149  cmp	rbx, r15
  1150  jb	.LBB0_17
  1151  # BB#18:                                # %._crit_edge.us.i17
  1152  #   in Loop: Header=BB0_16 Depth=1
  1153  movdqu	xmm2, xmmword ptr [rdi + 2*rdx - 16]
  1154  pand	xmm2, xmm1
  1155  movdqu	xmm3, xmmword ptr [r8 + 2*rdx - 16]
  1156  pand	xmm3, xmm1
  1157  movdqu	xmm4, xmmword ptr [r12 + 2*rdx - 16]
  1158  pand	xmm4, xmm1
  1159  pslldq	xmm3, 1                 # xmm3 = zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
  1160  por	xmm3, xmm2
  1161  por	xmm4, xmm0
  1162  movdqa	xmm2, xmm3
  1163  punpcklwd	xmm2, xmm4      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
  1164  movdqu	xmmword ptr [rax + r13], xmm2
  1165  punpckhwd	xmm3, xmm4      # xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
  1166  movdqu	xmmword ptr [rax + r13 + 16], xmm3
  1167  add	rdi, rsi
  1168  add	r8, r9
  1169  add	r12, r11
  1170  add	rax, r10
  1171  inc	r14
  1172  cmp	r14, rcx
  1173  jne	.LBB0_16
  1174  jmp	.LBB0_36
  1175  .Lfunc_end0:
  1176  .size	_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh, .Lfunc_end0-_ZN4Simd4Sse214Bgr48pToBgra32EPKhmmmS2_mS2_mPhmh
  1177  
  1178  
  1179  .ident	"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"
  1180  .section	".note.GNU-stack","",@progbits`, "\n")
  1181  
  1182  var srcLabelHasSpecialComment = strings.Split(`	.text
  1183  	.intel_syntax noprefix
  1184  	.file	"sample.c"
  1185  	.globl	sample_sum_sse4_2       # -- Begin function sample_sum_sse4_2
  1186  	.p2align	4, 0x90
  1187  	.type	sample_sum_sse4_2,@function
  1188  sample_sum_sse4_2:                      # @sample_sum_sse4_2
  1189  # %bb.0:
  1190  	push	rbp
  1191  	mov	rbp, rsp
  1192  	and	rsp, -8
  1193  	test	rsi, rsi
  1194  	jle	.LBB0_1
  1195  # %bb.2:
  1196  	lea	rcx, [rdi + 8*rsi]
  1197  	lea	rax, [rdi + 8]
  1198  	cmp	rcx, rax
  1199  	cmova	rax, rcx
  1200  	mov	r9, rdi
  1201  	not	r9
  1202  	add	r9, rax
  1203  	shr	r9, 3
  1204  	add	r9, 1
  1205  	cmp	r9, 4
  1206  	jae	.LBB0_4
  1207  # %bb.3:
  1208  	xor	eax, eax
  1209  	jmp	.LBB0_13
  1210  .LBB0_1:
  1211  	xor	eax, eax
  1212  .LBB0_14:
  1213  	mov	rsp, rbp
  1214  	pop	rbp
  1215  	ret
  1216  .LBB0_4:
  1217  	mov	r8, r9
  1218  	and	r8, -4
  1219  	lea	rsi, [r8 - 4]
  1220  	mov	rdx, rsi
  1221  	shr	rdx, 2
  1222  	add	rdx, 1
  1223  	mov	eax, edx
  1224  	and	eax, 3
  1225  	cmp	rsi, 12
  1226  	jae	.LBB0_6
  1227  # %bb.5:
  1228  	pxor	xmm0, xmm0
  1229  	xor	esi, esi
  1230  	pxor	xmm1, xmm1
  1231  	test	rax, rax
  1232  	jne	.LBB0_9
  1233  	jmp	.LBB0_11
  1234  .LBB0_6:
  1235  	mov	esi, 1
  1236  	sub	rsi, rdx
  1237  	lea	rdx, [rax + rsi]
  1238  	add	rdx, -1
  1239  	pxor	xmm0, xmm0
  1240  	xor	esi, esi
  1241  	pxor	xmm1, xmm1
  1242  	.p2align	4, 0x90
  1243  .LBB0_7:                                # =>This Inner Loop Header: Depth=1
  1244  	movdqu	xmm2, xmmword ptr [rdi + 8*rsi]
  1245  	paddq	xmm2, xmm0
  1246  	movdqu	xmm0, xmmword ptr [rdi + 8*rsi + 16]
  1247  	paddq	xmm0, xmm1
  1248  	movdqu	xmm1, xmmword ptr [rdi + 8*rsi + 32]
  1249  	movdqu	xmm3, xmmword ptr [rdi + 8*rsi + 48]
  1250  	movdqu	xmm4, xmmword ptr [rdi + 8*rsi + 64]
  1251  	paddq	xmm4, xmm1
  1252  	paddq	xmm4, xmm2
  1253  	movdqu	xmm2, xmmword ptr [rdi + 8*rsi + 80]
  1254  	paddq	xmm2, xmm3
  1255  	paddq	xmm2, xmm0
  1256  	movdqu	xmm0, xmmword ptr [rdi + 8*rsi + 96]
  1257  	paddq	xmm0, xmm4
  1258  	movdqu	xmm1, xmmword ptr [rdi + 8*rsi + 112]
  1259  	paddq	xmm1, xmm2
  1260  	add	rsi, 16
  1261  	add	rdx, 4
  1262  	jne	.LBB0_7
  1263  # %bb.8:
  1264  	test	rax, rax
  1265  	je	.LBB0_11
  1266  .LBB0_9:
  1267  	lea	rdx, [rdi + 8*rsi]
  1268  	add	rdx, 16
  1269  	neg	rax
  1270  	.p2align	4, 0x90
  1271  .LBB0_10:                               # =>This Inner Loop Header: Depth=1
  1272  	movdqu	xmm2, xmmword ptr [rdx - 16]
  1273  	paddq	xmm0, xmm2
  1274  	movdqu	xmm2, xmmword ptr [rdx]
  1275  	paddq	xmm1, xmm2
  1276  	add	rdx, 32
  1277  	add	rax, 1
  1278  	jne	.LBB0_10
  1279  .LBB0_11:
  1280  	paddq	xmm0, xmm1
  1281  	pshufd	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
  1282  	paddq	xmm1, xmm0
  1283  	movq	rax, xmm1
  1284  	cmp	r9, r8
  1285  	je	.LBB0_14
  1286  # %bb.12:
  1287  	lea	rdi, [rdi + 8*r8]
  1288  	.p2align	4, 0x90
  1289  .LBB0_13:                               # =>This Inner Loop Header: Depth=1
  1290  	add	rax, qword ptr [rdi]
  1291  	add	rdi, 8
  1292  	cmp	rdi, rcx
  1293  	jb	.LBB0_13
  1294  	jmp	.LBB0_14
  1295  .Lfunc_end0:
  1296  	.size	sample_sum_sse4_2, .Lfunc_end0-sample_sum_sse4_2
  1297                                          # -- End function
  1298  	.section	.rodata.cst16,"aM",@progbits,16
  1299  	.p2align	4               # -- Begin function sample_max_sse4_2
  1300  `, "\n")