github.com/emmansun/gmsm@v0.29.1/internal/subtle/xor_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  //
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // func xorBytes(dst, a, b *byte, n int)
    10  TEXT ·xorBytes(SB), NOSPLIT, $0
    11  	MOVQ  dst+0(FP), BX
    12  	MOVQ  a+8(FP), SI
    13  	MOVQ  b+16(FP), CX
    14  	MOVQ  n+24(FP), DX
    15  	CMPQ  DX, $32         // if len less than 32, non avx2.
    16  	JL non_avx2
    17  	CMPB ·useAVX2(SB), $1
    18  	JE   avx2
    19  
    20  non_avx2:
    21  	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
    22  	JNZ   not_aligned
    23  
    24  aligned:
    25  	MOVQ $0, AX // position in slices
    26  
    27  loop16b:
    28  	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
    29  	MOVOU (CX)(AX*1), X1
    30  	PXOR  X1, X0
    31  	MOVOU X0, (BX)(AX*1)
    32  	ADDQ  $16, AX
    33  	CMPQ  DX, AX
    34  	JNE   loop16b
    35  	RET
    36  
    37  loop_1b:
    38  	SUBQ  $1, DX           // XOR 1byte backwards.
    39  	MOVB  (SI)(DX*1), DI
    40  	MOVB  (CX)(DX*1), AX
    41  	XORB  AX, DI
    42  	MOVB  DI, (BX)(DX*1)
    43  	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
    44  	JNZ   loop_1b
    45  	CMPQ  DX, $0           // if len is 0, ret.
    46  	JE    ret
    47  	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
    48  	JZ    aligned
    49  
    50  not_aligned:
    51  	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
    52  	JNE   loop_1b
    53  	SUBQ  $8, DX           // XOR 8bytes backwards.
    54  	MOVQ  (SI)(DX*1), DI
    55  	MOVQ  (CX)(DX*1), AX
    56  	XORQ  AX, DI
    57  	MOVQ  DI, (BX)(DX*1)
    58  	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
    59  	JGE   aligned
    60  
    61  ret:
    62  	RET
    63  
    64  avx2:
    65  	TESTQ $31, DX          // AND 31 & len, if not zero jump to avx2_not_aligned.
    66  	JNZ   avx2_not_aligned
    67  
    68  avx2_aligned:              // input length = 16*n, where n is greater or equal 2.
    69  	TESTQ $16, DX          // AND 16 & len, if zero jump to loop32b_start.
    70  	JE loop32b_start
    71  	SUBQ  $16, DX          // XOR 16bytes backwards.
    72  	VMOVDQU (SI)(DX*1), X0
    73  	VPXOR  (CX)(DX*1), X0, X0
    74  	VMOVDQU X0, (BX)(DX*1)
    75  
    76  loop32b_start:
    77  	MOVQ $0, AX            // position in slices
    78  
    79  loop32b:
    80  	VMOVDQU (SI)(AX*1), Y0   // XOR 32byte forwards.
    81  	VPXOR (CX)(AX*1), Y0, Y0
    82  	VMOVDQU Y0, (BX)(AX*1)
    83  	ADDQ  $32, AX
    84  	CMPQ  DX, AX
    85  	JNE   loop32b
    86  
    87  avx2_ret:	
    88  	VZEROUPPER
    89  	RET
    90  
    91  avx2_loop_1b:
    92  	SUBQ  $1, DX           // XOR 1byte backwards.
    93  	MOVB  (SI)(DX*1), DI
    94  	MOVB  (CX)(DX*1), AX
    95  	XORB  AX, DI
    96  	MOVB  DI, (BX)(DX*1)
    97  	TESTQ $7, DX           // AND 7 & len, if not zero jump to avx2_loop_1b.
    98  	JNZ   avx2_loop_1b
    99  	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
   100  	JZ    avx2_aligned
   101  
   102  avx2_not_aligned:
   103  	TESTQ $7, DX           // AND $7 & len, if not zero jump to avx2_loop_1b.
   104  	JNE   avx2_loop_1b
   105  	TESTQ $8, DX           // AND $8 & len, if zero jump to avx2_aligned.
   106  	JE   avx2_aligned
   107  	SUBQ  $8, DX           // XOR 8bytes backwards.
   108  	MOVQ  (SI)(DX*1), DI
   109  	MOVQ  (CX)(DX*1), AX
   110  	XORQ  AX, DI
   111  	MOVQ  DI, (BX)(DX*1)
   112  	JMP  avx2_aligned