github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/rc4/rc4_amd64.s

github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/rc4/rc4_amd64.s (about)

     1  // Original source:
     2  //	http://www.zorinaq.com/papers/rc4-amd64.html
     3  //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
     4  
     5  #include "textflag.h"
     6  
     7  // Local modifications:
     8  //
     9  // Transliterated from GNU to 6a assembly syntax by the Go authors.
    10  // The comments and spacing are from the original.
    11  //
    12  // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
    13  //
    14  // The original code accumulated 64 bits of key stream in an integer
    15  // register and then XOR'ed the key stream into the data 8 bytes at a time.
    16  // Modified to accumulate 128 bits of key stream into an XMM register
    17  // and then XOR the key stream into the data 16 bytes at a time.
    18  // Approximately doubles throughput.
    19  
    20  // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
    21  // but makes the code run 2.0x slower on Xeon.
    22  #define EXTEND(r) MOVBLZX r, r
    23  
    24  /*
    25  ** RC4 implementation optimized for AMD64.
    26  **
    27  ** Author: Marc Bevand <bevand_m (at) epita.fr>
    28  ** Licence: I hereby disclaim the copyright on this code and place it
    29  ** in the public domain.
    30  **
    31  ** The code has been designed to be easily integrated into openssl:
    32  ** the exported RC4() function can replace the actual implementations
    33  ** openssl already contains. Please note that when linking with openssl,
    34  ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
    35  ** with -DRC4_INT='unsigned long'.
    36  **
    37  ** The throughput achieved by this code is about 320 MBytes/sec, on
    38  ** a 1.8 GHz AMD Opteron (rev C0) processor.
    39  */
    40  
    41  TEXT ·xorKeyStream(SB),NOSPLIT,$0
    42  	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
    43  	MOVQ	src+8(FP),	SI		// in = ARG(in)
    44  	MOVQ	dst+0(FP),	DI		// out = ARG(out)
    45  	MOVQ	state+24(FP),	BP		// d = ARG(data)
    46  	MOVQ	i+32(FP),	AX
    47  	MOVBQZX	0(AX),		CX		// x = *xp
    48  	MOVQ	j+40(FP),	AX
    49  	MOVBQZX	0(AX),		DX		// y = *yp
    50  
    51  	LEAQ	(SI)(BX*1),	R9		// limit = in+len
    52  
    53  l1:	CMPQ	SI,		R9		// cmp in with in+len
    54  	JGE	finished			// jump if (in >= in+len)
    55  
    56  	INCB	CX
    57  	EXTEND(CX)
    58  	TESTL	$15,		CX
    59  	JZ	wordloop
    60  
    61  	MOVBLZX	(BP)(CX*4),	AX
    62  
    63  	ADDB	AX,		DX		// y += tx
    64  	EXTEND(DX)
    65  	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
    66  	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
    67  	ADDB	AX,		BX		// val = ty+tx
    68  	EXTEND(BX)
    69  	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
    70  	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
    71  	XORB	(SI),		R8		// xor 1 byte
    72  	MOVB	R8,		(DI)
    73  	INCQ	SI				// in++
    74  	INCQ	DI				// out++
    75  	JMP l1
    76  
    77  wordloop:
    78  	SUBQ	$16,		R9
    79  	CMPQ	SI,		R9
    80  	JGT	end
    81  
    82  start:
    83  	ADDQ	$16,		SI		// increment in
    84  	ADDQ	$16,		DI		// increment out
    85  
    86  	// Each KEYROUND generates one byte of key and
    87  	// inserts it into an XMM register at the given 16-bit index.
    88  	// The key state array is uint32 words only using the bottom
    89  	// byte of each word, so the 16-bit OR only copies 8 useful bits.
    90  	// We accumulate alternating bytes into X0 and X1, and then at
    91  	// the end we OR X1<<8 into X0 to produce the actual key.
    92  	//
    93  	// At the beginning of the loop, CX%16 == 0, so the 16 loads
    94  	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
    95  	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
    96  	// without fear of the byte computation CX+15 wrapping around.
    97  	//
    98  	// The first round needs R12[0], the second needs R12[1], and so on.
    99  	// We can avoid memory stalls by starting the load for round n+1
   100  	// before the end of round n, using the LOAD macro.
   101  	LEAQ	(BP)(CX*4),	R12
   102  
   103  #define KEYROUND(xmm, load, off, r1, r2, index) \
   104  	MOVBLZX	(BP)(DX*4),	R8; \
   105  	MOVB	r1,		(BP)(DX*4); \
   106  	load((off+1), r2); \
   107  	MOVB	R8,		(off*4)(R12); \
   108  	ADDB	r1,		R8; \
   109  	EXTEND(R8); \
   110  	PINSRW	$index, (BP)(R8*4), xmm
   111  
   112  #define LOAD(off, reg) \
   113  	MOVBLZX	(off*4)(R12),	reg; \
   114  	ADDB	reg,		DX; \
   115  	EXTEND(DX)
   116  
   117  #define SKIP(off, reg)
   118  
   119  	LOAD(0, AX)
   120  	KEYROUND(X0, LOAD, 0, AX, BX, 0)
   121  	KEYROUND(X1, LOAD, 1, BX, AX, 0)
   122  	KEYROUND(X0, LOAD, 2, AX, BX, 1)
   123  	KEYROUND(X1, LOAD, 3, BX, AX, 1)
   124  	KEYROUND(X0, LOAD, 4, AX, BX, 2)
   125  	KEYROUND(X1, LOAD, 5, BX, AX, 2)
   126  	KEYROUND(X0, LOAD, 6, AX, BX, 3)
   127  	KEYROUND(X1, LOAD, 7, BX, AX, 3)
   128  	KEYROUND(X0, LOAD, 8, AX, BX, 4)
   129  	KEYROUND(X1, LOAD, 9, BX, AX, 4)
   130  	KEYROUND(X0, LOAD, 10, AX, BX, 5)
   131  	KEYROUND(X1, LOAD, 11, BX, AX, 5)
   132  	KEYROUND(X0, LOAD, 12, AX, BX, 6)
   133  	KEYROUND(X1, LOAD, 13, BX, AX, 6)
   134  	KEYROUND(X0, LOAD, 14, AX, BX, 7)
   135  	KEYROUND(X1, SKIP, 15, BX, AX, 7)
   136  	
   137  	ADDB	$16,		CX
   138  
   139  	PSLLQ	$8,		X1
   140  	PXOR	X1,		X0
   141  	MOVOU	-16(SI),	X2
   142  	PXOR	X0,		X2
   143  	MOVOU	X2,		-16(DI)
   144  
   145  	CMPQ	SI,		R9		// cmp in with in+len-16
   146  	JLE	start				// jump if (in <= in+len-16)
   147  
   148  end:
   149  	DECB	CX
   150  	ADDQ	$16,		R9		// tmp = in+len
   151  
   152  	// handle the last bytes, one by one
   153  l2:	CMPQ	SI,		R9		// cmp in with in+len
   154  	JGE	finished			// jump if (in >= in+len)
   155  
   156  	INCB	CX
   157  	EXTEND(CX)
   158  	MOVBLZX	(BP)(CX*4),	AX
   159  
   160  	ADDB	AX,		DX		// y += tx
   161  	EXTEND(DX)
   162  	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
   163  	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
   164  	ADDB	AX,		BX		// val = ty+tx
   165  	EXTEND(BX)
   166  	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
   167  	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
   168  	XORB	(SI),		R8		// xor 1 byte
   169  	MOVB	R8,		(DI)
   170  	INCQ	SI				// in++
   171  	INCQ	DI				// out++
   172  	JMP l2
   173  
   174  finished:
   175  	MOVQ	j+40(FP),	BX
   176  	MOVB	DX, 0(BX)
   177  	MOVQ	i+32(FP),	AX
   178  	MOVB	CX, 0(AX)
   179  	RET