github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/crypto/rc4/rc4_amd64.s (about)

     1  // Original source:
     2  //	http://www.zorinaq.com/papers/rc4-amd64.html
     3  //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
     4  
     5  // Local modifications:
     6  //
     7  // Transliterated from GNU to 6a assembly syntax by the Go authors.
     8  // The comments and spacing are from the original.
     9  //
    10  // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
    11  //
    12  // The original code accumulated 64 bits of key stream in an integer
    13  // register and then XOR'ed the key stream into the data 8 bytes at a time.
    14  // Modified to accumulate 128 bits of key stream into an XMM register
    15  // and then XOR the key stream into the data 16 bytes at a time.
    16  // Approximately doubles throughput.
    17  
    18  // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
    19  // but makes the code run 2.0x slower on Xeon.
    20  #define EXTEND(r) MOVBLZX r, r
    21  
    22  /*
    23  ** RC4 implementation optimized for AMD64.
    24  **
    25  ** Author: Marc Bevand <bevand_m (at) epita.fr>
    26  ** Licence: I hereby disclaim the copyright on this code and place it
    27  ** in the public domain.
    28  **
    29  ** The code has been designed to be easily integrated into openssl:
    30  ** the exported RC4() function can replace the actual implementations
    31  ** openssl already contains. Please note that when linking with openssl,
    32  ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
    33  ** with -DRC4_INT='unsigned long'.
    34  **
    35  ** The throughput achieved by this code is about 320 MBytes/sec, on
    36  ** a 1.8 GHz AMD Opteron (rev C0) processor.
    37  */
    38  
    39  TEXT ·xorKeyStream(SB),7,$0
    40  	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
    41  	MOVQ	src+8(FP),	SI		// in = ARG(in)
    42  	MOVQ	dst+0(FP),	DI		// out = ARG(out)
    43  	MOVQ	state+24(FP),	BP		// d = ARG(data)
    44  	MOVQ	i+32(FP),	AX
    45  	MOVBQZX	0(AX),		CX		// x = *xp
    46  	MOVQ	j+40(FP),	AX
    47  	MOVBQZX	0(AX),		DX		// y = *yp
    48  
    49  	LEAQ	(SI)(BX*1),	R9		// limit = in+len
    50  
    51  l1:	CMPQ	SI,		R9		// cmp in with in+len
    52  	JGE	finished			// jump if (in >= in+len)
    53  
    54  	INCB	CX
    55  	EXTEND(CX)
    56  	TESTL	$15,		CX
    57  	JZ	wordloop
    58  
    59  	MOVBLZX	(BP)(CX*4),	AX
    60  
    61  	ADDB	AX,		DX		// y += tx
    62  	EXTEND(DX)
    63  	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
    64  	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
    65  	ADDB	AX,		BX		// val = ty+tx
    66  	EXTEND(BX)
    67  	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
    68  	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
    69  	XORB	(SI),		R8		// xor 1 byte
    70  	MOVB	R8,		(DI)
    71  	INCQ	SI				// in++
    72  	INCQ	DI				// out++
    73  	JMP l1
    74  
    75  wordloop:
    76  	SUBQ	$16,		R9
    77  	CMPQ	SI,		R9
    78  	JGT	end
    79  
    80  start:
    81  	ADDQ	$16,		SI		// increment in
    82  	ADDQ	$16,		DI		// increment out
    83  
    84  	// Each KEYROUND generates one byte of key and
    85  	// inserts it into an XMM register at the given 16-bit index.
    86  	// The key state array is uint32 words only using the bottom
    87  	// byte of each word, so the 16-bit OR only copies 8 useful bits.
    88  	// We accumulate alternating bytes into X0 and X1, and then at
    89  	// the end we OR X1<<8 into X0 to produce the actual key.
    90  	//
    91  	// At the beginning of the loop, CX%16 == 0, so the 16 loads
    92  	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
    93  	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
    94  	// without fear of the byte computation CX+15 wrapping around.
    95  	//
    96  	// The first round needs R12[0], the second needs R12[1], and so on.
    97  	// We can avoid memory stalls by starting the load for round n+1
    98  	// before the end of round n, using the LOAD macro.
    99  	LEAQ	(BP)(CX*4),	R12
   100  
   101  #define KEYROUND(xmm, load, off, r1, r2, index) \
   102  	MOVBLZX	(BP)(DX*4),	R8; \
   103  	MOVB	r1,		(BP)(DX*4); \
   104  	load((off+1), r2); \
   105  	MOVB	R8,		(off*4)(R12); \
   106  	ADDB	r1,		R8; \
   107  	EXTEND(R8); \
   108  	PINSRW	$index, (BP)(R8*4), xmm
   109  
   110  #define LOAD(off, reg) \
   111  	MOVBLZX	(off*4)(R12),	reg; \
   112  	ADDB	reg,		DX; \
   113  	EXTEND(DX)
   114  
   115  #define SKIP(off, reg)
   116  
   117  	LOAD(0, AX)
   118  	KEYROUND(X0, LOAD, 0, AX, BX, 0)
   119  	KEYROUND(X1, LOAD, 1, BX, AX, 0)
   120  	KEYROUND(X0, LOAD, 2, AX, BX, 1)
   121  	KEYROUND(X1, LOAD, 3, BX, AX, 1)
   122  	KEYROUND(X0, LOAD, 4, AX, BX, 2)
   123  	KEYROUND(X1, LOAD, 5, BX, AX, 2)
   124  	KEYROUND(X0, LOAD, 6, AX, BX, 3)
   125  	KEYROUND(X1, LOAD, 7, BX, AX, 3)
   126  	KEYROUND(X0, LOAD, 8, AX, BX, 4)
   127  	KEYROUND(X1, LOAD, 9, BX, AX, 4)
   128  	KEYROUND(X0, LOAD, 10, AX, BX, 5)
   129  	KEYROUND(X1, LOAD, 11, BX, AX, 5)
   130  	KEYROUND(X0, LOAD, 12, AX, BX, 6)
   131  	KEYROUND(X1, LOAD, 13, BX, AX, 6)
   132  	KEYROUND(X0, LOAD, 14, AX, BX, 7)
   133  	KEYROUND(X1, SKIP, 15, BX, AX, 7)
   134  	
   135  	ADDB	$16,		CX
   136  
   137  	PSLLQ	$8,		X1
   138  	PXOR	X1,		X0
   139  	MOVOU	-16(SI),	X2
   140  	PXOR	X0,		X2
   141  	MOVOU	X2,		-16(DI)
   142  
   143  	CMPQ	SI,		R9		// cmp in with in+len-16
   144  	JLE	start				// jump if (in <= in+len-16)
   145  
   146  end:
   147  	DECB	CX
   148  	ADDQ	$16,		R9		// tmp = in+len
   149  
   150  	// handle the last bytes, one by one
   151  l2:	CMPQ	SI,		R9		// cmp in with in+len
   152  	JGE	finished			// jump if (in >= in+len)
   153  
   154  	INCB	CX
   155  	EXTEND(CX)
   156  	MOVBLZX	(BP)(CX*4),	AX
   157  
   158  	ADDB	AX,		DX		// y += tx
   159  	EXTEND(DX)
   160  	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
   161  	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
   162  	ADDB	AX,		BX		// val = ty+tx
   163  	EXTEND(BX)
   164  	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
   165  	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
   166  	XORB	(SI),		R8		// xor 1 byte
   167  	MOVB	R8,		(DI)
   168  	INCQ	SI				// in++
   169  	INCQ	DI				// out++
   170  	JMP l2
   171  
   172  finished:
   173  	MOVQ	j+40(FP),	BX
   174  	MOVB	DX, 0(BX)
   175  	MOVQ	i+32(FP),	AX
   176  	MOVB	CX, 0(AX)
   177  	RET