github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/rc4/rc4_amd64p32.s

github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/rc4/rc4_amd64p32.s (about)

     1  // Original source:
     2  //	http://www.zorinaq.com/papers/rc4-amd64.html
     3  //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
     4  
     5  #include "textflag.h"
     6  
     7  // Local modifications:
     8  //
     9  // Transliterated from GNU to 6a assembly syntax by the Go authors.
    10  // The comments and spacing are from the original.
    11  //
    12  // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
    13  //
    14  // The original code accumulated 64 bits of key stream in an integer
    15  // register and then XOR'ed the key stream into the data 8 bytes at a time.
    16  // Modified to accumulate 128 bits of key stream into an XMM register
    17  // and then XOR the key stream into the data 16 bytes at a time.
    18  // Approximately doubles throughput.
    19  //
    20  // Converted to amd64p32.
    21  //
    22  // To make safe for Native Client, avoid use of BP, R15,
    23  // and two-register addressing modes.
    24  
    25  // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
    26  // but makes the code run 2.0x slower on Xeon.
    27  #define EXTEND(r) MOVBLZX r, r
    28  
    29  /*
    30  ** RC4 implementation optimized for AMD64.
    31  **
    32  ** Author: Marc Bevand <bevand_m (at) epita.fr>
    33  ** Licence: I hereby disclaim the copyright on this code and place it
    34  ** in the public domain.
    35  **
    36  ** The code has been designed to be easily integrated into openssl:
    37  ** the exported RC4() function can replace the actual implementations
    38  ** openssl already contains. Please note that when linking with openssl,
    39  ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
    40  ** with -DRC4_INT='unsigned long'.
    41  **
    42  ** The throughput achieved by this code is about 320 MBytes/sec, on
    43  ** a 1.8 GHz AMD Opteron (rev C0) processor.
    44  */
    45  
    46  TEXT ·xorKeyStream(SB),NOSPLIT,$0
    47  	MOVL	n+8(FP),	BX		// rbx = ARG(len)
    48  	MOVL	src+4(FP),	SI		// in = ARG(in)
    49  	MOVL	dst+0(FP),	DI		// out = ARG(out)
    50  	MOVL	state+12(FP),	R10		// d = ARG(data)
    51  	MOVL	i+16(FP),	AX
    52  	MOVBQZX	0(AX),		CX		// x = *xp
    53  	MOVL	j+20(FP),	AX
    54  	MOVBQZX	0(AX),		DX		// y = *yp
    55  
    56  	LEAQ	(SI)(BX*1),	R9		// limit = in+len
    57  
    58  l1:	CMPQ	SI,		R9		// cmp in with in+len
    59  	JGE	finished			// jump if (in >= in+len)
    60  
    61  	INCB	CX
    62  	EXTEND(CX)
    63  	TESTL	$15,		CX
    64  	JZ	wordloop
    65  	LEAL	(R10)(CX*4), R12
    66  
    67  	MOVBLZX	(R12),	AX
    68  
    69  	ADDB	AX,		DX		// y += tx
    70  	EXTEND(DX)
    71  	LEAL (R10)(DX*4), R11
    72  	MOVBLZX	(R11),	BX		// ty = d[y]
    73  	MOVB	BX,		(R12)	// d[x] = ty
    74  	ADDB	AX,		BX		// val = ty+tx
    75  	EXTEND(BX)
    76  	LEAL (R10)(BX*4), R13
    77  	MOVB	AX,		(R11)	// d[y] = tx
    78  	MOVBLZX	(R13),	R8		// val = d[val]
    79  	XORB	(SI),		R8		// xor 1 byte
    80  	MOVB	R8,		(DI)
    81  	INCQ	SI				// in++
    82  	INCQ	DI				// out++
    83  	JMP l1
    84  
    85  wordloop:
    86  	SUBQ	$16,		R9
    87  	CMPQ	SI,		R9
    88  	JGT	end
    89  
    90  start:
    91  	ADDQ	$16,		SI		// increment in
    92  	ADDQ	$16,		DI		// increment out
    93  
    94  	// Each KEYROUND generates one byte of key and
    95  	// inserts it into an XMM register at the given 16-bit index.
    96  	// The key state array is uint32 words only using the bottom
    97  	// byte of each word, so the 16-bit OR only copies 8 useful bits.
    98  	// We accumulate alternating bytes into X0 and X1, and then at
    99  	// the end we OR X1<<8 into X0 to produce the actual key.
   100  	//
   101  	// At the beginning of the loop, CX%16 == 0, so the 16 loads
   102  	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
   103  	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
   104  	// without fear of the byte computation CX+15 wrapping around.
   105  	//
   106  	// The first round needs R12[0], the second needs R12[1], and so on.
   107  	// We can avoid memory stalls by starting the load for round n+1
   108  	// before the end of round n, using the LOAD macro.
   109  	LEAQ	(R10)(CX*4),	R12
   110  
   111  #define KEYROUND(xmm, load, off, r1, r2, index) \
   112  	LEAL (R10)(DX*4), R11; \
   113  	MOVBLZX	(R11),	R8; \
   114  	MOVB	r1,		(R11); \
   115  	load((off+1), r2); \
   116  	MOVB	R8,		(off*4)(R12); \
   117  	ADDB	r1,		R8; \
   118  	EXTEND(R8); \
   119  	LEAL (R10)(R8*4), R14; \
   120  	PINSRW	$index, (R14), xmm
   121  
   122  #define LOAD(off, reg) \
   123  	MOVBLZX	(off*4)(R12),	reg; \
   124  	ADDB	reg,		DX; \
   125  	EXTEND(DX)
   126  
   127  #define SKIP(off, reg)
   128  
   129  	LOAD(0, AX)
   130  	KEYROUND(X0, LOAD, 0, AX, BX, 0)
   131  	KEYROUND(X1, LOAD, 1, BX, AX, 0)
   132  	KEYROUND(X0, LOAD, 2, AX, BX, 1)
   133  	KEYROUND(X1, LOAD, 3, BX, AX, 1)
   134  	KEYROUND(X0, LOAD, 4, AX, BX, 2)
   135  	KEYROUND(X1, LOAD, 5, BX, AX, 2)
   136  	KEYROUND(X0, LOAD, 6, AX, BX, 3)
   137  	KEYROUND(X1, LOAD, 7, BX, AX, 3)
   138  	KEYROUND(X0, LOAD, 8, AX, BX, 4)
   139  	KEYROUND(X1, LOAD, 9, BX, AX, 4)
   140  	KEYROUND(X0, LOAD, 10, AX, BX, 5)
   141  	KEYROUND(X1, LOAD, 11, BX, AX, 5)
   142  	KEYROUND(X0, LOAD, 12, AX, BX, 6)
   143  	KEYROUND(X1, LOAD, 13, BX, AX, 6)
   144  	KEYROUND(X0, LOAD, 14, AX, BX, 7)
   145  	KEYROUND(X1, SKIP, 15, BX, AX, 7)
   146  	
   147  	ADDB	$16,		CX
   148  
   149  	PSLLQ	$8,		X1
   150  	PXOR	X1,		X0
   151  	MOVOU	-16(SI),	X2
   152  	PXOR	X0,		X2
   153  	MOVOU	X2,		-16(DI)
   154  
   155  	CMPQ	SI,		R9		// cmp in with in+len-16
   156  	JLE	start				// jump if (in <= in+len-16)
   157  
   158  end:
   159  	DECB	CX
   160  	ADDQ	$16,		R9		// tmp = in+len
   161  
   162  	// handle the last bytes, one by one
   163  l2:	CMPQ	SI,		R9		// cmp in with in+len
   164  	JGE	finished			// jump if (in >= in+len)
   165  
   166  	INCB	CX
   167  	EXTEND(CX)
   168  	LEAL (R10)(CX*4), R12
   169  	MOVBLZX	(R12),	AX
   170  
   171  	ADDB	AX,		DX		// y += tx
   172  	EXTEND(DX)
   173  	LEAL (R10)(DX*4), R11
   174  	MOVBLZX	(R11),	BX		// ty = d[y]
   175  	MOVB	BX,		(R12)	// d[x] = ty
   176  	ADDB	AX,		BX		// val = ty+tx
   177  	EXTEND(BX)
   178  	LEAL (R10)(BX*4), R13
   179  	MOVB	AX,		(R11)	// d[y] = tx
   180  	MOVBLZX	(R13),	R8		// val = d[val]
   181  	XORB	(SI),		R8		// xor 1 byte
   182  	MOVB	R8,		(DI)
   183  	INCQ	SI				// in++
   184  	INCQ	DI				// out++
   185  	JMP l2
   186  
   187  finished:
   188  	MOVL	j+20(FP),	BX
   189  	MOVB	DX, 0(BX)
   190  	MOVL	i+16(FP),	AX
   191  	MOVB	CX, 0(AX)
   192  	RET