github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/rc4/rc4_amd64.s (about) 1 // Original source: 2 // http://www.zorinaq.com/papers/rc4-amd64.html 3 // http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 4 5 #include "textflag.h" 6 7 // Local modifications: 8 // 9 // Transliterated from GNU to 6a assembly syntax by the Go authors. 10 // The comments and spacing are from the original. 11 // 12 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math. 13 // 14 // The original code accumulated 64 bits of key stream in an integer 15 // register and then XOR'ed the key stream into the data 8 bytes at a time. 16 // Modified to accumulate 128 bits of key stream into an XMM register 17 // and then XOR the key stream into the data 16 bytes at a time. 18 // Approximately doubles throughput. 19 20 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 21 // but makes the code run 2.0x slower on Xeon. 22 #define EXTEND(r) MOVBLZX r, r 23 24 /* 25 ** RC4 implementation optimized for AMD64. 26 ** 27 ** Author: Marc Bevand <bevand_m (at) epita.fr> 28 ** Licence: I hereby disclaim the copyright on this code and place it 29 ** in the public domain. 30 ** 31 ** The code has been designed to be easily integrated into openssl: 32 ** the exported RC4() function can replace the actual implementations 33 ** openssl already contains. Please note that when linking with openssl, 34 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled 35 ** with -DRC4_INT='unsigned long'. 36 ** 37 ** The throughput achieved by this code is about 320 MBytes/sec, on 38 ** a 1.8 GHz AMD Opteron (rev C0) processor. 39 */ 40 41 TEXT ·xorKeyStream(SB),NOSPLIT,$0 42 MOVQ n+16(FP), BX // rbx = ARG(len) 43 MOVQ src+8(FP), SI // in = ARG(in) 44 MOVQ dst+0(FP), DI // out = ARG(out) 45 MOVQ state+24(FP), BP // d = ARG(data) 46 MOVQ i+32(FP), AX 47 MOVBQZX 0(AX), CX // x = *xp 48 MOVQ j+40(FP), AX 49 MOVBQZX 0(AX), DX // y = *yp 50 51 LEAQ (SI)(BX*1), R9 // limit = in+len 52 53 l1: CMPQ SI, R9 // cmp in with in+len 54 JGE finished // jump if (in >= in+len) 55 56 INCB CX 57 EXTEND(CX) 58 TESTL $15, CX 59 JZ wordloop 60 61 MOVBLZX (BP)(CX*4), AX 62 63 ADDB AX, DX // y += tx 64 EXTEND(DX) 65 MOVBLZX (BP)(DX*4), BX // ty = d[y] 66 MOVB BX, (BP)(CX*4) // d[x] = ty 67 ADDB AX, BX // val = ty+tx 68 EXTEND(BX) 69 MOVB AX, (BP)(DX*4) // d[y] = tx 70 MOVBLZX (BP)(BX*4), R8 // val = d[val] 71 XORB (SI), R8 // xor 1 byte 72 MOVB R8, (DI) 73 INCQ SI // in++ 74 INCQ DI // out++ 75 JMP l1 76 77 wordloop: 78 SUBQ $16, R9 79 CMPQ SI, R9 80 JGT end 81 82 start: 83 ADDQ $16, SI // increment in 84 ADDQ $16, DI // increment out 85 86 // Each KEYROUND generates one byte of key and 87 // inserts it into an XMM register at the given 16-bit index. 88 // The key state array is uint32 words only using the bottom 89 // byte of each word, so the 16-bit OR only copies 8 useful bits. 90 // We accumulate alternating bytes into X0 and X1, and then at 91 // the end we OR X1<<8 into X0 to produce the actual key. 92 // 93 // At the beginning of the loop, CX%16 == 0, so the 16 loads 94 // at state[CX], state[CX+1], ..., state[CX+15] can precompute 95 // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], 96 // without fear of the byte computation CX+15 wrapping around. 97 // 98 // The first round needs R12[0], the second needs R12[1], and so on. 99 // We can avoid memory stalls by starting the load for round n+1 100 // before the end of round n, using the LOAD macro. 101 LEAQ (BP)(CX*4), R12 102 103 #define KEYROUND(xmm, load, off, r1, r2, index) \ 104 MOVBLZX (BP)(DX*4), R8; \ 105 MOVB r1, (BP)(DX*4); \ 106 load((off+1), r2); \ 107 MOVB R8, (off*4)(R12); \ 108 ADDB r1, R8; \ 109 EXTEND(R8); \ 110 PINSRW $index, (BP)(R8*4), xmm 111 112 #define LOAD(off, reg) \ 113 MOVBLZX (off*4)(R12), reg; \ 114 ADDB reg, DX; \ 115 EXTEND(DX) 116 117 #define SKIP(off, reg) 118 119 LOAD(0, AX) 120 KEYROUND(X0, LOAD, 0, AX, BX, 0) 121 KEYROUND(X1, LOAD, 1, BX, AX, 0) 122 KEYROUND(X0, LOAD, 2, AX, BX, 1) 123 KEYROUND(X1, LOAD, 3, BX, AX, 1) 124 KEYROUND(X0, LOAD, 4, AX, BX, 2) 125 KEYROUND(X1, LOAD, 5, BX, AX, 2) 126 KEYROUND(X0, LOAD, 6, AX, BX, 3) 127 KEYROUND(X1, LOAD, 7, BX, AX, 3) 128 KEYROUND(X0, LOAD, 8, AX, BX, 4) 129 KEYROUND(X1, LOAD, 9, BX, AX, 4) 130 KEYROUND(X0, LOAD, 10, AX, BX, 5) 131 KEYROUND(X1, LOAD, 11, BX, AX, 5) 132 KEYROUND(X0, LOAD, 12, AX, BX, 6) 133 KEYROUND(X1, LOAD, 13, BX, AX, 6) 134 KEYROUND(X0, LOAD, 14, AX, BX, 7) 135 KEYROUND(X1, SKIP, 15, BX, AX, 7) 136 137 ADDB $16, CX 138 139 PSLLQ $8, X1 140 PXOR X1, X0 141 MOVOU -16(SI), X2 142 PXOR X0, X2 143 MOVOU X2, -16(DI) 144 145 CMPQ SI, R9 // cmp in with in+len-16 146 JLE start // jump if (in <= in+len-16) 147 148 end: 149 DECB CX 150 ADDQ $16, R9 // tmp = in+len 151 152 // handle the last bytes, one by one 153 l2: CMPQ SI, R9 // cmp in with in+len 154 JGE finished // jump if (in >= in+len) 155 156 INCB CX 157 EXTEND(CX) 158 MOVBLZX (BP)(CX*4), AX 159 160 ADDB AX, DX // y += tx 161 EXTEND(DX) 162 MOVBLZX (BP)(DX*4), BX // ty = d[y] 163 MOVB BX, (BP)(CX*4) // d[x] = ty 164 ADDB AX, BX // val = ty+tx 165 EXTEND(BX) 166 MOVB AX, (BP)(DX*4) // d[y] = tx 167 MOVBLZX (BP)(BX*4), R8 // val = d[val] 168 XORB (SI), R8 // xor 1 byte 169 MOVB R8, (DI) 170 INCQ SI // in++ 171 INCQ DI // out++ 172 JMP l2 173 174 finished: 175 MOVQ j+40(FP), BX 176 MOVB DX, 0(BX) 177 MOVQ i+32(FP), AX 178 MOVB CX, 0(AX) 179 RET