github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/crypto/rc4/rc4_amd64.s (about) 1 // Original source: 2 // http://www.zorinaq.com/papers/rc4-amd64.html 3 // http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 4 5 // Local modifications: 6 // 7 // Transliterated from GNU to 6a assembly syntax by the Go authors. 8 // The comments and spacing are from the original. 9 // 10 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math. 11 // 12 // The original code accumulated 64 bits of key stream in an integer 13 // register and then XOR'ed the key stream into the data 8 bytes at a time. 14 // Modified to accumulate 128 bits of key stream into an XMM register 15 // and then XOR the key stream into the data 16 bytes at a time. 16 // Approximately doubles throughput. 17 18 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 19 // but makes the code run 2.0x slower on Xeon. 20 #define EXTEND(r) MOVBLZX r, r 21 22 /* 23 ** RC4 implementation optimized for AMD64. 24 ** 25 ** Author: Marc Bevand <bevand_m (at) epita.fr> 26 ** Licence: I hereby disclaim the copyright on this code and place it 27 ** in the public domain. 28 ** 29 ** The code has been designed to be easily integrated into openssl: 30 ** the exported RC4() function can replace the actual implementations 31 ** openssl already contains. Please note that when linking with openssl, 32 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled 33 ** with -DRC4_INT='unsigned long'. 34 ** 35 ** The throughput achieved by this code is about 320 MBytes/sec, on 36 ** a 1.8 GHz AMD Opteron (rev C0) processor. 37 */ 38 39 TEXT ·xorKeyStream(SB),7,$0 40 MOVQ n+16(FP), BX // rbx = ARG(len) 41 MOVQ src+8(FP), SI // in = ARG(in) 42 MOVQ dst+0(FP), DI // out = ARG(out) 43 MOVQ state+24(FP), BP // d = ARG(data) 44 MOVQ i+32(FP), AX 45 MOVBQZX 0(AX), CX // x = *xp 46 MOVQ j+40(FP), AX 47 MOVBQZX 0(AX), DX // y = *yp 48 49 LEAQ (SI)(BX*1), R9 // limit = in+len 50 51 l1: CMPQ SI, R9 // cmp in with in+len 52 JGE finished // jump if (in >= in+len) 53 54 INCB CX 55 EXTEND(CX) 56 TESTL $15, CX 57 JZ wordloop 58 59 MOVBLZX (BP)(CX*4), AX 60 61 ADDB AX, DX // y += tx 62 EXTEND(DX) 63 MOVBLZX (BP)(DX*4), BX // ty = d[y] 64 MOVB BX, (BP)(CX*4) // d[x] = ty 65 ADDB AX, BX // val = ty+tx 66 EXTEND(BX) 67 MOVB AX, (BP)(DX*4) // d[y] = tx 68 MOVBLZX (BP)(BX*4), R8 // val = d[val] 69 XORB (SI), R8 // xor 1 byte 70 MOVB R8, (DI) 71 INCQ SI // in++ 72 INCQ DI // out++ 73 JMP l1 74 75 wordloop: 76 SUBQ $16, R9 77 CMPQ SI, R9 78 JGT end 79 80 start: 81 ADDQ $16, SI // increment in 82 ADDQ $16, DI // increment out 83 84 // Each KEYROUND generates one byte of key and 85 // inserts it into an XMM register at the given 16-bit index. 86 // The key state array is uint32 words only using the bottom 87 // byte of each word, so the 16-bit OR only copies 8 useful bits. 88 // We accumulate alternating bytes into X0 and X1, and then at 89 // the end we OR X1<<8 into X0 to produce the actual key. 90 // 91 // At the beginning of the loop, CX%16 == 0, so the 16 loads 92 // at state[CX], state[CX+1], ..., state[CX+15] can precompute 93 // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], 94 // without fear of the byte computation CX+15 wrapping around. 95 // 96 // The first round needs R12[0], the second needs R12[1], and so on. 97 // We can avoid memory stalls by starting the load for round n+1 98 // before the end of round n, using the LOAD macro. 99 LEAQ (BP)(CX*4), R12 100 101 #define KEYROUND(xmm, load, off, r1, r2, index) \ 102 MOVBLZX (BP)(DX*4), R8; \ 103 MOVB r1, (BP)(DX*4); \ 104 load((off+1), r2); \ 105 MOVB R8, (off*4)(R12); \ 106 ADDB r1, R8; \ 107 EXTEND(R8); \ 108 PINSRW $index, (BP)(R8*4), xmm 109 110 #define LOAD(off, reg) \ 111 MOVBLZX (off*4)(R12), reg; \ 112 ADDB reg, DX; \ 113 EXTEND(DX) 114 115 #define SKIP(off, reg) 116 117 LOAD(0, AX) 118 KEYROUND(X0, LOAD, 0, AX, BX, 0) 119 KEYROUND(X1, LOAD, 1, BX, AX, 0) 120 KEYROUND(X0, LOAD, 2, AX, BX, 1) 121 KEYROUND(X1, LOAD, 3, BX, AX, 1) 122 KEYROUND(X0, LOAD, 4, AX, BX, 2) 123 KEYROUND(X1, LOAD, 5, BX, AX, 2) 124 KEYROUND(X0, LOAD, 6, AX, BX, 3) 125 KEYROUND(X1, LOAD, 7, BX, AX, 3) 126 KEYROUND(X0, LOAD, 8, AX, BX, 4) 127 KEYROUND(X1, LOAD, 9, BX, AX, 4) 128 KEYROUND(X0, LOAD, 10, AX, BX, 5) 129 KEYROUND(X1, LOAD, 11, BX, AX, 5) 130 KEYROUND(X0, LOAD, 12, AX, BX, 6) 131 KEYROUND(X1, LOAD, 13, BX, AX, 6) 132 KEYROUND(X0, LOAD, 14, AX, BX, 7) 133 KEYROUND(X1, SKIP, 15, BX, AX, 7) 134 135 ADDB $16, CX 136 137 PSLLQ $8, X1 138 PXOR X1, X0 139 MOVOU -16(SI), X2 140 PXOR X0, X2 141 MOVOU X2, -16(DI) 142 143 CMPQ SI, R9 // cmp in with in+len-16 144 JLE start // jump if (in <= in+len-16) 145 146 end: 147 DECB CX 148 ADDQ $16, R9 // tmp = in+len 149 150 // handle the last bytes, one by one 151 l2: CMPQ SI, R9 // cmp in with in+len 152 JGE finished // jump if (in >= in+len) 153 154 INCB CX 155 EXTEND(CX) 156 MOVBLZX (BP)(CX*4), AX 157 158 ADDB AX, DX // y += tx 159 EXTEND(DX) 160 MOVBLZX (BP)(DX*4), BX // ty = d[y] 161 MOVB BX, (BP)(CX*4) // d[x] = ty 162 ADDB AX, BX // val = ty+tx 163 EXTEND(BX) 164 MOVB AX, (BP)(DX*4) // d[y] = tx 165 MOVBLZX (BP)(BX*4), R8 // val = d[val] 166 XORB (SI), R8 // xor 1 byte 167 MOVB R8, (DI) 168 INCQ SI // in++ 169 INCQ DI // out++ 170 JMP l2 171 172 finished: 173 MOVQ j+40(FP), BX 174 MOVB DX, 0(BX) 175 MOVQ i+32(FP), AX 176 MOVB CX, 0(AX) 177 RET