github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/rc4/rc4_amd64p32.s (about) 1 // Original source: 2 // http://www.zorinaq.com/papers/rc4-amd64.html 3 // http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 4 5 #include "textflag.h" 6 7 // Local modifications: 8 // 9 // Transliterated from GNU to 6a assembly syntax by the Go authors. 10 // The comments and spacing are from the original. 11 // 12 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math. 13 // 14 // The original code accumulated 64 bits of key stream in an integer 15 // register and then XOR'ed the key stream into the data 8 bytes at a time. 16 // Modified to accumulate 128 bits of key stream into an XMM register 17 // and then XOR the key stream into the data 16 bytes at a time. 18 // Approximately doubles throughput. 19 // 20 // Converted to amd64p32. 21 // 22 // To make safe for Native Client, avoid use of BP, R15, 23 // and two-register addressing modes. 24 25 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 26 // but makes the code run 2.0x slower on Xeon. 27 #define EXTEND(r) MOVBLZX r, r 28 29 /* 30 ** RC4 implementation optimized for AMD64. 31 ** 32 ** Author: Marc Bevand <bevand_m (at) epita.fr> 33 ** Licence: I hereby disclaim the copyright on this code and place it 34 ** in the public domain. 35 ** 36 ** The code has been designed to be easily integrated into openssl: 37 ** the exported RC4() function can replace the actual implementations 38 ** openssl already contains. Please note that when linking with openssl, 39 ** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled 40 ** with -DRC4_INT='unsigned long'. 41 ** 42 ** The throughput achieved by this code is about 320 MBytes/sec, on 43 ** a 1.8 GHz AMD Opteron (rev C0) processor. 44 */ 45 46 TEXT ·xorKeyStream(SB),NOSPLIT,$0 47 MOVL n+8(FP), BX // rbx = ARG(len) 48 MOVL src+4(FP), SI // in = ARG(in) 49 MOVL dst+0(FP), DI // out = ARG(out) 50 MOVL state+12(FP), R10 // d = ARG(data) 51 MOVL i+16(FP), AX 52 MOVBQZX 0(AX), CX // x = *xp 53 MOVL j+20(FP), AX 54 MOVBQZX 0(AX), DX // y = *yp 55 56 LEAQ (SI)(BX*1), R9 // limit = in+len 57 58 l1: CMPQ SI, R9 // cmp in with in+len 59 JGE finished // jump if (in >= in+len) 60 61 INCB CX 62 EXTEND(CX) 63 TESTL $15, CX 64 JZ wordloop 65 LEAL (R10)(CX*4), R12 66 67 MOVBLZX (R12), AX 68 69 ADDB AX, DX // y += tx 70 EXTEND(DX) 71 LEAL (R10)(DX*4), R11 72 MOVBLZX (R11), BX // ty = d[y] 73 MOVB BX, (R12) // d[x] = ty 74 ADDB AX, BX // val = ty+tx 75 EXTEND(BX) 76 LEAL (R10)(BX*4), R13 77 MOVB AX, (R11) // d[y] = tx 78 MOVBLZX (R13), R8 // val = d[val] 79 XORB (SI), R8 // xor 1 byte 80 MOVB R8, (DI) 81 INCQ SI // in++ 82 INCQ DI // out++ 83 JMP l1 84 85 wordloop: 86 SUBQ $16, R9 87 CMPQ SI, R9 88 JGT end 89 90 start: 91 ADDQ $16, SI // increment in 92 ADDQ $16, DI // increment out 93 94 // Each KEYROUND generates one byte of key and 95 // inserts it into an XMM register at the given 16-bit index. 96 // The key state array is uint32 words only using the bottom 97 // byte of each word, so the 16-bit OR only copies 8 useful bits. 98 // We accumulate alternating bytes into X0 and X1, and then at 99 // the end we OR X1<<8 into X0 to produce the actual key. 100 // 101 // At the beginning of the loop, CX%16 == 0, so the 16 loads 102 // at state[CX], state[CX+1], ..., state[CX+15] can precompute 103 // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], 104 // without fear of the byte computation CX+15 wrapping around. 105 // 106 // The first round needs R12[0], the second needs R12[1], and so on. 107 // We can avoid memory stalls by starting the load for round n+1 108 // before the end of round n, using the LOAD macro. 109 LEAQ (R10)(CX*4), R12 110 111 #define KEYROUND(xmm, load, off, r1, r2, index) \ 112 LEAL (R10)(DX*4), R11; \ 113 MOVBLZX (R11), R8; \ 114 MOVB r1, (R11); \ 115 load((off+1), r2); \ 116 MOVB R8, (off*4)(R12); \ 117 ADDB r1, R8; \ 118 EXTEND(R8); \ 119 LEAL (R10)(R8*4), R14; \ 120 PINSRW $index, (R14), xmm 121 122 #define LOAD(off, reg) \ 123 MOVBLZX (off*4)(R12), reg; \ 124 ADDB reg, DX; \ 125 EXTEND(DX) 126 127 #define SKIP(off, reg) 128 129 LOAD(0, AX) 130 KEYROUND(X0, LOAD, 0, AX, BX, 0) 131 KEYROUND(X1, LOAD, 1, BX, AX, 0) 132 KEYROUND(X0, LOAD, 2, AX, BX, 1) 133 KEYROUND(X1, LOAD, 3, BX, AX, 1) 134 KEYROUND(X0, LOAD, 4, AX, BX, 2) 135 KEYROUND(X1, LOAD, 5, BX, AX, 2) 136 KEYROUND(X0, LOAD, 6, AX, BX, 3) 137 KEYROUND(X1, LOAD, 7, BX, AX, 3) 138 KEYROUND(X0, LOAD, 8, AX, BX, 4) 139 KEYROUND(X1, LOAD, 9, BX, AX, 4) 140 KEYROUND(X0, LOAD, 10, AX, BX, 5) 141 KEYROUND(X1, LOAD, 11, BX, AX, 5) 142 KEYROUND(X0, LOAD, 12, AX, BX, 6) 143 KEYROUND(X1, LOAD, 13, BX, AX, 6) 144 KEYROUND(X0, LOAD, 14, AX, BX, 7) 145 KEYROUND(X1, SKIP, 15, BX, AX, 7) 146 147 ADDB $16, CX 148 149 PSLLQ $8, X1 150 PXOR X1, X0 151 MOVOU -16(SI), X2 152 PXOR X0, X2 153 MOVOU X2, -16(DI) 154 155 CMPQ SI, R9 // cmp in with in+len-16 156 JLE start // jump if (in <= in+len-16) 157 158 end: 159 DECB CX 160 ADDQ $16, R9 // tmp = in+len 161 162 // handle the last bytes, one by one 163 l2: CMPQ SI, R9 // cmp in with in+len 164 JGE finished // jump if (in >= in+len) 165 166 INCB CX 167 EXTEND(CX) 168 LEAL (R10)(CX*4), R12 169 MOVBLZX (R12), AX 170 171 ADDB AX, DX // y += tx 172 EXTEND(DX) 173 LEAL (R10)(DX*4), R11 174 MOVBLZX (R11), BX // ty = d[y] 175 MOVB BX, (R12) // d[x] = ty 176 ADDB AX, BX // val = ty+tx 177 EXTEND(BX) 178 LEAL (R10)(BX*4), R13 179 MOVB AX, (R11) // d[y] = tx 180 MOVBLZX (R13), R8 // val = d[val] 181 XORB (SI), R8 // xor 1 byte 182 MOVB R8, (DI) 183 INCQ SI // in++ 184 INCQ DI // out++ 185 JMP l2 186 187 finished: 188 MOVL j+20(FP), BX 189 MOVB DX, 0(BX) 190 MOVL i+16(FP), AX 191 MOVB CX, 0(AX) 192 RET