github.com/mad-day/Yawning-crypto@v0.0.0-20190711051033-5a5f8cca32ec/aez/round_bitsliced64.go (about)

     1  // round_bitsliced64.go - 64bit constant time AES round function.
     2  //
     3  // To the extent possible under law, Yawning Angel has waived all copyright
     4  // and related or neighboring rights to aez, using the Creative
     5  // Commons "CC0" public domain dedication. See LICENSE or
     6  // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
     7  
     8  package aez
     9  
    10  import "github.com/mad-day/Yawning-crypto/bsaes/ct64"
    11  
    12  type roundB64 struct {
    13  	skey [32]uint64 // I, J, L, 0
    14  }
    15  
    16  func newRoundB64(extractedKey *[extractedKeySize]byte) aesImpl {
    17  	r := new(roundB64)
    18  	for i := 0; i < 3; i++ {
    19  		ct64.RkeyOrtho(r.skey[i*8:], extractedKey[i*16:])
    20  	}
    21  
    22  	return r
    23  }
    24  
    25  func (r *roundB64) Reset() {
    26  	memwipeU64(r.skey[:])
    27  }
    28  
    29  func (r *roundB64) AES4(j, i, l *[blockSize]byte, src []byte, dst *[blockSize]byte) {
    30  	var q [8]uint64
    31  	xorBytes4x16(j[:], i[:], l[:], src, dst[:])
    32  
    33  	ct64.Load4xU32(&q, dst[:])
    34  	r.round(&q, r.skey[8:])  // J
    35  	r.round(&q, r.skey[0:])  // I
    36  	r.round(&q, r.skey[16:]) // L
    37  	r.round(&q, r.skey[24:]) // zero
    38  	ct64.Store4xU32(dst[:], &q)
    39  
    40  	memwipeU64(q[:])
    41  }
    42  
    43  func (r *roundB64) aes4x4(
    44  	j0, i0, l0 *[blockSize]byte, src0 []byte, dst0 *[blockSize]byte,
    45  	j1, i1, l1 *[blockSize]byte, src1 []byte, dst1 *[blockSize]byte,
    46  	j2, i2, l2 *[blockSize]byte, src2 []byte, dst2 *[blockSize]byte,
    47  	j3, i3, l3 *[blockSize]byte, src3 []byte, dst3 *[blockSize]byte) {
    48  	var q [8]uint64
    49  	xorBytes4x16(j0[:], i0[:], l0[:], src0, dst0[:])
    50  	xorBytes4x16(j1[:], i1[:], l1[:], src1, dst1[:])
    51  	xorBytes4x16(j2[:], i2[:], l2[:], src2, dst2[:])
    52  	xorBytes4x16(j3[:], i3[:], l3[:], src3, dst3[:])
    53  
    54  	ct64.Load16xU32(&q, dst0[:], dst1[:], dst2[:], dst3[:])
    55  	r.round(&q, r.skey[8:])  // J
    56  	r.round(&q, r.skey[0:])  // I
    57  	r.round(&q, r.skey[16:]) // L
    58  	r.round(&q, r.skey[24:]) // zero
    59  	ct64.Store16xU32(dst0[:], dst1[:], dst2[:], dst3[:], &q)
    60  
    61  	memwipeU64(q[:])
    62  }
    63  
    64  func (r *roundB64) AES10(l *[blockSize]byte, src []byte, dst *[blockSize]byte) {
    65  	var q [8]uint64
    66  	xorBytes1x16(src, l[:], dst[:])
    67  
    68  	ct64.Load4xU32(&q, dst[:])
    69  	for i := 0; i < 3; i++ {
    70  		r.round(&q, r.skey[0:])  // I
    71  		r.round(&q, r.skey[8:])  // J
    72  		r.round(&q, r.skey[16:]) // L
    73  	}
    74  	r.round(&q, r.skey[0:]) // I
    75  	ct64.Store4xU32(dst[:], &q)
    76  
    77  	memwipeU64(q[:])
    78  }
    79  
    80  func (r *roundB64) round(q *[8]uint64, k []uint64) {
    81  	ct64.Sbox(q)
    82  	ct64.ShiftRows(q)
    83  	ct64.MixColumns(q)
    84  	ct64.AddRoundKey(q, k)
    85  }
    86  
    87  func (r *roundB64) aezCorePass1(e *eState, in, out []byte, X *[blockSize]byte, sz int) {
    88  	var tmp0, tmp1, tmp2, tmp3, I [blockSize]byte
    89  
    90  	copy(I[:], e.I[1][:])
    91  	i := 1
    92  
    93  	// Process 8 * 16 bytes at a time in a loop.
    94  	for mult := false; sz >= 8*blockSize; mult = !mult {
    95  		r.aes4x4(&e.J[0], &I, &e.L[(i+0)%8], in[blockSize:], &tmp0,
    96  			&e.J[0], &I, &e.L[(i+1)%8], in[blockSize*3:], &tmp1,
    97  			&e.J[0], &I, &e.L[(i+2)%8], in[blockSize*5:], &tmp2,
    98  			&e.J[0], &I, &e.L[(i+3)%8], in[blockSize*7:], &tmp3) // E(1,i) ... E(1,i+3)
    99  		xorBytes1x16(in[:], tmp0[:], out[:])
   100  		xorBytes1x16(in[blockSize*2:], tmp1[:], out[blockSize*2:])
   101  		xorBytes1x16(in[blockSize*4:], tmp2[:], out[blockSize*4:])
   102  		xorBytes1x16(in[blockSize*6:], tmp3[:], out[blockSize*6:])
   103  
   104  		r.aes4x4(&zero, &e.I[0], &e.L[0], out[:], &tmp0,
   105  			&zero, &e.I[0], &e.L[0], out[blockSize*2:], &tmp1,
   106  			&zero, &e.I[0], &e.L[0], out[blockSize*4:], &tmp2,
   107  			&zero, &e.I[0], &e.L[0], out[blockSize*6:], &tmp3) // E(0,0) x4
   108  		xorBytes1x16(in[blockSize:], tmp0[:], out[blockSize:])
   109  		xorBytes1x16(in[blockSize*3:], tmp1[:], out[blockSize*3:])
   110  		xorBytes1x16(in[blockSize*5:], tmp2[:], out[blockSize*5:])
   111  		xorBytes1x16(in[blockSize*7:], tmp3[:], out[blockSize*7:])
   112  
   113  		xorBytes1x16(out[blockSize:], X[:], X[:])
   114  		xorBytes1x16(out[blockSize*3:], X[:], X[:])
   115  		xorBytes1x16(out[blockSize*5:], X[:], X[:])
   116  		xorBytes1x16(out[blockSize*7:], X[:], X[:])
   117  
   118  		sz -= 8 * blockSize
   119  		in, out = in[128:], out[128:]
   120  		if mult { // Multiply every other pass.
   121  			doubleBlock(&I)
   122  		}
   123  		i += 4
   124  	}
   125  
   126  	// XXX/performance: 4 * 16 bytes at a time.
   127  
   128  	for sz > 0 {
   129  		r.AES4(&e.J[0], &I, &e.L[i%8], in[blockSize:], &tmp0) // E(1,i)
   130  		xorBytes1x16(in[:], tmp0[:], out[:])
   131  		r.AES4(&zero, &e.I[0], &e.L[0], out[:], &tmp0) // E(0,0)
   132  		xorBytes1x16(in[blockSize:], tmp0[:], out[blockSize:])
   133  		xorBytes1x16(out[blockSize:], X[:], X[:])
   134  
   135  		sz -= 2 * blockSize
   136  		in, out = in[32:], out[32:]
   137  		if i%8 == 0 {
   138  			doubleBlock(&I)
   139  		}
   140  		i++
   141  	}
   142  
   143  	memwipe(tmp0[:])
   144  	memwipe(tmp1[:])
   145  	memwipe(tmp2[:])
   146  	memwipe(tmp3[:])
   147  	memwipe(I[:])
   148  }
   149  
   150  func (r *roundB64) aezCorePass2(e *eState, out []byte, Y, S *[blockSize]byte, sz int) {
   151  	var tmp0, tmp1, tmp2, tmp3, I [blockSize]byte
   152  
   153  	copy(I[:], e.I[1][:])
   154  	i := 1
   155  
   156  	// Process 8 * 16 bytes at a time in a loop.
   157  	for mult := false; sz >= 8*blockSize; mult = !mult {
   158  		r.aes4x4(&e.J[1], &I, &e.L[(i+0)%8], S[:], &tmp0,
   159  			&e.J[1], &I, &e.L[(i+1)%8], S[:], &tmp1,
   160  			&e.J[1], &I, &e.L[(i+2)%8], S[:], &tmp2,
   161  			&e.J[1], &I, &e.L[(i+3)%8], S[:], &tmp3) // E(2,i) .. E(2,i+3)
   162  		xorBytes1x16(out, tmp0[:], out[:])
   163  		xorBytes1x16(out[blockSize*2:], tmp1[:], out[blockSize*2:])
   164  		xorBytes1x16(out[blockSize*4:], tmp2[:], out[blockSize*4:])
   165  		xorBytes1x16(out[blockSize*6:], tmp3[:], out[blockSize*6:])
   166  		xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
   167  		xorBytes1x16(out[blockSize*3:], tmp1[:], out[blockSize*3:])
   168  		xorBytes1x16(out[blockSize*5:], tmp2[:], out[blockSize*5:])
   169  		xorBytes1x16(out[blockSize*7:], tmp3[:], out[blockSize*7:])
   170  		xorBytes1x16(out, Y[:], Y[:])
   171  		xorBytes1x16(out[blockSize*2:], Y[:], Y[:])
   172  		xorBytes1x16(out[blockSize*4:], Y[:], Y[:])
   173  		xorBytes1x16(out[blockSize*6:], Y[:], Y[:])
   174  
   175  		r.aes4x4(&zero, &e.I[0], &e.L[0], out[blockSize:], &tmp0,
   176  			&zero, &e.I[0], &e.L[0], out[blockSize*3:], &tmp1,
   177  			&zero, &e.I[0], &e.L[0], out[blockSize*5:], &tmp2,
   178  			&zero, &e.I[0], &e.L[0], out[blockSize*7:], &tmp3) // E(0,0)x4
   179  		xorBytes1x16(out, tmp0[:], out[:])
   180  		xorBytes1x16(out[blockSize*2:], tmp1[:], out[blockSize*2:])
   181  		xorBytes1x16(out[blockSize*4:], tmp2[:], out[blockSize*4:])
   182  		xorBytes1x16(out[blockSize*6:], tmp3[:], out[blockSize*6:])
   183  
   184  		r.aes4x4(&e.J[0], &I, &e.L[(i+0)%8], out[:], &tmp0,
   185  			&e.J[0], &I, &e.L[(i+1)%8], out[blockSize*2:], &tmp1,
   186  			&e.J[0], &I, &e.L[(i+2)%8], out[blockSize*4:], &tmp2,
   187  			&e.J[0], &I, &e.L[(i+3)%8], out[blockSize*6:], &tmp3) // E(1,i) ...  E(1,i+3)
   188  		xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
   189  		xorBytes1x16(out[blockSize*3:], tmp1[:], out[blockSize*3:])
   190  		xorBytes1x16(out[blockSize*5:], tmp2[:], out[blockSize*5:])
   191  		xorBytes1x16(out[blockSize*7:], tmp3[:], out[blockSize*7:])
   192  
   193  		swapBlocks(&tmp0, out)
   194  		swapBlocks(&tmp0, out[blockSize*2:])
   195  		swapBlocks(&tmp0, out[blockSize*4:])
   196  		swapBlocks(&tmp0, out[blockSize*6:])
   197  
   198  		sz -= 8 * blockSize
   199  		out = out[128:]
   200  		if mult { // Multiply every other pass.
   201  			doubleBlock(&I)
   202  		}
   203  		i += 4
   204  	}
   205  
   206  	// XXX/performance: 4 * 16 bytes at a time.
   207  
   208  	for sz > 0 {
   209  		r.AES4(&e.J[1], &I, &e.L[i%8], S[:], &tmp0) // E(2,i)
   210  		xorBytes1x16(out, tmp0[:], out[:])
   211  		xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
   212  		xorBytes1x16(out, Y[:], Y[:])
   213  
   214  		r.AES4(&zero, &e.I[0], &e.L[0], out[blockSize:], &tmp0) // E(0,0)
   215  		xorBytes1x16(out, tmp0[:], out[:])
   216  
   217  		r.AES4(&e.J[0], &I, &e.L[i%8], out[:], &tmp0) // E(1,i)
   218  		xorBytes1x16(out[blockSize:], tmp0[:], out[blockSize:])
   219  
   220  		swapBlocks(&tmp0, out)
   221  
   222  		sz -= 2 * blockSize
   223  		out = out[32:]
   224  		if i%8 == 0 {
   225  			doubleBlock(&I)
   226  		}
   227  		i++
   228  	}
   229  
   230  	memwipe(tmp0[:])
   231  	memwipe(tmp1[:])
   232  	memwipe(tmp2[:])
   233  	memwipe(tmp3[:])
   234  	memwipe(I[:])
   235  }
   236  
   237  func memwipeU64(s []uint64) {
   238  	for i := range s {
   239  		s[i] = 0
   240  	}
   241  }