github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/checksum/checksum_amd64.s (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  #include "textflag.h"
    19  
    20  // calculateChecksum computes the checksum of a slice, taking into account a
    21  // previously computed initial value and whether the first byte is a lower or
    22  // upper byte.
    23  //
    24  // It utilizes byte order independence and parallel summation as described in
    25  // RFC 1071 1.2.
    26  //
    27  // The best way to understand this function is to understand
    28  // checksum_noasm_unsafe.go first, which implements largely the same logic.
    29  // Using assembly speeds things up via ADC (add with carry).
    30  TEXT ·calculateChecksum(SB),NOSPLIT|NOFRAME,$0-35
    31    // Store arguments in registers.
    32    MOVW initial+26(FP), AX
    33    MOVQ buf_len+8(FP), BX
    34    MOVQ buf_base+0(FP), CX
    35    XORQ R8, R8
    36    MOVB odd+24(FP), R8
    37  
    38    // Account for a previous odd number of bytes.
    39    //
    40    // if odd {
    41    //   initial += buf[0]
    42    //   buf = buf[1:]
    43    // }
    44    CMPB R8, $0
    45    JE newlyodd
    46    XORQ R9, R9
    47    MOVB (CX), R9
    48    ADDW R9, AX
    49    ADCW $0, AX
    50    INCQ CX
    51    DECQ BX
    52  
    53    // See whether we're checksumming an odd number of bytes. If so, the final
    54    // byte is a big endian most significant byte, and so needs to be shifted.
    55    //
    56    // odd = buf_len%2 != 0
    57    // if odd {
    58    //   buf_len--
    59    //   initial += buf[buf_len]<<8
    60    // }
    61  newlyodd:
    62    XORQ R8, R8
    63    TESTQ $1, BX
    64    JZ swaporder
    65    MOVB $1, R8
    66    DECQ BX
    67    XORQ R10, R10
    68    MOVB (CX)(BX*1), R10
    69    SHLQ $8, R10
    70    ADDW R10, AX
    71    ADCW $0, AX
    72  
    73  swaporder:
    74    // Load initial in network byte order.
    75    BSWAPQ AX
    76    SHRQ $48, AX
    77  
    78    // Accumulate 8 bytes at a time.
    79    //
    80    // while buf_len >= 8 {
    81    //   acc, carry = acc + *(uint64 *)(buf) + carry
    82    //   buf_len -= 8
    83    //   buf = buf[8:]
    84    // }
    85    // acc += carry
    86    JMP addcond
    87  addloop:
    88    ADDQ (CX), AX
    89    ADCQ $0, AX
    90    SUBQ $8, BX
    91    ADDQ $8, CX
    92  addcond:
    93    CMPQ BX, $8
    94    JAE addloop
    95  
    96    // TODO(krakauer): We can do 4 byte accumulation too.
    97  
    98    // Accumulate the rest 2 bytes at a time.
    99    //
   100    // while buf_len > 0 {
   101    //   acc, carry = acc + *(uint16 *)(buf)
   102    //   buf_len -= 2
   103    //   buf = buf[2:]
   104    // }
   105    JMP slowaddcond
   106  slowaddloop:
   107    XORQ DX, DX
   108    MOVW (CX), DX
   109    ADDQ DX, AX
   110    ADCQ $0, AX
   111    SUBQ $2, BX
   112    ADDQ $2, CX
   113  slowaddcond:
   114    CMPQ BX, $2
   115    JAE slowaddloop
   116  
   117    // Fold into 16 bits.
   118    //
   119    // for acc > math.MaxUint16 {
   120    //   acc = (acc & 0xffff) + acc>>16
   121    // }
   122    JMP foldcond
   123  foldloop:
   124    MOVQ AX, DX
   125    ANDQ $0xffff, DX
   126    SHRQ $16, AX
   127    ADDQ DX, AX
   128    // We don't need ADC because folding will take care of it
   129  foldcond:
   130    CMPQ AX, $0xffff
   131    JA foldloop
   132  
   133    // Return the checksum in host byte order.
   134    BSWAPQ AX
   135    SHRQ $48, AX
   136    MOVW AX, ret+32(FP)
   137    MOVB R8, ret1+34(FP)
   138    RET