github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/roaring/setutil_arm64.s

github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/roaring/setutil_arm64.s (about)

     1  // +build arm64,!gccgo,!appengine
     2  
     3  #include "textflag.h"
     4  
     5  
     6  // This implements union2by2 using golang's version of arm64 assembly
     7  // The algorithm is very similar to the generic one,
     8  // but makes better use of arm64 features so is notably faster.
     9  // The basic algorithm structure is as follows:
    10  // 1. If either set is empty, copy the other set into the buffer and return the length
    11  // 2. Otherwise, load the first element of each set into a variable (s1 and s2).
    12  // 3. a. Compare the values of s1 and s2.
    13   // b. add the smaller one to the buffer.
    14   // c. perform a bounds check before incrementing.
    15   // If one set is finished, copy the rest of the other set over.
    16   // d. update s1 and or s2 to the next value, continue loop.
    17   //
    18   // Past the fact of the algorithm, this code makes use of several arm64 features
    19   // Condition Codes:
    20   // arm64's CMP operation sets 4 bits that can be used for branching,
    21   // rather than just true or false.
    22   // As a consequence, a single comparison gives enough information to distinguish the three cases
    23   //
    24   // Post-increment pointers after load/store:
    25   // Instructions like `MOVHU.P 2(R0), R6`
    26   // increment the register by a specified amount, in this example 2.
    27   // Because uint16's are exactly 2 bytes and the length of the slices
    28   // is part of the slice header,
    29   // there is no need to separately track the index into the slice.
    30   // Instead, the code can calculate the final read value and compare against that,
    31   // using the post-increment reads to move the pointers along.
    32   //
    33   // TODO: CALL out to memmove once the list is exhausted.
    34   // Right now it moves the necessary shorts so that the remaining count
    35   // is a multiple of 4 and then copies 64 bits at a time.
    36  
    37  TEXT ·union2by2(SB), NOSPLIT, $0-80
    38  	// R0, R1, and R2 for the pointers to the three slices
    39  	MOVD set1+0(FP), R0
    40  	MOVD set2+24(FP), R1
    41  	MOVD buffer+48(FP), R2
    42  
    43  	//R3 and R4 will be the values at which we will have finished reading set1 and set2.
    44  	// R3 should be R0 + 2 * set1_len+8(FP)
    45  	MOVD set1_len+8(FP), R3
    46  	MOVD set2_len+32(FP), R4
    47  
    48  	ADD R3<<1, R0, R3
    49  	ADD R4<<1, R1, R4
    50  
    51  
    52  	//Rather than counting the number of elements added separately
    53  	//Save the starting register of buffer.
    54  	MOVD buffer+48(FP), R5
    55  
    56  	// set1 is empty, just flush set2
    57  	CMP R0, R3
    58  	BEQ flush_right
    59  
    60  	// set2 is empty, just flush set1
    61  	CMP R1, R4
    62  	BEQ flush_left
    63  
    64  	// R6, R7 are the working space for s1 and s2
    65  	MOVD ZR, R6
    66  	MOVD ZR, R7
    67  
    68  	MOVHU.P 2(R0), R6
    69  	MOVHU.P 2(R1), R7
    70  loop:
    71  
    72  	CMP R6, R7
    73  	BEQ pop_both // R6 == R7
    74  	BLS pop_right // R6 > R7
    75  //pop_left: // R6 < R7
    76  	MOVHU.P R6, 2(R2)
    77  	CMP R0, R3
    78  	BEQ pop_then_flush_right
    79  	MOVHU.P 2(R0), R6
    80  	JMP loop
    81  pop_both:
    82  	MOVHU.P R6, 2(R2) //could also use R7, since they are equal
    83  	CMP R0, R3
    84  	BEQ flush_right
    85  	CMP R1, R4
    86  	BEQ flush_left
    87  	MOVHU.P 2(R0), R6
    88  	MOVHU.P 2(R1), R7
    89  	JMP loop
    90  pop_right:
    91  	MOVHU.P R7, 2(R2)
    92  	CMP R1, R4
    93  	BEQ pop_then_flush_left
    94  	MOVHU.P 2(R1), R7
    95  	JMP loop
    96  
    97  pop_then_flush_right:
    98  	MOVHU.P R7, 2(R2)
    99  flush_right:
   100  	MOVD R1, R0
   101  	MOVD R4, R3
   102  	JMP flush_left
   103  pop_then_flush_left:
   104  	MOVHU.P R6, 2(R2)
   105  flush_left:
   106  	CMP R0, R3
   107  	BEQ return
   108  	//figure out how many bytes to slough off. Must be a multiple of two
   109  	SUB R0, R3, R4
   110  	ANDS $6, R4
   111  	BEQ long_flush //handles the 0 mod 8 case
   112  	SUBS $4, R4, R4 // since possible values are 2, 4, 6, this splits evenly
   113  	BLT pop_single  // exactly the 2 case
   114  	MOVW.P 4(R0), R6
   115  	MOVW.P R6, 4(R2)
   116  	BEQ long_flush // we're now aligned by 64 bits, as R4==4, otherwise 2 more
   117  pop_single:
   118  	MOVHU.P 2(R0), R6
   119  	MOVHU.P R6, 2(R2)
   120  long_flush:
   121  	// at this point we know R3 - R0 is a multiple of 8.
   122  	CMP R0, R3
   123  	BEQ return
   124  	MOVD.P 8(R0), R6
   125  	MOVD.P R6, 8(R2)
   126  	JMP long_flush
   127  return:
   128  	// number of shorts written is (R5 - R2) >> 1
   129  	SUB R5, R2
   130  	LSR $1, R2, R2
   131  	MOVD R2, size+72(FP)
   132  	RET