github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/roaring/setutil_arm64.s (about) 1 // +build arm64,!gccgo,!appengine 2 3 #include "textflag.h" 4 5 6 // This implements union2by2 using golang's version of arm64 assembly 7 // The algorithm is very similar to the generic one, 8 // but makes better use of arm64 features so is notably faster. 9 // The basic algorithm structure is as follows: 10 // 1. If either set is empty, copy the other set into the buffer and return the length 11 // 2. Otherwise, load the first element of each set into a variable (s1 and s2). 12 // 3. a. Compare the values of s1 and s2. 13 // b. add the smaller one to the buffer. 14 // c. perform a bounds check before incrementing. 15 // If one set is finished, copy the rest of the other set over. 16 // d. update s1 and or s2 to the next value, continue loop. 17 // 18 // Past the fact of the algorithm, this code makes use of several arm64 features 19 // Condition Codes: 20 // arm64's CMP operation sets 4 bits that can be used for branching, 21 // rather than just true or false. 22 // As a consequence, a single comparison gives enough information to distinguish the three cases 23 // 24 // Post-increment pointers after load/store: 25 // Instructions like `MOVHU.P 2(R0), R6` 26 // increment the register by a specified amount, in this example 2. 27 // Because uint16's are exactly 2 bytes and the length of the slices 28 // is part of the slice header, 29 // there is no need to separately track the index into the slice. 30 // Instead, the code can calculate the final read value and compare against that, 31 // using the post-increment reads to move the pointers along. 32 // 33 // TODO: CALL out to memmove once the list is exhausted. 34 // Right now it moves the necessary shorts so that the remaining count 35 // is a multiple of 4 and then copies 64 bits at a time. 36 37 TEXT ·union2by2(SB), NOSPLIT, $0-80 38 // R0, R1, and R2 for the pointers to the three slices 39 MOVD set1+0(FP), R0 40 MOVD set2+24(FP), R1 41 MOVD buffer+48(FP), R2 42 43 //R3 and R4 will be the values at which we will have finished reading set1 and set2. 44 // R3 should be R0 + 2 * set1_len+8(FP) 45 MOVD set1_len+8(FP), R3 46 MOVD set2_len+32(FP), R4 47 48 ADD R3<<1, R0, R3 49 ADD R4<<1, R1, R4 50 51 52 //Rather than counting the number of elements added separately 53 //Save the starting register of buffer. 54 MOVD buffer+48(FP), R5 55 56 // set1 is empty, just flush set2 57 CMP R0, R3 58 BEQ flush_right 59 60 // set2 is empty, just flush set1 61 CMP R1, R4 62 BEQ flush_left 63 64 // R6, R7 are the working space for s1 and s2 65 MOVD ZR, R6 66 MOVD ZR, R7 67 68 MOVHU.P 2(R0), R6 69 MOVHU.P 2(R1), R7 70 loop: 71 72 CMP R6, R7 73 BEQ pop_both // R6 == R7 74 BLS pop_right // R6 > R7 75 //pop_left: // R6 < R7 76 MOVHU.P R6, 2(R2) 77 CMP R0, R3 78 BEQ pop_then_flush_right 79 MOVHU.P 2(R0), R6 80 JMP loop 81 pop_both: 82 MOVHU.P R6, 2(R2) //could also use R7, since they are equal 83 CMP R0, R3 84 BEQ flush_right 85 CMP R1, R4 86 BEQ flush_left 87 MOVHU.P 2(R0), R6 88 MOVHU.P 2(R1), R7 89 JMP loop 90 pop_right: 91 MOVHU.P R7, 2(R2) 92 CMP R1, R4 93 BEQ pop_then_flush_left 94 MOVHU.P 2(R1), R7 95 JMP loop 96 97 pop_then_flush_right: 98 MOVHU.P R7, 2(R2) 99 flush_right: 100 MOVD R1, R0 101 MOVD R4, R3 102 JMP flush_left 103 pop_then_flush_left: 104 MOVHU.P R6, 2(R2) 105 flush_left: 106 CMP R0, R3 107 BEQ return 108 //figure out how many bytes to slough off. Must be a multiple of two 109 SUB R0, R3, R4 110 ANDS $6, R4 111 BEQ long_flush //handles the 0 mod 8 case 112 SUBS $4, R4, R4 // since possible values are 2, 4, 6, this splits evenly 113 BLT pop_single // exactly the 2 case 114 MOVW.P 4(R0), R6 115 MOVW.P R6, 4(R2) 116 BEQ long_flush // we're now aligned by 64 bits, as R4==4, otherwise 2 more 117 pop_single: 118 MOVHU.P 2(R0), R6 119 MOVHU.P R6, 2(R2) 120 long_flush: 121 // at this point we know R3 - R0 is a multiple of 8. 122 CMP R0, R3 123 BEQ return 124 MOVD.P 8(R0), R6 125 MOVD.P R6, 8(R2) 126 JMP long_flush 127 return: 128 // number of shorts written is (R5 - R2) >> 1 129 SUB R5, R2 130 LSR $1, R2, R2 131 MOVD R2, size+72(FP) 132 RET