github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/invmask_amd64.go.tpl (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 package PACKAGE 8 9 import ( 10 "reflect" 11 "unsafe" 12 ) 13 14 // ZZUnsafeInplace sets main[pos] := arg[pos] OPCHAR main[pos] for every position 15 // in main[]. 16 // 17 // WARNING: This is a function designed to be used in inner loops, which makes 18 // assumptions about length and capacity which aren't checked at runtime. Use 19 // the safe version of this function when that's a problem. 20 // Assumptions #2-3 are always satisfied when the last 21 // potentially-size-increasing operation on arg[] is {Re}makeUnsafe(), 22 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for main[]. 23 // 24 // 1. len(arg) and len(main) must be equal. 25 // 26 // 2. Capacities are at least RoundUpPow2(len(main) + 1, bytesPerVec). 27 // 28 // 3. The caller does not care if a few bytes past the end of main[] are 29 // changed. 30 func ZZUnsafeInplace(main, arg []byte) { 31 mainLen := len(main) 32 argHeader := (*reflect.SliceHeader)(unsafe.Pointer(&arg)) 33 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 34 argWordsIter := unsafe.Pointer(argHeader.Data) 35 mainWordsIter := unsafe.Pointer(mainHeader.Data) 36 if mainLen > 2*BytesPerWord { 37 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 38 for widx := 0; widx < nWordMinus2; widx++ { 39 mainWord := *((*uintptr)(mainWordsIter)) 40 argWord := *((*uintptr)(argWordsIter)) 41 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 42 mainWordsIter = unsafe.Pointer(uintptr(mainWordsIter) + BytesPerWord) 43 argWordsIter = unsafe.Pointer(uintptr(argWordsIter) + BytesPerWord) 44 } 45 } else if mainLen <= BytesPerWord { 46 mainWord := *((*uintptr)(mainWordsIter)) 47 argWord := *((*uintptr)(argWordsIter)) 48 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 49 return 50 } 51 // The last two read-and-writes to main[] usually overlap. To avoid a 52 // store-to-load forwarding slowdown, we read both words before writing 53 // either. 54 // shuffleLookupOddInplaceSSSE3Asm() uses the same strategy. 55 mainWord1 := *((*uintptr)(mainWordsIter)) 56 argWord1 := *((*uintptr)(argWordsIter)) 57 finalOffset := uintptr(mainLen - BytesPerWord) 58 mainFinalWordPtr := unsafe.Pointer(mainHeader.Data + finalOffset) 59 argFinalWordPtr := unsafe.Pointer(argHeader.Data + finalOffset) 60 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 61 argWord2 := *((*uintptr)(argFinalWordPtr)) 62 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1 63 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2 64 } 65 66 // ZZInplace sets main[pos] := arg[pos] OPCHAR main[pos] for every position in 67 // main[]. It panics if slice lengths don't match. 68 func ZZInplace(main, arg []byte) { 69 // This takes ~6-8% longer than ZZUnsafeInplace on the short-array benchmark 70 // on my Mac. 71 mainLen := len(main) 72 if len(arg) != mainLen { 73 panic("ZZInplace() requires len(arg) == len(main).") 74 } 75 if mainLen < BytesPerWord { 76 // It's probably possible to do better here (e.g. when mainLen is in 4..7, 77 // operate on uint32s), but I won't worry about it unless/until that's 78 // actually a common case. 79 for pos, argByte := range arg { 80 main[pos] = main[pos] OPCHAR argByte 81 } 82 return 83 } 84 argHeader := (*reflect.SliceHeader)(unsafe.Pointer(&arg)) 85 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 86 argWordsIter := unsafe.Pointer(argHeader.Data) 87 mainWordsIter := unsafe.Pointer(mainHeader.Data) 88 if mainLen > 2*BytesPerWord { 89 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 90 for widx := 0; widx < nWordMinus2; widx++ { 91 mainWord := *((*uintptr)(mainWordsIter)) 92 argWord := *((*uintptr)(argWordsIter)) 93 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 94 mainWordsIter = unsafe.Pointer(uintptr(mainWordsIter) + BytesPerWord) 95 argWordsIter = unsafe.Pointer(uintptr(argWordsIter) + BytesPerWord) 96 } 97 } 98 mainWord1 := *((*uintptr)(mainWordsIter)) 99 argWord1 := *((*uintptr)(argWordsIter)) 100 finalOffset := uintptr(mainLen - BytesPerWord) 101 mainFinalWordPtr := unsafe.Pointer(mainHeader.Data + finalOffset) 102 argFinalWordPtr := unsafe.Pointer(argHeader.Data + finalOffset) 103 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 104 argWord2 := *((*uintptr)(argFinalWordPtr)) 105 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1 106 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2 107 } 108 109 // ZZUnsafe sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst. 110 // 111 // WARNING: This is a function designed to be used in inner loops, which makes 112 // assumptions about length and capacity which aren't checked at runtime. Use 113 // the safe version of this function when that's a problem. 114 // Assumptions #2-3 are always satisfied when the last 115 // potentially-size-increasing operation on src1[] is {Re}makeUnsafe(), 116 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for src2[] and dst[]. 117 // 118 // 1. len(src1), len(src2), and len(dst) must be equal. 119 // 120 // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec). 121 // 122 // 3. The caller does not care if a few bytes past the end of dst[] are 123 // changed. 124 func ZZUnsafe(dst, src1, src2 []byte) { 125 src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1)) 126 src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2)) 127 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 128 nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord) 129 130 src1Iter := unsafe.Pointer(src1Header.Data) 131 src2Iter := unsafe.Pointer(src2Header.Data) 132 dstIter := unsafe.Pointer(dstHeader.Data) 133 for widx := 0; widx < nWord; widx++ { 134 src1Word := *((*uintptr)(src1Iter)) 135 src2Word := *((*uintptr)(src2Iter)) 136 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 137 src1Iter = unsafe.Pointer(uintptr(src1Iter) + BytesPerWord) 138 src2Iter = unsafe.Pointer(uintptr(src2Iter) + BytesPerWord) 139 dstIter = unsafe.Pointer(uintptr(dstIter) + BytesPerWord) 140 } 141 } 142 143 // ZZ sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst. It 144 // panics if slice lengths don't match. 145 func ZZ(dst, src1, src2 []byte) { 146 dstLen := len(dst) 147 if (len(src1) != dstLen) || (len(src2) != dstLen) { 148 panic("ZZ() requires len(src1) == len(src2) == len(dst).") 149 } 150 if dstLen < BytesPerWord { 151 for pos, src1Byte := range src1 { 152 dst[pos] = src1Byte OPCHAR src2[pos] 153 } 154 return 155 } 156 src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1)) 157 src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2)) 158 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 159 nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord 160 161 src1Iter := unsafe.Pointer(src1Header.Data) 162 src2Iter := unsafe.Pointer(src2Header.Data) 163 dstIter := unsafe.Pointer(dstHeader.Data) 164 for widx := 0; widx < nWordMinus1; widx++ { 165 src1Word := *((*uintptr)(src1Iter)) 166 src2Word := *((*uintptr)(src2Iter)) 167 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 168 src1Iter = unsafe.Pointer(uintptr(src1Iter) + BytesPerWord) 169 src2Iter = unsafe.Pointer(uintptr(src2Iter) + BytesPerWord) 170 dstIter = unsafe.Pointer(uintptr(dstIter) + BytesPerWord) 171 } 172 // No store-forwarding problem here. 173 finalOffset := uintptr(dstLen - BytesPerWord) 174 src1Iter = unsafe.Pointer(src1Header.Data + finalOffset) 175 src2Iter = unsafe.Pointer(src2Header.Data + finalOffset) 176 dstIter = unsafe.Pointer(dstHeader.Data + finalOffset) 177 src1Word := *((*uintptr)(src1Iter)) 178 src2Word := *((*uintptr)(src2Iter)) 179 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 180 }