github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/bitwise_amd64.go.tpl (about) 1 // Copyright 2021 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 package PACKAGE 8 9 import ( 10 "reflect" 11 "unsafe" 12 ) 13 14 // ZZUnsafeInplace sets main[pos] := main[pos] OPCHAR arg[pos] for every position 15 // in main[]. 16 // 17 // WARNING: This is a function designed to be used in inner loops, which makes 18 // assumptions about length and capacity which aren't checked at runtime. Use 19 // the safe version of this function when that's a problem. 20 // Assumptions #2-3 are always satisfied when the last 21 // potentially-size-increasing operation on arg[] is {Re}makeUnsafe(), 22 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for main[]. 23 // 24 // 1. len(arg) and len(main) must be equal. 25 // 26 // 2. Capacities are at least RoundUpPow2(len(main) + 1, bytesPerVec). 27 // 28 // 3. The caller does not care if a few bytes past the end of main[] are 29 // changed. 30 func ZZUnsafeInplace(main, arg []byte) { 31 mainLen := len(main) 32 argData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&arg)).Data) 33 mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data) 34 argWordsIter := argData 35 mainWordsIter := mainData 36 if mainLen > 2*BytesPerWord { 37 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 38 for widx := 0; widx < nWordMinus2; widx++ { 39 mainWord := *((*uintptr)(mainWordsIter)) 40 argWord := *((*uintptr)(argWordsIter)) 41 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 42 mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord) 43 argWordsIter = unsafe.Add(argWordsIter, BytesPerWord) 44 } 45 } else if mainLen <= BytesPerWord { 46 mainWord := *((*uintptr)(mainWordsIter)) 47 argWord := *((*uintptr)(argWordsIter)) 48 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 49 return 50 } 51 // The last two read-and-writes to main[] usually overlap. To avoid a 52 // store-to-load forwarding slowdown, we read both words before writing 53 // either. 54 // shuffleLookupOddInplaceSSSE3Asm() uses the same strategy. 55 mainWord1 := *((*uintptr)(mainWordsIter)) 56 argWord1 := *((*uintptr)(argWordsIter)) 57 finalOffset := uintptr(mainLen - BytesPerWord) 58 mainFinalWordPtr := unsafe.Add(mainData, finalOffset) 59 argFinalWordPtr := unsafe.Add(argData, finalOffset) 60 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 61 argWord2 := *((*uintptr)(argFinalWordPtr)) 62 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1 63 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2 64 } 65 66 // ZZInplace sets main[pos] := arg[pos] OPCHAR main[pos] for every position in 67 // main[]. It panics if slice lengths don't match. 68 func ZZInplace(main, arg []byte) { 69 // This takes ~6-8% longer than ZZUnsafeInplace on the short-array benchmark 70 // on my Mac. 71 mainLen := len(main) 72 if len(arg) != mainLen { 73 panic("ZZInplace() requires len(arg) == len(main).") 74 } 75 if mainLen < BytesPerWord { 76 // It's probably possible to do better here (e.g. when mainLen is in 4..7, 77 // operate on uint32s), but I won't worry about it unless/until that's 78 // actually a common case. 79 for pos, argByte := range arg { 80 main[pos] = main[pos] OPCHAR argByte 81 } 82 return 83 } 84 argData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&arg)).Data) 85 mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data) 86 argWordsIter := argData 87 mainWordsIter := mainData 88 if mainLen > 2*BytesPerWord { 89 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 90 for widx := 0; widx < nWordMinus2; widx++ { 91 mainWord := *((*uintptr)(mainWordsIter)) 92 argWord := *((*uintptr)(argWordsIter)) 93 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 94 mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord) 95 argWordsIter = unsafe.Add(argWordsIter, BytesPerWord) 96 } 97 } 98 mainWord1 := *((*uintptr)(mainWordsIter)) 99 argWord1 := *((*uintptr)(argWordsIter)) 100 finalOffset := uintptr(mainLen - BytesPerWord) 101 mainFinalWordPtr := unsafe.Add(mainData, finalOffset) 102 argFinalWordPtr := unsafe.Add(argData, finalOffset) 103 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 104 argWord2 := *((*uintptr)(argFinalWordPtr)) 105 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1 106 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2 107 } 108 109 // ZZUnsafe sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst. 110 // 111 // WARNING: This is a function designed to be used in inner loops, which makes 112 // assumptions about length and capacity which aren't checked at runtime. Use 113 // the safe version of this function when that's a problem. 114 // Assumptions #2-3 are always satisfied when the last 115 // potentially-size-increasing operation on src1[] is {Re}makeUnsafe(), 116 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for src2[] and dst[]. 117 // 118 // 1. len(src1), len(src2), and len(dst) must be equal. 119 // 120 // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec). 121 // 122 // 3. The caller does not care if a few bytes past the end of dst[] are 123 // changed. 124 func ZZUnsafe(dst, src1, src2 []byte) { 125 src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1)) 126 src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2)) 127 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 128 nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord) 129 130 src1Iter := unsafe.Pointer(src1Header.Data) 131 src2Iter := unsafe.Pointer(src2Header.Data) 132 dstIter := unsafe.Pointer(dstHeader.Data) 133 for widx := 0; widx < nWord; widx++ { 134 src1Word := *((*uintptr)(src1Iter)) 135 src2Word := *((*uintptr)(src2Iter)) 136 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 137 src1Iter = unsafe.Add(src1Iter, BytesPerWord) 138 src2Iter = unsafe.Add(src2Iter, BytesPerWord) 139 dstIter = unsafe.Add(dstIter, BytesPerWord) 140 } 141 } 142 143 // ZZ sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst. It 144 // panics if slice lengths don't match. 145 func ZZ(dst, src1, src2 []byte) { 146 dstLen := len(dst) 147 if (len(src1) != dstLen) || (len(src2) != dstLen) { 148 panic("ZZ() requires len(src1) == len(src2) == len(dst).") 149 } 150 if dstLen < BytesPerWord { 151 for pos, src1Byte := range src1 { 152 dst[pos] = src1Byte OPCHAR src2[pos] 153 } 154 return 155 } 156 src1Data := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src1)).Data) 157 src2Data := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src2)).Data) 158 dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data) 159 nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord 160 161 src1Iter := src1Data 162 src2Iter := src2Data 163 dstIter := dstData 164 for widx := 0; widx < nWordMinus1; widx++ { 165 src1Word := *((*uintptr)(src1Iter)) 166 src2Word := *((*uintptr)(src2Iter)) 167 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 168 src1Iter = unsafe.Add(src1Iter, BytesPerWord) 169 src2Iter = unsafe.Add(src2Iter, BytesPerWord) 170 dstIter = unsafe.Add(dstIter, BytesPerWord) 171 } 172 // No store-forwarding problem here. 173 finalOffset := uintptr(dstLen - BytesPerWord) 174 src1Iter = unsafe.Add(src1Data, finalOffset) 175 src2Iter = unsafe.Add(src2Data, finalOffset) 176 dstIter = unsafe.Add(dstData, finalOffset) 177 src1Word := *((*uintptr)(src1Iter)) 178 src2Word := *((*uintptr)(src2Iter)) 179 *((*uintptr)(dstIter)) = src1Word OPCHAR src2Word 180 } 181 182 // ZZConst8UnsafeInplace sets main[pos] := main[pos] OPCHAR val for every position 183 // in main[]. 184 // 185 // WARNING: This is a function designed to be used in inner loops, which makes 186 // assumptions about length and capacity which aren't checked at runtime. Use 187 // the safe version of this function when that's a problem. 188 // These assumptions are always satisfied when the last 189 // potentially-size-increasing operation on main[] is {Re}makeUnsafe(), 190 // ResizeUnsafe(), or XcapUnsafe(). 191 // 192 // 1. cap(main) is at least RoundUpPow2(len(main) + 1, bytesPerVec). 193 // 194 // 2. The caller does not care if a few bytes past the end of main[] are 195 // changed. 196 func ZZConst8UnsafeInplace(main []byte, val byte) { 197 mainLen := len(main) 198 argWord := 0x101010101010101 * uintptr(val) 199 mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data) 200 mainWordsIter := mainData 201 if mainLen > 2*BytesPerWord { 202 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 203 for widx := 0; widx < nWordMinus2; widx++ { 204 mainWord := *((*uintptr)(mainWordsIter)) 205 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 206 mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord) 207 } 208 } else if mainLen <= BytesPerWord { 209 mainWord := *((*uintptr)(mainWordsIter)) 210 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 211 return 212 } 213 mainWord1 := *((*uintptr)(mainWordsIter)) 214 finalOffset := uintptr(mainLen - BytesPerWord) 215 mainFinalWordPtr := unsafe.Add(mainData, finalOffset) 216 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 217 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord 218 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord 219 } 220 221 // ZZConst8Inplace sets main[pos] := main[pos] OPCHAR val for every position in 222 // main[]. 223 func ZZConst8Inplace(main []byte, val byte) { 224 mainLen := len(main) 225 if mainLen < BytesPerWord { 226 for pos, mainByte := range main { 227 main[pos] = mainByte OPCHAR val 228 } 229 return 230 } 231 argWord := 0x101010101010101 * uintptr(val) 232 mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data) 233 mainWordsIter := mainData 234 if mainLen > 2*BytesPerWord { 235 nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord 236 for widx := 0; widx < nWordMinus2; widx++ { 237 mainWord := *((*uintptr)(mainWordsIter)) 238 *((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord 239 mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord) 240 } 241 } 242 mainWord1 := *((*uintptr)(mainWordsIter)) 243 finalOffset := uintptr(mainLen - BytesPerWord) 244 mainFinalWordPtr := unsafe.Add(mainData, finalOffset) 245 mainWord2 := *((*uintptr)(mainFinalWordPtr)) 246 *((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord 247 *((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord 248 } 249 250 // ZZConst8Unsafe sets dst[pos] := src[pos] OPCHAR val for every position in dst. 251 // 252 // WARNING: This is a function designed to be used in inner loops, which makes 253 // assumptions about length and capacity which aren't checked at runtime. Use 254 // the safe version of this function when that's a problem. 255 // Assumptions #2-3 are always satisfied when the last 256 // potentially-size-increasing operation on src[] is {Re}makeUnsafe(), 257 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[]. 258 // 259 // 1. len(src) and len(dst) must be equal. 260 // 261 // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec). 262 // 263 // 3. The caller does not care if a few bytes past the end of dst[] are 264 // changed. 265 func ZZConst8Unsafe(dst, src []byte, val byte) { 266 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 267 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 268 nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord) 269 argWord := 0x101010101010101 * uintptr(val) 270 271 srcIter := unsafe.Pointer(srcHeader.Data) 272 dstIter := unsafe.Pointer(dstHeader.Data) 273 for widx := 0; widx < nWord; widx++ { 274 srcWord := *((*uintptr)(srcIter)) 275 *((*uintptr)(dstIter)) = srcWord OPCHAR argWord 276 srcIter = unsafe.Add(srcIter, BytesPerWord) 277 dstIter = unsafe.Add(dstIter, BytesPerWord) 278 } 279 } 280 281 // ZZConst8 sets dst[pos] := src[pos] OPCHAR val for every position in dst. It 282 // panics if slice lengths don't match. 283 func ZZConst8(dst, src []byte, val byte) { 284 dstLen := len(dst) 285 if len(src) != dstLen { 286 panic("ZZConst8() requires len(src) == len(dst).") 287 } 288 if dstLen < BytesPerWord { 289 for pos, srcByte := range src { 290 dst[pos] = srcByte OPCHAR val 291 } 292 return 293 } 294 srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data) 295 dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data) 296 nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord 297 argWord := 0x101010101010101 * uintptr(val) 298 299 srcIter := unsafe.Pointer(srcData) 300 dstIter := unsafe.Pointer(dstData) 301 for widx := 0; widx < nWordMinus1; widx++ { 302 srcWord := *((*uintptr)(srcIter)) 303 *((*uintptr)(dstIter)) = srcWord OPCHAR argWord 304 srcIter = unsafe.Add(srcIter, BytesPerWord) 305 dstIter = unsafe.Add(dstIter, BytesPerWord) 306 } 307 finalOffset := uintptr(dstLen - BytesPerWord) 308 srcIter = unsafe.Add(srcData, finalOffset) 309 dstIter = unsafe.Add(dstData, finalOffset) 310 srcWord := *((*uintptr)(srcIter)) 311 *((*uintptr)(dstIter)) = srcWord OPCHAR argWord 312 }