github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/multibyte_amd64.go (about) 1 // Copyright 2021 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 //go:build amd64 && !appengine 6 // +build amd64,!appengine 7 8 package simd 9 10 import ( 11 "reflect" 12 "unsafe" 13 ) 14 15 // This file contains functions which operate on slices of 2- or 4-byte 16 // elements (typically small structs or integers) in ways that differ from the 17 // corresponding operations on single-byte elements. 18 // In this context, there is little point in making the interface based on 19 // []byte, since the caller will need to unsafely cast to it. Instead, most 20 // functions take unsafe.Pointer(s) and a count, and have names ending in 21 // 'Raw'; the caller should write safe wrappers around them when appropriate. 22 // We provide sample wrappers for the int16 and uint16 cases. (Originally did 23 // this for int32/uint32, but turns out the compiler has hardcoded 24 // optimizations for those cases which are currently missing for {u}int16.) 25 26 // *** the following functions are defined in multibyte_amd64.s 27 28 //go:noescape 29 func index16SSE2Asm(main unsafe.Pointer, val, nElem int) int 30 31 //go:noescape 32 func reverse16InplaceSSSE3Asm(main unsafe.Pointer, nElem int) 33 34 //go:noescape 35 func reverse16SSSE3Asm(dst, src unsafe.Pointer, nElem int) 36 37 // *** end assembly function signature(s) 38 39 // Memset16Raw assumes dst points to an array of nElem 2-byte elements, and 40 // valPtr points to a single 2-byte element. It fills dst with copies of 41 // *valPtr. 42 func Memset16Raw(dst, valPtr unsafe.Pointer, nElem int) { 43 // Strictly speaking, it may be slightly more efficient to pass val directly 44 // on the stack as e.g. a uint16, but this interface lets us avoid worrying 45 // about little-endian vs. big-endian and leads to cleaner struct-filling 46 // code. 47 val := *((*uint16)(valPtr)) 48 if nElem < BytesPerWord/2 { 49 for idx := 0; idx != nElem; idx++ { 50 *((*uint16)(dst)) = val 51 dst = unsafe.Add(dst, 2) 52 } 53 return 54 } 55 valWord := uintptr(0x1000100010001) * uintptr(val) 56 nWordMinus1 := (nElem - 1) >> (Log2BytesPerWord - 1) 57 dstWordsIter := dst 58 for widx := 0; widx != nWordMinus1; widx++ { 59 *((*uintptr)(dstWordsIter)) = valWord 60 dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord) 61 } 62 dstWordsIter = unsafe.Add(dst, nElem*2-BytesPerWord) 63 *((*uintptr)(dstWordsIter)) = valWord 64 } 65 66 // Memset32Raw assumes dst points to an array of nElem 4-byte elements, and 67 // valPtr points to a single 4-byte element. It fills dst with copies of 68 // *valPtr. 69 func Memset32Raw(dst, valPtr unsafe.Pointer, nElem int) { 70 val := *((*uint32)(valPtr)) 71 if nElem < BytesPerWord/4 { 72 if nElem != 0 { 73 *((*uint32)(dst)) = val 74 } 75 return 76 } 77 valWord := uintptr(0x100000001) * uintptr(val) 78 nWordMinus1 := (nElem - 1) >> (Log2BytesPerWord - 2) 79 dstWordsIter := dst 80 for widx := 0; widx != nWordMinus1; widx++ { 81 *((*uintptr)(dstWordsIter)) = valWord 82 dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord) 83 } 84 dstWordsIter = unsafe.Add(dst, nElem*4-BytesPerWord) 85 *((*uintptr)(dstWordsIter)) = valWord 86 } 87 88 // RepeatI16 fills dst[] with the given int16. 89 func RepeatI16(dst []int16, val int16) { 90 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 91 Memset16Raw(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(&val), dstHeader.Len) 92 } 93 94 // RepeatU16 fills dst[] with the given uint16. 95 func RepeatU16(dst []uint16, val uint16) { 96 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 97 Memset16Raw(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(&val), dstHeader.Len) 98 } 99 100 // IndexU16 returns the index of the first instance of val in main, or -1 if 101 // val is not present in main. 102 func IndexU16(main []uint16, val uint16) int { 103 if len(main) < 8 { 104 for i, v := range main { 105 if v == val { 106 return i 107 } 108 } 109 return -1 110 } 111 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 112 return index16SSE2Asm(unsafe.Pointer(mainHeader.Data), int(val), mainHeader.Len) 113 } 114 115 // (Add a function which has the original little-endian byte-slice semantics if 116 // we ever need it.) 117 118 // Reverse16InplaceRaw assumes main points to an array of ct 2-byte elements, 119 // and reverses it in-place. 120 func Reverse16InplaceRaw(main unsafe.Pointer, nElem int) { 121 if nElem <= 8 { 122 nElemDiv2 := nElem >> 1 123 fwdIter := main 124 revIter := unsafe.Add(main, (nElem-1)*2) 125 for idx := 0; idx != nElemDiv2; idx++ { 126 origLeftVal := *((*uint16)(fwdIter)) 127 *((*uint16)(fwdIter)) = *((*uint16)(revIter)) 128 *((*uint16)(revIter)) = origLeftVal 129 fwdIter = unsafe.Add(fwdIter, 2) 130 revIter = unsafe.Add(revIter, -2) 131 } 132 return 133 } 134 reverse16InplaceSSSE3Asm(main, nElem) 135 } 136 137 // Reverse16Raw assumes dst and src both point to arrays of ct 2-byte elements, 138 // and sets dst[pos] := src[ct - 1 - pos] for each position. 139 func Reverse16Raw(dst, src unsafe.Pointer, nElem int) { 140 if nElem < 8 { 141 srcIter := unsafe.Add(src, (nElem-1)*2) 142 dstIter := dst 143 for idx := 0; idx != nElem; idx++ { 144 *((*uint16)(dstIter)) = *((*uint16)(srcIter)) 145 srcIter = unsafe.Add(srcIter, -2) 146 dstIter = unsafe.Add(dstIter, 2) 147 } 148 return 149 } 150 reverse16SSSE3Asm(dst, src, nElem) 151 } 152 153 // ReverseI16Inplace reverses a []int16 in-place. 154 func ReverseI16Inplace(main []int16) { 155 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 156 Reverse16InplaceRaw(unsafe.Pointer(mainHeader.Data), mainHeader.Len) 157 } 158 159 // ReverseU16Inplace reverses a []uint16 in-place. 160 func ReverseU16Inplace(main []uint16) { 161 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 162 Reverse16InplaceRaw(unsafe.Pointer(mainHeader.Data), mainHeader.Len) 163 } 164 165 // ReverseI16 sets dst[len(src) - 1 - pos] := src[pos] for each position in 166 // src. It panics if len(src) != len(dst). 167 func ReverseI16(dst, src []int16) { 168 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 169 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 170 nElem := srcHeader.Len 171 if nElem != dstHeader.Len { 172 panic("ReverseI16() requires len(src) == len(dst).") 173 } 174 Reverse16Raw(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), nElem) 175 } 176 177 // ReverseU16 sets dst[len(src) - 1 - pos] := src[pos] for each position in 178 // src. It panics if len(src) != len(dst). 179 func ReverseU16(dst, src []uint16) { 180 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 181 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 182 nElem := srcHeader.Len 183 if nElem != dstHeader.Len { 184 panic("ReverseU16() requires len(src) == len(dst).") 185 } 186 Reverse16Raw(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), nElem) 187 } 188 189 // Benchmark results suggest that Reverse32Raw is unimportant.