github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/or_amd64.go

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/or_amd64.go (about)

     1  // Code generated by "../gtl/generate.py --prefix=Or -DOPCHAR=| --package=simd --output=or_amd64.go bitwise_amd64.go.tpl". DO NOT EDIT.
     2  
     3  // Copyright 2021 GRAIL, Inc.  All rights reserved.
     4  // Use of this source code is governed by the Apache-2.0
     5  // license that can be found in the LICENSE file.
     6  
     7  //go:build amd64 && !appengine
     8  // +build amd64,!appengine
     9  
    10  package simd
    11  
    12  import (
    13  	"reflect"
    14  	"unsafe"
    15  )
    16  
    17  // OrUnsafeInplace sets main[pos] := main[pos] | arg[pos] for every position
    18  // in main[].
    19  //
    20  // WARNING: This is a function designed to be used in inner loops, which makes
    21  // assumptions about length and capacity which aren't checked at runtime.  Use
    22  // the safe version of this function when that's a problem.
    23  // Assumptions #2-3 are always satisfied when the last
    24  // potentially-size-increasing operation on arg[] is {Re}makeUnsafe(),
    25  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for main[].
    26  //
    27  // 1. len(arg) and len(main) must be equal.
    28  //
    29  // 2. Capacities are at least RoundUpPow2(len(main) + 1, bytesPerVec).
    30  //
    31  // 3. The caller does not care if a few bytes past the end of main[] are
    32  // changed.
    33  func OrUnsafeInplace(main, arg []byte) {
    34  	mainLen := len(main)
    35  	argData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&arg)).Data)
    36  	mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data)
    37  	argWordsIter := argData
    38  	mainWordsIter := mainData
    39  	if mainLen > 2*BytesPerWord {
    40  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
    41  		for widx := 0; widx < nWordMinus2; widx++ {
    42  			mainWord := *((*uintptr)(mainWordsIter))
    43  			argWord := *((*uintptr)(argWordsIter))
    44  			*((*uintptr)(mainWordsIter)) = mainWord | argWord
    45  			mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord)
    46  			argWordsIter = unsafe.Add(argWordsIter, BytesPerWord)
    47  		}
    48  	} else if mainLen <= BytesPerWord {
    49  		mainWord := *((*uintptr)(mainWordsIter))
    50  		argWord := *((*uintptr)(argWordsIter))
    51  		*((*uintptr)(mainWordsIter)) = mainWord | argWord
    52  		return
    53  	}
    54  	// The last two read-and-writes to main[] usually overlap.  To avoid a
    55  	// store-to-load forwarding slowdown, we read both words before writing
    56  	// either.
    57  	// shuffleLookupOddInplaceSSSE3Asm() uses the same strategy.
    58  	mainWord1 := *((*uintptr)(mainWordsIter))
    59  	argWord1 := *((*uintptr)(argWordsIter))
    60  	finalOffset := uintptr(mainLen - BytesPerWord)
    61  	mainFinalWordPtr := unsafe.Add(mainData, finalOffset)
    62  	argFinalWordPtr := unsafe.Add(argData, finalOffset)
    63  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
    64  	argWord2 := *((*uintptr)(argFinalWordPtr))
    65  	*((*uintptr)(mainWordsIter)) = mainWord1 | argWord1
    66  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 | argWord2
    67  }
    68  
    69  // OrInplace sets main[pos] := arg[pos] | main[pos] for every position in
    70  // main[].  It panics if slice lengths don't match.
    71  func OrInplace(main, arg []byte) {
    72  	// This takes ~6-8% longer than OrUnsafeInplace on the short-array benchmark
    73  	// on my Mac.
    74  	mainLen := len(main)
    75  	if len(arg) != mainLen {
    76  		panic("OrInplace() requires len(arg) == len(main).")
    77  	}
    78  	if mainLen < BytesPerWord {
    79  		// It's probably possible to do better here (e.g. when mainLen is in 4..7,
    80  		// operate on uint32s), but I won't worry about it unless/until that's
    81  		// actually a common case.
    82  		for pos, argByte := range arg {
    83  			main[pos] = main[pos] | argByte
    84  		}
    85  		return
    86  	}
    87  	argData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&arg)).Data)
    88  	mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data)
    89  	argWordsIter := argData
    90  	mainWordsIter := mainData
    91  	if mainLen > 2*BytesPerWord {
    92  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
    93  		for widx := 0; widx < nWordMinus2; widx++ {
    94  			mainWord := *((*uintptr)(mainWordsIter))
    95  			argWord := *((*uintptr)(argWordsIter))
    96  			*((*uintptr)(mainWordsIter)) = mainWord | argWord
    97  			mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord)
    98  			argWordsIter = unsafe.Add(argWordsIter, BytesPerWord)
    99  		}
   100  	}
   101  	mainWord1 := *((*uintptr)(mainWordsIter))
   102  	argWord1 := *((*uintptr)(argWordsIter))
   103  	finalOffset := uintptr(mainLen - BytesPerWord)
   104  	mainFinalWordPtr := unsafe.Add(mainData, finalOffset)
   105  	argFinalWordPtr := unsafe.Add(argData, finalOffset)
   106  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
   107  	argWord2 := *((*uintptr)(argFinalWordPtr))
   108  	*((*uintptr)(mainWordsIter)) = mainWord1 | argWord1
   109  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 | argWord2
   110  }
   111  
   112  // OrUnsafe sets dst[pos] := src1[pos] | src2[pos] for every position in dst.
   113  //
   114  // WARNING: This is a function designed to be used in inner loops, which makes
   115  // assumptions about length and capacity which aren't checked at runtime.  Use
   116  // the safe version of this function when that's a problem.
   117  // Assumptions #2-3 are always satisfied when the last
   118  // potentially-size-increasing operation on src1[] is {Re}makeUnsafe(),
   119  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for src2[] and dst[].
   120  //
   121  // 1. len(src1), len(src2), and len(dst) must be equal.
   122  //
   123  // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec).
   124  //
   125  // 3. The caller does not care if a few bytes past the end of dst[] are
   126  // changed.
   127  func OrUnsafe(dst, src1, src2 []byte) {
   128  	src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1))
   129  	src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2))
   130  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   131  	nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord)
   132  
   133  	src1Iter := unsafe.Pointer(src1Header.Data)
   134  	src2Iter := unsafe.Pointer(src2Header.Data)
   135  	dstIter := unsafe.Pointer(dstHeader.Data)
   136  	for widx := 0; widx < nWord; widx++ {
   137  		src1Word := *((*uintptr)(src1Iter))
   138  		src2Word := *((*uintptr)(src2Iter))
   139  		*((*uintptr)(dstIter)) = src1Word | src2Word
   140  		src1Iter = unsafe.Add(src1Iter, BytesPerWord)
   141  		src2Iter = unsafe.Add(src2Iter, BytesPerWord)
   142  		dstIter = unsafe.Add(dstIter, BytesPerWord)
   143  	}
   144  }
   145  
   146  // Or sets dst[pos] := src1[pos] | src2[pos] for every position in dst.  It
   147  // panics if slice lengths don't match.
   148  func Or(dst, src1, src2 []byte) {
   149  	dstLen := len(dst)
   150  	if (len(src1) != dstLen) || (len(src2) != dstLen) {
   151  		panic("Or() requires len(src1) == len(src2) == len(dst).")
   152  	}
   153  	if dstLen < BytesPerWord {
   154  		for pos, src1Byte := range src1 {
   155  			dst[pos] = src1Byte | src2[pos]
   156  		}
   157  		return
   158  	}
   159  	src1Data := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src1)).Data)
   160  	src2Data := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src2)).Data)
   161  	dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data)
   162  	nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord
   163  
   164  	src1Iter := src1Data
   165  	src2Iter := src2Data
   166  	dstIter := dstData
   167  	for widx := 0; widx < nWordMinus1; widx++ {
   168  		src1Word := *((*uintptr)(src1Iter))
   169  		src2Word := *((*uintptr)(src2Iter))
   170  		*((*uintptr)(dstIter)) = src1Word | src2Word
   171  		src1Iter = unsafe.Add(src1Iter, BytesPerWord)
   172  		src2Iter = unsafe.Add(src2Iter, BytesPerWord)
   173  		dstIter = unsafe.Add(dstIter, BytesPerWord)
   174  	}
   175  	// No store-forwarding problem here.
   176  	finalOffset := uintptr(dstLen - BytesPerWord)
   177  	src1Iter = unsafe.Add(src1Data, finalOffset)
   178  	src2Iter = unsafe.Add(src2Data, finalOffset)
   179  	dstIter = unsafe.Add(dstData, finalOffset)
   180  	src1Word := *((*uintptr)(src1Iter))
   181  	src2Word := *((*uintptr)(src2Iter))
   182  	*((*uintptr)(dstIter)) = src1Word | src2Word
   183  }
   184  
   185  // OrConst8UnsafeInplace sets main[pos] := main[pos] | val for every position
   186  // in main[].
   187  //
   188  // WARNING: This is a function designed to be used in inner loops, which makes
   189  // assumptions about length and capacity which aren't checked at runtime.  Use
   190  // the safe version of this function when that's a problem.
   191  // These assumptions are always satisfied when the last
   192  // potentially-size-increasing operation on main[] is {Re}makeUnsafe(),
   193  // ResizeUnsafe(), or XcapUnsafe().
   194  //
   195  // 1. cap(main) is at least RoundUpPow2(len(main) + 1, bytesPerVec).
   196  //
   197  // 2. The caller does not care if a few bytes past the end of main[] are
   198  // changed.
   199  func OrConst8UnsafeInplace(main []byte, val byte) {
   200  	mainLen := len(main)
   201  	argWord := 0x101010101010101 * uintptr(val)
   202  	mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data)
   203  	mainWordsIter := mainData
   204  	if mainLen > 2*BytesPerWord {
   205  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
   206  		for widx := 0; widx < nWordMinus2; widx++ {
   207  			mainWord := *((*uintptr)(mainWordsIter))
   208  			*((*uintptr)(mainWordsIter)) = mainWord | argWord
   209  			mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord)
   210  		}
   211  	} else if mainLen <= BytesPerWord {
   212  		mainWord := *((*uintptr)(mainWordsIter))
   213  		*((*uintptr)(mainWordsIter)) = mainWord | argWord
   214  		return
   215  	}
   216  	mainWord1 := *((*uintptr)(mainWordsIter))
   217  	finalOffset := uintptr(mainLen - BytesPerWord)
   218  	mainFinalWordPtr := unsafe.Add(mainData, finalOffset)
   219  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
   220  	*((*uintptr)(mainWordsIter)) = mainWord1 | argWord
   221  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 | argWord
   222  }
   223  
   224  // OrConst8Inplace sets main[pos] := main[pos] | val for every position in
   225  // main[].
   226  func OrConst8Inplace(main []byte, val byte) {
   227  	mainLen := len(main)
   228  	if mainLen < BytesPerWord {
   229  		for pos, mainByte := range main {
   230  			main[pos] = mainByte | val
   231  		}
   232  		return
   233  	}
   234  	argWord := 0x101010101010101 * uintptr(val)
   235  	mainData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&main)).Data)
   236  	mainWordsIter := mainData
   237  	if mainLen > 2*BytesPerWord {
   238  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
   239  		for widx := 0; widx < nWordMinus2; widx++ {
   240  			mainWord := *((*uintptr)(mainWordsIter))
   241  			*((*uintptr)(mainWordsIter)) = mainWord | argWord
   242  			mainWordsIter = unsafe.Add(mainWordsIter, BytesPerWord)
   243  		}
   244  	}
   245  	mainWord1 := *((*uintptr)(mainWordsIter))
   246  	finalOffset := uintptr(mainLen - BytesPerWord)
   247  	mainFinalWordPtr := unsafe.Add(mainData, finalOffset)
   248  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
   249  	*((*uintptr)(mainWordsIter)) = mainWord1 | argWord
   250  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 | argWord
   251  }
   252  
   253  // OrConst8Unsafe sets dst[pos] := src[pos] | val for every position in dst.
   254  //
   255  // WARNING: This is a function designed to be used in inner loops, which makes
   256  // assumptions about length and capacity which aren't checked at runtime.  Use
   257  // the safe version of this function when that's a problem.
   258  // Assumptions #2-3 are always satisfied when the last
   259  // potentially-size-increasing operation on src[] is {Re}makeUnsafe(),
   260  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[].
   261  //
   262  // 1. len(src) and len(dst) must be equal.
   263  //
   264  // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec).
   265  //
   266  // 3. The caller does not care if a few bytes past the end of dst[] are
   267  // changed.
   268  func OrConst8Unsafe(dst, src []byte, val byte) {
   269  	srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src))
   270  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   271  	nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord)
   272  	argWord := 0x101010101010101 * uintptr(val)
   273  
   274  	srcIter := unsafe.Pointer(srcHeader.Data)
   275  	dstIter := unsafe.Pointer(dstHeader.Data)
   276  	for widx := 0; widx < nWord; widx++ {
   277  		srcWord := *((*uintptr)(srcIter))
   278  		*((*uintptr)(dstIter)) = srcWord | argWord
   279  		srcIter = unsafe.Add(srcIter, BytesPerWord)
   280  		dstIter = unsafe.Add(dstIter, BytesPerWord)
   281  	}
   282  }
   283  
   284  // OrConst8 sets dst[pos] := src[pos] | val for every position in dst.  It
   285  // panics if slice lengths don't match.
   286  func OrConst8(dst, src []byte, val byte) {
   287  	dstLen := len(dst)
   288  	if len(src) != dstLen {
   289  		panic("OrConst8() requires len(src) == len(dst).")
   290  	}
   291  	if dstLen < BytesPerWord {
   292  		for pos, srcByte := range src {
   293  			dst[pos] = srcByte | val
   294  		}
   295  		return
   296  	}
   297  	srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data)
   298  	dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data)
   299  	nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord
   300  	argWord := 0x101010101010101 * uintptr(val)
   301  
   302  	srcIter := unsafe.Pointer(srcData)
   303  	dstIter := unsafe.Pointer(dstData)
   304  	for widx := 0; widx < nWordMinus1; widx++ {
   305  		srcWord := *((*uintptr)(srcIter))
   306  		*((*uintptr)(dstIter)) = srcWord | argWord
   307  		srcIter = unsafe.Add(srcIter, BytesPerWord)
   308  		dstIter = unsafe.Add(dstIter, BytesPerWord)
   309  	}
   310  	finalOffset := uintptr(dstLen - BytesPerWord)
   311  	srcIter = unsafe.Add(srcData, finalOffset)
   312  	dstIter = unsafe.Add(dstData, finalOffset)
   313  	srcWord := *((*uintptr)(srcIter))
   314  	*((*uintptr)(dstIter)) = srcWord | argWord
   315  }