github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/invmask_amd64.go.tpl

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/invmask_amd64.go.tpl (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!appengine
     6  
     7  package PACKAGE
     8  
     9  import (
    10  	"reflect"
    11  	"unsafe"
    12  )
    13  
    14  // ZZUnsafeInplace sets main[pos] := arg[pos] OPCHAR main[pos] for every position
    15  // in main[].
    16  //
    17  // WARNING: This is a function designed to be used in inner loops, which makes
    18  // assumptions about length and capacity which aren't checked at runtime.  Use
    19  // the safe version of this function when that's a problem.
    20  // Assumptions #2-3 are always satisfied when the last
    21  // potentially-size-increasing operation on arg[] is {Re}makeUnsafe(),
    22  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for main[].
    23  //
    24  // 1. len(arg) and len(main) must be equal.
    25  //
    26  // 2. Capacities are at least RoundUpPow2(len(main) + 1, bytesPerVec).
    27  //
    28  // 3. The caller does not care if a few bytes past the end of main[] are
    29  // changed.
    30  func ZZUnsafeInplace(main, arg []byte) {
    31  	mainLen := len(main)
    32  	argHeader := (*reflect.SliceHeader)(unsafe.Pointer(&arg))
    33  	mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main))
    34  	argWordsIter := unsafe.Pointer(argHeader.Data)
    35  	mainWordsIter := unsafe.Pointer(mainHeader.Data)
    36  	if mainLen > 2*BytesPerWord {
    37  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
    38  		for widx := 0; widx < nWordMinus2; widx++ {
    39  			mainWord := *((*uintptr)(mainWordsIter))
    40  			argWord := *((*uintptr)(argWordsIter))
    41  			*((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord
    42  			mainWordsIter = unsafe.Pointer(uintptr(mainWordsIter) + BytesPerWord)
    43  			argWordsIter = unsafe.Pointer(uintptr(argWordsIter) + BytesPerWord)
    44  		}
    45  	} else if mainLen <= BytesPerWord {
    46  		mainWord := *((*uintptr)(mainWordsIter))
    47  		argWord := *((*uintptr)(argWordsIter))
    48  		*((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord
    49  		return
    50  	}
    51  	// The last two read-and-writes to main[] usually overlap.  To avoid a
    52  	// store-to-load forwarding slowdown, we read both words before writing
    53  	// either.
    54  	// shuffleLookupOddInplaceSSSE3Asm() uses the same strategy.
    55  	mainWord1 := *((*uintptr)(mainWordsIter))
    56  	argWord1 := *((*uintptr)(argWordsIter))
    57  	finalOffset := uintptr(mainLen - BytesPerWord)
    58  	mainFinalWordPtr := unsafe.Pointer(mainHeader.Data + finalOffset)
    59  	argFinalWordPtr := unsafe.Pointer(argHeader.Data + finalOffset)
    60  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
    61  	argWord2 := *((*uintptr)(argFinalWordPtr))
    62  	*((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1
    63  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2
    64  }
    65  
    66  // ZZInplace sets main[pos] := arg[pos] OPCHAR main[pos] for every position in
    67  // main[].  It panics if slice lengths don't match.
    68  func ZZInplace(main, arg []byte) {
    69  	// This takes ~6-8% longer than ZZUnsafeInplace on the short-array benchmark
    70  	// on my Mac.
    71  	mainLen := len(main)
    72  	if len(arg) != mainLen {
    73  		panic("ZZInplace() requires len(arg) == len(main).")
    74  	}
    75  	if mainLen < BytesPerWord {
    76  		// It's probably possible to do better here (e.g. when mainLen is in 4..7,
    77  		// operate on uint32s), but I won't worry about it unless/until that's
    78  		// actually a common case.
    79  		for pos, argByte := range arg {
    80  			main[pos] = main[pos] OPCHAR argByte
    81  		}
    82  		return
    83  	}
    84  	argHeader := (*reflect.SliceHeader)(unsafe.Pointer(&arg))
    85  	mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main))
    86  	argWordsIter := unsafe.Pointer(argHeader.Data)
    87  	mainWordsIter := unsafe.Pointer(mainHeader.Data)
    88  	if mainLen > 2*BytesPerWord {
    89  		nWordMinus2 := (mainLen - BytesPerWord - 1) >> Log2BytesPerWord
    90  		for widx := 0; widx < nWordMinus2; widx++ {
    91  			mainWord := *((*uintptr)(mainWordsIter))
    92  			argWord := *((*uintptr)(argWordsIter))
    93  			*((*uintptr)(mainWordsIter)) = mainWord OPCHAR argWord
    94  			mainWordsIter = unsafe.Pointer(uintptr(mainWordsIter) + BytesPerWord)
    95  			argWordsIter = unsafe.Pointer(uintptr(argWordsIter) + BytesPerWord)
    96  		}
    97  	}
    98  	mainWord1 := *((*uintptr)(mainWordsIter))
    99  	argWord1 := *((*uintptr)(argWordsIter))
   100  	finalOffset := uintptr(mainLen - BytesPerWord)
   101  	mainFinalWordPtr := unsafe.Pointer(mainHeader.Data + finalOffset)
   102  	argFinalWordPtr := unsafe.Pointer(argHeader.Data + finalOffset)
   103  	mainWord2 := *((*uintptr)(mainFinalWordPtr))
   104  	argWord2 := *((*uintptr)(argFinalWordPtr))
   105  	*((*uintptr)(mainWordsIter)) = mainWord1 OPCHAR argWord1
   106  	*((*uintptr)(mainFinalWordPtr)) = mainWord2 OPCHAR argWord2
   107  }
   108  
   109  // ZZUnsafe sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst.
   110  //
   111  // WARNING: This is a function designed to be used in inner loops, which makes
   112  // assumptions about length and capacity which aren't checked at runtime.  Use
   113  // the safe version of this function when that's a problem.
   114  // Assumptions #2-3 are always satisfied when the last
   115  // potentially-size-increasing operation on src1[] is {Re}makeUnsafe(),
   116  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for src2[] and dst[].
   117  //
   118  // 1. len(src1), len(src2), and len(dst) must be equal.
   119  //
   120  // 2. Capacities are at least RoundUpPow2(len(dst) + 1, bytesPerVec).
   121  //
   122  // 3. The caller does not care if a few bytes past the end of dst[] are
   123  // changed.
   124  func ZZUnsafe(dst, src1, src2 []byte) {
   125  	src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1))
   126  	src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2))
   127  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   128  	nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord)
   129  
   130  	src1Iter := unsafe.Pointer(src1Header.Data)
   131  	src2Iter := unsafe.Pointer(src2Header.Data)
   132  	dstIter := unsafe.Pointer(dstHeader.Data)
   133  	for widx := 0; widx < nWord; widx++ {
   134  		src1Word := *((*uintptr)(src1Iter))
   135  		src2Word := *((*uintptr)(src2Iter))
   136  		*((*uintptr)(dstIter)) = src1Word OPCHAR src2Word
   137  		src1Iter = unsafe.Pointer(uintptr(src1Iter) + BytesPerWord)
   138  		src2Iter = unsafe.Pointer(uintptr(src2Iter) + BytesPerWord)
   139  		dstIter = unsafe.Pointer(uintptr(dstIter) + BytesPerWord)
   140  	}
   141  }
   142  
   143  // ZZ sets dst[pos] := src1[pos] OPCHAR src2[pos] for every position in dst.  It
   144  // panics if slice lengths don't match.
   145  func ZZ(dst, src1, src2 []byte) {
   146  	dstLen := len(dst)
   147  	if (len(src1) != dstLen) || (len(src2) != dstLen) {
   148  		panic("ZZ() requires len(src1) == len(src2) == len(dst).")
   149  	}
   150  	if dstLen < BytesPerWord {
   151  		for pos, src1Byte := range src1 {
   152  			dst[pos] = src1Byte OPCHAR src2[pos]
   153  		}
   154  		return
   155  	}
   156  	src1Header := (*reflect.SliceHeader)(unsafe.Pointer(&src1))
   157  	src2Header := (*reflect.SliceHeader)(unsafe.Pointer(&src2))
   158  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   159  	nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord
   160  
   161  	src1Iter := unsafe.Pointer(src1Header.Data)
   162  	src2Iter := unsafe.Pointer(src2Header.Data)
   163  	dstIter := unsafe.Pointer(dstHeader.Data)
   164  	for widx := 0; widx < nWordMinus1; widx++ {
   165  		src1Word := *((*uintptr)(src1Iter))
   166  		src2Word := *((*uintptr)(src2Iter))
   167  		*((*uintptr)(dstIter)) = src1Word OPCHAR src2Word
   168  		src1Iter = unsafe.Pointer(uintptr(src1Iter) + BytesPerWord)
   169  		src2Iter = unsafe.Pointer(uintptr(src2Iter) + BytesPerWord)
   170  		dstIter = unsafe.Pointer(uintptr(dstIter) + BytesPerWord)
   171  	}
   172  	// No store-forwarding problem here.
   173  	finalOffset := uintptr(dstLen - BytesPerWord)
   174  	src1Iter = unsafe.Pointer(src1Header.Data + finalOffset)
   175  	src2Iter = unsafe.Pointer(src2Header.Data + finalOffset)
   176  	dstIter = unsafe.Pointer(dstHeader.Data + finalOffset)
   177  	src1Word := *((*uintptr)(src1Iter))
   178  	src2Word := *((*uintptr)(src2Iter))
   179  	*((*uintptr)(dstIter)) = src1Word OPCHAR src2Word
   180  }