github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/multibyte_test.go (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !appengine
     6  
     7  package simd_test
     8  
     9  import (
    10  	"math/rand"
    11  	"reflect"
    12  	"testing"
    13  	"unsafe"
    14  
    15  	"github.com/Schaudge/grailbase/simd"
    16  	"github.com/grailbio/testutil/expect"
    17  )
    18  
    19  // The compiler clearly recognizes this; performance is almost
    20  // indistinguishable from handcoded assembly.
    21  func memset32Builtin(dst []uint32, val uint32) {
    22  	for idx := range dst {
    23  		dst[idx] = val
    24  	}
    25  }
    26  
    27  func TestMemset32(t *testing.T) {
    28  	maxSize := 500
    29  	nIter := 200
    30  	rand.Seed(1)
    31  	main1Arr := make([]uint32, maxSize)
    32  	main2Arr := make([]uint32, maxSize)
    33  	for iter := 0; iter < nIter; iter++ {
    34  		sliceStart := rand.Intn(maxSize)
    35  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
    36  		u32Val := rand.Uint32()
    37  		main1Slice := main1Arr[sliceStart:sliceEnd]
    38  		main2Slice := main2Arr[sliceStart:sliceEnd]
    39  		sentinel := rand.Uint32()
    40  		main2Arr[sliceEnd] = sentinel
    41  		memset32Builtin(main1Slice, u32Val)
    42  		main2SliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main2Slice))
    43  		simd.Memset32Raw(unsafe.Pointer(main2SliceHeader.Data), unsafe.Pointer(&u32Val), main2SliceHeader.Len)
    44  		if !reflect.DeepEqual(main1Slice, main2Slice) {
    45  			t.Fatal("Mismatched Memset32Raw result.")
    46  		}
    47  		if main2Arr[sliceEnd] != sentinel {
    48  			t.Fatal("Memset32Raw clobbered an extra byte.")
    49  		}
    50  	}
    51  }
    52  
    53  func memset16Standard(dst []uint16, val uint16) {
    54  	// This tends to be better than the range-for loop, though it's less
    55  	// clear-cut than the memset case.
    56  	nDst := len(dst)
    57  	if nDst != 0 {
    58  		dst[0] = val
    59  		for i := 1; i < nDst; {
    60  			i += copy(dst[i:], dst[:i])
    61  		}
    62  	}
    63  }
    64  
    65  func TestMemset16(t *testing.T) {
    66  	maxSize := 500
    67  	nIter := 200
    68  	rand.Seed(1)
    69  	main1Arr := make([]uint16, maxSize)
    70  	main2Arr := make([]uint16, maxSize)
    71  	for iter := 0; iter < nIter; iter++ {
    72  		sliceStart := rand.Intn(maxSize)
    73  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
    74  		u16Val := uint16(rand.Uint32())
    75  		main1Slice := main1Arr[sliceStart:sliceEnd]
    76  		main2Slice := main2Arr[sliceStart:sliceEnd]
    77  		sentinel := uint16(rand.Uint32())
    78  		main2Arr[sliceEnd] = sentinel
    79  		memset16Standard(main1Slice, u16Val)
    80  		simd.RepeatU16(main2Slice, u16Val)
    81  		if !reflect.DeepEqual(main1Slice, main2Slice) {
    82  			t.Fatal("Mismatched RepeatU16 result.")
    83  		}
    84  		if main2Arr[sliceEnd] != sentinel {
    85  			t.Fatal("RepeatU16 clobbered an extra byte.")
    86  		}
    87  	}
    88  }
    89  
    90  /*
    91  Benchmark results:
    92    MacBook Pro (15-inch, 2016)
    93    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
    94  
    95  Benchmark_Memset16/SIMDShort1Cpu-8                    10         140130606 ns/op
    96  Benchmark_Memset16/SIMDShortHalfCpu-8                 50          37087600 ns/op
    97  Benchmark_Memset16/SIMDShortAllCpu-8                  50          35361817 ns/op
    98  Benchmark_Memset16/SIMDLong1Cpu-8                      1        1157494604 ns/op
    99  Benchmark_Memset16/SIMDLongHalfCpu-8                   2         921843584 ns/op
   100  Benchmark_Memset16/SIMDLongAllCpu-8                    2         960652822 ns/op
   101  Benchmark_Memset16/StandardShort1Cpu-8                 5         343877390 ns/op
   102  Benchmark_Memset16/StandardShortHalfCpu-8             20          88295789 ns/op
   103  Benchmark_Memset16/StandardShortAllCpu-8              20          86026817 ns/op
   104  Benchmark_Memset16/StandardLong1Cpu-8                  1        1038072481 ns/op
   105  Benchmark_Memset16/StandardLongHalfCpu-8               2         979292703 ns/op
   106  Benchmark_Memset16/StandardLongAllCpu-8                1        1052316741 ns/op
   107  */
   108  
   109  type u16Args struct {
   110  	main []uint16
   111  }
   112  
   113  func memset16SimdSubtask(args interface{}, nIter int) int {
   114  	a := args.(u16Args)
   115  	for iter := 0; iter < nIter; iter++ {
   116  		simd.RepeatU16(a.main, 0x201)
   117  	}
   118  	return int(a.main[0])
   119  }
   120  
   121  func memset16StandardSubtask(args interface{}, nIter int) int {
   122  	a := args.(u16Args)
   123  	for iter := 0; iter < nIter; iter++ {
   124  		memset16Standard(a.main, 0x201)
   125  	}
   126  	return int(a.main[0])
   127  }
   128  
   129  func Benchmark_Memset16(b *testing.B) {
   130  	funcs := []taggedMultiBenchVarargsFunc{
   131  		{
   132  			f:   memset16SimdSubtask,
   133  			tag: "SIMD",
   134  		},
   135  		{
   136  			f:   memset16StandardSubtask,
   137  			tag: "Standard",
   138  		},
   139  	}
   140  	for _, f := range funcs {
   141  		multiBenchmarkVarargs(f.f, f.tag+"Short", 9999999, func() interface{} {
   142  			return u16Args{
   143  				main: make([]uint16, 75, 75+31),
   144  			}
   145  		}, b)
   146  		multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} {
   147  			return u16Args{
   148  				main: make([]uint16, 249250622/2, 249250622/2+31),
   149  			}
   150  		}, b)
   151  	}
   152  }
   153  
   154  func indexU16Standard(main []uint16, val uint16) int {
   155  	for i, v := range main {
   156  		if v == val {
   157  			return i
   158  		}
   159  	}
   160  	return -1
   161  }
   162  
   163  func TestIndexU16(t *testing.T) {
   164  	// Generate nOuterIter random length-arrLen []uint16s, and perform nInnerIter
   165  	// random searches on each slice.
   166  	arrLen := 50000
   167  	nOuterIter := 5
   168  	nInnerIter := 100
   169  	valLimit := 65536 // maximum uint16 is 65535
   170  	rand.Seed(1)
   171  	mainArr := make([]uint16, arrLen)
   172  	for outerIdx := 0; outerIdx < nOuterIter; outerIdx++ {
   173  		for i := range mainArr {
   174  			mainArr[i] = uint16(rand.Intn(valLimit))
   175  		}
   176  		for innerIdx := 0; innerIdx < nInnerIter; innerIdx++ {
   177  			needle := uint16(rand.Intn(valLimit))
   178  			expected := indexU16Standard(mainArr, needle)
   179  			actual := simd.IndexU16(mainArr, needle)
   180  			expect.EQ(t, expected, actual)
   181  		}
   182  	}
   183  }
   184  
   185  const indexU16TestLimit = 100
   186  
   187  func indexU16SimdSubtask(args interface{}, nIter int) int {
   188  	a := args.(u16Args)
   189  	sum := 0
   190  	needle := uint16(0)
   191  	for iter := 0; iter < nIter; iter++ {
   192  		sum += simd.IndexU16(a.main, needle)
   193  		needle++
   194  		if needle == indexU16TestLimit {
   195  			needle = 0
   196  		}
   197  	}
   198  	return sum
   199  }
   200  
   201  func indexU16StandardSubtask(args interface{}, nIter int) int {
   202  	a := args.(u16Args)
   203  	sum := 0
   204  	needle := uint16(0)
   205  	for iter := 0; iter < nIter; iter++ {
   206  		sum += indexU16Standard(a.main, needle)
   207  		needle++
   208  		if needle == indexU16TestLimit {
   209  			needle = 0
   210  		}
   211  	}
   212  	return sum
   213  }
   214  
   215  // Single-threaded performance is ~4x as good in my testing.
   216  func Benchmark_IndexU16(b *testing.B) {
   217  	funcs := []taggedMultiBenchVarargsFunc{
   218  		{
   219  			f:   indexU16SimdSubtask,
   220  			tag: "SIMD",
   221  		},
   222  		{
   223  			f:   indexU16StandardSubtask,
   224  			tag: "Standard",
   225  		},
   226  	}
   227  	for _, f := range funcs {
   228  		multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} {
   229  			return u16Args{
   230  				main: make([]uint16, 4000000, 4000000+31),
   231  			}
   232  		}, b)
   233  	}
   234  }
   235  
   236  func reverseU16Slow(main []uint16) {
   237  	nU16 := len(main)
   238  	nU16Div2 := nU16 >> 1
   239  	for idx, invIdx := 0, nU16-1; idx != nU16Div2; idx, invIdx = idx+1, invIdx-1 {
   240  		main[idx], main[invIdx] = main[invIdx], main[idx]
   241  	}
   242  }
   243  
   244  func TestReverse16(t *testing.T) {
   245  	maxSize := 500
   246  	nIter := 200
   247  	rand.Seed(1)
   248  	main1Arr := make([]uint16, maxSize)
   249  	main2Arr := make([]uint16, maxSize)
   250  	main3Arr := make([]uint16, maxSize)
   251  	src2Arr := make([]uint16, maxSize)
   252  	for iter := 0; iter < nIter; iter++ {
   253  		sliceStart := rand.Intn(maxSize)
   254  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   255  		main1Slice := main1Arr[sliceStart:sliceEnd]
   256  		main2Slice := main2Arr[sliceStart:sliceEnd]
   257  		main3Slice := main3Arr[sliceStart:sliceEnd]
   258  		src2Slice := src2Arr[sliceStart:sliceEnd]
   259  		for ii := range main1Slice {
   260  			main1Slice[ii] = uint16(rand.Uint32())
   261  		}
   262  		copy(main2Slice, main1Slice)
   263  		copy(src2Slice, main1Slice)
   264  		sentinel := uint16(rand.Uint32())
   265  		main2Arr[sliceEnd] = sentinel
   266  		main3Arr[sliceEnd] = sentinel
   267  		simd.ReverseU16(main3Slice, main1Slice)
   268  		reverseU16Slow(main1Slice)
   269  		simd.ReverseU16Inplace(main2Slice)
   270  		if !reflect.DeepEqual(main1Slice, main2Slice) {
   271  			t.Fatal("Mismatched ReverseU16Inplace result.")
   272  		}
   273  		if main2Arr[sliceEnd] != sentinel {
   274  			t.Fatal("ReverseU16Inplace clobbered an extra byte.")
   275  		}
   276  		if !reflect.DeepEqual(main1Slice, main3Slice) {
   277  			t.Fatal("Mismatched ReverseU16 result.")
   278  		}
   279  		if main3Arr[sliceEnd] != sentinel {
   280  			t.Fatal("ReverseU16 clobbered an extra byte.")
   281  		}
   282  		simd.ReverseU16Inplace(main2Slice)
   283  		if !reflect.DeepEqual(src2Slice, main2Slice) {
   284  			t.Fatal("ReverseU16Inplace didn't invert itself.")
   285  		}
   286  	}
   287  }
   288  
   289  /*
   290  Benchmark results:
   291    MacBook Pro (15-inch, 2016)
   292    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   293  
   294  Benchmark_ReverseU16Inplace/SIMDShort1Cpu-8                   20         102899505 ns/op
   295  Benchmark_ReverseU16Inplace/SIMDShortHalfCpu-8                50          32918441 ns/op
   296  Benchmark_ReverseU16Inplace/SIMDShortAllCpu-8                 30          38848510 ns/op
   297  Benchmark_ReverseU16Inplace/SIMDLong1Cpu-8                     1        1116384992 ns/op
   298  Benchmark_ReverseU16Inplace/SIMDLongHalfCpu-8                  2         880730467 ns/op
   299  Benchmark_ReverseU16Inplace/SIMDLongAllCpu-8                   2         943204867 ns/op
   300  Benchmark_ReverseU16Inplace/SlowShort1Cpu-8                    3         443056373 ns/op
   301  Benchmark_ReverseU16Inplace/SlowShortHalfCpu-8                10         117142962 ns/op
   302  Benchmark_ReverseU16Inplace/SlowShortAllCpu-8                 10         159087579 ns/op
   303  Benchmark_ReverseU16Inplace/SlowLong1Cpu-8                     1        3158497662 ns/op
   304  Benchmark_ReverseU16Inplace/SlowLongHalfCpu-8                  2         967619258 ns/op
   305  Benchmark_ReverseU16Inplace/SlowLongAllCpu-8                   2         978231337 ns/op
   306  */
   307  
   308  func reverseU16InplaceSimdSubtask(args interface{}, nIter int) int {
   309  	a := args.(u16Args)
   310  	for iter := 0; iter < nIter; iter++ {
   311  		simd.ReverseU16Inplace(a.main)
   312  	}
   313  	return int(a.main[0])
   314  }
   315  
   316  func reverseU16InplaceSlowSubtask(args interface{}, nIter int) int {
   317  	a := args.(u16Args)
   318  	for iter := 0; iter < nIter; iter++ {
   319  		reverseU16Slow(a.main)
   320  	}
   321  	return int(a.main[0])
   322  }
   323  
   324  func Benchmark_ReverseU16Inplace(b *testing.B) {
   325  	funcs := []taggedMultiBenchVarargsFunc{
   326  		{
   327  			f:   reverseU16InplaceSimdSubtask,
   328  			tag: "SIMD",
   329  		},
   330  		{
   331  			f:   reverseU16InplaceSlowSubtask,
   332  			tag: "Slow",
   333  		},
   334  	}
   335  	for _, f := range funcs {
   336  		multiBenchmarkVarargs(f.f, f.tag+"Short", 9999999, func() interface{} {
   337  			return u16Args{
   338  				main: make([]uint16, 75, 75+31),
   339  			}
   340  		}, b)
   341  		multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} {
   342  			return u16Args{
   343  				main: make([]uint16, 249250622/2, 249250622/2+31),
   344  			}
   345  		}, b)
   346  	}
   347  }