github.com/grailbio/base@v0.0.11/simd/multi_benchmark_test.go

github.com/grailbio/base@v0.0.11/simd/multi_benchmark_test.go (about)

     1  // Copyright 2019 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package simd_test
     6  
     7  import (
     8  	"runtime"
     9  	"testing"
    10  
    11  	"github.com/grailbio/base/simd"
    12  	"github.com/grailbio/base/traverse"
    13  )
    14  
    15  // Utility functions to assist with benchmarking of embarrassingly parallel
    16  // jobs. It probably makes sense to move this code to a more central location
    17  // at some point.
    18  
    19  type multiBenchFunc func(dst, src []byte, nIter int) int
    20  
    21  type taggedMultiBenchFunc struct {
    22  	f   multiBenchFunc
    23  	tag string
    24  }
    25  
    26  type bytesInitFunc func(src []byte)
    27  
    28  type multiBenchmarkOpts struct {
    29  	dstInit bytesInitFunc
    30  	srcInit bytesInitFunc
    31  }
    32  
    33  func multiBenchmark(bf multiBenchFunc, benchmarkSubtype string, nDstByte, nSrcByte, nJob int, b *testing.B, opts ...multiBenchmarkOpts) {
    34  	// 'bf' is expected to execute the benchmarking target nIter times.
    35  	//
    36  	// Given that, for each of the 3 nCpu settings below, multiBenchmark launches
    37  	// 'parallelism' goroutines, where each goroutine has nIter set to roughly
    38  	// (nJob / nCpu), so that the total number of benchmark-target-function
    39  	// invocations across all threads is nJob.  It is designed to measure how
    40  	// effective traverse.Each-style parallelization is at reducing wall-clock
    41  	// runtime.
    42  	totalCpu := runtime.NumCPU()
    43  	cases := []struct {
    44  		nCpu    int
    45  		descrip string
    46  	}{
    47  		{
    48  			nCpu:    1,
    49  			descrip: "1Cpu",
    50  		},
    51  		// 'Half' is often the saturation point, due to hyperthreading.
    52  		{
    53  			nCpu:    (totalCpu + 1) / 2,
    54  			descrip: "HalfCpu",
    55  		},
    56  		{
    57  			nCpu:    totalCpu,
    58  			descrip: "AllCpu",
    59  		},
    60  	}
    61  	var dstInit bytesInitFunc
    62  	var srcInit bytesInitFunc
    63  	if len(opts) >= 1 {
    64  		dstInit = opts[0].dstInit
    65  		srcInit = opts[0].srcInit
    66  	}
    67  	for _, c := range cases {
    68  		success := b.Run(benchmarkSubtype+c.descrip, func(b *testing.B) {
    69  			dsts := make([][]byte, c.nCpu)
    70  			srcs := make([][]byte, c.nCpu)
    71  			for i := 0; i < c.nCpu; i++ {
    72  				// Add 63 to prevent false sharing.
    73  				newArrDst := simd.MakeUnsafe(nDstByte + 63)
    74  				newArrSrc := simd.MakeUnsafe(nSrcByte + 63)
    75  				if i == 0 {
    76  					if dstInit != nil {
    77  						dstInit(newArrDst)
    78  					}
    79  					if srcInit != nil {
    80  						srcInit(newArrSrc)
    81  					} else {
    82  						for j := 0; j < nSrcByte; j++ {
    83  							newArrSrc[j] = byte(j * 3)
    84  						}
    85  					}
    86  				} else {
    87  					if dstInit != nil {
    88  						copy(newArrDst[:nDstByte], dsts[0])
    89  					}
    90  					copy(newArrSrc[:nSrcByte], srcs[0])
    91  				}
    92  				dsts[i] = newArrDst[:nDstByte]
    93  				srcs[i] = newArrSrc[:nSrcByte]
    94  			}
    95  			b.ResetTimer()
    96  			for i := 0; i < b.N; i++ {
    97  				// May want to replace this with something based on testing.B's
    98  				// RunParallel method.  (Haven't done so yet since I don't see a clean
    99  				// way to make that play well with per-core preallocated buffers.)
   100  				_ = traverse.Each(c.nCpu, func(threadIdx int) error {
   101  					nIter := (((threadIdx + 1) * nJob) / c.nCpu) - ((threadIdx * nJob) / c.nCpu)
   102  					_ = bf(dsts[threadIdx], srcs[threadIdx], nIter)
   103  					return nil
   104  				})
   105  			}
   106  		})
   107  		if !success {
   108  			panic("benchmark failed")
   109  		}
   110  	}
   111  }
   112  
   113  func bytesInit0(src []byte) {
   114  	// do nothing
   115  }
   116  
   117  func bytesInitMax15(src []byte) {
   118  	for i := 0; i < len(src); i++ {
   119  		src[i] = byte(i*3) & 15
   120  	}
   121  }
   122  
   123  type multiBenchVarargsFunc func(args interface{}, nIter int) int
   124  
   125  type taggedMultiBenchVarargsFunc struct {
   126  	f   multiBenchVarargsFunc
   127  	tag string
   128  }
   129  
   130  type varargsFactory func() interface{}
   131  
   132  func multiBenchmarkVarargs(bvf multiBenchVarargsFunc, benchmarkSubtype string, nJob int, argsFactory varargsFactory, b *testing.B) {
   133  	totalCpu := runtime.NumCPU()
   134  	cases := []struct {
   135  		nCpu    int
   136  		descrip string
   137  	}{
   138  		{
   139  			nCpu:    1,
   140  			descrip: "1Cpu",
   141  		},
   142  		{
   143  			nCpu:    (totalCpu + 1) / 2,
   144  			descrip: "HalfCpu",
   145  		},
   146  		{
   147  			nCpu:    totalCpu,
   148  			descrip: "AllCpu",
   149  		},
   150  	}
   151  	for _, c := range cases {
   152  		success := b.Run(benchmarkSubtype+c.descrip, func(b *testing.B) {
   153  			var argSlice []interface{}
   154  			for i := 0; i < c.nCpu; i++ {
   155  				// Can take an "args interface{}" parameter and make deep copies
   156  				// instead.
   157  				argSlice = append(argSlice, argsFactory())
   158  			}
   159  			b.ResetTimer()
   160  			for i := 0; i < b.N; i++ {
   161  				_ = traverse.Each(c.nCpu, func(threadIdx int) error {
   162  					nIter := (((threadIdx + 1) * nJob) / c.nCpu) - ((threadIdx * nJob) / c.nCpu)
   163  					_ = bvf(argSlice[threadIdx], nIter)
   164  					return nil
   165  				})
   166  			}
   167  		})
   168  		if !success {
   169  			panic("benchmark failed")
   170  		}
   171  	}
   172  }