gorgonia.org/gorgonia@v0.9.17/math_fast.go (about)

     1  // +build fastmath
     2  
     3  package gorgonia
     4  
     5  import (
     6  	"math"
     7  	"unsafe"
     8  )
     9  
    10  // SetOptimizationLevel to i
    11  func SetOptimizationLevel(i int) { optimizationLevel = i }
    12  
    13  func castFU32(x float32) uint32 { return *(*uint32)(unsafe.Pointer(&x)) }
    14  func castFU64(x float64) uint64 { return *(*uint64)(unsafe.Pointer(&x)) }
    15  func castUF64(x uint64) float64 { return *(*float64)(unsafe.Pointer(&x)) }
    16  func castUF32(x uint32) float32 { return *(*float32)(unsafe.Pointer(&x)) }
    17  
    18  /*
    19  INVERSE/RECIPROCAL HACKS
    20  
    21  Resources:
    22  	http://stackoverflow.com/questions/31555260/fast-vectorized-rsqrt-and-reciprocal-with-sse-avx-depending-on-precision
    23  	https://github.com/stgatilov/recip_rsqrt_benchmark
    24  
    25  Also many thanks to the countless crazy people out there who have done their homework, written papers about it
    26  so that I may benefit from it.
    27  */
    28  
    29  // magic numbers are acquired from here:
    30  // 		0x7FDE623822FC16E6 - https://www.pvk.ca/Blog/LowLevel/software-reciprocal.html
    31  // 		0x7FDE6238DA3C2118 - http://www.hackersdelight.org/hdcodetxt/recip.c.txt
    32  // Paul Khong's magic number seems to be the best performing for my use case, with 3 newton iterations
    33  //
    34  // On the number of refinement steps required, 4 refinement steps will yield the same
    35  // results as the naive function for most values. However, the gains in accuracy is offset
    36  // by the loss in speed gains:
    37  //		BenchmarkInv64-8    	300000000	         5.99 ns/op
    38  //		BenchmarkApp4Inv64-8	300000000	         5.09 ns/op
    39  //		BenchmarkApp3Inv64-8	500000000	         3.70 ns/op
    40  func _inversef64(x float64) float64 {
    41  	u := uint64(0x7FDE623822FC16E6) - castFU64(x)
    42  	// u := uint64(0x7FDE6238DA3C2118) - castFU64(x)
    43  
    44  	f := castUF64(u)
    45  
    46  	// 3 newton raphson refinement steps:
    47  	for i := 0; i < 3; i++ {
    48  		f = 2.0*f - f*f*x
    49  	}
    50  	return f
    51  }
    52  
    53  // magic numbers acquired from here:
    54  //		http://bits.stephan-brumme.com/inverse.html
    55  // On the number of refinement steps:
    56  // 		BenchmarkInv32-8    	500000000	         3.85 ns/op
    57  //		BenchmarkApp3Inv32-8	500000000	         3.69 ns/op
    58  // 		BenchmarkApp2Inv32-8	1000000000	         2.47 ns/op
    59  //
    60  // I have also found that 2 refinement steps are more than sufficient to get decent results. No funny gradient explosions for sure
    61  // TODO: use RCPSS when available
    62  func _inversef32(x float32) float32 {
    63  	u := uint32(0x7F000000) - castFU32(x)
    64  	f := castUF32(u)
    65  
    66  	// 2 newton raphson refinement steps:
    67  	for i := 0; i < 2; i++ {
    68  		f = 2.0*f - f*f*x
    69  	}
    70  	return castUF32(u)
    71  }
    72  
    73  /*
    74  TANH HACKS
    75  Resources:
    76  	"Speed Improvement of the Back-Propagation on Current Generation Workstations" D. Anguita, G. Parodi and R. Zunino. Proceedings of the World Congress on Neural Networking, 1993.
    77  	Fuzzpillz's Tanh http://www.musicdsp.org/showone.php?id=178
    78  	varosound's lambert expansion Tanh -  https://varietyofsound.wordpress.com/2011/02/14/efficient-tanh-computation-using-lamberts-continued-fraction/
    79  
    80  Here are the relative benchmarks:
    81  
    82  float32
    83  	BenchmarkTanh32UW-8       	200000000	         7.34 ns/op
    84  	BenchmarkTanh32-8         	100000000	        10.6 ns/op
    85  	BenchmarkFuzzpillsTanh32-8	200000000	         9.24 ns/op
    86  	BenchmarkAnguitaTanh32-8  	500000000	         3.52 ns/op
    87  	BenchmarkVarosoundTanh32-8	200000000	         8.96 ns/op
    88  
    89  float64
    90  	BenchmarkTanh64UW-8       	200000000	         7.25 ns/op
    91  	BenchmarkTanh64-8         	200000000	         9.64 ns/op
    92  	BenchmarkFuzzpillsTanh64-8	200000000	         5.98 ns/op
    93  	BenchmarkAnguitaTanh64-8  	500000000	         3.26 ns/op
    94  	BenchmarkVarosoundTanh64-8	300000000	         6.03 ns/op
    95  
    96  There appears to be a problem when using float32 - the conversion takes extra time.
    97  Tanh32UW and Tanh64UW is a direct call to math.Tanh(), without a wrapping function call. The results of the float32 version is inaccurate, because if you
    98  wrap the results in float32(), the benchmark won't run
    99  
   100  On the precisions, I found Anguita's the least precise, but surprisingly works well for the very limited scope of things I am doing.
   101  VarietyOfSound's approximation algorithm is also very close to the actual math.Tanh() implementation.
   102  */
   103  
   104  func _tanhf32(x float32) float32 {
   105  	switch optimizationLevel {
   106  	case 0, 1:
   107  		return float32(math.Tanh(float64(x)))
   108  	case 2:
   109  		// Use Anguita
   110  		switch {
   111  		case x > 1.92033:
   112  			return 0.96016
   113  		case x > 0:
   114  			return 0.96016 - 0.26037*(x-1.92033)*(x-1.92033)
   115  		case x <= -1.92033:
   116  			return -0.96016
   117  		case x < 0:
   118  			return 0.26037*(x+1.92033)*(x+1.92033) - 0.96016
   119  		}
   120  	}
   121  	panic("unreachable")
   122  }
   123  
   124  func _tanhf64(x float64) float64 {
   125  	switch optimizationLevel {
   126  	case 0:
   127  		return math.Tanh(x)
   128  	case 1:
   129  		// use Variety of Sound's
   130  		x2 := x * x
   131  		a := x * (135135.0 + x2*(17325.0+x2*(378.0+x2)))
   132  		b := 135135.0 + x2*(62370.0+x2*(3150.0+x2*28.0))
   133  		return a / b
   134  	case 2:
   135  		// Use Anguita
   136  		switch {
   137  		case x > 1.92033:
   138  			return 0.96016
   139  		case x > 0:
   140  			return 0.96016 - 0.26037*(x-1.92033)*(x-1.92033)
   141  		case x <= -1.92033:
   142  			return -0.96016
   143  		case x < 0:
   144  			return 0.26037*(x+1.92033)*(x+1.92033) - 0.96016
   145  		}
   146  	}
   147  	panic("unreachable")
   148  }
   149  
   150  func _sigmoidf64(x float64) float64 {
   151  	return 0
   152  }
   153  
   154  func _sigmoidf32(x float32) float32 {
   155  	return 0
   156  }