gorgonia.org/gorgonia@v0.9.17/math_fast.go (about) 1 // +build fastmath 2 3 package gorgonia 4 5 import ( 6 "math" 7 "unsafe" 8 ) 9 10 // SetOptimizationLevel to i 11 func SetOptimizationLevel(i int) { optimizationLevel = i } 12 13 func castFU32(x float32) uint32 { return *(*uint32)(unsafe.Pointer(&x)) } 14 func castFU64(x float64) uint64 { return *(*uint64)(unsafe.Pointer(&x)) } 15 func castUF64(x uint64) float64 { return *(*float64)(unsafe.Pointer(&x)) } 16 func castUF32(x uint32) float32 { return *(*float32)(unsafe.Pointer(&x)) } 17 18 /* 19 INVERSE/RECIPROCAL HACKS 20 21 Resources: 22 http://stackoverflow.com/questions/31555260/fast-vectorized-rsqrt-and-reciprocal-with-sse-avx-depending-on-precision 23 https://github.com/stgatilov/recip_rsqrt_benchmark 24 25 Also many thanks to the countless crazy people out there who have done their homework, written papers about it 26 so that I may benefit from it. 27 */ 28 29 // magic numbers are acquired from here: 30 // 0x7FDE623822FC16E6 - https://www.pvk.ca/Blog/LowLevel/software-reciprocal.html 31 // 0x7FDE6238DA3C2118 - http://www.hackersdelight.org/hdcodetxt/recip.c.txt 32 // Paul Khong's magic number seems to be the best performing for my use case, with 3 newton iterations 33 // 34 // On the number of refinement steps required, 4 refinement steps will yield the same 35 // results as the naive function for most values. However, the gains in accuracy is offset 36 // by the loss in speed gains: 37 // BenchmarkInv64-8 300000000 5.99 ns/op 38 // BenchmarkApp4Inv64-8 300000000 5.09 ns/op 39 // BenchmarkApp3Inv64-8 500000000 3.70 ns/op 40 func _inversef64(x float64) float64 { 41 u := uint64(0x7FDE623822FC16E6) - castFU64(x) 42 // u := uint64(0x7FDE6238DA3C2118) - castFU64(x) 43 44 f := castUF64(u) 45 46 // 3 newton raphson refinement steps: 47 for i := 0; i < 3; i++ { 48 f = 2.0*f - f*f*x 49 } 50 return f 51 } 52 53 // magic numbers acquired from here: 54 // http://bits.stephan-brumme.com/inverse.html 55 // On the number of refinement steps: 56 // BenchmarkInv32-8 500000000 3.85 ns/op 57 // BenchmarkApp3Inv32-8 500000000 3.69 ns/op 58 // BenchmarkApp2Inv32-8 1000000000 2.47 ns/op 59 // 60 // I have also found that 2 refinement steps are more than sufficient to get decent results. No funny gradient explosions for sure 61 // TODO: use RCPSS when available 62 func _inversef32(x float32) float32 { 63 u := uint32(0x7F000000) - castFU32(x) 64 f := castUF32(u) 65 66 // 2 newton raphson refinement steps: 67 for i := 0; i < 2; i++ { 68 f = 2.0*f - f*f*x 69 } 70 return castUF32(u) 71 } 72 73 /* 74 TANH HACKS 75 Resources: 76 "Speed Improvement of the Back-Propagation on Current Generation Workstations" D. Anguita, G. Parodi and R. Zunino. Proceedings of the World Congress on Neural Networking, 1993. 77 Fuzzpillz's Tanh http://www.musicdsp.org/showone.php?id=178 78 varosound's lambert expansion Tanh - https://varietyofsound.wordpress.com/2011/02/14/efficient-tanh-computation-using-lamberts-continued-fraction/ 79 80 Here are the relative benchmarks: 81 82 float32 83 BenchmarkTanh32UW-8 200000000 7.34 ns/op 84 BenchmarkTanh32-8 100000000 10.6 ns/op 85 BenchmarkFuzzpillsTanh32-8 200000000 9.24 ns/op 86 BenchmarkAnguitaTanh32-8 500000000 3.52 ns/op 87 BenchmarkVarosoundTanh32-8 200000000 8.96 ns/op 88 89 float64 90 BenchmarkTanh64UW-8 200000000 7.25 ns/op 91 BenchmarkTanh64-8 200000000 9.64 ns/op 92 BenchmarkFuzzpillsTanh64-8 200000000 5.98 ns/op 93 BenchmarkAnguitaTanh64-8 500000000 3.26 ns/op 94 BenchmarkVarosoundTanh64-8 300000000 6.03 ns/op 95 96 There appears to be a problem when using float32 - the conversion takes extra time. 97 Tanh32UW and Tanh64UW is a direct call to math.Tanh(), without a wrapping function call. The results of the float32 version is inaccurate, because if you 98 wrap the results in float32(), the benchmark won't run 99 100 On the precisions, I found Anguita's the least precise, but surprisingly works well for the very limited scope of things I am doing. 101 VarietyOfSound's approximation algorithm is also very close to the actual math.Tanh() implementation. 102 */ 103 104 func _tanhf32(x float32) float32 { 105 switch optimizationLevel { 106 case 0, 1: 107 return float32(math.Tanh(float64(x))) 108 case 2: 109 // Use Anguita 110 switch { 111 case x > 1.92033: 112 return 0.96016 113 case x > 0: 114 return 0.96016 - 0.26037*(x-1.92033)*(x-1.92033) 115 case x <= -1.92033: 116 return -0.96016 117 case x < 0: 118 return 0.26037*(x+1.92033)*(x+1.92033) - 0.96016 119 } 120 } 121 panic("unreachable") 122 } 123 124 func _tanhf64(x float64) float64 { 125 switch optimizationLevel { 126 case 0: 127 return math.Tanh(x) 128 case 1: 129 // use Variety of Sound's 130 x2 := x * x 131 a := x * (135135.0 + x2*(17325.0+x2*(378.0+x2))) 132 b := 135135.0 + x2*(62370.0+x2*(3150.0+x2*28.0)) 133 return a / b 134 case 2: 135 // Use Anguita 136 switch { 137 case x > 1.92033: 138 return 0.96016 139 case x > 0: 140 return 0.96016 - 0.26037*(x-1.92033)*(x-1.92033) 141 case x <= -1.92033: 142 return -0.96016 143 case x < 0: 144 return 0.26037*(x+1.92033)*(x+1.92033) - 0.96016 145 } 146 } 147 panic("unreachable") 148 } 149 150 func _sigmoidf64(x float64) float64 { 151 return 0 152 } 153 154 func _sigmoidf32(x float32) float32 { 155 return 0 156 }