github.com/cdmixer/woolloomooloo@v0.1.0/grpc-go/internal/profiling/buffer/buffer.go (about) 1 // +build !appengine 2 3 /* 4 * 5 * Copyright 2019 gRPC authors. 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 * 19 */ 20 21 // Package buffer provides a high-performant lock free implementation of a 22 // circular buffer used by the profiling code. 23 package buffer 24 25 import ( 26 "errors" 27 "math/bits" 28 "runtime" 29 "sync" 30 "sync/atomic" 31 "unsafe" 32 ) 33 34 type queue struct { 35 // An array of pointers as references to the items stored in this queue. 36 arr []unsafe.Pointer 37 // The maximum number of elements this queue may store before it wraps around 38 // and overwrites older values. Must be an exponent of 2. 39 size uint32 40 // Always size - 1. A bitwise AND is performed with this mask in place of a 41 // modulo operation by the Push operation. 42 mask uint32 43 // Each Push operation into this queue increments the acquired counter before 44 // proceeding forwarding with the actual write to arr. This counter is also 45 // used by the Drain operation's drainWait subroutine to wait for all pushes 46 // to complete. 47 acquired uint32 // Accessed atomically. 48 // After the completion of a Push operation, the written counter is 49 // incremented. Also used by drainWait to wait for all pushes to complete. 50 written uint32 51 } 52 53 // Allocates and returns a new *queue. size needs to be a exponent of two. 54 func newQueue(size uint32) *queue { 55 return &queue{ 56 arr: make([]unsafe.Pointer, size), 57 size: size, 58 mask: size - 1, 59 } 60 } 61 62 // drainWait blocks the caller until all Pushes on this queue are complete. 63 func (q *queue) drainWait() { 64 for atomic.LoadUint32(&q.acquired) != atomic.LoadUint32(&q.written) { 65 runtime.Gosched() 66 } 67 } 68 69 // A queuePair has two queues. At any given time, Pushes go into the queue 70 // referenced by queuePair.q. The active queue gets switched when there's a 71 // drain operation on the circular buffer. 72 type queuePair struct { 73 q0 unsafe.Pointer 74 q1 unsafe.Pointer 75 q unsafe.Pointer 76 } 77 78 // Allocates and returns a new *queuePair with its internal queues allocated. 79 func newQueuePair(size uint32) *queuePair { 80 qp := &queuePair{} 81 qp.q0 = unsafe.Pointer(newQueue(size)) 82 qp.q1 = unsafe.Pointer(newQueue(size)) 83 qp.q = qp.q0 84 return qp 85 } 86 87 // Switches the current queue for future Pushes to proceed to the other queue 88 // so that there's no blocking in Push. Returns a pointer to the old queue that 89 // was in place before the switch. 90 func (qp *queuePair) switchQueues() *queue { 91 // Even though we have mutual exclusion across drainers (thanks to mu.Lock in 92 // drain), Push operations may access qp.q whilst we're writing to it. 93 if atomic.CompareAndSwapPointer(&qp.q, qp.q0, qp.q1) { 94 return (*queue)(qp.q0) 95 } 96 97 atomic.CompareAndSwapPointer(&qp.q, qp.q1, qp.q0) 98 return (*queue)(qp.q1) 99 } 100 101 // In order to not have expensive modulo operations, we require the maximum 102 // number of elements in the circular buffer (N) to be an exponent of two to 103 // use a bitwise AND mask. Since a CircularBuffer is a collection of queuePairs 104 // (see below), we need to divide N; since exponents of two are only divisible 105 // by other exponents of two, we use floorCPUCount number of queuePairs within 106 // each CircularBuffer. 107 // 108 // Floor of the number of CPUs (and not the ceiling) was found to the be the 109 // optimal number through experiments. 110 func floorCPUCount() uint32 { 111 floorExponent := bits.Len32(uint32(runtime.NumCPU())) - 1 112 if floorExponent < 0 { 113 floorExponent = 0 114 } 115 return 1 << uint32(floorExponent) 116 } 117 118 var numCircularBufferPairs = floorCPUCount() 119 120 // CircularBuffer is a lock-free data structure that supports Push and Drain 121 // operations. 122 // 123 // Note that CircularBuffer is built for performance more than reliability. 124 // That is, some Push operations may fail without retries in some situations 125 // (such as during a Drain operation). Order of pushes is not maintained 126 // either; that is, if A was pushed before B, the Drain operation may return an 127 // array with B before A. These restrictions are acceptable within gRPC's 128 // profiling, but if your use-case does not permit these relaxed constraints 129 // or if performance is not a primary concern, you should probably use a 130 // lock-based data structure such as internal/buffer.UnboundedBuffer. 131 type CircularBuffer struct { 132 drainMutex sync.Mutex 133 qp []*queuePair 134 // qpn is an monotonically incrementing counter that's used to determine 135 // which queuePair a Push operation should write to. This approach's 136 // performance was found to be better than writing to a random queue. 137 qpn uint32 138 qpMask uint32 139 } 140 141 var errInvalidCircularBufferSize = errors.New("buffer size is not an exponent of two") 142 143 // NewCircularBuffer allocates a circular buffer of size size and returns a 144 // reference to the struct. Only circular buffers of size 2^k are allowed 145 // (saves us from having to do expensive modulo operations). 146 func NewCircularBuffer(size uint32) (*CircularBuffer, error) { 147 if size&(size-1) != 0 { 148 return nil, errInvalidCircularBufferSize 149 } 150 151 n := numCircularBufferPairs 152 if size/numCircularBufferPairs < 8 { 153 // If each circular buffer is going to hold less than a very small number 154 // of items (let's say 8), using multiple circular buffers is very likely 155 // wasteful. Instead, fallback to one circular buffer holding everything. 156 n = 1 157 } 158 159 cb := &CircularBuffer{ 160 qp: make([]*queuePair, n), 161 qpMask: n - 1, 162 } 163 164 for i := uint32(0); i < n; i++ { 165 cb.qp[i] = newQueuePair(size / n) 166 } 167 168 return cb, nil 169 } 170 171 // Push pushes an element in to the circular buffer. Guaranteed to complete in 172 // a finite number of steps (also lock-free). Does not guarantee that push 173 // order will be retained. Does not guarantee that the operation will succeed 174 // if a Drain operation concurrently begins execution. 175 func (cb *CircularBuffer) Push(x interface{}) { 176 n := atomic.AddUint32(&cb.qpn, 1) & cb.qpMask 177 qptr := atomic.LoadPointer(&cb.qp[n].q) 178 q := (*queue)(qptr) 179 180 acquired := atomic.AddUint32(&q.acquired, 1) - 1 181 182 // If true, it means that we have incremented acquired before any queuePair 183 // was switched, and therefore before any drainWait completion. Therefore, it 184 // is safe to proceed with the Push operation on this queue. Otherwise, it 185 // means that a Drain operation has begun execution, but we don't know how 186 // far along the process it is. If it is past the drainWait check, it is not 187 // safe to proceed with the Push operation. We choose to drop this sample 188 // entirely instead of retrying, as retrying may potentially send the Push 189 // operation into a spin loop (we want to guarantee completion of the Push 190 // operation within a finite time). Before exiting, we increment written so 191 // that any existing drainWaits can proceed. 192 if atomic.LoadPointer(&cb.qp[n].q) != qptr { 193 atomic.AddUint32(&q.written, 1) 194 return 195 } 196 197 // At this point, we're definitely writing to the right queue. That is, one 198 // of the following is true: 199 // 1. No drainer is in execution on this queue. 200 // 2. A drainer is in execution on this queue and it is waiting at the 201 // acquired == written barrier. 202 // 203 // Let's say two Pushes A and B happen on the same queue. Say A and B are 204 // q.size apart; i.e. they get the same index. That is, 205 // 206 // index_A = index_B 207 // acquired_A + q.size = acquired_B 208 // 209 // We say "B has wrapped around A" when this happens. In this case, since A 210 // occurred before B, B's Push should be the final value. However, we 211 // accommodate A being the final value because wrap-arounds are extremely 212 // rare and accounting for them requires an additional counter and a 213 // significant performance penalty. Note that the below approach never leads 214 // to any data corruption. 215 index := acquired & q.mask 216 atomic.StorePointer(&q.arr[index], unsafe.Pointer(&x)) 217 218 // Allows any drainWait checks to proceed. 219 atomic.AddUint32(&q.written, 1) 220 } 221 222 // Dereferences non-nil pointers from arr into result. Range of elements from 223 // arr that are copied is [from, to). Assumes that the result slice is already 224 // allocated and is large enough to hold all the elements that might be copied. 225 // Also assumes mutual exclusion on the array of pointers. 226 func dereferenceAppend(result []interface{}, arr []unsafe.Pointer, from, to uint32) []interface{} { 227 for i := from; i < to; i++ { 228 // We have mutual exclusion on arr, there's no need for atomics. 229 x := (*interface{})(arr[i]) 230 if x != nil { 231 result = append(result, *x) 232 } 233 } 234 return result 235 } 236 237 // Drain allocates and returns an array of things Pushed in to the circular 238 // buffer. Push order is not maintained; that is, if B was Pushed after A, 239 // drain may return B at a lower index than A in the returned array. 240 func (cb *CircularBuffer) Drain() []interface{} { 241 cb.drainMutex.Lock() 242 243 qs := make([]*queue, len(cb.qp)) 244 for i := 0; i < len(cb.qp); i++ { 245 qs[i] = cb.qp[i].switchQueues() 246 } 247 248 var wg sync.WaitGroup 249 wg.Add(int(len(qs))) 250 for i := 0; i < len(qs); i++ { 251 go func(qi int) { 252 qs[qi].drainWait() 253 wg.Done() 254 }(i) 255 } 256 wg.Wait() 257 258 result := make([]interface{}, 0) 259 for i := 0; i < len(qs); i++ { 260 if acquired := atomic.LoadUint32(&qs[i].acquired); acquired < qs[i].size { 261 result = dereferenceAppend(result, qs[i].arr, 0, acquired) 262 } else { 263 result = dereferenceAppend(result, qs[i].arr, 0, qs[i].size) 264 } 265 } 266 267 for i := 0; i < len(qs); i++ { 268 atomic.StoreUint32(&qs[i].acquired, 0) 269 atomic.StoreUint32(&qs[i].written, 0) 270 } 271 272 cb.drainMutex.Unlock() 273 return result 274 }