gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/grpc/internal/profiling/buffer/buffer.go (about) 1 /* 2 * 3 * Copyright 2019 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package buffer provides a high-performant lock free implementation of a 20 // circular buffer used by the profiling code. 21 package buffer 22 23 import ( 24 "errors" 25 "math/bits" 26 "runtime" 27 "sync" 28 "sync/atomic" 29 "unsafe" 30 ) 31 32 type queue struct { 33 // An array of pointers as references to the items stored in this queue. 34 arr []unsafe.Pointer 35 // The maximum number of elements this queue may store before it wraps around 36 // and overwrites older values. Must be an exponent of 2. 37 size uint32 38 // Always size - 1. A bitwise AND is performed with this mask in place of a 39 // modulo operation by the Push operation. 40 mask uint32 41 // Each Push operation into this queue increments the acquired counter before 42 // proceeding forwarding with the actual write to arr. This counter is also 43 // used by the Drain operation's drainWait subroutine to wait for all pushes 44 // to complete. 45 acquired uint32 // Accessed atomically. 46 // After the completion of a Push operation, the written counter is 47 // incremented. Also used by drainWait to wait for all pushes to complete. 48 written uint32 49 } 50 51 // Allocates and returns a new *queue. size needs to be a exponent of two. 52 func newQueue(size uint32) *queue { 53 return &queue{ 54 arr: make([]unsafe.Pointer, size), 55 size: size, 56 mask: size - 1, 57 } 58 } 59 60 // drainWait blocks the caller until all Pushes on this queue are complete. 61 func (q *queue) drainWait() { 62 for atomic.LoadUint32(&q.acquired) != atomic.LoadUint32(&q.written) { 63 runtime.Gosched() 64 } 65 } 66 67 // A queuePair has two queues. At any given time, Pushes go into the queue 68 // referenced by queuePair.q. The active queue gets switched when there's a 69 // drain operation on the circular buffer. 70 type queuePair struct { 71 q0 unsafe.Pointer 72 q1 unsafe.Pointer 73 q unsafe.Pointer 74 } 75 76 // Allocates and returns a new *queuePair with its internal queues allocated. 77 func newQueuePair(size uint32) *queuePair { 78 qp := &queuePair{} 79 qp.q0 = unsafe.Pointer(newQueue(size)) 80 qp.q1 = unsafe.Pointer(newQueue(size)) 81 qp.q = qp.q0 82 return qp 83 } 84 85 // Switches the current queue for future Pushes to proceed to the other queue 86 // so that there's no blocking in Push. Returns a pointer to the old queue that 87 // was in place before the switch. 88 func (qp *queuePair) switchQueues() *queue { 89 // Even though we have mutual exclusion across drainers (thanks to mu.Lock in 90 // drain), Push operations may access qp.q whilst we're writing to it. 91 if atomic.CompareAndSwapPointer(&qp.q, qp.q0, qp.q1) { 92 return (*queue)(qp.q0) 93 } 94 95 atomic.CompareAndSwapPointer(&qp.q, qp.q1, qp.q0) 96 return (*queue)(qp.q1) 97 } 98 99 // In order to not have expensive modulo operations, we require the maximum 100 // number of elements in the circular buffer (N) to be an exponent of two to 101 // use a bitwise AND mask. Since a CircularBuffer is a collection of queuePairs 102 // (see below), we need to divide N; since exponents of two are only divisible 103 // by other exponents of two, we use floorCPUCount number of queuePairs within 104 // each CircularBuffer. 105 // 106 // Floor of the number of CPUs (and not the ceiling) was found to the be the 107 // optimal number through experiments. 108 func floorCPUCount() uint32 { 109 floorExponent := bits.Len32(uint32(runtime.NumCPU())) - 1 110 if floorExponent < 0 { 111 floorExponent = 0 112 } 113 return 1 << uint32(floorExponent) 114 } 115 116 var numCircularBufferPairs = floorCPUCount() 117 118 // CircularBuffer is a lock-free data structure that supports Push and Drain 119 // operations. 120 // 121 // Note that CircularBuffer is built for performance more than reliability. 122 // That is, some Push operations may fail without retries in some situations 123 // (such as during a Drain operation). Order of pushes is not maintained 124 // either; that is, if A was pushed before B, the Drain operation may return an 125 // array with B before A. These restrictions are acceptable within gRPC's 126 // profiling, but if your use-case does not permit these relaxed constraints 127 // or if performance is not a primary concern, you should probably use a 128 // lock-based data structure such as internal/buffer.UnboundedBuffer. 129 type CircularBuffer struct { 130 drainMutex sync.Mutex 131 qp []*queuePair 132 // qpn is an monotonically incrementing counter that's used to determine 133 // which queuePair a Push operation should write to. This approach's 134 // performance was found to be better than writing to a random queue. 135 qpn uint32 136 qpMask uint32 137 } 138 139 var errInvalidCircularBufferSize = errors.New("buffer size is not an exponent of two") 140 141 // NewCircularBuffer allocates a circular buffer of size size and returns a 142 // reference to the struct. Only circular buffers of size 2^k are allowed 143 // (saves us from having to do expensive modulo operations). 144 func NewCircularBuffer(size uint32) (*CircularBuffer, error) { 145 if size&(size-1) != 0 { 146 return nil, errInvalidCircularBufferSize 147 } 148 149 n := numCircularBufferPairs 150 if size/numCircularBufferPairs < 8 { 151 // If each circular buffer is going to hold less than a very small number 152 // of items (let's say 8), using multiple circular buffers is very likely 153 // wasteful. Instead, fallback to one circular buffer holding everything. 154 n = 1 155 } 156 157 cb := &CircularBuffer{ 158 qp: make([]*queuePair, n), 159 qpMask: n - 1, 160 } 161 162 for i := uint32(0); i < n; i++ { 163 cb.qp[i] = newQueuePair(size / n) 164 } 165 166 return cb, nil 167 } 168 169 // Push pushes an element in to the circular buffer. Guaranteed to complete in 170 // a finite number of steps (also lock-free). Does not guarantee that push 171 // order will be retained. Does not guarantee that the operation will succeed 172 // if a Drain operation concurrently begins execution. 173 func (cb *CircularBuffer) Push(x interface{}) { 174 n := atomic.AddUint32(&cb.qpn, 1) & cb.qpMask 175 qptr := atomic.LoadPointer(&cb.qp[n].q) 176 q := (*queue)(qptr) 177 178 acquired := atomic.AddUint32(&q.acquired, 1) - 1 179 180 // If true, it means that we have incremented acquired before any queuePair 181 // was switched, and therefore before any drainWait completion. Therefore, it 182 // is safe to proceed with the Push operation on this queue. Otherwise, it 183 // means that a Drain operation has begun execution, but we don't know how 184 // far along the process it is. If it is past the drainWait check, it is not 185 // safe to proceed with the Push operation. We choose to drop this sample 186 // entirely instead of retrying, as retrying may potentially send the Push 187 // operation into a spin loop (we want to guarantee completion of the Push 188 // operation within a finite time). Before exiting, we increment written so 189 // that any existing drainWaits can proceed. 190 if atomic.LoadPointer(&cb.qp[n].q) != qptr { 191 atomic.AddUint32(&q.written, 1) 192 return 193 } 194 195 // At this point, we're definitely writing to the right queue. That is, one 196 // of the following is true: 197 // 1. No drainer is in execution on this queue. 198 // 2. A drainer is in execution on this queue and it is waiting at the 199 // acquired == written barrier. 200 // 201 // Let's say two Pushes A and B happen on the same queue. Say A and B are 202 // q.size apart; i.e. they get the same index. That is, 203 // 204 // index_A = index_B 205 // acquired_A + q.size = acquired_B 206 // 207 // We say "B has wrapped around A" when this happens. In this case, since A 208 // occurred before B, B's Push should be the final value. However, we 209 // accommodate A being the final value because wrap-arounds are extremely 210 // rare and accounting for them requires an additional counter and a 211 // significant performance penalty. Note that the below approach never leads 212 // to any data corruption. 213 index := acquired & q.mask 214 atomic.StorePointer(&q.arr[index], unsafe.Pointer(&x)) 215 216 // Allows any drainWait checks to proceed. 217 atomic.AddUint32(&q.written, 1) 218 } 219 220 // Dereferences non-nil pointers from arr into result. Range of elements from 221 // arr that are copied is [from, to). Assumes that the result slice is already 222 // allocated and is large enough to hold all the elements that might be copied. 223 // Also assumes mutual exclusion on the array of pointers. 224 func dereferenceAppend(result []interface{}, arr []unsafe.Pointer, from, to uint32) []interface{} { 225 for i := from; i < to; i++ { 226 // We have mutual exclusion on arr, there's no need for atomics. 227 x := (*interface{})(arr[i]) 228 if x != nil { 229 result = append(result, *x) 230 } 231 } 232 return result 233 } 234 235 // Drain allocates and returns an array of things Pushed in to the circular 236 // buffer. Push order is not maintained; that is, if B was Pushed after A, 237 // drain may return B at a lower index than A in the returned array. 238 func (cb *CircularBuffer) Drain() []interface{} { 239 cb.drainMutex.Lock() 240 241 qs := make([]*queue, len(cb.qp)) 242 for i := 0; i < len(cb.qp); i++ { 243 qs[i] = cb.qp[i].switchQueues() 244 } 245 246 var wg sync.WaitGroup 247 wg.Add(len(qs)) 248 for i := 0; i < len(qs); i++ { 249 go func(qi int) { 250 qs[qi].drainWait() 251 wg.Done() 252 }(i) 253 } 254 wg.Wait() 255 256 result := make([]interface{}, 0) 257 for i := 0; i < len(qs); i++ { 258 if acquired := atomic.LoadUint32(&qs[i].acquired); acquired < qs[i].size { 259 result = dereferenceAppend(result, qs[i].arr, 0, acquired) 260 } else { 261 result = dereferenceAppend(result, qs[i].arr, 0, qs[i].size) 262 } 263 } 264 265 for i := 0; i < len(qs); i++ { 266 atomic.StoreUint32(&qs[i].acquired, 0) 267 atomic.StoreUint32(&qs[i].written, 0) 268 } 269 270 cb.drainMutex.Unlock() 271 return result 272 }