github.com/cloudwego/kitex@v0.9.0/pkg/profiler/profiler.go (about) 1 /* 2 * Copyright 2022 CloudWeGo Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package profiler 18 19 import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "runtime/pprof" 25 "sort" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/google/pprof/profile" 31 32 "github.com/cloudwego/kitex/pkg/klog" 33 ) 34 35 type profilerContextKey struct{} 36 37 type Profiler interface { 38 Run(ctx context.Context) (err error) 39 Stop() 40 Pause() 41 Resume() 42 Prepare(ctx context.Context) context.Context 43 Tag(ctx context.Context, tags ...string) context.Context 44 Untag(ctx context.Context) 45 Lookup(ctx context.Context, key string) (string, bool) 46 } 47 48 type Processor func(profiles []*TagsProfile) error 49 50 func LogProcessor(profiles []*TagsProfile) error { 51 if len(profiles) == 0 { 52 return nil 53 } 54 klog.Infof("KITEX: profiler collect %d records", len(profiles)) 55 for _, p := range profiles { 56 if p.Key != "" { 57 klog.Infof("KITEX: profiler - %s %.2f%%", p.Key, p.Percent*100) 58 } else { 59 klog.Infof("KITEX: profiler - type=default %.2f%%", p.Percent*100) 60 } 61 } 62 klog.Info("---------------------------------") 63 return nil 64 } 65 66 func NewProfiler(processor Processor, interval, window time.Duration) *profiler { 67 if processor == nil { 68 processor = LogProcessor 69 } 70 return &profiler{ 71 stateCond: sync.NewCond(&sync.Mutex{}), 72 processor: processor, 73 interval: interval, 74 window: window, 75 } 76 } 77 78 var _ Profiler = (*profiler)(nil) 79 80 const ( 81 // state changes: 82 // running => pausing => paused => resuming => running 83 // => stopped 84 stateRunning = 0 85 statePausing = 1 86 statePaused = 2 87 stateResuming = 3 88 stateStopped = 4 89 ) 90 91 type profiler struct { 92 data bytes.Buffer // protobuf 93 state int 94 stateCond *sync.Cond 95 // settings 96 processor Processor 97 interval time.Duration // sleep time between every profiling window 98 window time.Duration // profiling in the window, go pprof collect stack profile every 10ms 99 } 100 101 // Tag current goroutine with tagged ctx 102 // it's used for reuse goroutine scenario 103 func Tag(ctx context.Context) { 104 if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok { 105 pc.profiler.Tag(ctx) 106 } 107 } 108 109 // Untag current goroutine with tagged ctx 110 // it's used for reuse goroutine scenario 111 func Untag(ctx context.Context) { 112 if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok { 113 pc.profiler.Untag(ctx) 114 } 115 } 116 117 type profilerContext struct { 118 profiler Profiler 119 untagCtx context.Context 120 tags []string 121 } 122 123 func newProfilerContext(profiler Profiler) *profilerContext { 124 return &profilerContext{ 125 profiler: profiler, 126 tags: make([]string, 0, 12), 127 } 128 } 129 130 // Prepare the profiler context 131 func (p *profiler) Prepare(ctx context.Context) context.Context { 132 if c := ctx.Value(profilerContextKey{}); c != nil { 133 return ctx 134 } 135 return context.WithValue(ctx, profilerContextKey{}, newProfilerContext(p)) 136 } 137 138 // State return current profiler state 139 func (p *profiler) State() (state int) { 140 p.stateCond.L.Lock() 141 state = p.state 142 p.stateCond.L.Unlock() 143 return state 144 } 145 146 // Stop the profiler 147 func (p *profiler) Stop() { 148 if p.State() == stateStopped { 149 return 150 } 151 // stateRunning => stateStopped 152 p.stateChange(stateRunning, stateStopped) 153 } 154 155 // Pause the profiler. 156 // The profiler has been paused when Pause() return 157 func (p *profiler) Pause() { 158 if p.State() == statePaused { 159 return 160 } 161 // stateRunning => statePausing 162 p.stateChange(stateRunning, statePausing) 163 // => statePaused 164 p.stateWait(statePaused) 165 } 166 167 // Resume the profiler. 168 // The profiler has been resumed when Resume() return 169 func (p *profiler) Resume() { 170 if p.State() == stateRunning { 171 return 172 } 173 // statePaused => stateResuming 174 p.stateChange(statePaused, stateResuming) 175 // => stateRunning 176 p.stateWait(stateRunning) 177 } 178 179 // Run start analyse the pprof data with interval and window settings 180 func (p *profiler) Run(ctx context.Context) (err error) { 181 var profiles []*TagsProfile 182 timer := time.NewTimer(0) 183 for { 184 // check state 185 state := p.State() 186 switch state { 187 case stateRunning: // do nothing 188 case statePausing: // pause the loop 189 p.stateChange(statePausing, statePaused) // wake up Pause() 190 klog.Info("KITEX: profiler paused") 191 p.stateChange(stateResuming, stateRunning) // wake up Resume() 192 klog.Info("KITEX: profiler resumed") 193 continue 194 case statePaused, stateResuming: // actually, no such case 195 continue 196 case stateStopped: // end the loop 197 klog.Info("KITEX: profiler stopped") 198 return nil 199 } 200 201 // wait for an interval time to reduce the cost of profiling 202 if p.interval > 0 { 203 timer.Reset(p.interval) 204 <-timer.C 205 } 206 207 // start profiler 208 if err = p.startProfile(); err != nil { 209 klog.Errorf("KITEX: profiler start profile failed: %v", err) 210 // the reason of startProfile failed maybe because of there is a running pprof program. 211 // so we need to wait a fixed time for it. 212 time.Sleep(time.Second * 30) 213 continue 214 } 215 216 // wait for a window time to collect pprof data 217 if p.window > 0 { 218 timer.Reset(p.window) 219 <-timer.C 220 } 221 222 // stop profiler 223 p.stopProfile() 224 225 // analyse and process pprof data 226 profiles, err = p.analyse() 227 if err != nil { 228 klog.Errorf("KITEX: profiler analyse failed: %v", err) 229 continue 230 } 231 err = p.processor(profiles) 232 if err != nil { 233 klog.Errorf("KITEX: profiler controller process failed: %v", err) 234 continue 235 } 236 } 237 } 238 239 // Tag current goroutine with tags 240 // If ctx already tagged, append the existed tags 241 func (p *profiler) Tag(ctx context.Context, tags ...string) context.Context { 242 pctx, ok := ctx.Value(profilerContextKey{}).(*profilerContext) 243 if !ok { 244 pctx = newProfilerContext(p) 245 ctx = context.WithValue(ctx, profilerContextKey{}, pctx) 246 } 247 if pctx.untagCtx == nil { 248 pctx.untagCtx = ctx 249 } 250 pctx.tags = append(pctx.tags, tags...) 251 // do not return pprof ctx 252 pprof.SetGoroutineLabels(pprof.WithLabels(context.Background(), pprof.Labels(pctx.tags...))) 253 return ctx 254 } 255 256 // Untag current goroutine 257 // Only untag if ctx already tagged, will not clear the goroutine labels if not tagged by profiler 258 func (p *profiler) Untag(ctx context.Context) { 259 if pc, ok := ctx.Value(profilerContextKey{}).(*profilerContext); ok && pc.untagCtx != nil { 260 // if ctx have untagCtx, that means the current goroutine created by a tagged goroutine 261 // we need to untag the goroutine when finished 262 // else, do nothing 263 pprof.SetGoroutineLabels(pc.untagCtx) 264 } 265 } 266 267 func (p *profiler) Lookup(ctx context.Context, key string) (string, bool) { 268 return pprof.Label(ctx, key) 269 } 270 271 func (p *profiler) stateChange(from, to int) { 272 p.stateCond.L.Lock() 273 for p.state != from { // wait state to from first 274 p.stateCond.Wait() 275 } 276 p.state = to 277 p.stateCond.L.Unlock() 278 p.stateCond.Broadcast() 279 } 280 281 func (p *profiler) stateWait(to int) { 282 p.stateCond.L.Lock() 283 for p.state != to { 284 p.stateCond.Wait() 285 } 286 p.stateCond.L.Unlock() 287 } 288 289 func (p *profiler) startProfile() error { 290 p.data.Reset() 291 return pprof.StartCPUProfile(&p.data) 292 } 293 294 func (p *profiler) stopProfile() { 295 pprof.StopCPUProfile() 296 } 297 298 func (p *profiler) analyse() ([]*TagsProfile, error) { 299 // parse protobuf data 300 pf, err := profile.ParseData(p.data.Bytes()) 301 if err != nil { 302 return nil, err 303 } 304 305 // filter cpu value index 306 sampleIdx := -1 307 for idx, st := range pf.SampleType { 308 if st.Type == "cpu" { 309 sampleIdx = idx 310 break 311 } 312 } 313 if sampleIdx < 0 { 314 return nil, errors.New("profiler: sample type not found") 315 } 316 317 // calculate every sample expense 318 counter := map[string]*TagsProfile{} // map[tagsKey]funcProfile 319 var total int64 320 for _, sm := range pf.Sample { 321 value := sm.Value[sampleIdx] 322 tags := labelToTags(sm.Label) 323 tagsKey := tagsToKey(tags) 324 tp, ok := counter[tagsKey] 325 if !ok { 326 tp = &TagsProfile{} 327 counter[tagsKey] = tp 328 tp.Key = tagsKey 329 tp.Tags = tags 330 } 331 tp.Value += value 332 total += value 333 } 334 335 // compensate value with duration date 336 durationRate := float64((p.interval + p.window) / p.window) 337 profiles := make([]*TagsProfile, 0, len(counter)) // flat to array 338 for _, l := range counter { 339 l.Percent = float64(l.Value) / float64(total) 340 l.Value = int64(durationRate * float64(l.Value)) 341 profiles = append(profiles, l) 342 } 343 return profiles, nil 344 } 345 346 // TagsProfile is the stats result group by tag key 347 type TagsProfile struct { 348 Key string // eg: a=1,b=2 349 Tags []string // eg: ["a", "1", "b", "2"] 350 Value int64 // pprof cpu times 351 Percent float64 // <= 1.0 352 } 353 354 func labelToTags(label map[string][]string) []string { 355 tags := make([]string, 0, len(label)*2) 356 for k, v := range label { 357 tags = append(tags, k, strings.Join(v, ",")) 358 } 359 return tags 360 } 361 362 func tagsToKey(tags []string) string { 363 if len(tags)%2 != 0 { 364 return "" 365 } 366 tagsPair := make([]string, 0, len(tags)/2) 367 for i := 0; i < len(tags); i += 2 { 368 tagsPair = append(tagsPair, fmt.Sprintf("%s=%s", tags[i], tags[i+1])) 369 } 370 // sort tags to make it a unique key 371 sort.Strings(tagsPair) 372 return strings.Join(tagsPair, "|") 373 }