github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 //go:build pebble_obj_io_tracing 6 // +build pebble_obj_io_tracing 7 8 package objiotracing 9 10 import ( 11 "bufio" 12 "context" 13 "fmt" 14 "math/rand" 15 "sync" 16 "sync/atomic" 17 "time" 18 "unsafe" 19 20 "github.com/cockroachdb/pebble/internal/base" 21 "github.com/cockroachdb/pebble/objstorage" 22 "github.com/cockroachdb/pebble/vfs" 23 ) 24 25 // Enabled is used to short circuit tracing-related code in regular builds. 26 const Enabled = true 27 28 // Tracer manages the writing of object IO traces to files. 29 // 30 // The tracer runs a background worker goroutine which receives trace event 31 // buffers over a channel and dumps them to IOTRACES- files. Wrapper 32 // implementations for Readable, ReadHandle, Writable are producers of traces; 33 // they maintain internal buffers of events which get flushed to the buffered 34 // channel when they get full. This allows for minimal synchronization per IO 35 // (as for most of these structures, an instance only allows a single IO at a 36 // time). 37 type Tracer struct { 38 fs vfs.FS 39 fsDir string 40 41 handleID atomic.Uint64 42 43 workerStopCh chan struct{} 44 workerDataCh chan eventBuf 45 workerWait sync.WaitGroup 46 } 47 48 // Open creates a Tracer which generates trace files in the given directory. 49 // Each trace file contains a series of Events (as they are in memory). 50 func Open(fs vfs.FS, fsDir string) *Tracer { 51 t := &Tracer{ 52 fs: fs, 53 fsDir: fsDir, 54 workerStopCh: make(chan struct{}), 55 workerDataCh: make(chan eventBuf, channelBufSize), 56 } 57 58 t.handleID.Store(uint64(rand.NewSource(time.Now().UnixNano()).Int63())) 59 60 t.workerWait.Add(1) 61 go t.workerLoop() 62 return t 63 } 64 65 // Close the tracer, flushing any remaining events. 66 func (t *Tracer) Close() { 67 if t.workerStopCh == nil { 68 return 69 } 70 // Tell the worker to stop and wait for it to finish up. 71 close(t.workerStopCh) 72 t.workerWait.Wait() 73 t.workerStopCh = nil 74 } 75 76 // WrapWritable wraps an objstorage.Writable with one that generates tracing 77 // events. 78 func (t *Tracer) WrapWritable( 79 ctx context.Context, w objstorage.Writable, fileNum base.FileNum, 80 ) objstorage.Writable { 81 return &writable{ 82 w: w, 83 fileNum: fileNum, 84 g: makeEventGenerator(ctx, t), 85 } 86 } 87 88 type writable struct { 89 w objstorage.Writable 90 fileNum base.FileNum 91 curOffset int64 92 g eventGenerator 93 } 94 95 var _ objstorage.Writable = (*writable)(nil) 96 97 // Write is part of the objstorage.Writable interface. 98 func (w *writable) Write(p []byte) error { 99 w.g.add(context.Background(), Event{ 100 Op: WriteOp, 101 FileNum: w.fileNum, 102 Offset: w.curOffset, 103 Size: int64(len(p)), 104 }) 105 // If w.w.Write(p) returns an error, a new writable 106 // will be used, so even tho all of p may not have 107 // been written to the underlying "file", it is okay 108 // to add len(p) to curOffset. 109 w.curOffset += int64(len(p)) 110 return w.w.Write(p) 111 } 112 113 // Finish is part of the objstorage.Writable interface. 114 func (w *writable) Finish() error { 115 w.g.flush() 116 return w.w.Finish() 117 } 118 119 // Abort is part of the objstorage.Writable interface. 120 func (w *writable) Abort() { 121 w.g.flush() 122 w.w.Abort() 123 } 124 125 // WrapReadable wraps an objstorage.Readable with one that generates tracing 126 // events. 127 func (t *Tracer) WrapReadable( 128 ctx context.Context, r objstorage.Readable, fileNum base.FileNum, 129 ) objstorage.Readable { 130 res := &readable{ 131 r: r, 132 fileNum: fileNum, 133 } 134 res.mu.g = makeEventGenerator(ctx, t) 135 return res 136 } 137 138 type readable struct { 139 r objstorage.Readable 140 fileNum base.FileNum 141 mu struct { 142 sync.Mutex 143 g eventGenerator 144 } 145 } 146 147 var _ objstorage.Readable = (*readable)(nil) 148 149 // ReadAt is part of the objstorage.Readable interface. 150 func (r *readable) ReadAt(ctx context.Context, v []byte, off int64) (n int, err error) { 151 r.mu.Lock() 152 r.mu.g.add(ctx, Event{ 153 Op: ReadOp, 154 FileNum: r.fileNum, 155 Offset: off, 156 Size: int64(len(v)), 157 }) 158 r.mu.Unlock() 159 return r.r.ReadAt(ctx, v, off) 160 } 161 162 // Close is part of the objstorage.Readable interface. 163 func (r *readable) Close() error { 164 r.mu.g.flush() 165 return r.r.Close() 166 } 167 168 // Size is part of the objstorage.Readable interface. 169 func (r *readable) Size() int64 { 170 return r.r.Size() 171 } 172 173 // NewReadHandle is part of the objstorage.Readable interface. 174 func (r *readable) NewReadHandle(ctx context.Context) objstorage.ReadHandle { 175 // It's safe to get the tracer from the generator without the mutex since it never changes. 176 t := r.mu.g.t 177 return &readHandle{ 178 rh: r.r.NewReadHandle(ctx), 179 fileNum: r.fileNum, 180 handleID: t.handleID.Add(1), 181 g: makeEventGenerator(ctx, t), 182 } 183 } 184 185 type readHandle struct { 186 rh objstorage.ReadHandle 187 fileNum base.FileNum 188 handleID uint64 189 g eventGenerator 190 } 191 192 var _ objstorage.ReadHandle = (*readHandle)(nil) 193 194 // ReadAt is part of the objstorage.ReadHandle interface. 195 func (rh *readHandle) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) { 196 rh.g.add(ctx, Event{ 197 Op: ReadOp, 198 FileNum: rh.fileNum, 199 HandleID: rh.handleID, 200 Offset: off, 201 Size: int64(len(p)), 202 }) 203 return rh.rh.ReadAt(ctx, p, off) 204 } 205 206 // Close is part of the objstorage.ReadHandle interface. 207 func (rh *readHandle) Close() error { 208 rh.g.flush() 209 return rh.rh.Close() 210 } 211 212 // SetupForCompaction is part of the objstorage.ReadHandle interface. 213 func (rh *readHandle) SetupForCompaction() { 214 rh.g.add(context.Background(), Event{ 215 Op: SetupForCompactionOp, 216 FileNum: rh.fileNum, 217 HandleID: rh.handleID, 218 }) 219 rh.rh.SetupForCompaction() 220 } 221 222 // RecordCacheHit is part of the objstorage.ReadHandle interface. 223 func (rh *readHandle) RecordCacheHit(ctx context.Context, offset, size int64) { 224 rh.g.add(ctx, Event{ 225 Op: RecordCacheHitOp, 226 FileNum: rh.fileNum, 227 HandleID: rh.handleID, 228 Offset: offset, 229 Size: size, 230 }) 231 rh.rh.RecordCacheHit(ctx, offset, size) 232 } 233 234 type ctxInfo struct { 235 reason Reason 236 blockType BlockType 237 levelPlusOne uint8 238 } 239 240 func mergeCtxInfo(base, other ctxInfo) ctxInfo { 241 res := other 242 if res.reason == 0 { 243 res.reason = base.reason 244 } 245 if res.blockType == 0 { 246 res.blockType = base.blockType 247 } 248 if res.levelPlusOne == 0 { 249 res.levelPlusOne = base.levelPlusOne 250 } 251 return res 252 } 253 254 type ctxInfoKey struct{} 255 256 func withInfo(ctx context.Context, info ctxInfo) context.Context { 257 return context.WithValue(ctx, ctxInfoKey{}, info) 258 } 259 260 func infoFromCtx(ctx context.Context) ctxInfo { 261 res := ctx.Value(ctxInfoKey{}) 262 if res == nil { 263 return ctxInfo{} 264 } 265 return res.(ctxInfo) 266 } 267 268 // WithReason creates a context that has an associated Reason (which ends up in 269 // traces created under that context). 270 func WithReason(ctx context.Context, reason Reason) context.Context { 271 info := infoFromCtx(ctx) 272 info.reason = reason 273 return withInfo(ctx, info) 274 } 275 276 // WithBlockType creates a context that has an associated BlockType (which ends up in 277 // traces created under that context). 278 func WithBlockType(ctx context.Context, blockType BlockType) context.Context { 279 info := infoFromCtx(ctx) 280 info.blockType = blockType 281 return withInfo(ctx, info) 282 } 283 284 // WithLevel creates a context that has an associated level (which ends up in 285 // traces created under that context). 286 func WithLevel(ctx context.Context, level int) context.Context { 287 info := infoFromCtx(ctx) 288 info.levelPlusOne = uint8(level) + 1 289 return withInfo(ctx, info) 290 } 291 292 const ( 293 eventSize = int(unsafe.Sizeof(Event{})) 294 targetEntriesPerFile = 256 * 1024 * 1024 / eventSize // 256MB files 295 eventsPerBuf = 16 296 channelBufSize = 512 * 1024 / eventsPerBuf // 512K events. 297 bytesPerFileSync = 128 * 1024 298 ) 299 300 type eventBuf struct { 301 events [eventsPerBuf]Event 302 num int 303 } 304 305 type eventGenerator struct { 306 t *Tracer 307 baseCtxInfo ctxInfo 308 buf eventBuf 309 } 310 311 func makeEventGenerator(ctx context.Context, t *Tracer) eventGenerator { 312 return eventGenerator{ 313 t: t, 314 baseCtxInfo: infoFromCtx(ctx), 315 } 316 } 317 318 func (g *eventGenerator) flush() { 319 if g.buf.num > 0 { 320 g.t.workerDataCh <- g.buf 321 g.buf.num = 0 322 } 323 } 324 325 func (g *eventGenerator) add(ctx context.Context, e Event) { 326 e.StartUnixNano = time.Now().UnixNano() 327 info := infoFromCtx(ctx) 328 info = mergeCtxInfo(g.baseCtxInfo, info) 329 e.Reason = info.reason 330 e.BlockType = info.blockType 331 e.LevelPlusOne = info.levelPlusOne 332 if g.buf.num == eventsPerBuf { 333 g.flush() 334 } 335 g.buf.events[g.buf.num] = e 336 g.buf.num++ 337 } 338 339 type workerState struct { 340 curFile vfs.File 341 curBW *bufio.Writer 342 numEntriesInFile int 343 } 344 345 func (t *Tracer) workerLoop() { 346 defer t.workerWait.Done() 347 stopCh := t.workerStopCh 348 dataCh := t.workerDataCh 349 var state workerState 350 t.workerNewFile(&state) 351 for { 352 select { 353 case <-stopCh: 354 close(dataCh) 355 // Flush any remaining traces. 356 for data := range dataCh { 357 t.workerWriteTraces(&state, data) 358 } 359 t.workerCloseFile(&state) 360 return 361 362 case data := <-dataCh: 363 t.workerWriteTraces(&state, data) 364 } 365 } 366 } 367 368 func (t *Tracer) workerWriteTraces(state *workerState, data eventBuf) { 369 if state.numEntriesInFile >= targetEntriesPerFile { 370 t.workerCloseFile(state) 371 t.workerNewFile(state) 372 } 373 state.numEntriesInFile += data.num 374 p := unsafe.Pointer(&data.events[0]) 375 b := unsafe.Slice((*byte)(p), eventSize*data.num) 376 if _, err := state.curBW.Write(b); err != nil { 377 panic(err) 378 } 379 } 380 381 func (t *Tracer) workerNewFile(state *workerState) { 382 filename := fmt.Sprintf("IOTRACES-%s", time.Now().UTC().Format(time.RFC3339Nano)) 383 384 file, err := t.fs.Create(t.fs.PathJoin(t.fsDir, filename)) 385 if err != nil { 386 panic(err) 387 } 388 file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{ 389 BytesPerSync: bytesPerFileSync, 390 }) 391 state.curFile = file 392 state.curBW = bufio.NewWriter(file) 393 state.numEntriesInFile = 0 394 } 395 396 func (t *Tracer) workerCloseFile(state *workerState) { 397 if state.curFile != nil { 398 if err := state.curBW.Flush(); err != nil { 399 panic(err) 400 } 401 if err := state.curFile.Sync(); err != nil { 402 panic(err) 403 } 404 if err := state.curFile.Close(); err != nil { 405 panic(err) 406 } 407 state.curFile = nil 408 state.curBW = nil 409 } 410 }