github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  //go:build pebble_obj_io_tracing
     6  // +build pebble_obj_io_tracing
     7  
     8  package objiotracing
     9  
    10  import (
    11  	"bufio"
    12  	"context"
    13  	"fmt"
    14  	"math/rand"
    15  	"sync"
    16  	"sync/atomic"
    17  	"time"
    18  	"unsafe"
    19  
    20  	"github.com/cockroachdb/pebble/internal/base"
    21  	"github.com/cockroachdb/pebble/objstorage"
    22  	"github.com/cockroachdb/pebble/vfs"
    23  )
    24  
    25  // Enabled is used to short circuit tracing-related code in regular builds.
    26  const Enabled = true
    27  
    28  // Tracer manages the writing of object IO traces to files.
    29  //
    30  // The tracer runs a background worker goroutine which receives trace event
    31  // buffers over a channel and dumps them to IOTRACES- files. Wrapper
    32  // implementations for Readable, ReadHandle, Writable are producers of traces;
    33  // they maintain internal buffers of events which get flushed to the buffered
    34  // channel when they get full. This allows for minimal synchronization per IO
    35  // (as for most of these structures, an instance only allows a single IO at a
    36  // time).
    37  type Tracer struct {
    38  	fs    vfs.FS
    39  	fsDir string
    40  
    41  	handleID atomic.Uint64
    42  
    43  	workerStopCh chan struct{}
    44  	workerDataCh chan eventBuf
    45  	workerWait   sync.WaitGroup
    46  }
    47  
    48  // Open creates a Tracer which generates trace files in the given directory.
    49  // Each trace file contains a series of Events (as they are in memory).
    50  func Open(fs vfs.FS, fsDir string) *Tracer {
    51  	t := &Tracer{
    52  		fs:           fs,
    53  		fsDir:        fsDir,
    54  		workerStopCh: make(chan struct{}),
    55  		workerDataCh: make(chan eventBuf, channelBufSize),
    56  	}
    57  
    58  	t.handleID.Store(uint64(rand.NewSource(time.Now().UnixNano()).Int63()))
    59  
    60  	t.workerWait.Add(1)
    61  	go t.workerLoop()
    62  	return t
    63  }
    64  
    65  // Close the tracer, flushing any remaining events.
    66  func (t *Tracer) Close() {
    67  	if t.workerStopCh == nil {
    68  		return
    69  	}
    70  	// Tell the worker to stop and wait for it to finish up.
    71  	close(t.workerStopCh)
    72  	t.workerWait.Wait()
    73  	t.workerStopCh = nil
    74  }
    75  
    76  // WrapWritable wraps an objstorage.Writable with one that generates tracing
    77  // events.
    78  func (t *Tracer) WrapWritable(
    79  	ctx context.Context, w objstorage.Writable, fileNum base.FileNum,
    80  ) objstorage.Writable {
    81  	return &writable{
    82  		w:       w,
    83  		fileNum: fileNum,
    84  		g:       makeEventGenerator(ctx, t),
    85  	}
    86  }
    87  
    88  type writable struct {
    89  	w         objstorage.Writable
    90  	fileNum   base.FileNum
    91  	curOffset int64
    92  	g         eventGenerator
    93  }
    94  
    95  var _ objstorage.Writable = (*writable)(nil)
    96  
    97  // Write is part of the objstorage.Writable interface.
    98  func (w *writable) Write(p []byte) error {
    99  	w.g.add(context.Background(), Event{
   100  		Op:      WriteOp,
   101  		FileNum: w.fileNum,
   102  		Offset:  w.curOffset,
   103  		Size:    int64(len(p)),
   104  	})
   105  	// If w.w.Write(p) returns an error, a new writable
   106  	// will be used, so even tho all of p may not have
   107  	// been written to the underlying "file", it is okay
   108  	// to add len(p) to curOffset.
   109  	w.curOffset += int64(len(p))
   110  	return w.w.Write(p)
   111  }
   112  
   113  // Finish is part of the objstorage.Writable interface.
   114  func (w *writable) Finish() error {
   115  	w.g.flush()
   116  	return w.w.Finish()
   117  }
   118  
   119  // Abort is part of the objstorage.Writable interface.
   120  func (w *writable) Abort() {
   121  	w.g.flush()
   122  	w.w.Abort()
   123  }
   124  
   125  // WrapReadable wraps an objstorage.Readable with one that generates tracing
   126  // events.
   127  func (t *Tracer) WrapReadable(
   128  	ctx context.Context, r objstorage.Readable, fileNum base.FileNum,
   129  ) objstorage.Readable {
   130  	res := &readable{
   131  		r:       r,
   132  		fileNum: fileNum,
   133  	}
   134  	res.mu.g = makeEventGenerator(ctx, t)
   135  	return res
   136  }
   137  
   138  type readable struct {
   139  	r       objstorage.Readable
   140  	fileNum base.FileNum
   141  	mu      struct {
   142  		sync.Mutex
   143  		g eventGenerator
   144  	}
   145  }
   146  
   147  var _ objstorage.Readable = (*readable)(nil)
   148  
   149  // ReadAt is part of the objstorage.Readable interface.
   150  func (r *readable) ReadAt(ctx context.Context, v []byte, off int64) (n int, err error) {
   151  	r.mu.Lock()
   152  	r.mu.g.add(ctx, Event{
   153  		Op:      ReadOp,
   154  		FileNum: r.fileNum,
   155  		Offset:  off,
   156  		Size:    int64(len(v)),
   157  	})
   158  	r.mu.Unlock()
   159  	return r.r.ReadAt(ctx, v, off)
   160  }
   161  
   162  // Close is part of the objstorage.Readable interface.
   163  func (r *readable) Close() error {
   164  	r.mu.g.flush()
   165  	return r.r.Close()
   166  }
   167  
   168  // Size is part of the objstorage.Readable interface.
   169  func (r *readable) Size() int64 {
   170  	return r.r.Size()
   171  }
   172  
   173  // NewReadHandle is part of the objstorage.Readable interface.
   174  func (r *readable) NewReadHandle(ctx context.Context) objstorage.ReadHandle {
   175  	// It's safe to get the tracer from the generator without the mutex since it never changes.
   176  	t := r.mu.g.t
   177  	return &readHandle{
   178  		rh:       r.r.NewReadHandle(ctx),
   179  		fileNum:  r.fileNum,
   180  		handleID: t.handleID.Add(1),
   181  		g:        makeEventGenerator(ctx, t),
   182  	}
   183  }
   184  
   185  type readHandle struct {
   186  	rh       objstorage.ReadHandle
   187  	fileNum  base.FileNum
   188  	handleID uint64
   189  	g        eventGenerator
   190  }
   191  
   192  var _ objstorage.ReadHandle = (*readHandle)(nil)
   193  
   194  // ReadAt is part of the objstorage.ReadHandle interface.
   195  func (rh *readHandle) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) {
   196  	rh.g.add(ctx, Event{
   197  		Op:       ReadOp,
   198  		FileNum:  rh.fileNum,
   199  		HandleID: rh.handleID,
   200  		Offset:   off,
   201  		Size:     int64(len(p)),
   202  	})
   203  	return rh.rh.ReadAt(ctx, p, off)
   204  }
   205  
   206  // Close is part of the objstorage.ReadHandle interface.
   207  func (rh *readHandle) Close() error {
   208  	rh.g.flush()
   209  	return rh.rh.Close()
   210  }
   211  
   212  // SetupForCompaction is part of the objstorage.ReadHandle interface.
   213  func (rh *readHandle) SetupForCompaction() {
   214  	rh.g.add(context.Background(), Event{
   215  		Op:       SetupForCompactionOp,
   216  		FileNum:  rh.fileNum,
   217  		HandleID: rh.handleID,
   218  	})
   219  	rh.rh.SetupForCompaction()
   220  }
   221  
   222  // RecordCacheHit is part of the objstorage.ReadHandle interface.
   223  func (rh *readHandle) RecordCacheHit(ctx context.Context, offset, size int64) {
   224  	rh.g.add(ctx, Event{
   225  		Op:       RecordCacheHitOp,
   226  		FileNum:  rh.fileNum,
   227  		HandleID: rh.handleID,
   228  		Offset:   offset,
   229  		Size:     size,
   230  	})
   231  	rh.rh.RecordCacheHit(ctx, offset, size)
   232  }
   233  
   234  type ctxInfo struct {
   235  	reason       Reason
   236  	blockType    BlockType
   237  	levelPlusOne uint8
   238  }
   239  
   240  func mergeCtxInfo(base, other ctxInfo) ctxInfo {
   241  	res := other
   242  	if res.reason == 0 {
   243  		res.reason = base.reason
   244  	}
   245  	if res.blockType == 0 {
   246  		res.blockType = base.blockType
   247  	}
   248  	if res.levelPlusOne == 0 {
   249  		res.levelPlusOne = base.levelPlusOne
   250  	}
   251  	return res
   252  }
   253  
   254  type ctxInfoKey struct{}
   255  
   256  func withInfo(ctx context.Context, info ctxInfo) context.Context {
   257  	return context.WithValue(ctx, ctxInfoKey{}, info)
   258  }
   259  
   260  func infoFromCtx(ctx context.Context) ctxInfo {
   261  	res := ctx.Value(ctxInfoKey{})
   262  	if res == nil {
   263  		return ctxInfo{}
   264  	}
   265  	return res.(ctxInfo)
   266  }
   267  
   268  // WithReason creates a context that has an associated Reason (which ends up in
   269  // traces created under that context).
   270  func WithReason(ctx context.Context, reason Reason) context.Context {
   271  	info := infoFromCtx(ctx)
   272  	info.reason = reason
   273  	return withInfo(ctx, info)
   274  }
   275  
   276  // WithBlockType creates a context that has an associated BlockType (which ends up in
   277  // traces created under that context).
   278  func WithBlockType(ctx context.Context, blockType BlockType) context.Context {
   279  	info := infoFromCtx(ctx)
   280  	info.blockType = blockType
   281  	return withInfo(ctx, info)
   282  }
   283  
   284  // WithLevel creates a context that has an associated level (which ends up in
   285  // traces created under that context).
   286  func WithLevel(ctx context.Context, level int) context.Context {
   287  	info := infoFromCtx(ctx)
   288  	info.levelPlusOne = uint8(level) + 1
   289  	return withInfo(ctx, info)
   290  }
   291  
   292  const (
   293  	eventSize            = int(unsafe.Sizeof(Event{}))
   294  	targetEntriesPerFile = 256 * 1024 * 1024 / eventSize // 256MB files
   295  	eventsPerBuf         = 16
   296  	channelBufSize       = 512 * 1024 / eventsPerBuf // 512K events.
   297  	bytesPerFileSync     = 128 * 1024
   298  )
   299  
   300  type eventBuf struct {
   301  	events [eventsPerBuf]Event
   302  	num    int
   303  }
   304  
   305  type eventGenerator struct {
   306  	t           *Tracer
   307  	baseCtxInfo ctxInfo
   308  	buf         eventBuf
   309  }
   310  
   311  func makeEventGenerator(ctx context.Context, t *Tracer) eventGenerator {
   312  	return eventGenerator{
   313  		t:           t,
   314  		baseCtxInfo: infoFromCtx(ctx),
   315  	}
   316  }
   317  
   318  func (g *eventGenerator) flush() {
   319  	if g.buf.num > 0 {
   320  		g.t.workerDataCh <- g.buf
   321  		g.buf.num = 0
   322  	}
   323  }
   324  
   325  func (g *eventGenerator) add(ctx context.Context, e Event) {
   326  	e.StartUnixNano = time.Now().UnixNano()
   327  	info := infoFromCtx(ctx)
   328  	info = mergeCtxInfo(g.baseCtxInfo, info)
   329  	e.Reason = info.reason
   330  	e.BlockType = info.blockType
   331  	e.LevelPlusOne = info.levelPlusOne
   332  	if g.buf.num == eventsPerBuf {
   333  		g.flush()
   334  	}
   335  	g.buf.events[g.buf.num] = e
   336  	g.buf.num++
   337  }
   338  
   339  type workerState struct {
   340  	curFile          vfs.File
   341  	curBW            *bufio.Writer
   342  	numEntriesInFile int
   343  }
   344  
   345  func (t *Tracer) workerLoop() {
   346  	defer t.workerWait.Done()
   347  	stopCh := t.workerStopCh
   348  	dataCh := t.workerDataCh
   349  	var state workerState
   350  	t.workerNewFile(&state)
   351  	for {
   352  		select {
   353  		case <-stopCh:
   354  			close(dataCh)
   355  			// Flush any remaining traces.
   356  			for data := range dataCh {
   357  				t.workerWriteTraces(&state, data)
   358  			}
   359  			t.workerCloseFile(&state)
   360  			return
   361  
   362  		case data := <-dataCh:
   363  			t.workerWriteTraces(&state, data)
   364  		}
   365  	}
   366  }
   367  
   368  func (t *Tracer) workerWriteTraces(state *workerState, data eventBuf) {
   369  	if state.numEntriesInFile >= targetEntriesPerFile {
   370  		t.workerCloseFile(state)
   371  		t.workerNewFile(state)
   372  	}
   373  	state.numEntriesInFile += data.num
   374  	p := unsafe.Pointer(&data.events[0])
   375  	b := unsafe.Slice((*byte)(p), eventSize*data.num)
   376  	if _, err := state.curBW.Write(b); err != nil {
   377  		panic(err)
   378  	}
   379  }
   380  
   381  func (t *Tracer) workerNewFile(state *workerState) {
   382  	filename := fmt.Sprintf("IOTRACES-%s", time.Now().UTC().Format(time.RFC3339Nano))
   383  
   384  	file, err := t.fs.Create(t.fs.PathJoin(t.fsDir, filename))
   385  	if err != nil {
   386  		panic(err)
   387  	}
   388  	file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{
   389  		BytesPerSync: bytesPerFileSync,
   390  	})
   391  	state.curFile = file
   392  	state.curBW = bufio.NewWriter(file)
   393  	state.numEntriesInFile = 0
   394  }
   395  
   396  func (t *Tracer) workerCloseFile(state *workerState) {
   397  	if state.curFile != nil {
   398  		if err := state.curBW.Flush(); err != nil {
   399  			panic(err)
   400  		}
   401  		if err := state.curFile.Sync(); err != nil {
   402  			panic(err)
   403  		}
   404  		if err := state.curFile.Close(); err != nil {
   405  			panic(err)
   406  		}
   407  		state.curFile = nil
   408  		state.curBW = nil
   409  	}
   410  }