
     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     5  //go:build goexperiment.pagetrace
     7  // Page tracer.
     8  //
     9  // This file contains an implementation of page trace instrumentation for tracking
    10  // the way the Go runtime manages pages of memory. The trace may be enabled at program
    11  // startup with the GODEBUG option pagetrace.
    12  //
    13  // Each page trace event is either 8 or 16 bytes wide. The first
    14  // 8 bytes follow this format for non-sync events:
    15  //
    16  //     [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType]
    17  //
    18  // If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word
    19  // containing the full npages value (the npages bitfield is 0).
    20  //
    21  // The base address's bottom pageShift bits are always zero hence why we can pack other
    22  // data in there. We ignore the top 16 bits, assuming a 48 bit address space for the
    23  // heap.
    24  //
    25  // The timestamp delta is computed from the difference between the current nanotime
    26  // timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of
    27  // this delta is removed and only the next pageTraceTimeDeltaBits are kept.
    28  //
    29  // A sync event is emitted at the beginning of each trace buffer and whenever the
    30  // timestamp delta would not fit in an event.
    31  //
    32  // Sync events have the following structure:
    33  //
    34  //    [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent]
    35  //
    36  // In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID
    37  // (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp,
    38  // but like for the delta we drop the bottom pageTraceTimeLostBits here as well.
    40  package runtime
    42  import (
    43  	"runtime/internal/sys"
    44  	"unsafe"
    45  )
    47  // pageTraceAlloc records a page trace allocation event.
    48  // pp may be nil. Call only if debug.pagetracefd != 0.
    49  //
    50  // Must run on the system stack as a crude way to prevent preemption.
    51  //
    52  //go:systemstack
    53  func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
    54  	if pageTrace.enabled {
    55  		if now == 0 {
    56  			now = nanotime()
    57  		}
    58  		pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent)
    59  	}
    60  }
    62  // pageTraceFree records a page trace free event.
    63  // pp may be nil. Call only if debug.pagetracefd != 0.
    64  //
    65  // Must run on the system stack as a crude way to prevent preemption.
    66  //
    67  //go:systemstack
    68  func pageTraceFree(pp *p, now int64, base, npages uintptr) {
    69  	if pageTrace.enabled {
    70  		if now == 0 {
    71  			now = nanotime()
    72  		}
    73  		pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent)
    74  	}
    75  }
    77  // pageTraceScav records a page trace scavenge event.
    78  // pp may be nil. Call only if debug.pagetracefd != 0.
    79  //
    80  // Must run on the system stack as a crude way to prevent preemption.
    81  //
    82  //go:systemstack
    83  func pageTraceScav(pp *p, now int64, base, npages uintptr) {
    84  	if pageTrace.enabled {
    85  		if now == 0 {
    86  			now = nanotime()
    87  		}
    88  		pageTraceEmit(pp, now, base, npages, pageTraceScavEvent)
    89  	}
    90  }
    92  // pageTraceEventType is a page trace event type.
    93  type pageTraceEventType uint8
    95  const (
    96  	pageTraceSyncEvent  pageTraceEventType = iota // Timestamp emission.
    97  	pageTraceAllocEvent                           // Allocation of pages.
    98  	pageTraceFreeEvent                            // Freeing pages.
    99  	pageTraceScavEvent                            // Scavenging pages.
   100  )
   102  // pageTraceEmit emits a page trace event.
   103  //
   104  // Must run on the system stack as a crude way to prevent preemption.
   105  //
   106  //go:systemstack
   107  func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) {
   108  	// Get a buffer.
   109  	var tbp *pageTraceBuf
   110  	pid := int32(-1)
   111  	if pp == nil {
   112  		// We have no P, so take the global buffer.
   113  		lock(&pageTrace.lock)
   114  		tbp = &pageTrace.buf
   115  	} else {
   116  		tbp = &pp.pageTraceBuf
   117  		pid =
   118  	}
   120  	// Initialize the buffer if necessary.
   121  	tb := *tbp
   122  	if tb.buf == nil {
   123  		tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys))
   124  		tb = tb.writePid(pid)
   125  	}
   127  	// Handle timestamp and emit a sync event if necessary.
   128  	if now < tb.timeBase {
   129  		now = tb.timeBase
   130  	}
   131  	if now-tb.timeBase >= pageTraceTimeMaxDelta {
   132  		tb.timeBase = now
   133  		tb = tb.writeSync(pid)
   134  	}
   136  	// Emit the event.
   137  	tb = tb.writeEvent(pid, now, base, npages, typ)
   139  	// Write back the buffer.
   140  	*tbp = tb
   141  	if pp == nil {
   142  		unlock(&pageTrace.lock)
   143  	}
   144  }
   146  const (
   147  	pageTraceBufSize = 32 << 10
   149  	// These constants describe the per-event timestamp delta encoding.
   150  	pageTraceTimeLostBits  = 7  // How many bits of precision we lose in the delta.
   151  	pageTraceTimeDeltaBits = 16 // Size of the delta in bits.
   152  	pageTraceTimeMaxDelta  = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits)
   153  )
   155  // pageTraceEvents is the low-level buffer containing the trace data.
   156  type pageTraceEvents struct {
   157  	_      sys.NotInHeap
   158  	events [pageTraceBufSize / 8]uint64
   159  }
   161  // pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events
   162  // to the buffer. It tracks state necessary to do so.
   163  type pageTraceBuf struct {
   164  	buf      *pageTraceEvents
   165  	len      int   // How many events have been written so far.
   166  	timeBase int64 // The current timestamp base from which deltas are produced.
   167  	finished bool  // Whether this trace buf should no longer flush anything out.
   168  }
   170  // writePid writes a P ID event indicating which P we're running on.
   171  //
   172  // Assumes there's always space in the buffer since this is only called at the
   173  // beginning of a new buffer.
   174  //
   175  // Must run on the system stack as a crude way to prevent preemption.
   176  //
   177  //go:systemstack
   178  func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf {
   179  	e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent)
   180[tb.len] = e
   181  	tb.len++
   182  	return tb
   183  }
   185  // writeSync writes a sync event, which is just a timestamp. Handles flushing.
   186  //
   187  // Must run on the system stack as a crude way to prevent preemption.
   188  //
   189  //go:systemstack
   190  func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf {
   191  	if tb.len+1 > len( {
   192  		// N.B. flush will writeSync again.
   193  		return tb.flush(pid, tb.timeBase)
   194  	}
   195  	e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent)
   196[tb.len] = e
   197  	tb.len++
   198  	return tb
   199  }
   201  // writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary.
   202  //
   203  // pid indicates the P we're currently running on. Necessary in case we need to flush.
   204  // now is the current nanotime timestamp.
   205  // base is the base address of whatever group of pages this event is happening to.
   206  // npages is the length of the group of pages this event is happening to.
   207  // typ is the event that's happening to these pages.
   208  //
   209  // Must run on the system stack as a crude way to prevent preemption.
   210  //
   211  //go:systemstack
   212  func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf {
   213  	large := 0
   214  	np := npages
   215  	if npages >= 1024 {
   216  		large = 1
   217  		np = 0
   218  	}
   219  	if tb.len+1+large > len( {
   220  		tb = tb.flush(pid, now)
   221  	}
   222  	if base%pageSize != 0 {
   223  		throw("base address not page aligned")
   224  	}
   225  	e := uint64(base)
   226  	// The pageShift low-order bits are zero.
   227  	e |= uint64(typ)        // 2 bits
   228  	e |= uint64(large) << 2 // 1 bit
   229  	e |= uint64(np) << 3    // 10 bits
   230  	// Write the timestamp delta in the upper pageTraceTimeDeltaBits.
   231  	e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits)
   232[tb.len] = e
   233  	if large != 0 {
   234  		// npages doesn't fit in 10 bits, so write an additional word with that data.
   235[tb.len+1] = uint64(npages)
   236  	}
   237  	tb.len += 1 + large
   238  	return tb
   239  }
   241  // flush writes out the contents of the buffer to pageTrace.fd and resets the buffer.
   242  // It then writes out a P ID event and the first sync event for the new buffer.
   243  //
   244  // Must run on the system stack as a crude way to prevent preemption.
   245  //
   246  //go:systemstack
   247  func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf {
   248  	if !tb.finished {
   249  		lock(&pageTrace.fdLock)
   250  		writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&[0])), tb.len*8)
   251  		unlock(&pageTrace.fdLock)
   252  	}
   253  	tb.len = 0
   254  	tb.timeBase = now
   255  	return tb.writePid(pid).writeSync(pid)
   256  }
   258  var pageTrace struct {
   259  	// enabled indicates whether tracing is enabled. If true, fd >= 0.
   260  	//
   261  	// Safe to read without synchronization because it's only set once
   262  	// at program initialization.
   263  	enabled bool
   265  	// buf is the page trace buffer used if there is no P.
   266  	//
   267  	// lock protects buf.
   268  	lock mutex
   269  	buf  pageTraceBuf
   271  	// fdLock protects writing to fd.
   272  	//
   273  	// fd is the file to write the page trace to.
   274  	fdLock mutex
   275  	fd     int32
   276  }
   278  // initPageTrace initializes the page tracing infrastructure from GODEBUG.
   279  //
   280  // env must be the value of the GODEBUG environment variable.
   281  func initPageTrace(env string) {
   282  	var value string
   283  	for env != "" {
   284  		elt, rest := env, ""
   285  		for i := 0; i < len(env); i++ {
   286  			if env[i] == ',' {
   287  				elt, rest = env[:i], env[i+1:]
   288  				break
   289  			}
   290  		}
   291  		env = rest
   292  		if hasPrefix(elt, "pagetrace=") {
   293  			value = elt[len("pagetrace="):]
   294  			break
   295  		}
   296  	}
   297  	pageTrace.fd = -1
   298  	if canCreateFile && value != "" {
   299  		var tmp [4096]byte
   300  		if len(value) != 0 && len(value) < 4096 {
   301  			copy(tmp[:], value)
   302  			pageTrace.fd = create(&tmp[0], 0o664)
   303  		}
   304  	}
   305  	pageTrace.enabled = pageTrace.fd >= 0
   306  }
   308  // finishPageTrace flushes all P's trace buffers and disables page tracing.
   309  func finishPageTrace() {
   310  	if !pageTrace.enabled {
   311  		return
   312  	}
   313  	// Grab worldsema as we're about to execute a ragged barrier.
   314  	semacquire(&worldsema)
   315  	systemstack(func() {
   316  		// Disable tracing. This isn't strictly necessary and it's best-effort.
   317  		pageTrace.enabled = false
   319  		// Execute a ragged barrier, flushing each trace buffer.
   320  		forEachP(func(pp *p) {
   321  			if pp.pageTraceBuf.buf != nil {
   322  				pp.pageTraceBuf = pp.pageTraceBuf.flush(, nanotime())
   323  			}
   324  			pp.pageTraceBuf.finished = true
   325  		})
   327  		// Write the global have-no-P buffer.
   328  		lock(&pageTrace.lock)
   329  		if pageTrace.buf.buf != nil {
   330  			pageTrace.buf = pageTrace.buf.flush(-1, nanotime())
   331  		}
   332  		pageTrace.buf.finished = true
   333  		unlock(&pageTrace.lock)
   335  		// Safely close the file as nothing else should be allowed to write to the fd.
   336  		lock(&pageTrace.fdLock)
   337  		closefd(pageTrace.fd)
   338  		pageTrace.fd = -1
   339  		unlock(&pageTrace.fdLock)
   340  	})
   341  	semrelease(&worldsema)
   342  }
   344  // writeFull ensures that a complete write of bn bytes from b is made to fd.
   345  func writeFull(fd uintptr, b *byte, bn int) {
   346  	for bn > 0 {
   347  		n := write(fd, unsafe.Pointer(b), int32(bn))
   348  		if n == -_EINTR || n == -_EAGAIN {
   349  			continue
   350  		}
   351  		if n < 0 {
   352  			print("errno=", -n, "\n")
   353  			throw("writeBytes: bad write")
   354  		}
   355  		bn -= int(n)
   356  		b = addb(b, uintptr(n))
   357  	}
   358  }