lab.nexedi.com/kirr/go123@v0.0.0-20240207185015-8299741fa871/tracing/tracetest/tracetest.go

lab.nexedi.com/kirr/go123@v0.0.0-20240207185015-8299741fa871/tracing/tracetest/tracetest.go (about)

     1  // Copyright (C) 2017-2021  Nexedi SA and Contributors.
     2  //                          Kirill Smelkov <kirr@nexedi.com>
     3  //
     4  // This program is free software: you can Use, Study, Modify and Redistribute
     5  // it under the terms of the GNU General Public License version 3, or (at your
     6  // option) any later version, as published by the Free Software Foundation.
     7  //
     8  // You can also Link and Combine this program with other software covered by
     9  // the terms of any of the Free Software licenses or any of the Open Source
    10  // Initiative approved licenses and Convey the resulting work. Corresponding
    11  // source of such a combination shall include the source code for all other
    12  // software used.
    13  //
    14  // This program is distributed WITHOUT ANY WARRANTY; without even the implied
    15  // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
    16  //
    17  // See COPYING file for full licensing terms.
    18  // See https://www.nexedi.com/licensing for rationale and options.
    19  
    20  // Package tracetest provides infrastructure for testing concurrent systems
    21  // based on synchronous event tracing.
    22  //
    23  // A serial system can be verified by checking that its execution produces
    24  // expected serial stream of events. But concurrent systems cannot be verified
    25  // by exactly this way because events are only partly-ordered with respect to
    26  // each other by causality or so called happens-before relation.
    27  //
    28  // However in a concurrent system one can decompose all events into serial
    29  // streams in which events should be strictly ordered by causality with respect
    30  // to each other. This decomposition in turn allows to verify that in every
    31  // stream events happenned as expected.
    32  //
    33  // Verification of events for all streams can be done by one *sequential*
    34  // process:
    35  //
    36  //   - if events A and B in different streams are unrelated to each other by
    37  //     causality, the sequence of checks models a particular possible flow of
    38  //     time. Notably since events are delivered synchronously and sender is
    39  //     blocked until receiver/checker explicitly confirms event has been
    40  //     processed, by checking either A then B, or B then A allows to check
    41  //     for a particular race-condition.
    42  //
    43  //   - if events A and B in different streams are related to each other by
    44  //     causality (i.e. there is some happens-before relation for them) the
    45  //     sequence of checking should represent that ordering relation.
    46  //
    47  // Basic package usage is as follows:
    48  //
    49  //	func TestSomething(t *testing.T) {
    50  //	    tracetest.Verify(t, func(t *tracetest.T) {
    51  //	        // setup tracing so that events of test system are collected and
    52  //	        // synchronously delivered to t.RxEvent. This can be done with e.g.
    53  //	        // package lab.nexedi.com/kirr/go123/tracing or by other similar means.
    54  //	        ...
    55  //
    56  //	        // tell t to which stream an event should go.
    57  //	        t.SetEventRouter(...)
    58  //
    59  //	        // run the system and verify it produces expected events
    60  //
    61  //	        // <code to start the system>
    62  //	        t.Expect("<stream₁>", eventOk₁)
    63  //	        t.Expect("<stream₂>", eventOk₂)
    64  //	        ...
    65  //
    66  //	        // <code to further control/affect the system>
    67  //	        t.Expect("<stream₃>", eventOk₃)
    68  //	        t.Expect("<stream₄>", eventOk₄)
    69  //	        ...
    70  //	    })
    71  //	}
    72  //
    73  // See example_test.go for more details.
    74  package tracetest
    75  
    76  // Note on detection of races
    77  //
    78  // Verify injects delays to empirically detect race conditions and if a test
    79  // incorrectly decomposed its system into serial streams: consider unrelated to
    80  // each other events A and B are incorrectly routed to the same channel. It
    81  // could be so happening that the order of checks on the test side is almost
    82  // always correct and so the error is not visible. However
    83  //
    84  //	if we add delays to delivery of either A or B
    85  //	and test both combinations
    86  //
    87  // we will for sure detect the error as, if A and B are indeed
    88  // unrelated, one of the delay combination will result in events
    89  // delivered to test in different to what it expects order.
    90  //
    91  // the time for delay could be taken as follows:
    92  //
    93  //	- run the test without delay; collect δt between events on particular stream
    94  //	- take delay = max(δt)·10
    95  //
    96  // to make sure there is indeed no different orderings possible on the
    97  // stream, rerun the test N(event-on-stream) times, and during i'th run
    98  // delay i'th event.
    99  //
   100  // See also on this topic:
   101  // http://www.1024cores.net/home/relacy-race-detector
   102  // http://www.1024cores.net/home/relacy-race-detector/rrd-introduction
   103  
   104  import (
   105  	"fmt"
   106  	"path/filepath"
   107  	"reflect"
   108  	"runtime"
   109  	"runtime/debug"
   110  	"sort"
   111  	"strings"
   112  	"sync"
   113  	"testing"
   114  	"time"
   115  
   116  	"github.com/kylelemons/godebug/pretty"
   117  
   118  	"lab.nexedi.com/kirr/go123/xruntime"
   119  )
   120  
   121  
   122  // _testing_TB is alias for testing.TB that is non-public when embedded into a struct.
   123  type _testing_TB = testing.TB
   124  
   125  // T is similar to testing.T and represents tracetest test environment.
   126  //
   127  // It is passed by Verify and Run to tested function.
   128  //
   129  // Besides testing.TB it provides
   130  //
   131  //	.RxEvent	-- to where events should be synchronously delivered by the test
   132  //	.SetEventRouter	-- to tell T to which stream an event should go
   133  //	.Expect		-- to assert expectation of an event on a stream
   134  type T struct {
   135  	_testing_TB
   136  
   137  	mu             sync.Mutex
   138  	streamTab      map[/*stream*/string]*_chan // where events on stream are delivered; set to nil on test shutdown
   139  	routeEvent     func(event interface{}) (stream string)
   140  	tracev         []eventTrace // record of events as they happen
   141  	delayInjectTab map[/*stream*/string]*delayInjectState
   142  
   143  	nakq []nak    // naks queued to be sent after Fatal
   144  	logq []string // queued log messages prepared in fatalfInNonMain
   145  }
   146  
   147  // eventTrace keeps information about one event T received via RxEvent.
   148  type eventTrace struct {
   149  	t      time.Time   // time of receive; monotonic
   150  	stream string
   151  	event  interface{}
   152  }
   153  
   154  // delayInjectState is used by delay-injector to find out for which event on a
   155  // stream a delay should be injected.
   156  type delayInjectState struct {
   157  	seqno   int           // current sequence number of event on this stream.
   158  	delayAt int           // event with `seqno == delayAt` will be delayed
   159  	delayT  time.Duration // by delayT time.
   160  }
   161  
   162  
   163  // Run runs f under tracetest environment.
   164  //
   165  // It is similar to Verify but f is ran only once.
   166  // Run does not check for race conditions.
   167  func Run(t testing.TB, f func(t *T)) {
   168  	run(t, f, nil)
   169  }
   170  
   171  // run serves Run and Verify: it creates T that wraps t, and runs f under T.
   172  func run(t testing.TB, f func(t *T), delayInjectTab map[string]*delayInjectState) *T {
   173  	tT := &T{
   174  		_testing_TB:    t,
   175  		streamTab:      make(map[string]*_chan),
   176  		delayInjectTab: delayInjectTab,
   177  	}
   178  
   179  	// verify in the end that no events are left unchecked / unconsumed,
   180  	// e.g. sent to RxEvent, but not received. Nak them if they are and fail.
   181  	//
   182  	// NOTE this complements T.Fatal and friends, because a test might
   183  	// think it completes successfully, but leaves unconsumed events behind it.
   184  	defer func() {
   185  		nnak := tT.closeStreamTab()
   186  		if nnak != 0 {
   187  			tT.Fail()
   188  		}
   189  		// log messages queued by fatalfInNonMain
   190  		for _, msg := range tT.logq {
   191  			// TODO try to log without hereby file:line, because msg
   192  			// already has file:line corresponding to logged event source location.
   193  			tT.Log(msg)
   194  		}
   195  	}()
   196  
   197  	f(tT)
   198  	return tT
   199  }
   200  
   201  // Verify verifies a test system.
   202  //
   203  // It runs f under T environment, catching race conditions, deadlocks and
   204  // unexpected events. f is rerun several times and should not alter its
   205  // behaviour from run to run.
   206  func Verify(t *testing.T, f func(t *T)) {
   207  	// run f once. This produces initial trace of events.
   208  	tT0 := run(t, f, nil)
   209  
   210  	// now, if f succeeds, verify f with injected delays.
   211  	if tT0.Failed() {
   212  		return
   213  	}
   214  
   215  	trace0 := tT0.tracev
   216  	if len(trace0) < 2 {
   217  		return
   218  	}
   219  	streams0 := streamsOfTrace(trace0)
   220  
   221  	// sort trace0 by time just in case - events might come from multiple
   222  	// CPUs simultaneously, and so for close events they might be added to
   223  	// tracev not in time order.
   224  	sort.Slice(trace0, func(i, j int) bool {
   225  		return trace0[i].t.Before(trace0[j].t)
   226  	})
   227  
   228  	// find out max(δt) in between events
   229  	var δtMax time.Duration
   230  	for i := 1; i < len(trace0); i++ {
   231  		δt := trace0[i].t.Sub(trace0[i-1].t)
   232  		if δt > δtMax {
   233  			δtMax = δt
   234  		}
   235  	}
   236  
   237  	// retest f with 10·δtMax delay injected at i'th event
   238  	delayT    := 10*δtMax            // TODO make sure it < deadTime
   239  	delayTmin := 10*time.Millisecond // make sure delayT ≥ 10ms
   240  	if delayT < delayTmin {
   241  		delayT = delayTmin
   242  	}
   243  	for i := 0; i < len(trace0); i++ {
   244  		// stream and on-stream sequence number for i'th global event
   245  		stream := trace0[i].stream
   246  		istream := -1
   247  		for j := 0; j <= i; j++ {
   248  			if trace0[j].stream == stream {
   249  				istream++
   250  			}
   251  		}
   252  
   253  		t.Run(fmt.Sprintf("delay@%d(=%s:%d)", i, stream, istream), func(t *testing.T) {
   254  			tT := run(t, f, map[string]*delayInjectState{
   255  				stream: &delayInjectState{
   256  					delayAt: istream,
   257  					delayT:  delayT,
   258  				},
   259  			})
   260  
   261  			// verify that streams are the same from run to run
   262  			if tT.Failed() {
   263  				return
   264  			}
   265  			streams := streamsOfTrace(tT.tracev)
   266  			if !reflect.DeepEqual(streams, streams0) {
   267  				tT.Fatalf("streams are not the same as in the first run:\n"+
   268  					  "first: %s\nnow:   %s\ndiff:\n%s\n\n",
   269  					  streams0, streams, pretty.Compare(streams0, streams))
   270  			}
   271  		})
   272  	}
   273  }
   274  
   275  
   276  // T overrides FailNow/Fatal/Fatalf to also cancel all in-progress sends.
   277  func (t *T) FailNow() {
   278  	t.Helper()
   279  	_ = t.closeStreamTab()
   280  	t._testing_TB.FailNow()
   281  }
   282  
   283  func (t *T) Fatal(argv ...interface{}) {
   284  	t.Helper()
   285  	t.Log(argv...)
   286  	t.FailNow()
   287  }
   288  
   289  func (t *T) Fatalf(format string, argv ...interface{}) {
   290  	t.Helper()
   291  	t.Logf(format, argv...)
   292  	t.FailNow()
   293  }
   294  
   295  // closeStreamTab prints details about pending events on streamTab, naks them
   296  // and closes all channels. It returns the number of naked messages.
   297  func (t *T) closeStreamTab() (nnak int) {
   298  	t.Helper()
   299  
   300  	// mark streamTab no longer operational
   301  	t.mu.Lock()
   302  	defer t.mu.Unlock()
   303  	streamTab := t.streamTab
   304  	t.streamTab = nil
   305  
   306  	if streamTab == nil {
   307  		return // already closed
   308  	}
   309  
   310  	// print details about pending events and all streams
   311  	type sendInfo struct{ch *_chan; msg *_Msg}
   312  	var sendv  []sendInfo // sends are pending here
   313  	var quietv []*_chan   // this channels are quiet
   314  
   315  	// order channels by name
   316  	var streams []string
   317  	for __ := range streamTab {
   318  		streams = append(streams, __)
   319  	}
   320  	sort.Slice(streams, func(i, j int) bool {
   321  		return strings.Compare(streams[i], streams[j]) < 0
   322  	})
   323  
   324  	for _, stream := range streams {
   325  		ch := streamTab[stream]
   326  		quiet := true
   327  
   328  		// check whether someone is sending on channels without blocking.
   329  	loop:	// loop because there might be several concurrent pending sends to particular channel.
   330  		for {
   331  			select {
   332  			case msg := <-ch.msgq:
   333  				sendv = append(sendv, sendInfo{ch, msg})
   334  				quiet = false
   335  			default:
   336  				break loop
   337  			}
   338  		}
   339  		// include ch.unsentv into pending as well (we want to show
   340  		// such events as pending even if corresponding send deadlocked).
   341  		for _, msg := range ch.unsentv {
   342  			sendv = append(sendv, sendInfo{ch, msg})
   343  			quiet = false
   344  		}
   345  
   346  		if quiet {
   347  			quietv = append(quietv, ch)
   348  		}
   349  	}
   350  
   351  	pending := fmt.Sprintf("test shutdown: #streams: %d,  #(pending events): %d\n", len(streams), len(sendv))
   352  	for _, __ := range sendv {
   353  		pending += fmt.Sprintf("%s\t<- %T %v\n", __.ch.name, __.msg.Event, __.msg.Event)
   354  	}
   355  	for _, ch := range quietv {
   356  		pending += fmt.Sprintf("# %s\n", ch.name)
   357  	}
   358  
   359  	// log the details and nak senders that we received from.
   360  	// nak them only after details printout, so that our text comes first,
   361  	// and their "panics" don't get intermixed with it.
   362  	t.Log(pending)
   363  	for _, __ := range t.nakq {
   364  		__.msg.nak(__.why)
   365  		nnak++
   366  	}
   367  	t.nakq = nil
   368  	for _, __ := range sendv {
   369  		__.msg.nak("canceled (test failed)")
   370  		nnak++
   371  	}
   372  	// in any case close channel where future Sends may arrive so that they will "panic" too.
   373  	for _, ch := range streamTab {
   374  		ch.Close()
   375  	}
   376  
   377  	return nnak
   378  }
   379  
   380  // streamsOfTrace returns sorted list of all streams present in a trace.
   381  func streamsOfTrace(tracev []eventTrace) []string {
   382  	streams := make(map[string]struct{})
   383  	for _, t := range tracev {
   384  		streams[t.stream] = struct{}{}
   385  	}
   386  	streamv := []string{}
   387  	for stream := range streams {
   388  		streamv = append(streamv, stream)
   389  	}
   390  	sort.Strings(streamv)
   391  	return streamv
   392  }
   393  
   394  
   395  // ---- events delivery + Expect ----
   396  
   397  
   398  // SetEventRouter tells t to which stream an event should go.
   399  //
   400  // It should be called not more than once.
   401  // Before SetEventRouter is called, all events go to "default" stream.
   402  func (t *T) SetEventRouter(routeEvent func(event interface{}) (stream string)) {
   403  	t.mu.Lock()
   404  	defer t.mu.Unlock()
   405  
   406  	if t.routeEvent != nil {
   407  		panic("double call to SetEventRouter")
   408  	}
   409  	t.routeEvent = routeEvent
   410  }
   411  
   412  
   413  // chanForStream returns channel corresponding to stream.
   414  // must be called under mu.
   415  func (t *T) chanForStream(stream string) *_chan {
   416  	if t.streamTab == nil {
   417  		return nil // t is no longer operational after e.g. deadlock
   418  	}
   419  
   420  	ch, ok := t.streamTab[stream]
   421  	if !ok {
   422  		ch = t.newChan(stream)
   423  		t.streamTab[stream] = ch
   424  	}
   425  	return ch
   426  }
   427  
   428  // RxEvent should be synchronously called from test system when an event occurs.
   429  //
   430  // The sequential process of the test system where event originated should be
   431  // paused until RxEvent returns. This requirement can be usually met via
   432  // inserting t.RxEvent() call into the code that produces the event.
   433  func (t *T) RxEvent(event interface{}) {
   434  	t0 := time.Now()
   435  	stream := ""
   436  	t.mu.Lock()
   437  	if t.routeEvent != nil {
   438  		stream = t.routeEvent(event)
   439  	}
   440  	if stream == "" {
   441  		stream = "default"
   442  	}
   443  	t.tracev = append(t.tracev, eventTrace{t0, stream, event})
   444  	ch := t.chanForStream(stream)
   445  
   446  	var delay time.Duration
   447  	d, ok := t.delayInjectTab[stream]
   448  	if ok {
   449  		if d.seqno == d.delayAt {
   450  			delay = d.delayT
   451  		}
   452  		d.seqno++
   453  	}
   454  
   455  	t.mu.Unlock()
   456  
   457  	if ch == nil {
   458  		t.fatalfInNonMain("%s: (pre)send: canceled (test failed)", stream)
   459  	}
   460  
   461  	if delay != 0 {
   462  		time.Sleep(delay)
   463  	}
   464  
   465  	ch.Send(event)
   466  }
   467  
   468  // xget1 gets 1 event in place and checks it has expected type
   469  //
   470  // if checks do not pass - fatal testing error is raised
   471  func (t *T) xget1(stream string, eventp interface{}) *_Msg {
   472  	t.Helper()
   473  
   474  	t.mu.Lock()
   475  	ch := t.chanForStream(stream)
   476  	t.mu.Unlock()
   477  
   478  	if ch == nil {
   479  		t.Fatalf("%s: recv: canceled (test failed)", stream)
   480  	}
   481  
   482  	return ch.RecvInto(eventp)
   483  }
   484  
   485  // Expect receives next event on stream and verifies it to be equal to eventOK.
   486  //
   487  // If check is successful ACK is sent back to event producer.
   488  // If check does not pass - fatal testing error is raised.
   489  func (t *T) Expect(stream string, eventOK interface{}) {
   490  	t.Helper()
   491  	msg := t.expect1(stream, eventOK)
   492  	msg.Ack()
   493  }
   494  
   495  // TODO ExpectNoACK? (then it would be possible to receive events from 2
   496  // streams; have those 2 processes paused and inspect their state. After
   497  // inspection unpause both)
   498  
   499  // TODO Recv? (to receive an event for which we don't know type or value yet)
   500  
   501  // TODO Select? (e.g. Select("a", "b") to fetch from either "a" or "b")
   502  
   503  // expect1 receives next event on stream and verifies it to be equal to eventOK (both type and value).
   504  //
   505  // if checks do not pass - fatal testing error is raised.
   506  func (t *T) expect1(stream string, eventExpect interface{}) *_Msg {
   507  	t.Helper()
   508  
   509  	reventExpect := reflect.ValueOf(eventExpect)
   510  
   511  	reventp := reflect.New(reventExpect.Type())
   512  	msg := t.xget1(stream, reventp.Interface())
   513  	revent := reventp.Elem()
   514  
   515  	if !reflect.DeepEqual(revent.Interface(), reventExpect.Interface()) {
   516  		t.queuenak(msg, "unexpected event data")
   517  		t.Fatalf("%s: expect: %s:\nwant: %v\nhave: %v\ndiff:\n%s\n\n",
   518  			stream,
   519  			reventExpect.Type(), reventExpect, revent,
   520  			pretty.Compare(reventExpect.Interface(), revent.Interface()))
   521  	}
   522  
   523  	return msg
   524  }
   525  
   526  // fatalfInNonMain should be called for fatal cases in non-main goroutines instead of panic.
   527  //
   528  // we don't panic because it will stop the process and prevent the main
   529  // goroutine to print detailed reason for e.g. deadlock or other error.
   530  func (t *T) fatalfInNonMain(format string, argv ...interface{}) {
   531  	t.Helper()
   532  
   533  	if !strings.HasSuffix(format, "\n") {
   534  		format += "\n"
   535  	}
   536  	msg := fmt.Sprintf(format, argv...)
   537  	msg += fmt.Sprintf("%s\n", debug.Stack())
   538  
   539  	// manually include file:line so that message is logged with correct
   540  	// location when emitted via logq.
   541  	// XXX t.Helper() not taken into account
   542  	f := xruntime.Traceback(2)[0] // XXX we need only first caller, not full traceback
   543  	msg = fmt.Sprintf("%s:%d: %s", filepath.Base(f.File), f.Line, msg)
   544  
   545  	// serialize fatal log+traceback printout, so that such printouts from
   546  	// multiple goroutines do not get intermixed.
   547  	t.mu.Lock()
   548  	defer t.mu.Unlock()
   549  
   550  	if t.streamTab == nil {
   551  		// t is over -> log directly.
   552  		// make sure to prefix log message the same way as would be
   553  		// done when messages are logged via .logq .
   554  		t.logFromTracetest_go(msg)
   555  	} else {
   556  		// remember msg to be logged when t is done so that non-main
   557  		// log output always come after main printout. The messages
   558  		// won't be intermixed because t.Log is serialized internally.
   559  		t.logq = append(t.logq, msg)
   560  	}
   561  
   562  	t.Fail()
   563  	runtime.Goexit()
   564  }
   565  
   566  // logFromTracetest_go calls t.Log without wrapping it with t.Helper().
   567  //
   568  // as the result the message is prefixed with tracetest.go:<LINE>, not the
   569  // location of fatalfInNonMain caller.
   570  func (t *T) logFromTracetest_go(msg string) {
   571  	t.Log(msg)
   572  }