lab.nexedi.com/kirr/go123@v0.0.0-20240207185015-8299741fa871/tracing/tracetest/tracetest.go (about) 1 // Copyright (C) 2017-2021 Nexedi SA and Contributors. 2 // Kirill Smelkov <kirr@nexedi.com> 3 // 4 // This program is free software: you can Use, Study, Modify and Redistribute 5 // it under the terms of the GNU General Public License version 3, or (at your 6 // option) any later version, as published by the Free Software Foundation. 7 // 8 // You can also Link and Combine this program with other software covered by 9 // the terms of any of the Free Software licenses or any of the Open Source 10 // Initiative approved licenses and Convey the resulting work. Corresponding 11 // source of such a combination shall include the source code for all other 12 // software used. 13 // 14 // This program is distributed WITHOUT ANY WARRANTY; without even the implied 15 // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 16 // 17 // See COPYING file for full licensing terms. 18 // See https://www.nexedi.com/licensing for rationale and options. 19 20 // Package tracetest provides infrastructure for testing concurrent systems 21 // based on synchronous event tracing. 22 // 23 // A serial system can be verified by checking that its execution produces 24 // expected serial stream of events. But concurrent systems cannot be verified 25 // by exactly this way because events are only partly-ordered with respect to 26 // each other by causality or so called happens-before relation. 27 // 28 // However in a concurrent system one can decompose all events into serial 29 // streams in which events should be strictly ordered by causality with respect 30 // to each other. This decomposition in turn allows to verify that in every 31 // stream events happenned as expected. 32 // 33 // Verification of events for all streams can be done by one *sequential* 34 // process: 35 // 36 // - if events A and B in different streams are unrelated to each other by 37 // causality, the sequence of checks models a particular possible flow of 38 // time. Notably since events are delivered synchronously and sender is 39 // blocked until receiver/checker explicitly confirms event has been 40 // processed, by checking either A then B, or B then A allows to check 41 // for a particular race-condition. 42 // 43 // - if events A and B in different streams are related to each other by 44 // causality (i.e. there is some happens-before relation for them) the 45 // sequence of checking should represent that ordering relation. 46 // 47 // Basic package usage is as follows: 48 // 49 // func TestSomething(t *testing.T) { 50 // tracetest.Verify(t, func(t *tracetest.T) { 51 // // setup tracing so that events of test system are collected and 52 // // synchronously delivered to t.RxEvent. This can be done with e.g. 53 // // package lab.nexedi.com/kirr/go123/tracing or by other similar means. 54 // ... 55 // 56 // // tell t to which stream an event should go. 57 // t.SetEventRouter(...) 58 // 59 // // run the system and verify it produces expected events 60 // 61 // // <code to start the system> 62 // t.Expect("<stream₁>", eventOk₁) 63 // t.Expect("<stream₂>", eventOk₂) 64 // ... 65 // 66 // // <code to further control/affect the system> 67 // t.Expect("<stream₃>", eventOk₃) 68 // t.Expect("<stream₄>", eventOk₄) 69 // ... 70 // }) 71 // } 72 // 73 // See example_test.go for more details. 74 package tracetest 75 76 // Note on detection of races 77 // 78 // Verify injects delays to empirically detect race conditions and if a test 79 // incorrectly decomposed its system into serial streams: consider unrelated to 80 // each other events A and B are incorrectly routed to the same channel. It 81 // could be so happening that the order of checks on the test side is almost 82 // always correct and so the error is not visible. However 83 // 84 // if we add delays to delivery of either A or B 85 // and test both combinations 86 // 87 // we will for sure detect the error as, if A and B are indeed 88 // unrelated, one of the delay combination will result in events 89 // delivered to test in different to what it expects order. 90 // 91 // the time for delay could be taken as follows: 92 // 93 // - run the test without delay; collect δt between events on particular stream 94 // - take delay = max(δt)·10 95 // 96 // to make sure there is indeed no different orderings possible on the 97 // stream, rerun the test N(event-on-stream) times, and during i'th run 98 // delay i'th event. 99 // 100 // See also on this topic: 101 // http://www.1024cores.net/home/relacy-race-detector 102 // http://www.1024cores.net/home/relacy-race-detector/rrd-introduction 103 104 import ( 105 "fmt" 106 "path/filepath" 107 "reflect" 108 "runtime" 109 "runtime/debug" 110 "sort" 111 "strings" 112 "sync" 113 "testing" 114 "time" 115 116 "github.com/kylelemons/godebug/pretty" 117 118 "lab.nexedi.com/kirr/go123/xruntime" 119 ) 120 121 122 // _testing_TB is alias for testing.TB that is non-public when embedded into a struct. 123 type _testing_TB = testing.TB 124 125 // T is similar to testing.T and represents tracetest test environment. 126 // 127 // It is passed by Verify and Run to tested function. 128 // 129 // Besides testing.TB it provides 130 // 131 // .RxEvent -- to where events should be synchronously delivered by the test 132 // .SetEventRouter -- to tell T to which stream an event should go 133 // .Expect -- to assert expectation of an event on a stream 134 type T struct { 135 _testing_TB 136 137 mu sync.Mutex 138 streamTab map[/*stream*/string]*_chan // where events on stream are delivered; set to nil on test shutdown 139 routeEvent func(event interface{}) (stream string) 140 tracev []eventTrace // record of events as they happen 141 delayInjectTab map[/*stream*/string]*delayInjectState 142 143 nakq []nak // naks queued to be sent after Fatal 144 logq []string // queued log messages prepared in fatalfInNonMain 145 } 146 147 // eventTrace keeps information about one event T received via RxEvent. 148 type eventTrace struct { 149 t time.Time // time of receive; monotonic 150 stream string 151 event interface{} 152 } 153 154 // delayInjectState is used by delay-injector to find out for which event on a 155 // stream a delay should be injected. 156 type delayInjectState struct { 157 seqno int // current sequence number of event on this stream. 158 delayAt int // event with `seqno == delayAt` will be delayed 159 delayT time.Duration // by delayT time. 160 } 161 162 163 // Run runs f under tracetest environment. 164 // 165 // It is similar to Verify but f is ran only once. 166 // Run does not check for race conditions. 167 func Run(t testing.TB, f func(t *T)) { 168 run(t, f, nil) 169 } 170 171 // run serves Run and Verify: it creates T that wraps t, and runs f under T. 172 func run(t testing.TB, f func(t *T), delayInjectTab map[string]*delayInjectState) *T { 173 tT := &T{ 174 _testing_TB: t, 175 streamTab: make(map[string]*_chan), 176 delayInjectTab: delayInjectTab, 177 } 178 179 // verify in the end that no events are left unchecked / unconsumed, 180 // e.g. sent to RxEvent, but not received. Nak them if they are and fail. 181 // 182 // NOTE this complements T.Fatal and friends, because a test might 183 // think it completes successfully, but leaves unconsumed events behind it. 184 defer func() { 185 nnak := tT.closeStreamTab() 186 if nnak != 0 { 187 tT.Fail() 188 } 189 // log messages queued by fatalfInNonMain 190 for _, msg := range tT.logq { 191 // TODO try to log without hereby file:line, because msg 192 // already has file:line corresponding to logged event source location. 193 tT.Log(msg) 194 } 195 }() 196 197 f(tT) 198 return tT 199 } 200 201 // Verify verifies a test system. 202 // 203 // It runs f under T environment, catching race conditions, deadlocks and 204 // unexpected events. f is rerun several times and should not alter its 205 // behaviour from run to run. 206 func Verify(t *testing.T, f func(t *T)) { 207 // run f once. This produces initial trace of events. 208 tT0 := run(t, f, nil) 209 210 // now, if f succeeds, verify f with injected delays. 211 if tT0.Failed() { 212 return 213 } 214 215 trace0 := tT0.tracev 216 if len(trace0) < 2 { 217 return 218 } 219 streams0 := streamsOfTrace(trace0) 220 221 // sort trace0 by time just in case - events might come from multiple 222 // CPUs simultaneously, and so for close events they might be added to 223 // tracev not in time order. 224 sort.Slice(trace0, func(i, j int) bool { 225 return trace0[i].t.Before(trace0[j].t) 226 }) 227 228 // find out max(δt) in between events 229 var δtMax time.Duration 230 for i := 1; i < len(trace0); i++ { 231 δt := trace0[i].t.Sub(trace0[i-1].t) 232 if δt > δtMax { 233 δtMax = δt 234 } 235 } 236 237 // retest f with 10·δtMax delay injected at i'th event 238 delayT := 10*δtMax // TODO make sure it < deadTime 239 delayTmin := 10*time.Millisecond // make sure delayT ≥ 10ms 240 if delayT < delayTmin { 241 delayT = delayTmin 242 } 243 for i := 0; i < len(trace0); i++ { 244 // stream and on-stream sequence number for i'th global event 245 stream := trace0[i].stream 246 istream := -1 247 for j := 0; j <= i; j++ { 248 if trace0[j].stream == stream { 249 istream++ 250 } 251 } 252 253 t.Run(fmt.Sprintf("delay@%d(=%s:%d)", i, stream, istream), func(t *testing.T) { 254 tT := run(t, f, map[string]*delayInjectState{ 255 stream: &delayInjectState{ 256 delayAt: istream, 257 delayT: delayT, 258 }, 259 }) 260 261 // verify that streams are the same from run to run 262 if tT.Failed() { 263 return 264 } 265 streams := streamsOfTrace(tT.tracev) 266 if !reflect.DeepEqual(streams, streams0) { 267 tT.Fatalf("streams are not the same as in the first run:\n"+ 268 "first: %s\nnow: %s\ndiff:\n%s\n\n", 269 streams0, streams, pretty.Compare(streams0, streams)) 270 } 271 }) 272 } 273 } 274 275 276 // T overrides FailNow/Fatal/Fatalf to also cancel all in-progress sends. 277 func (t *T) FailNow() { 278 t.Helper() 279 _ = t.closeStreamTab() 280 t._testing_TB.FailNow() 281 } 282 283 func (t *T) Fatal(argv ...interface{}) { 284 t.Helper() 285 t.Log(argv...) 286 t.FailNow() 287 } 288 289 func (t *T) Fatalf(format string, argv ...interface{}) { 290 t.Helper() 291 t.Logf(format, argv...) 292 t.FailNow() 293 } 294 295 // closeStreamTab prints details about pending events on streamTab, naks them 296 // and closes all channels. It returns the number of naked messages. 297 func (t *T) closeStreamTab() (nnak int) { 298 t.Helper() 299 300 // mark streamTab no longer operational 301 t.mu.Lock() 302 defer t.mu.Unlock() 303 streamTab := t.streamTab 304 t.streamTab = nil 305 306 if streamTab == nil { 307 return // already closed 308 } 309 310 // print details about pending events and all streams 311 type sendInfo struct{ch *_chan; msg *_Msg} 312 var sendv []sendInfo // sends are pending here 313 var quietv []*_chan // this channels are quiet 314 315 // order channels by name 316 var streams []string 317 for __ := range streamTab { 318 streams = append(streams, __) 319 } 320 sort.Slice(streams, func(i, j int) bool { 321 return strings.Compare(streams[i], streams[j]) < 0 322 }) 323 324 for _, stream := range streams { 325 ch := streamTab[stream] 326 quiet := true 327 328 // check whether someone is sending on channels without blocking. 329 loop: // loop because there might be several concurrent pending sends to particular channel. 330 for { 331 select { 332 case msg := <-ch.msgq: 333 sendv = append(sendv, sendInfo{ch, msg}) 334 quiet = false 335 default: 336 break loop 337 } 338 } 339 // include ch.unsentv into pending as well (we want to show 340 // such events as pending even if corresponding send deadlocked). 341 for _, msg := range ch.unsentv { 342 sendv = append(sendv, sendInfo{ch, msg}) 343 quiet = false 344 } 345 346 if quiet { 347 quietv = append(quietv, ch) 348 } 349 } 350 351 pending := fmt.Sprintf("test shutdown: #streams: %d, #(pending events): %d\n", len(streams), len(sendv)) 352 for _, __ := range sendv { 353 pending += fmt.Sprintf("%s\t<- %T %v\n", __.ch.name, __.msg.Event, __.msg.Event) 354 } 355 for _, ch := range quietv { 356 pending += fmt.Sprintf("# %s\n", ch.name) 357 } 358 359 // log the details and nak senders that we received from. 360 // nak them only after details printout, so that our text comes first, 361 // and their "panics" don't get intermixed with it. 362 t.Log(pending) 363 for _, __ := range t.nakq { 364 __.msg.nak(__.why) 365 nnak++ 366 } 367 t.nakq = nil 368 for _, __ := range sendv { 369 __.msg.nak("canceled (test failed)") 370 nnak++ 371 } 372 // in any case close channel where future Sends may arrive so that they will "panic" too. 373 for _, ch := range streamTab { 374 ch.Close() 375 } 376 377 return nnak 378 } 379 380 // streamsOfTrace returns sorted list of all streams present in a trace. 381 func streamsOfTrace(tracev []eventTrace) []string { 382 streams := make(map[string]struct{}) 383 for _, t := range tracev { 384 streams[t.stream] = struct{}{} 385 } 386 streamv := []string{} 387 for stream := range streams { 388 streamv = append(streamv, stream) 389 } 390 sort.Strings(streamv) 391 return streamv 392 } 393 394 395 // ---- events delivery + Expect ---- 396 397 398 // SetEventRouter tells t to which stream an event should go. 399 // 400 // It should be called not more than once. 401 // Before SetEventRouter is called, all events go to "default" stream. 402 func (t *T) SetEventRouter(routeEvent func(event interface{}) (stream string)) { 403 t.mu.Lock() 404 defer t.mu.Unlock() 405 406 if t.routeEvent != nil { 407 panic("double call to SetEventRouter") 408 } 409 t.routeEvent = routeEvent 410 } 411 412 413 // chanForStream returns channel corresponding to stream. 414 // must be called under mu. 415 func (t *T) chanForStream(stream string) *_chan { 416 if t.streamTab == nil { 417 return nil // t is no longer operational after e.g. deadlock 418 } 419 420 ch, ok := t.streamTab[stream] 421 if !ok { 422 ch = t.newChan(stream) 423 t.streamTab[stream] = ch 424 } 425 return ch 426 } 427 428 // RxEvent should be synchronously called from test system when an event occurs. 429 // 430 // The sequential process of the test system where event originated should be 431 // paused until RxEvent returns. This requirement can be usually met via 432 // inserting t.RxEvent() call into the code that produces the event. 433 func (t *T) RxEvent(event interface{}) { 434 t0 := time.Now() 435 stream := "" 436 t.mu.Lock() 437 if t.routeEvent != nil { 438 stream = t.routeEvent(event) 439 } 440 if stream == "" { 441 stream = "default" 442 } 443 t.tracev = append(t.tracev, eventTrace{t0, stream, event}) 444 ch := t.chanForStream(stream) 445 446 var delay time.Duration 447 d, ok := t.delayInjectTab[stream] 448 if ok { 449 if d.seqno == d.delayAt { 450 delay = d.delayT 451 } 452 d.seqno++ 453 } 454 455 t.mu.Unlock() 456 457 if ch == nil { 458 t.fatalfInNonMain("%s: (pre)send: canceled (test failed)", stream) 459 } 460 461 if delay != 0 { 462 time.Sleep(delay) 463 } 464 465 ch.Send(event) 466 } 467 468 // xget1 gets 1 event in place and checks it has expected type 469 // 470 // if checks do not pass - fatal testing error is raised 471 func (t *T) xget1(stream string, eventp interface{}) *_Msg { 472 t.Helper() 473 474 t.mu.Lock() 475 ch := t.chanForStream(stream) 476 t.mu.Unlock() 477 478 if ch == nil { 479 t.Fatalf("%s: recv: canceled (test failed)", stream) 480 } 481 482 return ch.RecvInto(eventp) 483 } 484 485 // Expect receives next event on stream and verifies it to be equal to eventOK. 486 // 487 // If check is successful ACK is sent back to event producer. 488 // If check does not pass - fatal testing error is raised. 489 func (t *T) Expect(stream string, eventOK interface{}) { 490 t.Helper() 491 msg := t.expect1(stream, eventOK) 492 msg.Ack() 493 } 494 495 // TODO ExpectNoACK? (then it would be possible to receive events from 2 496 // streams; have those 2 processes paused and inspect their state. After 497 // inspection unpause both) 498 499 // TODO Recv? (to receive an event for which we don't know type or value yet) 500 501 // TODO Select? (e.g. Select("a", "b") to fetch from either "a" or "b") 502 503 // expect1 receives next event on stream and verifies it to be equal to eventOK (both type and value). 504 // 505 // if checks do not pass - fatal testing error is raised. 506 func (t *T) expect1(stream string, eventExpect interface{}) *_Msg { 507 t.Helper() 508 509 reventExpect := reflect.ValueOf(eventExpect) 510 511 reventp := reflect.New(reventExpect.Type()) 512 msg := t.xget1(stream, reventp.Interface()) 513 revent := reventp.Elem() 514 515 if !reflect.DeepEqual(revent.Interface(), reventExpect.Interface()) { 516 t.queuenak(msg, "unexpected event data") 517 t.Fatalf("%s: expect: %s:\nwant: %v\nhave: %v\ndiff:\n%s\n\n", 518 stream, 519 reventExpect.Type(), reventExpect, revent, 520 pretty.Compare(reventExpect.Interface(), revent.Interface())) 521 } 522 523 return msg 524 } 525 526 // fatalfInNonMain should be called for fatal cases in non-main goroutines instead of panic. 527 // 528 // we don't panic because it will stop the process and prevent the main 529 // goroutine to print detailed reason for e.g. deadlock or other error. 530 func (t *T) fatalfInNonMain(format string, argv ...interface{}) { 531 t.Helper() 532 533 if !strings.HasSuffix(format, "\n") { 534 format += "\n" 535 } 536 msg := fmt.Sprintf(format, argv...) 537 msg += fmt.Sprintf("%s\n", debug.Stack()) 538 539 // manually include file:line so that message is logged with correct 540 // location when emitted via logq. 541 // XXX t.Helper() not taken into account 542 f := xruntime.Traceback(2)[0] // XXX we need only first caller, not full traceback 543 msg = fmt.Sprintf("%s:%d: %s", filepath.Base(f.File), f.Line, msg) 544 545 // serialize fatal log+traceback printout, so that such printouts from 546 // multiple goroutines do not get intermixed. 547 t.mu.Lock() 548 defer t.mu.Unlock() 549 550 if t.streamTab == nil { 551 // t is over -> log directly. 552 // make sure to prefix log message the same way as would be 553 // done when messages are logged via .logq . 554 t.logFromTracetest_go(msg) 555 } else { 556 // remember msg to be logged when t is done so that non-main 557 // log output always come after main printout. The messages 558 // won't be intermixed because t.Log is serialized internally. 559 t.logq = append(t.logq, msg) 560 } 561 562 t.Fail() 563 runtime.Goexit() 564 } 565 566 // logFromTracetest_go calls t.Log without wrapping it with t.Helper(). 567 // 568 // as the result the message is prefixed with tracetest.go:<LINE>, not the 569 // location of fatalfInNonMain caller. 570 func (t *T) logFromTracetest_go(msg string) { 571 t.Log(msg) 572 }