github.com/DARA-Project/GoDist-Scheduler@v0.0.0-20201030134746-668de4acea0d/scheduler.go (about) 1 package main 2 3 import ( 4 "bytes" 5 "dara" 6 "encoding/json" 7 "flag" 8 "github.com/DARA-Project/GoDist-Scheduler/common" 9 "github.com/DARA-Project/GoDist-Scheduler/explorer" 10 "github.com/DARA-Project/GoDist-Scheduler/propchecker" 11 "log" 12 "net/rpc" 13 "os" 14 "runtime" 15 "strconv" 16 "sync/atomic" 17 "time" 18 "unsafe" 19 "fmt" 20 ) 21 22 type ProcStatus int 23 24 const ( 25 ALIVE ProcStatus = iota 26 DEAD 27 ) 28 29 //Command line arguments 30 var ( 31 procs = flag.Int("procs", 1, "The number of processes to model check") 32 record = flag.Bool("record", false, "Record an execution") 33 replay = flag.Bool("replay", false, "Replay an execution") 34 explore = flag.Bool("explore", false, "Explore a recorded execution") 35 sched_file = flag.String("schedule", "", "The schedule file where the schedule will be stored/loaded from") 36 strategy = flag.String("strategy", "random", "The strategy to be used for exploration. One of [random|unique|frequency|nodefrequency]") 37 maxdepth = flag.Int("maxdepth", 100, "The maximum depth for an exploration path") 38 maxruns = flag.Int("maxruns", 2, "The maximum number of exploration runs to be performed") 39 ) 40 41 const ( 42 //The length of the schedule. When recording the system will 43 //execute up to dara.SCHEDLEN. The same is true on replay 44 RECORDLEN = 10000 45 EXPLORELEN = 100 46 EXPLORATION_LOG_FILE = "visitedLog.txt" 47 // TODO: Make this an option 48 OVERLORD_RPC_SERVER = "0.0.0.0:45000" 49 ) 50 51 var ( 52 MAX_EXPLORATION_DEPTH = 100 53 MAX_EXPLORE_RUNS = 1 54 ) 55 56 //Global variables specific to shared memory 57 var ( 58 //pointer to shared memory, unsafe so that it can be casted 59 p unsafe.Pointer 60 //err is an integer, because it is set from errors in the runtime 61 //where there is no error type set 62 err int 63 //Procchan is the communication channel laid over shared memory 64 procchan *[dara.CHANNELS]dara.DaraProc 65 ) 66 67 // Global variables for managing the state of Dara processes 68 var ( 69 //Used to determine which process (runtime) will be run on the 70 //next scheduling decision when in record mode 71 LastProc int 72 LogLevel int 73 //Variable to hold propertychecker 74 checker *propchecker.Checker 75 // Variable to hold the logger 76 l *log.Logger 77 // Variable where the schedule is stored 78 // This might be the input scheduler for replay/explore or the output schedule for record 79 schedule dara.Schedule 80 // Monotonous event counter 81 eventCounter int 82 83 // Map that maps the Dara Process ID to its status 84 procStatus map[int]ProcStatus 85 // Map that maps the Dara Process ID to whether we own the procchan lock for that process 86 lockStatus map[int]bool 87 // Map that maps Dara Process ID to linux PID 88 linuxpids map[int]uint64 89 // Proc Coverage 90 allCoverage map[int]*explorer.CovStats 91 // Virtual Clocks 92 clocks map[int]*explorer.VirtualClock 93 ) 94 95 // decodeInputSchedule decodes the input schedule for replay/exploration use 96 func decodeInputSchedule() *dara.Schedule { 97 var schedule dara.Schedule 98 f, err := os.Open(*sched_file) 99 if err != nil { 100 l.Fatal(err) 101 } 102 dec := json.NewDecoder(f) 103 err = dec.Decode(&schedule) 104 if err != nil { 105 l.Fatal(err) 106 } 107 return &schedule 108 } 109 110 // checkargs checks and parses the command line arguments 111 func checkargs() { 112 flag.Parse() 113 //Set exactly one arg for record or replay 114 if *record && *replay { 115 l.Fatal("enable either replay or record, not both") 116 } 117 if !(*record || *replay || *explore) { 118 l.Fatal("enable replay or record or explore") 119 } 120 if *record { 121 l.Print("Recording") 122 } else if *explore { 123 l.Print("Exploring") 124 if *strategy != "random" && *strategy != "unique" && *strategy != "frequency" && *strategy != "nodefrequency" { 125 l.Fatal("Invalid exploration strategy " + *strategy + " specified") 126 } 127 } else { 128 l.Print("Replaying") 129 } 130 level := os.Getenv("DARA_LOG_LEVEL") 131 i, err := strconv.Atoi(level) 132 if err != nil { 133 l.Fatal("Invalid log level specified") 134 } 135 MAX_EXPLORATION_DEPTH = *maxdepth 136 MAX_EXPLORE_RUNS = *maxruns 137 LogLevel = int(i) 138 return 139 } 140 141 func forward() { 142 time.Sleep(1 * time.Millisecond) 143 } 144 145 // dprint is a log level print function for the Global Scheduler 146 func dprint(level int, pfunc func()) { 147 if LogLevel == dara.OFF { 148 return 149 } 150 151 if level >= LogLevel { 152 pfunc() 153 } 154 } 155 156 // roundRobin selects the next DaraProcess to be run. 157 // Currently, it follows a round robin strategy (well no shit) 158 func roundRobin() int { 159 flag := false 160 for !flag { 161 LastProc = (LastProc + 1) % (*procs + 1) 162 // Dara doesn't have ProcID 0 163 if LastProc == 0 { 164 LastProc++ 165 } 166 dprint(dara.DEBUG, func() {l.Println("Checking status of", LastProc, "which is", procStatus[LastProc])}) 167 if procStatus[LastProc] == ALIVE { 168 dprint(dara.DEBUG, func() {l.Println("Next process should be", LastProc)}) 169 flag = true 170 } 171 } 172 return LastProc 173 } 174 175 // getSchedulingEvents returns all the scheduling events in a schedule 176 func getSchedulingEvents(sched *dara.Schedule) map[int][]dara.Event { 177 proc_event_map := make(map[int][]dara.Event) 178 for i := 0; i < len(sched.LogEvents); i++ { 179 eventTypeString := common.EventTypeString(sched.LogEvents[i].Type) 180 if eventTypeString == "SCHEDULE" { 181 // Don't want to run timer events 182 if string(bytes.Trim(sched.LogEvents[i].G.FuncInfo[:64], "\x00")[:]) != "runtime.timerproc" { 183 proc_event_map[sched.LogEvents[i].P] = append(proc_event_map[sched.LogEvents[i].P], sched.LogEvents[i]) 184 } 185 } 186 } 187 return proc_event_map 188 } 189 190 // ConsumeCoverage consumes the coverage information for this run 191 func ConsumeCoverage(ProcID int) explorer.CovStats { 192 coverage := make(explorer.CovStats) 193 for i := 0; i < procchan[ProcID].CoverageIndex; i++ { 194 blockID := string(bytes.Trim(procchan[ProcID].Coverage[i].BlockID[:dara.BLOCKIDLEN], "\x00")[:]) 195 count := procchan[ProcID].Coverage[i].Count 196 coverage[blockID] = count 197 } 198 // Reset the CoverageIndex 199 procchan[ProcID].CoverageIndex = 0 200 return coverage 201 } 202 203 // ConsumeAndPrint calls ConsumeLog 204 // TODO: Maybe we can delete this function and call ConsumeLog directly 205 func ConsumeAndPrint(ProcID int, context *map[string]interface{}) []dara.Event { 206 cl := ConsumeLog(ProcID, context) 207 dprint(dara.DEBUG, func() {l.Println("Consumed", len(cl), "events")}) 208 return cl 209 } 210 211 // ConsumeLog consumes all the events from the shared memory and produces 212 // a list of events for further use 213 func ConsumeLog(ProcID int, context *map[string]interface{}) []dara.Event { 214 dprint(dara.DEBUG, func() { l.Println("Current log event index is", procchan[ProcID].LogIndex, "for Process", ProcID) }) 215 logsize := procchan[ProcID].LogIndex 216 dprint(dara.DEBUG, func() { l.Println("Current log event index is", logsize, "for Process", ProcID) }) 217 log := make([]dara.Event, logsize) 218 for i := 0; i < logsize; i++ { 219 ee := &(procchan[ProcID].Log[i]) 220 e := &(log[i]) 221 (*e).Type = (*ee).Type 222 (*e).P = (*ee).P 223 (*e).G = (*ee).G 224 (*e).Epoch = (*ee).Epoch 225 (*e).LE.LogID = string(bytes.Trim((*ee).ELE.LogID[:dara.VARBUFLEN], "\x00")[:]) 226 (*e).LE.Vars = make([]dara.NameValuePair, (*ee).ELE.Length) 227 for j := 0; j < len((*e).LE.Vars); j++ { 228 (*e).LE.Vars[j].VarName = string(bytes.Trim((*ee).ELE.Vars[j].VarName[:dara.VARBUFLEN], "\x00")[:]) 229 if e.Type == dara.LOG_EVENT { 230 // This is a log event so we need to update the context with new variable:value pairs 231 (*e).LE.Vars[j].Value = runtime.DecodeValue((*ee).ELE.Vars[j].Type, (*ee).ELE.Vars[j].Value) 232 (*e).LE.Vars[j].Type = string(bytes.Trim((*ee).ELE.Vars[j].Type[:dara.VARBUFLEN], "\x00")[:]) 233 // Build Up context for property checking 234 (*context)[e.LE.Vars[j].VarName] = e.LE.Vars[j].Value 235 } else if e.Type == dara.DELETEVAR_EVENT { 236 // Remove Variable from the context as the application asked for something to be deleted from 237 // the context 238 delete(*context, e.LE.Vars[j].VarName) 239 } 240 } 241 (*e).SyscallInfo = (*ee).SyscallInfo 242 dprint(dara.DEBUG, func() { l.Println(common.ConciseEventString(e)) }) 243 if e.Type == dara.THREAD_EVENT { 244 dprint(dara.DEBUG, func() { l.Println("Thread creation noted with ID", e.G.Gid, "at function", string(e.G.FuncInfo[:64])) }) 245 } 246 if e.Type == dara.SCHED_EVENT { 247 dprint(dara.DEBUG, func() { l.Println("Scheduling event for Process", ProcID, "and goroutine", common.RoutineInfoString(&e.G))}) 248 } 249 if e.Type == dara.TIMER_EVENT { 250 // A timer was installed! Put this on the vclock queue. 251 if v, ok := clocks[ProcID]; ok { 252 eventCounter += 1 253 v.AddTimerEvent(eventCounter, explorer.Timer, uint64(e.SyscallInfo.Args[1].Integer64), uint64(e.SyscallInfo.Args[2].Integer64), int64(e.G.Gid), e.SyscallInfo.Args[0].Integer64) 254 } else { 255 dprint(dara.WARN, func() {l.Println("Virtual Clock not found for Process: ", ProcID)} ) 256 } 257 } 258 if e.SyscallInfo.SyscallNum == dara.DSYS_SLEEP { 259 // A goroutine was put to sleep. It will wake back up ....when the explorer decides 260 if v, ok := clocks[ProcID]; ok { 261 eventCounter += 1 262 v.AddTimerEvent(eventCounter, explorer.Sleep, uint64(e.SyscallInfo.Args[0].Integer64), 0, e.SyscallInfo.Args[1].Integer64, 0) 263 } else { 264 dprint(dara.WARN, func() {l.Println("Virtual Clock not found for Process: ", ProcID)} ) 265 } 266 } 267 } 268 procchan[ProcID].LogIndex = 0 269 procchan[ProcID].Epoch++ 270 return log 271 } 272 273 // CheckAllProcsDead checks if all local runtime/processes have died/completed or not. 274 func CheckAllProcsDead() bool { 275 for _, status := range procStatus { 276 if status != DEAD { 277 return false 278 } 279 } 280 return true 281 } 282 283 // CheckAllGoRoutinesDead returns true if all the goroutines in local runtime have died or not. 284 func CheckAllGoRoutinesDead(ProcID int) bool { 285 allDead := true 286 dprint(dara.DEBUG, func() { l.Printf("Checking if all goroutines are dead yet for Process %d\n", ProcID) }) 287 //l.Printf("Num routines are %d", len(procchan[ProcID].Routines)) 288 for _, routine := range procchan[ProcID].Routines { 289 // Only check for dead goroutines for real go routines 290 if routine.Gid != 0 { 291 status := dara.GetDaraProcStatus(routine.Status) 292 dprint(dara.DEBUG, func() { l.Printf("Status of GoRoutine %d is %s\n", routine.Gid, dara.GStatusStrings[status]) }) 293 allDead = allDead && (status == dara.Dead) 294 if status != dara.Dead { 295 //dprint(dara.INFO, func() { l.Printf("GoRoutine %d Proc %d is not dead yet\n", routine.Gid, ProcID)}) 296 return false 297 } 298 } 299 } 300 return allDead 301 } 302 303 // GetAllRunnableRoutines returns all the goroutines that were runnable when 304 // we switched back to the global scheduler 305 func GetAllRunnableRoutines(event dara.Event) []int { 306 runnable := make([]int, 0) 307 for j, info := range procchan[event.P].Routines { 308 if dara.GetDaraProcStatus(info.Status) != dara.Idle { 309 dprint(dara.DEBUG, func() { l.Printf("Proc[%d]Routine[%d].Info = %s", event.P, j, common.RoutineInfoString(&info)) }) 310 runnable = append(runnable, j) 311 } 312 } 313 return runnable 314 } 315 316 // IsEventReplayable checks if an event is replayable if the goid to be 317 // scheduled for that event is one of the runnable routines. 318 func IsEventReplayable(runnable []int, Gid int) bool { 319 for _, id := range runnable { 320 if Gid == id { 321 return true 322 } 323 } 324 return false 325 } 326 327 // CompareEvents compares if two events are the same or not. 328 // This is used in checking if replay is correct or not 329 // Note: Currently two events are the same if they are of the same type 330 func CompareEvents(e1 dara.Event, e2 dara.Event) bool { 331 // TODO Make a more detailed comparison 332 if e1.Type != e2.Type { 333 return false 334 } 335 return true 336 } 337 338 // CheckEndOrCrash checks if the event is a crash or an ending event 339 func CheckEndOrCrash(e dara.Event) bool { 340 if e.Type == dara.END_EVENT || e.Type == dara.CRASH_EVENT { 341 return true 342 } 343 return false 344 } 345 346 // initExplorer initializes the explorer 347 func initExplorer(seed *dara.Schedule) *explorer.Explorer { 348 var explorationStrat explorer.Strategy 349 if *strategy == "random" { 350 explorationStrat = explorer.RANDOM 351 } else if *strategy == "unique" { 352 explorationStrat = explorer.COVERAGE_UNIQUE 353 } else if *strategy == "frequency" { 354 explorationStrat = explorer.COVERAGE_FREQUENCY 355 } else if *strategy == "nodefrequency" { 356 explorationStrat = explorer.COVERAGE_NODE_FREQUENCY 357 } 358 e, err := explorer.MountExplorer(EXPLORATION_LOG_FILE, explorationStrat, seed, *procs, LogLevel) 359 if err != nil { 360 l.Fatal(err) 361 } 362 e.SetMaxDepth(MAX_EXPLORATION_DEPTH) 363 return e 364 } 365 366 // getDaraProcs returns the list of all goroutines for a given process 367 func getDaraProcs(procID int) *[]explorer.ProcThread { 368 threads := []explorer.ProcThread{} 369 for _, routine := range procchan[procID].Routines { 370 if dara.GetDaraProcStatus(routine.Status) != dara.Idle { 371 t := explorer.ProcThread{ProcID: procID, Thread: routine} 372 threads = append(threads, t) 373 } 374 } 375 return &threads 376 } 377 378 func getAllDaraProcs() *[]explorer.ProcThread { 379 threads := []explorer.ProcThread{} 380 for i := 1; i <= *procs; i++ { 381 for _, routine := range procchan[i].Routines { 382 if dara.GetDaraProcStatus(routine.Status) != dara.Idle { 383 t := explorer.ProcThread{ProcID: i, Thread: routine} 384 threads = append(threads, t) 385 } 386 } 387 } 388 return &threads 389 } 390 391 // preloadReplaySchedule populates the shared memory with a pre-chosen schedule 392 func preloadReplaySchedule(proc_schedules map[int][]dara.Event) { 393 for procID, events := range proc_schedules { 394 for i, e := range events { 395 procchan[procID].Log[i].Type = e.Type 396 procchan[procID].Log[i].P = e.P 397 procchan[procID].Log[i].G = e.G 398 } 399 procchan[procID].LogIndex = len(events) 400 } 401 } 402 403 // checkProperties checks the user-defined properties under a given context 404 // that is collected using DaraLog calls in the local runtimes 405 func checkProperties(context map[string]interface{}) (bool, *[]dara.FailedPropertyEvent, error) { 406 if checker != nil { 407 return checker.Check(context) 408 } 409 return true, &[]dara.FailedPropertyEvent{}, nil 410 } 411 412 // replaySchedule replays a previously recorded/explored execution 413 func replaySchedule() { 414 schedule := decodeInputSchedule() 415 context := make(map[string]interface{}) 416 dprint(dara.DEBUG, func() { l.Println("Num Events : ", len(schedule.LogEvents)) }) 417 dprint(dara.DEBUG, func() { l.Println("Replaying the following schedule : ", common.ConciseScheduleString(&schedule.LogEvents)) }) 418 var i int 419 fast_replay := os.Getenv("FAST_REPLAY") 420 if fast_replay == "true" { 421 proc_schedule_map := getSchedulingEvents(schedule) 422 preloadReplaySchedule(proc_schedule_map) 423 for procID := range procchan { 424 if procchan[procID].Run == -1 && procchan[procID].LogIndex == 0 { 425 procchan[procID].Run = -100 426 } else { 427 dprint(dara.INFO, func() { l.Println("Enabling runtime for proc", procID) }) 428 procchan[procID].Run = -5 429 } 430 } 431 all_procs_done := false 432 for !all_procs_done { 433 all_procs_done = true 434 for procID := range procchan { 435 if atomic.CompareAndSwapInt32((*int32)(unsafe.Pointer(&(procchan[procID].Lock))), dara.UNLOCKED, dara.LOCKED) { 436 if procchan[procID].Run != -100 { 437 // We haven't reached the end of replay for this runtime. Give up control to this runtime. 438 all_procs_done = false 439 //dprint(dara.INFO, func() {l.Println("Not over for Proc", procID, "with value", procchan[procID].Run)}) 440 } 441 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[procID].Lock))), dara.UNLOCKED) 442 } else { 443 all_procs_done = false 444 } 445 } 446 } 447 dprint(dara.INFO, func() { l.Println("Fast Replay done") }) 448 } else { 449 for i < len(schedule.LogEvents) { 450 dprint(dara.DEBUG, func(){l.Println("Replaying event index", i)}) 451 if CheckAllProcsDead() { 452 dprint(dara.INFO, func() {l.Println("All processes dead but", len(schedule.LogEvents) - i, "events need to be replayed")}) 453 break 454 } 455 if lockStatus[schedule.LogEvents[i].P] || atomic.CompareAndSwapInt32((*int32)(unsafe.Pointer(&(procchan[schedule.LogEvents[i].P].Lock))), dara.UNLOCKED, dara.LOCKED) { 456 if procchan[schedule.LogEvents[i].P].Run == -1 { //check predicates on goroutines + schedule 457 458 //move forward 459 forward() 460 461 dprint(dara.DEBUG, func() { l.Println("Scheduling Event") }) 462 runnable := GetAllRunnableRoutines(schedule.LogEvents[i]) 463 //TODO clean up init 464 runningindex := 0 465 if len(runnable) == 0 { 466 //This should only occur when the processes 467 //have not started yet. There should be a 468 //smarter starting condition here rather than 469 //just borking on the init of the schedule 470 runningindex = -2 471 } else { 472 //replay schedule 473 check_replayable: 474 can_run := IsEventReplayable(runnable, schedule.LogEvents[i].G.Gid) 475 if !can_run { 476 if string(bytes.Trim(schedule.LogEvents[i].G.FuncInfo[:64], "\x00")[:]) != "runtime.timerproc" { 477 dprint(dara.INFO, func() { l.Println("G info was ", string(schedule.LogEvents[i].G.FuncInfo[:64])) }) 478 dprint(dara.FATAL, func() { l.Fatal("Can't run event ", i, ": ", common.ConciseEventString(&schedule.LogEvents[i])) }) 479 } else { 480 //Trying to schedule timer thread. Let's just increment the event counter 481 dprint(dara.INFO, func() { l.Println("G info was ", string(schedule.LogEvents[i].G.FuncInfo[:64])) }) 482 i++ 483 //Check if the new event is replayable 484 goto check_replayable 485 } 486 } 487 runningindex = schedule.LogEvents[i].G.Gid 488 } 489 _ = runningindex 490 //Assign which goroutine to run 491 procchan[schedule.LogEvents[i].P].Run = -5 492 procchan[schedule.LogEvents[i].P].RunningRoutine = schedule.LogEvents[i].G 493 dprint(dara.INFO, func() { 494 l.Printf("Running (%d/%d) %d %s", i+1, len(schedule.LogEvents), schedule.LogEvents[i].P, common.RoutineInfoString(&schedule.LogEvents[i].G)) 495 }) 496 497 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[schedule.LogEvents[i].P].Lock))), dara.UNLOCKED) 498 for { 499 if atomic.CompareAndSwapInt32((*int32)(unsafe.Pointer(&(procchan[schedule.LogEvents[i].P].Lock))), dara.UNLOCKED, dara.LOCKED) { 500 currentDaraProc := schedule.LogEvents[i].P 501 if procchan[schedule.LogEvents[i].P].Run == -1 { 502 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[schedule.LogEvents[i].P].Lock))), dara.UNLOCKED) 503 events := ConsumeAndPrint(currentDaraProc, &context) 504 coverage := ConsumeCoverage(currentDaraProc) 505 dprint(dara.DEBUG, func() { l.Println("Replay Consumed ", len(events), "events") }) 506 dprint(dara.DEBUG, func() { l.Println("Replay Consumed ", len(coverage), "blocks") }) 507 for _, e := range events { 508 dprint(dara.DEBUG, func(){ l.Println(common.ConciseEventString(&e))}) 509 same_event := CompareEvents(e, schedule.LogEvents[i]) 510 if !same_event { 511 dprint(dara.WARN, func() { 512 l.Println("Replayed 2 different events", common.ConciseEventString(&e), common.ConciseEventString(&schedule.LogEvents[i])) 513 }) 514 } else { 515 dprint(dara.DEBUG, func() { l.Println("Replayed ", common.ConciseEventString(&schedule.LogEvents[i])) }) 516 } 517 i++ 518 } 519 dprint(dara.DEBUG, func() { l.Println("Replay : At event", i) }) 520 if i >= len(schedule.LogEvents) { 521 dprint(dara.DEBUG, func() { l.Printf("Informing the local scheduler end of replay") }) 522 procchan[currentDaraProc].Run = -4 523 break 524 } 525 break 526 } else if procchan[schedule.LogEvents[i].P].Run == -100 { 527 // This means that the local runtime finished and that we can move on 528 currentProc := schedule.LogEvents[i].P 529 events := ConsumeAndPrint(currentDaraProc, &context) 530 coverage := ConsumeCoverage(currentDaraProc) 531 dprint(dara.DEBUG, func() { l.Println("Replay Consumed ", len(coverage), "blocks") }) 532 dprint(dara.DEBUG, func() { l.Println("Replay Consumed", len(events), "events") }) 533 for _, e := range events { 534 dprint(dara.DEBUG, func(){ l.Println(common.ConciseEventString(&e))}) 535 same_event := CompareEvents(e, schedule.LogEvents[i]) 536 if !same_event { 537 dprint(dara.WARN, func() { 538 l.Println("Replayed 2 different events", common.ConciseEventString(&e), common.ConciseEventString(&schedule.LogEvents[i])) 539 }) 540 } 541 i++ 542 } 543 procStatus[currentProc] = DEAD 544 break 545 } 546 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[currentDaraProc].Lock))), dara.UNLOCKED) 547 } 548 } 549 time.Sleep(time.Second) 550 } 551 } 552 } 553 } 554 dprint(dara.INFO, func() { l.Println("Replay is over") }) 555 } 556 557 // extractDataFromProc consumes events and coverage information and then check properties 558 // Context, schedule, and the index are modified by this function 559 func extractDataFromProc(ProcID int, index *int, schedule *dara.Schedule, context *map[string]interface{}) (*[]dara.Event, *explorer.CovStats, bool) { 560 events := ConsumeAndPrint(ProcID, context) 561 coverage := ConsumeCoverage(ProcID) 562 dprint(dara.DEBUG, func() { l.Println("Consumed ", len(coverage), "blocks") }) 563 result, propevents, err := checkProperties(*context) 564 if err != nil { 565 dprint(dara.INFO, func() { l.Println(err) }) 566 } 567 if !result { 568 dprint(dara.INFO, func() { l.Println("Property check failed with", len(*propevents), "failures") }) 569 } 570 schedule.PropEvents = append(schedule.PropEvents, dara.CreatePropCheckEvent(*propevents, *index + len(events) - 1)) 571 schedule.LogEvents = append(schedule.LogEvents, events...) 572 coverageEvent := dara.CoverageEvent{CoverageInfo:coverage, EventIndex: *index + len(events) - 1} 573 schedule.CovEvents = append(schedule.CovEvents, coverageEvent) 574 hasBug := !result 575 for _, e := range events { 576 if e.Type == dara.CRASH_EVENT { 577 hasBug = true 578 break 579 } 580 } 581 *index += len(events) 582 return &events, &coverage, hasBug 583 } 584 585 // recordSchedule records a new execution 586 func recordSchedule() { 587 LastProc = 0 588 var i int 589 // Context for property checking 590 context := make(map[string]interface{}) 591 for i < RECORDLEN { 592 if CheckAllProcsDead() { 593 // If all processes have completed then break the loop and move out. 594 break 595 } 596 ProcID := roundRobin() 597 dprint(dara.WARN, func() {l.Println("Chosen process for running", ProcID)} ) 598 //dprint(dara.INFO, func() { l.Println("Chosen process for running is", ProcID) }) 599 if procchan[ProcID].Run == -1 { 600 // Initialize the local scheduler to record events 601 procchan[ProcID].Run = -3 602 dprint(dara.INFO, func() { l.Println("Setting up recording on Process", ProcID) }) 603 dprint(dara.INFO, func() { l.Println("Linux PID for this process is", procchan[ProcID].PID)}) 604 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 605 lockStatus[ProcID] = false 606 forward() // Give the local scheduler a chance to grab the lock 607 } 608 // Loop until we get some recorded events 609 for { 610 if lockStatus[ProcID] || atomic.CompareAndSwapInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED, dara.LOCKED) { 611 if linuxpids[ProcID] == 0 && procchan[ProcID].PID != 0{ 612 linuxpids[ProcID] = procchan[ProcID].PID 613 dprint(dara.INFO, func() { l.Println("Linux PID for this process is", procchan[ProcID].PID)}) 614 } 615 // We own the lock now if we didn't before. 616 lockStatus[ProcID] = true 617 //dprint(dara.DEBUG, func() { l.Println("Obtained Lock with run value", procchan[ProcID].Run, "on process", ProcID) }) 618 if procchan[ProcID].Run == -3 { 619 // The local scheduler hasn't run anything yet. 620 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 621 // Release the lock 622 lockStatus[ProcID] = false 623 continue 624 } 625 if procchan[ProcID].Run == -100 { 626 // Process has finished 627 // Ignore the return values since we really don't need them in record. 628 // Return value of extractDataFromProc are only useful during exploration 629 _, _, _ = extractDataFromProc(ProcID, &i, &schedule, &context) 630 } 631 if procchan[ProcID].Run == -6 { 632 //Process is blocked on network, hopefully it is unblocked after a full cycle of process scheduling! 633 // If this is the first time we chose this Process to run after getting blocked then we need 634 // to consume the events and the coverage 635 if procchan[ProcID].LogIndex > 0 { 636 _, _, _ = extractDataFromProc(ProcID, &i, &schedule, &context) 637 } 638 // Continue to hold the lock until the process has become unblocked 639 break 640 } 641 if procchan[ProcID].Run == -7 { 642 // Process has woken up from the Network poll and wants the lock back. 643 dprint(dara.DEBUG, func() { l.Println("Process", ProcID, " has become unblocked from network wait") }) 644 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 645 procchan[ProcID].Run = -3 646 lockStatus[ProcID] = false 647 forward() //Give the process a chance to grab the lock 648 continue 649 } 650 if procchan[ProcID].Run >= 0 { 651 // Process scheduled a goroutine which means we have events to add to the schedule. 652 dprint(dara.DEBUG, func() { l.Printf("Procchan Run status inside is %d for Process %d\n", procchan[ProcID].Run, ProcID) }) 653 dprint(dara.DEBUG, func() { l.Printf("Recording Event on Process/Node %d\n", ProcID) }) 654 _, _, _ = extractDataFromProc(ProcID, &i, &schedule, &context) 655 if i >= RECORDLEN-*procs { 656 //mark the processes for death 657 dprint(dara.DEBUG, func() { l.Printf("Ending Execution!") }) 658 procchan[ProcID].Run = -4 //mark process for death 659 } else { 660 //Reset the local scheduler to continue with its wizardry. 661 procchan[ProcID].Run = -3 662 } 663 // We got some events so now we should run some other process. 664 break 665 } 666 if CheckAllGoRoutinesDead(ProcID) { 667 procStatus[ProcID] = DEAD 668 dprint(dara.INFO, func() { l.Println("Process ", ProcID, " has finished")}) 669 break 670 } else { 671 dprint(dara.INFO, func() { l.Println("Run value for process ", ProcID, " is ", procchan[ProcID].Run)}) 672 } 673 } 674 } 675 } 676 // Create the schedule file and encode/store the captured schedule 677 f, err := os.Create(*sched_file) 678 if err != nil { 679 l.Fatal(err) 680 } 681 defer f.Close() 682 enc := json.NewEncoder(f) 683 enc.SetIndent("", "\t") 684 err = enc.Encode(schedule) 685 if err != nil { 686 l.Fatal(err) 687 } 688 } 689 690 // dara_rpc_client launches the rpc client for communicating with the overlord 691 func dara_rpc_client(command chan string, response chan bool) { 692 client, err := rpc.Dial("tcp", OVERLORD_RPC_SERVER) 693 if err != nil { 694 l.Fatal(err) 695 } 696 697 var isFinished bool 698 699 for ;!isFinished;{ 700 select { 701 case op := <-command: 702 switch op { 703 case "finish": 704 dprint(dara.INFO, func(){l.Println("Contacting overlord to finish execution")}) 705 var reply bool 706 err = client.Call("DaraRpcServer.FinishExecution", 0, &reply) 707 if err != nil { 708 l.Println(err) 709 response <- false 710 } 711 isFinished = true 712 response <- true 713 case "kill": 714 var reply bool 715 err = client.Call("DaraRpcServer.KillExecution", 0, &reply) 716 if err != nil { 717 l.Println(err) 718 response <- false 719 } 720 response <- true 721 case "restart": 722 var reply bool 723 err = client.Call("DaraRpcServer.RestartExecution", 0, &reply) 724 if err != nil { 725 l.Println(err) 726 response <- false 727 } 728 response <- true 729 } 730 } 731 } 732 } 733 734 // explore explores the schedule + coverage space 735 func exploreSchedule() { 736 schedule := decodeInputSchedule() 737 command_chan := make(chan string) 738 response_chan := make(chan bool) 739 go dara_rpc_client(command_chan, response_chan) 740 explore_unit := initExplorer(schedule) 741 explore_unit.SetMaxDepth(MAX_EXPLORATION_DEPTH) 742 dprint(dara.INFO, func() { l.Println("Dara the Explorer begins")}) 743 current_run := 0 744 restart := false 745 start := true 746 var all_times []float64 747 var run_times []float64 748 var firstBugRun int 749 var numbuggyPaths int 750 var bugruns []int 751 for current_run < MAX_EXPLORE_RUNS { 752 l.Println("Exploring path #", current_run + 1) 753 isblocked := make(map[int]bool) 754 if restart { 755 // Reset shared memory and all the status 756 for i := 0; i <= *procs; i++ { 757 dprint(dara.DEBUG, func() {l.Println("Re-Initializing", i) }) 758 procchan[i].Lock = dara.LOCKED 759 procchan[i].SyscallLock = dara.UNLOCKED 760 procchan[i].Run = -1 761 procchan[i].Syscall = -1 762 procchan[i].CoverageIndex = 0 763 procchan[i].RunningRoutine = dara.RoutineInfo{} 764 for j := 0; j < len(procchan[i].Routines); j++ { 765 procchan[i].Routines[j] = dara.RoutineInfo{} 766 } 767 } 768 for i := 1; i <= *procs; i++ { 769 procStatus[i] = ALIVE 770 lockStatus[i] = true // By default, global scheduler owns the locks on every local scheduler 771 linuxpids[i] = uint64(0) 772 covStats := make(explorer.CovStats) 773 allCoverage[i] = &covStats 774 clocks[i] = explorer.NewClock() 775 isblocked[i] = false 776 } 777 command_chan <- "restart" 778 response := <- response_chan 779 if response { 780 dprint(dara.INFO, func() {l.Println("Overlord restarted execution successfully")}) 781 } else { 782 dprint(dara.INFO, func() {l.Println("Overlord didn't restart execution successfully")}) 783 } 784 } 785 if restart || start { 786 for i := 1; i <= *procs; i++ { 787 isblocked[i] = false 788 } 789 // Give time for the program to start/restart 790 time.Sleep(5 * time.Second) 791 start = false 792 //time.Sleep(5 * time.Millisecond) 793 } 794 runStart := time.Now() 795 // Initialize variables for this run 796 var event_index int 797 var current_schedule dara.Schedule // current schedule 798 current_context := make(map[string]interface{}) // current context for property checking 799 800 // Reset the linux pids for each process! 801 for i := 1; i <= *procs; i++ { 802 linuxpids[i] = 0 803 } 804 805 // The choice of goroutine to be run is broken down into 2 parts 806 // Part 1: Selecting the Process 807 // Part 2: Selecting the goroutine on the process 808 actionComplete := true 809 actionInstalled := false 810 actionSelected := false 811 var ProcID int 812 var events *[]dara.Event 813 var coverage *explorer.CovStats 814 var next_routine *explorer.ProcThread 815 var next_timer *explorer.ProcTimer 816 var action explorer.Action 817 var numActions int 818 var bugFound bool 819 dprint(dara.INFO, func() {l.Println("Before inner loop")}) 820 for { 821 // Have the explorer choose the process 822 allBlocked := true 823 var hasBug bool 824 for _, v := range isblocked { 825 allBlocked = allBlocked && v 826 } 827 if actionComplete && !allBlocked{ 828 // Get New action now! 829 numActions += 1 830 threads := getAllDaraProcs() 831 start := time.Now() 832 action = explore_unit.GetNextAction(threads, events, coverage, &clocks, isblocked) 833 end := time.Now().Sub(start) 834 all_times = append(all_times, end.Seconds()) 835 // If action is restart then kill current execution and restart 836 if action.Type == explorer.RESTART { 837 command_chan <- "kill" 838 response := <-response_chan 839 if response { 840 dprint(dara.INFO, func() {l.Println("Overlord finished execution successfully")}) 841 } else { 842 dprint(dara.INFO, func() {l.Println("Overlord didn't finish execution successfully")}) 843 } 844 restart = true 845 // Update the run counter and break away to the next iteration 846 current_run += 1 847 break 848 } else if action.Type == explorer.SCHEDULE { 849 if ptr, ok := action.Arg.(*explorer.ProcThread); ok { 850 next_routine = ptr 851 ProcID = next_routine.ProcID 852 dprint(dara.INFO, func() {l.Println("Goroutine chosen to be scheduled is", next_routine.Thread.Gid)}) 853 } else { 854 dprint(dara.FATAL, func() {l.Println("Failed to parse explorer action")} ) 855 } 856 } else if action.Type == explorer.TIMER { 857 if ptr, ok := action.Arg.(*explorer.ProcTimer); ok { 858 next_timer = ptr 859 } else { 860 dprint(dara.FATAL, func() {l.Println("Failed to parse explorer action")} ) 861 } 862 } 863 864 actionSelected = true 865 actionComplete = false 866 } else if allBlocked { 867 actionSelected = false 868 } 869 // Loop until chosen action is completed 870 if lockStatus[ProcID] || atomic.CompareAndSwapInt32((*int32)(unsafe.Pointer(&procchan[ProcID].Lock)), dara.UNLOCKED, dara.LOCKED) { 871 lockStatus[ProcID] = true 872 if linuxpids[ProcID] == 0 && procchan[ProcID].PID !=0 { 873 linuxpids[ProcID] = procchan[ProcID].PID 874 } 875 if procchan[ProcID].Run == -100 { 876 // The process has ended 877 // Extract data and update the variables from this run! 878 // We don't need the events and coverage anymore for this event 879 // They are already in the schedule 880 if actionInstalled { 881 events, coverage, hasBug = extractDataFromProc(ProcID, &event_index, ¤t_schedule, ¤t_context) 882 explorer.CoverageUnion(allCoverage[ProcID], coverage) 883 explore_unit.UpdateActionCoverage(events, coverage) 884 dprint(dara.INFO, func() {l.Println("Process has finished")}) 885 // Check for crash events and property failures. Save the current schedule if there are any such failures 886 bugFound = bugFound || hasBug 887 // Mark the process as DEAD 888 procStatus[ProcID] = DEAD 889 actionComplete = true 890 actionInstalled = false 891 } 892 // We shouldn't be in this state if action was installed 893 } else if procchan[ProcID].Run == -5 { 894 // The process is executing some previously chosen instruction 895 // It is unlikely we ever get the lock back in this state 896 // Regardless, we should unlock and try again. 897 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 898 lockStatus[ProcID] = false 899 continue 900 } else if procchan[ProcID].Run == -1 { 901 // Local Runtime is either waiting for instructions about the next choice to be made 902 // or it has completed our action 903 if actionInstalled { 904 // Action has now been completed. Woohoo! 905 actionComplete = true 906 actionInstalled = false 907 events, coverage, hasBug = extractDataFromProc(ProcID, &event_index, ¤t_schedule, ¤t_context) 908 // Update the full coverage 909 explorer.CoverageUnion(allCoverage[ProcID], coverage) 910 explore_unit.UpdateActionCoverage(events, coverage) 911 // Check for crash events and property failures. Save the current schedule if there are any such failures 912 bugFound = bugFound || hasBug 913 // Keep holding the lock until we need to schedule a new action for this process. Don't release the lock. 914 } else { 915 // Need to install the action 916 actionInstalled = true 917 dprint(dara.INFO, func() {l.Println("Installing a new action")}) 918 procchan[ProcID].Run = -5 919 if action.Type == explorer.SCHEDULE { 920 // Either the action is to schedule a goroutine 921 procchan[ProcID].RunningRoutine = next_routine.Thread 922 } else if action.Type == explorer.TIMER { 923 // Or the action is to fire off a timer 924 // Need to add the timer firing event to the schedule manually. 925 procchan[ProcID].RunningRoutine.Gid = int(next_timer.ID) // timerID 926 procchan[ProcID].RunningRoutine.RoutineCount = -5 // Special Code 927 } 928 929 // Release the lock so that the installed action can be completed. 930 lockStatus[ProcID] = false 931 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 932 } 933 } else if procchan[ProcID].Run == -6 { 934 // This process is now blocked polling on network. Need to switch different processes! 935 isblocked[ProcID] = true 936 dprint(dara.DEBUG, func() {l.Println("Process", ProcID, "blocked polling on network")}) 937 if actionInstalled { 938 dprint(dara.INFO, func() { l.Println("action was installed")}) 939 // Action has now been completed. Woohoo! 940 actionComplete = true 941 actionInstalled = false 942 events, coverage, hasBug = extractDataFromProc(ProcID, &event_index, ¤t_schedule, ¤t_context) 943 // Update the full coverage 944 explorer.CoverageUnion(allCoverage[ProcID], coverage) 945 explore_unit.UpdateActionCoverage(events, coverage) 946 // Check for crash events and property failures. Save the current schedule if there are any such failures 947 bugFound = bugFound || hasBug 948 } 949 } else if procchan[ProcID].Run == -7 { 950 // This process is now available to be run woohoo! 951 isblocked[ProcID] = false 952 dprint(dara.INFO, func() {l.Println("Process", ProcID, "unblocked from network poll")}) 953 atomic.StoreInt32((*int32)(unsafe.Pointer(&(procchan[ProcID].Lock))), dara.UNLOCKED) 954 lockStatus[ProcID] = false 955 if actionSelected { 956 // Possibly schedule that action? 957 } 958 } 959 } 960 if CheckAllProcsDead() { 961 // All the processes are dead so we can move on to the next run/finish exploration 962 current_run += 1 963 explore_unit.RestartExploration() 964 restart = true 965 break 966 } 967 } 968 runend := time.Now().Sub(runStart) 969 run_times = append(run_times, runend.Seconds()) 970 l.Println("Total Number of actions:", numActions) 971 if bugFound { 972 numbuggyPaths++ 973 if firstBugRun == 0 { 974 firstBugRun = current_run 975 } 976 bugruns = append(bugruns, current_run) 977 } 978 } 979 // Notify the overlord that we are done with exploration 980 l.Println("Total number of buggy paths", numbuggyPaths) 981 l.Println("First bug run:", firstBugRun) 982 command_chan <- "finish" 983 response := <-response_chan 984 if response { 985 dprint(dara.INFO, func() {l.Println("Overlord finished execution successfully")}) 986 } else { 987 dprint(dara.INFO, func() {l.Println("Overlord didn't finish execution successfully")}) 988 } 989 dprint(dara.INFO, func() { l.Println("Exploration ended") }) 990 err := explore_unit.SaveVisitedSchedules() 991 if err != nil { 992 dprint(dara.FATAL, func() { l.Fatal(err) }) 993 } 994 file, err := os.Create("action_times-" + *strategy + ".csv") 995 if err != nil { 996 dprint(dara.FATAL, func() {l.Fatal(err)}) 997 } 998 defer file.Close() 999 for _, t := range all_times { 1000 file.WriteString(fmt.Sprintf("%f\n", t)) 1001 } 1002 rfile, err := os.Create("run_times-" + *strategy + ".csv") 1003 if err != nil { 1004 dprint(dara.FATAL, func() {l.Fatal(err)}) 1005 } 1006 defer rfile.Close() 1007 for _, t := range run_times { 1008 rfile.WriteString(fmt.Sprintf("%f\n", t)) 1009 } 1010 err = explore_unit.SaveStateSpaceResults("states-" + *strategy + ".csv") 1011 if err != nil { 1012 dprint(dara.FATAL, func() {l.Fatal(err)}) 1013 } 1014 bugrunfile, err := os.Create("bugruns-" + *strategy + ".csv") 1015 if err != nil { 1016 dprint(dara.FATAL, func(){l.Fatal(err)}) 1017 } 1018 defer bugrunfile.Close() 1019 for _, v := range bugruns { 1020 bugrunfile.WriteString(fmt.Sprintf("%d\n", v)) 1021 } 1022 } 1023 1024 // initGlobalScheduler initializes the global scheduler 1025 // This includes setting up the shared memory, initialising the global variables 1026 func initGlobalScheduler() { 1027 //Set up logger 1028 l = log.New(os.Stdout, "[Scheduler]", log.Lshortfile) 1029 checkargs() 1030 var err2 error 1031 prop_file := os.Getenv("PROP_FILE") 1032 if prop_file != "" { 1033 dprint(dara.INFO, func() { l.Println("Using property file", prop_file) }) 1034 checker, err2 = propchecker.NewChecker(prop_file) 1035 if err2 != nil { 1036 l.Fatal(err2) 1037 } 1038 } 1039 p, err = runtime.Mmap(nil, 1040 dara.CHANNELS*dara.DARAPROCSIZE, 1041 dara.PROT_READ|dara.PROT_WRITE, dara.MAP_SHARED, dara.DARAFD, 0) 1042 1043 if err != 0 { 1044 log.Println(err) 1045 l.Fatal(err) 1046 } 1047 1048 //Map control struct into shared memory 1049 procchan = (*[dara.CHANNELS]dara.DaraProc)(p) 1050 //rand.Seed(int64(time.Now().Nanosecond())) 1051 //var count int 1052 dprint(dara.INFO, func() { l.Println("Size of DaraProc struct is", unsafe.Sizeof(procchan[0]))}) 1053 for i := 0; i <= *procs; i++ { 1054 dprint(dara.DEBUG, func() {l.Println("Initializing", i) }) 1055 procchan[i].Lock = dara.LOCKED 1056 procchan[i].SyscallLock = dara.UNLOCKED 1057 procchan[i].Run = -1 1058 procchan[i].Syscall = -1 1059 procchan[i].CoverageIndex = 0 1060 } 1061 procStatus = make(map[int]ProcStatus) 1062 lockStatus = make(map[int]bool) 1063 linuxpids = make(map[int]uint64) 1064 allCoverage = make(map[int]*explorer.CovStats) 1065 clocks = make(map[int]*explorer.VirtualClock) 1066 for i := 1; i <= *procs; i++ { 1067 procStatus[i] = ALIVE 1068 lockStatus[i] = true // By default, global scheduler owns the locks on every local scheduler 1069 linuxpids[i] = uint64(0) 1070 covStats := make(explorer.CovStats) 1071 allCoverage[i] = &covStats 1072 clocks[i] = explorer.NewClock() 1073 } 1074 dprint(dara.INFO, func() { l.Println("Starting the Scheduler") }) 1075 1076 } 1077 1078 func main() { 1079 initGlobalScheduler() 1080 if *replay { 1081 dprint(dara.INFO, func() { l.Println("Started replaying") }) 1082 replaySchedule() 1083 dprint(dara.INFO, func() { l.Println("Finished replaying") }) 1084 } else if *record { 1085 dprint(dara.INFO, func() { l.Println("Started recording") }) 1086 recordSchedule() 1087 dprint(dara.INFO, func() { l.Println("Finished recording") }) 1088 } else if *explore { 1089 dprint(dara.INFO, func() { l.Println("Started exploring")}) 1090 exploreSchedule() 1091 dprint(dara.INFO, func() { l.Println("Finished exploring") }) 1092 } 1093 dprint(dara.INFO, func() { l.Println("Exiting scheduler") }) 1094 }