github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/syz-cluster/controller/processor.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package main 5 6 import ( 7 "bytes" 8 "context" 9 "fmt" 10 "log" 11 "sync" 12 "time" 13 14 "github.com/google/syzkaller/syz-cluster/pkg/api" 15 "github.com/google/syzkaller/syz-cluster/pkg/app" 16 "github.com/google/syzkaller/syz-cluster/pkg/blob" 17 "github.com/google/syzkaller/syz-cluster/pkg/db" 18 "github.com/google/syzkaller/syz-cluster/pkg/workflow" 19 "golang.org/x/sync/errgroup" 20 ) 21 22 type SeriesProcessor struct { 23 blobStorage blob.Storage 24 seriesRepo *db.SeriesRepository 25 sessionRepo *db.SessionRepository 26 sessionTestRepo *db.SessionTestRepository 27 workflows workflow.Service 28 dbPollInterval time.Duration 29 parallelWorkflows int 30 } 31 32 func NewSeriesProcessor(env *app.AppEnvironment, cfg *app.AppConfig) *SeriesProcessor { 33 workflows, err := workflow.NewArgoService() 34 if err != nil { 35 app.Fatalf("failed to initialize workflows: %v", err) 36 } 37 return &SeriesProcessor{ 38 blobStorage: env.BlobStorage, 39 seriesRepo: db.NewSeriesRepository(env.Spanner), 40 sessionRepo: db.NewSessionRepository(env.Spanner), 41 sessionTestRepo: db.NewSessionTestRepository(env.Spanner), 42 dbPollInterval: time.Minute, 43 workflows: workflows, 44 parallelWorkflows: cfg.ParallelWorkflows, 45 } 46 } 47 48 func (sp *SeriesProcessor) Loop(ctx context.Context) error { 49 var wg sync.WaitGroup 50 defer wg.Wait() 51 52 ch := make(chan *db.Session, 1) 53 wg.Add(1) 54 go func() { 55 defer wg.Done() 56 sp.seriesRunner(ctx, ch) 57 }() 58 // First pick up the previously running sessions. 59 activeSessions, err := sp.sessionRepo.ListRunning(ctx) 60 if err != nil { 61 return err 62 } 63 log.Printf("queried %d unfinished sessions", len(activeSessions)) 64 for _, session := range activeSessions { 65 ch <- session 66 } 67 // Then, monitor the DB for the new series. 68 wg.Add(1) 69 go func() { 70 defer wg.Done() 71 sp.streamSeries(ctx, ch) 72 close(ch) 73 }() 74 return nil 75 } 76 77 func (sp *SeriesProcessor) streamSeries(ctx context.Context, ch chan<- *db.Session) { 78 var next *db.NextSession 79 for { 80 select { 81 case <-ctx.Done(): 82 return 83 case <-time.After(sp.dbPollInterval): 84 } 85 if len(ch) > 0 { 86 // There are still series to be picked, no need to query the DB. 87 continue 88 } 89 var err error 90 var list []*db.Session 91 list, next, err = sp.sessionRepo.ListWaiting(ctx, next, cap(ch)) 92 if err != nil { 93 app.Errorf("failed to query series: %v", err) 94 continue 95 } 96 for _, session := range list { 97 ch <- session 98 } 99 } 100 } 101 102 func (sp *SeriesProcessor) seriesRunner(ctx context.Context, ch <-chan *db.Session) { 103 var eg errgroup.Group 104 defer eg.Wait() 105 106 eg.SetLimit(sp.parallelWorkflows) 107 for { 108 var session *db.Session 109 select { 110 case session = <-ch: 111 if session == nil { 112 return 113 } 114 case <-ctx.Done(): 115 return 116 } 117 log.Printf("scheduled session %q for series %q", session.ID, session.SeriesID) 118 eg.Go(func() error { 119 log.Printf("started processing session %q", session.ID) 120 sp.handleSession(ctx, session) 121 log.Printf("finished processing session %q", session.ID) 122 return nil 123 }) 124 } 125 } 126 127 func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Session) { 128 // TODO: set some sane deadline or just track indefinitely? 129 pollPeriod := sp.workflows.PollPeriod() 130 for { 131 select { 132 case <-time.After(pollPeriod): 133 case <-ctx.Done(): 134 return 135 } 136 status, workflowLog, err := sp.workflows.Status(session.ID) 137 if err != nil { 138 app.Errorf("failed to query workflow %q status: %v", session.ID, err) 139 continue 140 } 141 if len(workflowLog) > 0 { 142 err := sp.updateSessionLog(ctx, session, workflowLog) 143 if err != nil { 144 app.Errorf("failed to update session log: %v", err) 145 } 146 } 147 switch status { 148 case workflow.StatusNotFound: 149 log.Printf("scheduling a workflow for %q", session.ID) 150 err := sp.sessionRepo.Start(ctx, session.ID) 151 if err == db.ErrSessionAlreadyStarted { 152 // It may happen if the service was restarted right between the moment we updated the DB 153 // and actually started the workflow. 154 log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID) 155 } else if err != nil { 156 app.Errorf("failed to mark session started: %v", err) 157 break 158 } 159 err = sp.workflows.Start(session.ID) 160 if err != nil { 161 app.Errorf("failed to start a workflow: %v", err) 162 } 163 case workflow.StatusFinished, workflow.StatusFailed: 164 log.Printf("workflow for %q completed (status=%q), mark the session finished", session.ID, status) 165 err := sp.stopRunningTests(ctx, session.ID) 166 if err != nil { 167 app.Errorf("failed to check running tests for %s: %v", session.ID, err) 168 } 169 // TODO: StatusFailed needs a different handling. 170 err = sp.sessionRepo.Update(ctx, session.ID, func(session *db.Session) error { 171 session.SetFinishedAt(time.Now()) 172 return nil 173 }) 174 if err == nil { 175 // Nothing to do here anymore. 176 return 177 } 178 // Let's hope the error was transient. 179 app.Errorf("failed to update session %q: %v", session.ID, err) 180 case workflow.StatusRunning: 181 // Let's keep on tracking. 182 continue 183 default: 184 panic("unexpected workflow status: " + status) 185 } 186 } 187 } 188 189 // The session steps are expected to report that they are finished themselves. 190 // If the workflow was aborted for some external reason (or the session step crashed/timed out), 191 // the step may remain forever in the "Running" state. 192 // Go through such steps and mark them as finished (with an error). 193 func (sp *SeriesProcessor) stopRunningTests(ctx context.Context, sessionID string) error { 194 tests, err := sp.sessionTestRepo.BySessionRaw(ctx, sessionID) 195 if err != nil { 196 return fmt.Errorf("failed to query session steps: %w", err) 197 } 198 for _, test := range tests { 199 if test.Result != api.TestRunning { 200 continue 201 } 202 log.Printf("session %q is finished, but the test %q is running: marking it stopped", 203 sessionID, test.TestName) 204 err = sp.sessionTestRepo.InsertOrUpdate(ctx, test, func(entity *db.SessionTest) { 205 if entity.Result == api.TestRunning { 206 entity.Result = api.TestError 207 } 208 }) 209 if err != nil { 210 return fmt.Errorf("failed to update the step %q: %w", test.TestName, err) 211 } 212 } 213 return nil 214 } 215 216 func (sp *SeriesProcessor) updateSessionLog(ctx context.Context, session *db.Session, log []byte) error { 217 logURI, err := sp.blobStorage.Write(bytes.NewReader(log), "Session", session.ID, "log") 218 if err != nil { 219 return fmt.Errorf("failed to save the log: %w", err) 220 } 221 return sp.sessionRepo.Update(ctx, session.ID, func(session *db.Session) error { 222 session.LogURI = logURI 223 return nil 224 }) 225 }