go.mondoo.com/cnquery@v0.0.0-20231005093811-59568235f6ea/mql/internal/execution_manager.go (about) 1 // Copyright (c) Mondoo, Inc. 2 // SPDX-License-Identifier: BUSL-1.1 3 4 package internal 5 6 import ( 7 "errors" 8 "sync" 9 "time" 10 11 "github.com/rs/zerolog/log" 12 "go.mondoo.com/cnquery/llx" 13 ) 14 15 type executionManager struct { 16 schema llx.Schema 17 runtime llx.Runtime 18 // runQueue is the channel the execution manager will read 19 // items that need to be run from 20 runQueue chan runQueueItem 21 // resultChan is the channel the execution manager will write 22 // results to 23 resultChan chan *llx.RawResult 24 // errChan is used to signal an unrecoverable error. The execution 25 // manager writes to this channel 26 errChan chan error 27 // timeout is the amount of time the executor will wait for a query 28 // to return all the results after 29 timeout time.Duration 30 // stopChan is a channel that is closed when a stop is requested 31 stopChan chan struct{} 32 wg sync.WaitGroup 33 } 34 35 type runQueueItem struct { 36 codeBundle *llx.CodeBundle 37 props map[string]*llx.Result 38 } 39 40 func newExecutionManager(schema llx.Schema, runtime llx.Runtime, runQueue chan runQueueItem, 41 resultChan chan *llx.RawResult, timeout time.Duration, 42 ) *executionManager { 43 return &executionManager{ 44 runQueue: runQueue, 45 schema: schema, 46 runtime: runtime, 47 resultChan: resultChan, 48 errChan: make(chan error, 1), 49 stopChan: make(chan struct{}), 50 timeout: timeout, 51 } 52 } 53 54 func (em *executionManager) Start() { 55 em.wg.Add(1) 56 go func() { 57 defer em.wg.Done() 58 for { 59 // Prioritize stopChan 60 select { 61 case <-em.stopChan: 62 return 63 default: 64 } 65 66 select { 67 case item, ok := <-em.runQueue: 68 if !ok { 69 return 70 } 71 props := make(map[string]*llx.Primitive) 72 errMsg := "" 73 for k, r := range item.props { 74 if r.Error != "" { 75 // This case is tricky to handle. If we cannot run the query at 76 // all, its unclear what to report for the datapoint. If we 77 // report them in, then another query cant report them, at least 78 // with the way things are right now. If we don't report them, 79 // things will wait around for datapoint results that will never 80 // arrive. 81 errMsg = "property " + k + " errored: " + r.Error 82 break 83 } 84 props[k] = r.Data 85 } 86 87 if err := em.executeCodeBundle(item.codeBundle, props, errMsg); err != nil { 88 // an error is returned if we cannot execute a query. This happens 89 // if the lumi runtime doesn't report back expected data, there is 90 // a problem with the lumi runtime, or the query is somehow invalid. 91 // We need to give up here because the underlying runtime is in a bad 92 // state and/or we will not be able to report certain datapoints and 93 // we cannot be confident about which ones 94 select { 95 case em.errChan <- err: 96 default: 97 } 98 return 99 } 100 case <-em.stopChan: 101 return 102 } 103 } 104 }() 105 } 106 107 func (em *executionManager) Err() chan error { 108 return em.errChan 109 } 110 111 func (em *executionManager) Stop() { 112 close(em.stopChan) 113 em.wg.Wait() 114 } 115 116 func (em *executionManager) executeCodeBundle(codeBundle *llx.CodeBundle, props map[string]*llx.Primitive, errMsg string) error { 117 wg := NewWaitGroup() 118 119 sendResult := func(rr *llx.RawResult) { 120 log.Trace().Str("codeID", rr.CodeID).Msg("received result from executor") 121 wg.Done(rr.CodeID) 122 select { 123 case em.resultChan <- rr: 124 case <-em.stopChan: 125 } 126 } 127 128 checksums := map[string]struct{}{} 129 // Find the list of things we must wait for before execution of this codebundle is considered done 130 for _, checksum := range CodepointChecksums(codeBundle) { 131 if _, ok := checksums[checksum]; !ok { 132 checksums[checksum] = struct{}{} 133 // We must use a synchronization primitive because the llx.Run callback 134 // is not guaranteed to happen in a single thread 135 wg.Add(checksum) 136 if errMsg != "" { 137 // TODO: this is not entirely correct when looking at things as a whole. 138 // Its possible that another query executing will produce a non error. 139 // However, datapoint nodes take the first data that was reported. This 140 // issue exists in general for any query that errors 141 sendResult(&llx.RawResult{ 142 CodeID: checksum, 143 Data: &llx.RawData{ 144 Error: errors.New(errMsg), 145 }, 146 }) 147 } 148 } 149 } 150 151 if errMsg != "" { 152 return nil 153 } 154 155 var executor iExecutor 156 var err error 157 var codeID string 158 159 codeID = codeBundle.CodeV2.GetId() 160 log.Debug().Str("qrid", codeID).Msg("starting query execution") 161 defer func() { 162 log.Debug().Str("qrid", codeID).Msg("finished query execution") 163 }() 164 165 // TODO(jaym): sendResult may not be correct. We may need to fill in the 166 // checksum 167 x, err := llx.NewExecutorV2(codeBundle.CodeV2, em.runtime, props, sendResult) 168 if err == nil { 169 x.Run() 170 } 171 executor = x 172 173 if err != nil { 174 return err 175 } 176 177 execDoneChan := make(chan struct{}) 178 go func() { 179 wg.Wait() 180 close(execDoneChan) 181 }() 182 183 var errOut error 184 185 timer := time.NewTimer(em.timeout) 186 defer timer.Stop() 187 select { 188 case <-timer.C: 189 log.Error().Dur("timeout", em.timeout).Str("qrid", codeID).Msg("execution timed out") 190 errOut = errQueryTimeout 191 case <-execDoneChan: 192 } 193 194 unreported := wg.Decommission() 195 if len(unreported) > 0 { 196 log.Warn().Strs("missing", unreported).Str("qrid", codeID).Msg("unreported datapoints") 197 } 198 199 if err := executor.Unregister(); err != nil { 200 return err 201 } 202 203 return errOut 204 } 205 206 var errQueryTimeout = errors.New("query execution timed out") 207 208 type iExecutor interface { 209 Unregister() error 210 }