github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colflow/vectorized_flow_shutdown_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colflow_test 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "strconv" 18 "sync" 19 "testing" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/col/coldata" 23 "github.com/cockroachdb/cockroach/pkg/col/coldatatestutils" 24 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 25 "github.com/cockroachdb/cockroach/pkg/sql/colexec" 26 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 27 "github.com/cockroachdb/cockroach/pkg/sql/colflow/colrpc" 28 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 29 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 30 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 31 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 32 "github.com/cockroachdb/cockroach/pkg/sql/types" 33 "github.com/cockroachdb/cockroach/pkg/testutils" 34 "github.com/cockroachdb/cockroach/pkg/testutils/colcontainerutils" 35 "github.com/cockroachdb/cockroach/pkg/util/hlc" 36 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 37 "github.com/cockroachdb/cockroach/pkg/util/mon" 38 "github.com/cockroachdb/cockroach/pkg/util/randutil" 39 "github.com/cockroachdb/cockroach/pkg/util/stop" 40 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 41 "github.com/cockroachdb/cockroach/pkg/util/uuid" 42 "github.com/cockroachdb/errors" 43 "github.com/stretchr/testify/require" 44 ) 45 46 type shutdownScenario struct { 47 string 48 } 49 50 var ( 51 consumerDone = shutdownScenario{"ConsumerDone"} 52 consumerClosed = shutdownScenario{"ConsumerClosed"} 53 shutdownScenarios = []shutdownScenario{consumerDone, consumerClosed} 54 ) 55 56 type callbackCloser struct { 57 closeCb func() error 58 } 59 60 func (c callbackCloser) IdempotentClose(_ context.Context) error { 61 return c.closeCb() 62 } 63 64 // TestVectorizedFlowShutdown tests that closing the materializer correctly 65 // closes all the infrastructure corresponding to the flow ending in that 66 // materializer. Namely: 67 // - on a remote node, it creates an exec.HashRouter with 3 outputs (with a 68 // corresponding to each Outbox) as well as 3 standalone Outboxes; 69 // - on a local node, it creates 6 exec.Inboxes that feed into an unordered 70 // synchronizer which then outputs all the data into a materializer. 71 // The resulting scheme looks as follows: 72 // 73 // Remote Node | Local Node 74 // | 75 // -> output -> Outbox -> | -> Inbox -> | 76 // | | 77 // Hash Router -> output -> Outbox -> | -> Inbox -> | 78 // | | 79 // -> output -> Outbox -> | -> Inbox -> | 80 // | -> Synchronizer -> materializer 81 // Outbox -> | -> Inbox -> | 82 // | 83 // Outbox -> | -> Inbox -> | 84 // | 85 // Outbox -> | -> Inbox -> | 86 // 87 // Also, with 50% probability, another remote node with the chain of an Outbox 88 // and Inbox is placed between the synchronizer and materializer. The resulting 89 // scheme then looks as follows: 90 // 91 // Remote Node | Another Remote Node | Local Node 92 // | | 93 // -> output -> Outbox -> | -> Inbox -> | 94 // | | | | 95 // Hash Router -> output -> Outbox -> | -> Inbox -> | 96 // | | | | 97 // -> output -> Outbox -> | -> Inbox -> | 98 // | | -> Synchronizer -> Outbox -> | -> Inbox -> materializer 99 // Outbox -> | -> Inbox -> | 100 // | | | 101 // Outbox -> | -> Inbox -> | 102 // | | | 103 // Outbox -> | -> Inbox -> | 104 // 105 // Remote nodes are simulated by having separate contexts and separate outbox 106 // registries. 107 // 108 // Additionally, all Outboxes have a single metadata source. In ConsumerDone 109 // shutdown scenario, we check that the metadata has been successfully 110 // propagated from all of the metadata sources. 111 func TestVectorizedFlowShutdown(t *testing.T) { 112 defer leaktest.AfterTest(t)() 113 114 stopper := stop.NewStopper() 115 defer stopper.Stop(context.Background()) 116 _, mockServer, addr, err := execinfrapb.StartMockDistSQLServer( 117 hlc.NewClock(hlc.UnixNano, time.Nanosecond), stopper, execinfra.StaticNodeID, 118 ) 119 require.NoError(t, err) 120 dialer := &execinfrapb.MockDialer{Addr: addr} 121 defer dialer.Close() 122 123 queueCfg, cleanup := colcontainerutils.NewTestingDiskQueueCfg(t, true /* inMem */) 124 defer cleanup() 125 126 for run := 0; run < 10; run++ { 127 for _, shutdownOperation := range shutdownScenarios { 128 t.Run(fmt.Sprintf("shutdownScenario=%s", shutdownOperation.string), func(t *testing.T) { 129 ctxLocal := context.Background() 130 ctxRemote, cancelRemote := context.WithCancel(context.Background()) 131 // Linter says there is a possibility of "context leak" because 132 // cancelRemote variable may not be used, so we defer the call to it. 133 // This does not change anything about the test since we're blocking on 134 // the wait group and we will call cancelRemote() below, so this defer 135 // is actually a noop. 136 defer cancelRemote() 137 st := cluster.MakeTestingClusterSettings() 138 evalCtx := tree.MakeTestingEvalContext(st) 139 defer evalCtx.Stop(ctxLocal) 140 flowCtx := &execinfra.FlowCtx{ 141 EvalCtx: &evalCtx, 142 Cfg: &execinfra.ServerConfig{Settings: st}, 143 } 144 rng, _ := randutil.NewPseudoRand() 145 var ( 146 err error 147 wg sync.WaitGroup 148 typs = []*types.T{types.Int} 149 hashRouterInput = coldatatestutils.NewRandomDataOp( 150 testAllocator, 151 rng, 152 coldatatestutils.RandomDataOpArgs{ 153 DeterministicTyps: typs, 154 // Set a high number of batches to ensure that the HashRouter is 155 // very far from being finished when the flow is shut down. 156 NumBatches: math.MaxInt64, 157 Selection: true, 158 }, 159 ) 160 numHashRouterOutputs = 3 161 numInboxes = numHashRouterOutputs + 3 162 inboxes = make([]*colrpc.Inbox, 0, numInboxes+1) 163 handleStreamErrCh = make([]chan error, numInboxes+1) 164 synchronizerInputs = make([]colexecbase.Operator, 0, numInboxes) 165 materializerMetadataSources = make([]execinfrapb.MetadataSource, 0, numInboxes+1) 166 streamID = 0 167 addAnotherRemote = rng.Float64() < 0.5 168 ) 169 170 // Create an allocator for each output. 171 allocators := make([]*colmem.Allocator, numHashRouterOutputs) 172 diskAccounts := make([]*mon.BoundAccount, numHashRouterOutputs) 173 for i := range allocators { 174 acc := testMemMonitor.MakeBoundAccount() 175 defer acc.Close(ctxRemote) 176 allocators[i] = colmem.NewAllocator(ctxRemote, &acc, testColumnFactory) 177 diskAcc := testDiskMonitor.MakeBoundAccount() 178 diskAccounts[i] = &diskAcc 179 defer diskAcc.Close(ctxRemote) 180 } 181 hashRouter, hashRouterOutputs := colexec.NewHashRouter( 182 allocators, hashRouterInput, typs, []uint32{0}, 64<<20, /* 64 MiB */ 183 queueCfg, &colexecbase.TestingSemaphore{}, diskAccounts, nil, /* toClose */ 184 ) 185 for i := 0; i < numInboxes; i++ { 186 inboxMemAccount := testMemMonitor.MakeBoundAccount() 187 defer inboxMemAccount.Close(ctxLocal) 188 inbox, err := colrpc.NewInbox( 189 colmem.NewAllocator(ctxLocal, &inboxMemAccount, testColumnFactory), typs, execinfrapb.StreamID(streamID), 190 ) 191 require.NoError(t, err) 192 inboxes = append(inboxes, inbox) 193 materializerMetadataSources = append(materializerMetadataSources, inbox) 194 synchronizerInputs = append(synchronizerInputs, colexecbase.Operator(inbox)) 195 } 196 synchronizer := colexec.NewParallelUnorderedSynchronizer(synchronizerInputs, typs, &wg) 197 flowID := execinfrapb.FlowID{UUID: uuid.MakeV4()} 198 199 // idToClosed keeps track of whether Close was called for a given id. 200 idToClosed := struct { 201 syncutil.Mutex 202 mapping map[int]bool 203 }{} 204 idToClosed.mapping = make(map[int]bool) 205 runOutboxInbox := func( 206 ctx context.Context, 207 cancelFn context.CancelFunc, 208 outboxMemAcc *mon.BoundAccount, 209 outboxInput colexecbase.Operator, 210 inbox *colrpc.Inbox, 211 id int, 212 outboxMetadataSources []execinfrapb.MetadataSource, 213 ) { 214 idToClosed.Lock() 215 idToClosed.mapping[id] = false 216 idToClosed.Unlock() 217 outbox, err := colrpc.NewOutbox(colmem.NewAllocator(ctx, outboxMemAcc, testColumnFactory), outboxInput, typs, 218 append(outboxMetadataSources, execinfrapb.CallbackMetadataSource{ 219 DrainMetaCb: func(ctx context.Context) []execinfrapb.ProducerMetadata { 220 return []execinfrapb.ProducerMetadata{{Err: errors.Errorf("%d", id)}} 221 }, 222 }, 223 ), []colexec.IdempotentCloser{callbackCloser{closeCb: func() error { 224 idToClosed.Lock() 225 idToClosed.mapping[id] = true 226 idToClosed.Unlock() 227 return nil 228 }}}) 229 230 require.NoError(t, err) 231 wg.Add(1) 232 go func(id int) { 233 outbox.Run(ctx, dialer, execinfra.StaticNodeID, flowID, execinfrapb.StreamID(id), cancelFn) 234 wg.Done() 235 }(id) 236 237 require.NoError(t, err) 238 serverStreamNotification := <-mockServer.InboundStreams 239 serverStream := serverStreamNotification.Stream 240 handleStreamErrCh[id] = make(chan error, 1) 241 doneFn := func() { close(serverStreamNotification.Donec) } 242 wg.Add(1) 243 go func(id int, stream execinfrapb.DistSQL_FlowStreamServer, doneFn func()) { 244 handleStreamErrCh[id] <- inbox.RunWithStream(stream.Context(), stream) 245 doneFn() 246 wg.Done() 247 }(id, serverStream, doneFn) 248 } 249 250 wg.Add(1) 251 go func() { 252 hashRouter.Run(ctxRemote) 253 wg.Done() 254 }() 255 for i := 0; i < numInboxes; i++ { 256 var outboxMetadataSources []execinfrapb.MetadataSource 257 outboxMemAccount := testMemMonitor.MakeBoundAccount() 258 defer outboxMemAccount.Close(ctxRemote) 259 if i < numHashRouterOutputs { 260 if i == 0 { 261 // Only one outbox should drain the hash router. 262 outboxMetadataSources = append(outboxMetadataSources, hashRouter) 263 } 264 runOutboxInbox(ctxRemote, cancelRemote, &outboxMemAccount, hashRouterOutputs[i], inboxes[i], streamID, outboxMetadataSources) 265 } else { 266 sourceMemAccount := testMemMonitor.MakeBoundAccount() 267 defer sourceMemAccount.Close(ctxRemote) 268 remoteAllocator := colmem.NewAllocator(ctxRemote, &sourceMemAccount, testColumnFactory) 269 batch := remoteAllocator.NewMemBatch(typs) 270 batch.SetLength(coldata.BatchSize()) 271 runOutboxInbox(ctxRemote, cancelRemote, &outboxMemAccount, colexecbase.NewRepeatableBatchSource(remoteAllocator, batch, typs), inboxes[i], streamID, outboxMetadataSources) 272 } 273 streamID++ 274 } 275 276 var materializerInput colexecbase.Operator 277 ctxAnotherRemote, cancelAnotherRemote := context.WithCancel(context.Background()) 278 if addAnotherRemote { 279 // Add another "remote" node to the flow. 280 inboxMemAccount := testMemMonitor.MakeBoundAccount() 281 defer inboxMemAccount.Close(ctxAnotherRemote) 282 inbox, err := colrpc.NewInbox( 283 colmem.NewAllocator(ctxAnotherRemote, &inboxMemAccount, testColumnFactory), 284 typs, execinfrapb.StreamID(streamID), 285 ) 286 require.NoError(t, err) 287 inboxes = append(inboxes, inbox) 288 outboxMemAccount := testMemMonitor.MakeBoundAccount() 289 defer outboxMemAccount.Close(ctxAnotherRemote) 290 runOutboxInbox(ctxAnotherRemote, cancelAnotherRemote, &outboxMemAccount, synchronizer, inbox, streamID, materializerMetadataSources) 291 streamID++ 292 // There is now only a single Inbox on the "local" node which is the 293 // only metadata source. 294 materializerMetadataSources = []execinfrapb.MetadataSource{inbox} 295 materializerInput = inbox 296 } else { 297 materializerInput = synchronizer 298 } 299 300 ctxLocal, cancelLocal := context.WithCancel(ctxLocal) 301 materializerCalledClose := false 302 materializer, err := colexec.NewMaterializer( 303 flowCtx, 304 1, /* processorID */ 305 materializerInput, 306 typs, 307 nil, /* output */ 308 materializerMetadataSources, 309 []colexec.IdempotentCloser{callbackCloser{closeCb: func() error { 310 materializerCalledClose = true 311 return nil 312 }}}, /* toClose */ 313 nil, /* outputStatsToTrace */ 314 func() context.CancelFunc { return cancelLocal }, 315 ) 316 require.NoError(t, err) 317 materializer.Start(ctxLocal) 318 319 for i := 0; i < 10; i++ { 320 row, meta := materializer.Next() 321 require.NotNil(t, row) 322 require.Nil(t, meta) 323 } 324 switch shutdownOperation { 325 case consumerDone: 326 materializer.ConsumerDone() 327 receivedMetaFromID := make([]bool, streamID) 328 metaCount := 0 329 for { 330 row, meta := materializer.Next() 331 require.Nil(t, row) 332 if meta == nil { 333 break 334 } 335 metaCount++ 336 require.NotNil(t, meta.Err) 337 id, err := strconv.Atoi(meta.Err.Error()) 338 require.NoError(t, err) 339 require.False(t, receivedMetaFromID[id]) 340 receivedMetaFromID[id] = true 341 } 342 require.Equal(t, streamID, metaCount, fmt.Sprintf("received metadata from Outbox %+v", receivedMetaFromID)) 343 case consumerClosed: 344 materializer.ConsumerClosed() 345 } 346 347 // When Outboxes are setup through vectorizedFlowCreator, the latter 348 // keeps track of how many outboxes are on the node. When the last one 349 // exits (and if there is no materializer on that node), 350 // vectorizedFlowCreator will cancel the flow context of the node. To 351 // simulate this, we manually cancel contexts of both remote nodes. 352 cancelRemote() 353 cancelAnotherRemote() 354 355 for i := range inboxes { 356 err = <-handleStreamErrCh[i] 357 // We either should get no error or a context cancellation error. 358 if err != nil { 359 require.True(t, testutils.IsError(err, "context canceled"), err) 360 } 361 } 362 wg.Wait() 363 // Ensure all the outboxes called Close. 364 for id, closed := range idToClosed.mapping { 365 require.True(t, closed, "outbox with ID %d did not call Close on closers", id) 366 } 367 require.True(t, materializerCalledClose) 368 }) 369 } 370 } 371 }