go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/dsmapperlite/dsmapperlite.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package dsmapperlite implements an in-process datastore mapper.
    16  //
    17  // Unlike its bigger sibling dsmapper, it doesn't distribute mapping
    18  // operations across machines, but in exchange has a very simple API. There's
    19  // no need to install it as a server module or setup task queue tasks etc. Just
    20  // use is a library.
    21  //
    22  // Useful for quickly visiting up to 100K entities.
    23  package dsmapperlite
    24  
    25  import (
    26  	"context"
    27  
    28  	"golang.org/x/sync/errgroup"
    29  
    30  	"go.chromium.org/luci/common/errors"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/gae/service/datastore"
    33  
    34  	"go.chromium.org/luci/server/dsmapper/internal/splitter"
    35  )
    36  
    37  // Map passes all entities matching the query to the callback, in parallel,
    38  // in some random order.
    39  //
    40  // Runs up to `shards` number of parallel goroutines, where each one executes
    41  // a datastore query and passes the resulting entities to the callback (along
    42  // with the shard index). Each query fetches entities in `batchSize` pages
    43  // before handling them. The overall memory consumption is thus
    44  // `O(shards * batchSize * averageEntitySize)`.
    45  //
    46  // Within a shard, the callback is called sequentially, but different shards
    47  // are processed in parallel. If the callback needs to parallelize entity
    48  // processing more, it should manage its own goroutine pool and pass entities
    49  // to it.
    50  //
    51  // If the callback returns an error, Map aborts the entire operation ASAP (but
    52  // it may take some time to wind down). When this happens, the context passed
    53  // to the callback is canceled.
    54  func Map[E any](ctx context.Context, q *datastore.Query, shards, batchSize int, cb func(ctx context.Context, shard int, entity E) error) error {
    55  	logging.Infof(ctx, "Calculating ranges...")
    56  	ranges, err := splitter.SplitIntoRanges(ctx, q, splitter.Params{
    57  		Shards:  shards,
    58  		Samples: 500,
    59  	})
    60  	if err != nil {
    61  		return errors.Annotate(err, "failed to do the initial __scatter__ query").Err()
    62  	}
    63  	logging.Infof(ctx, "Querying %d ranges in parallel...", len(ranges))
    64  	eg, ctx := errgroup.WithContext(ctx)
    65  	for idx, r := range ranges {
    66  		idx := idx
    67  		rangedQ := r.Apply(q)
    68  		eg.Go(func() error {
    69  			return datastore.RunBatch(ctx, int32(batchSize), rangedQ, func(e E) error {
    70  				return cb(ctx, idx, e)
    71  			})
    72  		})
    73  	}
    74  	return eg.Wait()
    75  }