go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/model/filters.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package model
    16  
    17  import (
    18  	"fmt"
    19  	"sort"
    20  	"strings"
    21  
    22  	"go.chromium.org/luci/common/data/stringset"
    23  	"go.chromium.org/luci/common/errors"
    24  	"go.chromium.org/luci/gae/service/datastore"
    25  
    26  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    27  )
    28  
    29  // SplitMode is a parameter for SplitForQuery and Apply methods.
    30  type SplitMode int
    31  
    32  const (
    33  	// SplitOptimally indicates to make as few split as possible.
    34  	//
    35  	// Some queries may end up using "OR" filters, but no more than one such
    36  	// filter per query. Such queries are still accepted by the datastore.
    37  	SplitOptimally SplitMode = 0
    38  
    39  	// SplitCompletely indicates to split a filter into elementary filters.
    40  	//
    41  	// Elementary filters do not have "OR" in them. This is used in testing to
    42  	// cover code paths that merge results of multiple queries. This is needed
    43  	// because the local testing environment current (as of Jan 2024) doesn't
    44  	// actually support OR queries at all.
    45  	SplitCompletely SplitMode = 1
    46  )
    47  
    48  // Filter represents a filter over the space of ["key:value"] tags.
    49  //
    50  // Conceptually it is a list of AND'ed together checks on values of tags. Each
    51  // such check compares each value of some particular tag to a set of allowed
    52  // values (often just one). The same tag key is allowed to show up more than
    53  // once. In that case there will be more than one filter on values of this tag
    54  // (see the example below).
    55  //
    56  // In API this filter is encoded by a list of `key:val1|val2|val3` pairs, where
    57  // keys are allowed to be repeated.
    58  //
    59  // For example, this filter:
    60  //
    61  //	["os:Linux", "os:Ubuntu", "zone:us-central|us-east"]
    62  //
    63  // Will match entities with following tags:
    64  //
    65  //	["os:Linux", "os:Ubuntu", "os:Ubuntu-20", "zone:us-central"]
    66  //	["os:Linux", "os:Ubuntu", "os:Ubuntu-22", "zone:us-easy"]
    67  //
    68  // But it will not match these entities:
    69  //
    70  //	["os:Linux", "os:Debian", "zone:us-central"]
    71  //	["os:Linux", "os:Ubuntu", "os:Ubuntu-22", "zone:us-west"]
    72  type Filter struct {
    73  	filters []perKeyFilter // sorted by key
    74  }
    75  
    76  // perKeyFilter is a filter that checks the value of a single tag key.
    77  type perKeyFilter struct {
    78  	key    string   // the tag key to check
    79  	values []string // allowed values (no dups, sorted)
    80  }
    81  
    82  // NewFilter parses a list of `("key", "val1|val2|val2")` pairs.
    83  //
    84  // Empty filter is possible (if `tags` are empty).
    85  func NewFilter(tags []*apipb.StringPair) (Filter, error) {
    86  	filter := Filter{
    87  		filters: make([]perKeyFilter, 0, len(tags)),
    88  	}
    89  
    90  	for _, tag := range tags {
    91  		if strings.TrimSpace(tag.Key) != tag.Key || tag.Key == "" {
    92  			return filter, errors.Reason("bad key %q", tag.Key).Err()
    93  		}
    94  
    95  		vals := strings.Split(tag.Value, "|")
    96  		deduped := stringset.New(len(vals))
    97  		for _, val := range vals {
    98  			if strings.TrimSpace(val) != val || val == "" {
    99  				return filter, errors.Reason("bad value for key %q: %q", tag.Key, tag.Value).Err()
   100  			}
   101  			deduped.Add(val)
   102  		}
   103  
   104  		filter.filters = append(filter.filters, perKeyFilter{
   105  			key:    tag.Key,
   106  			values: deduped.ToSortedSlice(),
   107  		})
   108  	}
   109  
   110  	sort.SliceStable(filter.filters, func(i, j int) bool {
   111  		return filter.filters[i].key < filter.filters[j].key
   112  	})
   113  
   114  	return filter, nil
   115  }
   116  
   117  // Pools is a list of all pools mentioned in the filter (if any).
   118  func (f Filter) Pools() []string {
   119  	pools := stringset.New(1) // there's usually only 1 pool
   120  	for _, f := range f.filters {
   121  		if f.key == "pool" {
   122  			pools.AddAll(f.values)
   123  		}
   124  	}
   125  	return pools.ToSortedSlice()
   126  }
   127  
   128  // IsEmpty is true if this filter doesn't filter anything.
   129  func (f Filter) IsEmpty() bool {
   130  	return len(f.filters) == 0
   131  }
   132  
   133  // SplitForQuery splits this filter into several simpler filters that can be
   134  // used in datastore queries, with their results merged.
   135  //
   136  // The unsplit filter is generally too complex for the datastore query planner
   137  // to handle using existing indexes (e.g. an index on `dimensions_flat` and
   138  // a composite index on `(dimensions_flat, composite)` pair when used for
   139  // BotInfo queries).
   140  //
   141  // Unfortunately due to datastore limits we can't just add all necessary
   142  // composite indexes (like `(dimensions_flat, dimensions_flat, composite)` one).
   143  // Since `dimensions_flat` is a repeated property, this results in too many
   144  // indexed permutations of values, blowing up this index. Possible workarounds
   145  // require changing the layout of BotInfo entities in datastore, but that would
   146  // require imposing limits on public Swarming API (basically, we'll need to
   147  // predefine what dimension keys are worth indexing and what are not; currently
   148  // all are indexed).
   149  //
   150  // Instead we split the query into N subqueries, run them in parallel and merge
   151  // results locally. This is relatively expensive and scales poorly, but we need
   152  // to do that only for complex queries that use multiple OR property filters.
   153  // They are relatively rare.
   154  //
   155  // If the original filter is empty, returns one empty filter as the output.
   156  func (f Filter) SplitForQuery(mode SplitMode) []Filter {
   157  	// Count how many OR-ed property filters we have, find the smallest one. We'll
   158  	// use it as a "pivot" for splitting the original filter into smaller filters.
   159  	// That way we'll have the smallest number of splits.
   160  	multiValCount := 0
   161  	pivotIdx := 0
   162  	for idx, filter := range f.filters {
   163  		if vals := len(filter.values); vals > 1 {
   164  			multiValCount += 1
   165  			if multiValCount == 1 || vals < len(f.filters[pivotIdx].values) {
   166  				pivotIdx = idx
   167  			}
   168  		}
   169  	}
   170  
   171  	var maxMultiVal int
   172  	switch mode {
   173  	case SplitOptimally:
   174  		maxMultiVal = 1 // support at most one OR property filter
   175  	case SplitCompletely:
   176  		maxMultiVal = 0 // support no OR property filters at all
   177  	default:
   178  		panic(fmt.Sprintf("unknown split mode %d", mode))
   179  	}
   180  	if multiValCount <= maxMultiVal {
   181  		return []Filter{f}
   182  	}
   183  
   184  	// Split into simpler filters around the pivot eliminating this particular OR.
   185  	// Keep simplifying the result recursively until we get a list of filters
   186  	// where each one can be handled by the datastore natively.
   187  	pivotVals := f.filters[pivotIdx].values
   188  	simplified := make([]Filter, 0, len(pivotVals))
   189  	for _, pivotVal := range pivotVals {
   190  		subfilter := Filter{
   191  			filters: make([]perKeyFilter, 0, len(f.filters)),
   192  		}
   193  		for idx, filter := range f.filters {
   194  			if idx == pivotIdx {
   195  				// Pivot! Pivot!
   196  				subfilter.filters = append(subfilter.filters, perKeyFilter{
   197  					key:    filter.key,
   198  					values: []string{pivotVal},
   199  				})
   200  			} else {
   201  				subfilter.filters = append(subfilter.filters, filter)
   202  			}
   203  		}
   204  		simplified = append(simplified, subfilter.SplitForQuery(mode)...)
   205  	}
   206  
   207  	return simplified
   208  }
   209  
   210  // Apply applies this filter to a query, returning (potentially) multiple
   211  // queries.
   212  //
   213  // Results of these queries must be merged locally (e.g. via datastore.RunMulti)
   214  // to get the final filtered result.
   215  //
   216  // `field` is the datastore entity field to apply the filter on. It should be
   217  // a multi-valued field with values of form "key:value".
   218  //
   219  // If the filter is empty, returns a list with the original query as is.
   220  func (f Filter) Apply(q *datastore.Query, field string, mode SplitMode) []*datastore.Query {
   221  	split := f.SplitForQuery(mode)
   222  	out := make([]*datastore.Query, 0, len(split))
   223  	for _, simpleFilter := range split {
   224  		simpleQ := q
   225  		for _, f := range simpleFilter.filters {
   226  			if len(f.values) == 1 {
   227  				simpleQ = simpleQ.Eq(field, fmt.Sprintf("%s:%s", f.key, f.values[0]))
   228  			} else {
   229  				pairs := make([]any, len(f.values))
   230  				for i, v := range f.values {
   231  					pairs[i] = fmt.Sprintf("%s:%s", f.key, v)
   232  				}
   233  				simpleQ = simpleQ.In(field, pairs...)
   234  			}
   235  		}
   236  		out = append(out, simpleQ)
   237  	}
   238  	return out
   239  }