github.com/m3db/m3@v1.5.0/src/query/functions/aggregation/count_values.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package aggregation
    22  
    23  import (
    24  	"fmt"
    25  	"math"
    26  
    27  	"github.com/m3db/m3/src/query/block"
    28  	"github.com/m3db/m3/src/query/executor/transform"
    29  	"github.com/m3db/m3/src/query/functions/utils"
    30  	"github.com/m3db/m3/src/query/models"
    31  	"github.com/m3db/m3/src/query/parser"
    32  	"github.com/m3db/m3/src/query/util"
    33  )
    34  
    35  const (
    36  	// CountValuesType counts the number of non nan elements with the same value.
    37  	CountValuesType = "count_values"
    38  )
    39  
    40  // NewCountValuesOp creates a new count values operation.
    41  func NewCountValuesOp(
    42  	opType string,
    43  	params NodeParams,
    44  ) (parser.Params, error) {
    45  	if opType != CountValuesType {
    46  		return baseOp{}, fmt.Errorf("operator not supported: %s", opType)
    47  	}
    48  
    49  	return newCountValuesOp(params, opType), nil
    50  }
    51  
    52  // countValuesOp stores required properties for count values ops.
    53  type countValuesOp struct {
    54  	params NodeParams
    55  	opType string
    56  }
    57  
    58  func (o countValuesOp) OpType() string {
    59  	return o.opType
    60  }
    61  
    62  func (o countValuesOp) String() string {
    63  	return fmt.Sprintf("type: %s", o.OpType())
    64  }
    65  
    66  func (o countValuesOp) Node(
    67  	controller *transform.Controller,
    68  	_ transform.Options,
    69  ) transform.OpNode {
    70  	return &countValuesNode{
    71  		op:         o,
    72  		controller: controller,
    73  	}
    74  }
    75  
    76  func newCountValuesOp(params NodeParams, opType string) countValuesOp {
    77  	return countValuesOp{
    78  		params: params,
    79  		opType: opType,
    80  	}
    81  }
    82  
    83  type countValuesNode struct {
    84  	op         countValuesOp
    85  	controller *transform.Controller
    86  }
    87  
    88  func (n *countValuesNode) Params() parser.Params {
    89  	return n.op
    90  }
    91  
    92  // bucketColumn represents a column of times a particular value in a series has
    93  // been seen. This may expand as more unique values are seen
    94  type bucketColumn []float64
    95  
    96  // bucketBlock is an abstraction for a set of series grouped by tags; count_values
    97  // works on these groupings rather than the entire set of series.
    98  type bucketBlock struct {
    99  	// columnLength can expand as further columns are processed; used to initialize
   100  	// the columns with empty values at each step
   101  	columnLength int
   102  	// columns indicates the number of times a value has been seen at a given step
   103  	columns []bucketColumn
   104  	// indexMapping maps any unique values seen to the appropriate column index
   105  	indexMapping map[float64]int
   106  }
   107  
   108  // Processes all series in this block bucket at the current column.
   109  func processBlockBucketAtColumn(
   110  	currentBucketBlock *bucketBlock,
   111  	values []float64,
   112  	bucket []int,
   113  	columnIndex int,
   114  ) {
   115  	// Generate appropriate number of rows full of -1s that will later map to NaNs
   116  	// unless updated with valid values
   117  	currentColumnLength := currentBucketBlock.columnLength
   118  	currentBucketBlock.columns[columnIndex] = make(bucketColumn, currentColumnLength)
   119  	for i := 0; i < currentColumnLength; i++ {
   120  		util.Memset(currentBucketBlock.columns[columnIndex], math.NaN())
   121  	}
   122  
   123  	countedValues := countValuesFn(values, bucket)
   124  	for distinctValue, count := range countedValues {
   125  		currentBucketColumn := currentBucketBlock.columns[columnIndex]
   126  		if rowIndex, seen := currentBucketBlock.indexMapping[distinctValue]; seen {
   127  			// This value has already been seen at rowIndex in a previous column
   128  			// so add the current value to the appropriate row index.
   129  			currentBucketColumn[rowIndex] = count
   130  		} else {
   131  			// The column index needs to be created here already
   132  			// Add the count to the end of the bucket column
   133  			currentBucketBlock.columns[columnIndex] = append(currentBucketColumn, count)
   134  
   135  			// Add the distinctValue to the indexMapping
   136  			currentBucketBlock.indexMapping[distinctValue] = len(currentBucketColumn)
   137  		}
   138  	}
   139  
   140  	currentBucketBlock.columnLength = len(currentBucketBlock.columns[columnIndex])
   141  }
   142  
   143  // Process the block
   144  func (n *countValuesNode) Process(
   145  	queryCtx *models.QueryContext,
   146  	ID parser.NodeID,
   147  	b block.Block,
   148  ) error {
   149  	return transform.ProcessSimpleBlock(n, n.controller, queryCtx, ID, b)
   150  }
   151  
   152  func (n *countValuesNode) ProcessBlock(
   153  	queryCtx *models.QueryContext,
   154  	ID parser.NodeID,
   155  	b block.Block,
   156  ) (block.Block, error) {
   157  	meta := b.Meta()
   158  	stepIter, err := b.StepIter()
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  
   163  	params := n.op.params
   164  	labelName := params.StringParameter
   165  	if !models.IsValid(labelName) {
   166  		return nil, fmt.Errorf("invalid label name %q", labelName)
   167  	}
   168  
   169  	seriesMetas := utils.FlattenMetadata(meta, stepIter.SeriesMeta())
   170  	buckets, metas := utils.GroupSeries(
   171  		params.MatchingTags,
   172  		params.Without,
   173  		[]byte(n.op.opType),
   174  		seriesMetas,
   175  	)
   176  
   177  	stepCount := stepIter.StepCount()
   178  	intermediateBlock := make([]bucketBlock, len(buckets))
   179  	for i := range intermediateBlock {
   180  		intermediateBlock[i].columns = make([]bucketColumn, stepCount)
   181  		intermediateBlock[i].indexMapping = make(map[float64]int, len(buckets[i]))
   182  	}
   183  
   184  	for columnIndex := 0; stepIter.Next(); columnIndex++ {
   185  		step := stepIter.Current()
   186  		values := step.Values()
   187  		for bucketIndex, bucket := range buckets {
   188  			processBlockBucketAtColumn(
   189  				&intermediateBlock[bucketIndex],
   190  				values,
   191  				bucket,
   192  				columnIndex,
   193  			)
   194  		}
   195  	}
   196  
   197  	if err = stepIter.Err(); err != nil {
   198  		return nil, err
   199  	}
   200  
   201  	numSeries := 0
   202  	for _, bucketBlock := range intermediateBlock {
   203  		numSeries += bucketBlock.columnLength
   204  	}
   205  
   206  	// Rebuild block metas in the expected order
   207  	blockMetas := make([]block.SeriesMeta, numSeries)
   208  	previousBucketBlockIndex := 0
   209  	for bucketIndex, bucketBlock := range intermediateBlock {
   210  		for k, v := range bucketBlock.indexMapping {
   211  			// Add the metas of this bucketBlock right after the previous block
   212  			blockMetas[v+previousBucketBlockIndex] = block.SeriesMeta{
   213  				Name: []byte(n.op.opType),
   214  				Tags: metas[bucketIndex].Tags.Clone().AddTag(models.Tag{
   215  					Name:  []byte(labelName),
   216  					Value: utils.FormatFloatToBytes(k),
   217  				}),
   218  			}
   219  		}
   220  
   221  		// NB: All metadatas for the intermediate block for this bucket have
   222  		// been added to the combined block metas. The metadatas for the next
   223  		// intermediate block should be added after these to maintain order
   224  		previousBucketBlockIndex += bucketBlock.columnLength
   225  	}
   226  
   227  	// Dedupe common metadatas
   228  	metaTags, flattenedMeta := utils.DedupeMetadata(blockMetas, meta.Tags.Opts)
   229  	meta.Tags = metaTags
   230  
   231  	builder, err := n.controller.BlockBuilder(queryCtx, meta, flattenedMeta)
   232  	if err != nil {
   233  		return nil, err
   234  	}
   235  
   236  	if err := builder.AddCols(stepCount); err != nil {
   237  		return nil, err
   238  	}
   239  
   240  	for columnIndex := 0; columnIndex < stepCount; columnIndex++ {
   241  		for _, bucketBlock := range intermediateBlock {
   242  			valsToAdd := padValuesWithNaNs(
   243  				bucketBlock.columns[columnIndex],
   244  				len(bucketBlock.indexMapping),
   245  			)
   246  			if err := builder.AppendValues(columnIndex, valsToAdd); err != nil {
   247  				return nil, err
   248  			}
   249  		}
   250  	}
   251  
   252  	return builder.Build(), nil
   253  }
   254  
   255  // pads vals with enough NaNs to match size
   256  func padValuesWithNaNs(vals bucketColumn, size int) bucketColumn {
   257  	numToPad := size - len(vals)
   258  	for i := 0; i < numToPad; i++ {
   259  		vals = append(vals, math.NaN())
   260  	}
   261  
   262  	return vals
   263  }
   264  
   265  // count values takes a value array and a bucket list, returns a map of
   266  // distinct values to number of times the value was seen in this bucket.
   267  // The distinct number returned here becomes the datapoint's value
   268  func countValuesFn(values []float64, bucket []int) map[float64]float64 {
   269  	countedValues := make(map[float64]float64, len(bucket))
   270  	for _, idx := range bucket {
   271  		val := values[idx]
   272  		if !math.IsNaN(val) {
   273  			countedValues[val]++
   274  		}
   275  	}
   276  
   277  	return countedValues
   278  }