github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/aql_context.go (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package query
    16  
    17  // #include "time_series_aggregate.h"
    18  import "C"
    19  
    20  import (
    21  	"bytes"
    22  	"github.com/uber/aresdb/memstore"
    23  	memCom "github.com/uber/aresdb/memstore/common"
    24  	queryCom "github.com/uber/aresdb/query/common"
    25  	"github.com/uber/aresdb/query/expr"
    26  	"strings"
    27  	"time"
    28  	"unsafe"
    29  )
    30  
    31  type boundaryType int
    32  
    33  const (
    34  	noBoundary boundaryType = iota
    35  	inclusiveBoundary
    36  	exclusiveBoundary
    37  )
    38  
    39  // columnUsage is a bitmap that tracks how a column is used and whether the
    40  // column should be pushed to device memory for different types of batches.
    41  type columnUsage int
    42  
    43  const (
    44  	columnUsedByAllBatches columnUsage = 1 << iota
    45  	columnUsedByLiveBatches
    46  	columnUsedByFirstArchiveBatch
    47  	columnUsedByLastArchiveBatch
    48  	columnUsedByPrefilter
    49  	columnUsedHighSentinel
    50  )
    51  
    52  var columnUsageNames = map[columnUsage]string{
    53  	columnUsedByAllBatches:        "allBatches",
    54  	columnUsedByLiveBatches:       "liveBatches",
    55  	columnUsedByFirstArchiveBatch: "firstArchiveBatch",
    56  	columnUsedByLastArchiveBatch:  "lastArchiveBatch",
    57  	columnUsedByPrefilter:         "prefilter",
    58  }
    59  
    60  func (u columnUsage) MarshalJSON() ([]byte, error) {
    61  	var usageStrings []string
    62  	for mask := columnUsedByAllBatches; mask < columnUsedHighSentinel; mask <<= 1 {
    63  		usage := u & mask
    64  		if usage != 0 {
    65  			usageStrings = append(usageStrings, columnUsageNames[usage])
    66  		}
    67  	}
    68  	buffer := bytes.NewBufferString(`"`)
    69  	buffer.WriteString(strings.Join(usageStrings, "+"))
    70  	buffer.WriteString(`"`)
    71  	return buffer.Bytes(), nil
    72  }
    73  
    74  // TableScanner defines how data for a table should be fed to device memory for
    75  // processing (scanner in a traditional terminology).
    76  type TableScanner struct {
    77  	// Snapshot of the table schema for convenience.
    78  	Schema *memstore.TableSchema `json:"-"`
    79  	// IDS of all table shards to be scanned on this instance.
    80  	Shards []int `json:"shards"`
    81  	// IDs of columns to be used in this query, in the following order:
    82  	//   1. Columns not from ArchivingSortColumns.
    83  	//   2. Columns from ArchivingSortColumns in reverse order.
    84  	Columns []int `json:"columns"`
    85  	// reversed mapping from columnID to column scan order index
    86  	ColumnsByIDs map[int]int `json:"-"`
    87  
    88  	// Map from column ID to its usage by the query.
    89  	ColumnUsages map[int]columnUsage `json:"columnUsage"`
    90  
    91  	// Fact table specifics:
    92  
    93  	// Values of equality prefilters in order. Each 4 bytes of the uint32 is used
    94  	// to store any data type other than UUID (not supported).
    95  	EqualityPrefilterValues []uint32 `json:"equalityPrefilterValues,omitempty"`
    96  	// Boundary types and values of the final range prefilter.
    97  	RangePrefilterBoundaries [2]boundaryType `json:"rangePrefilterBoundaries"`
    98  	RangePrefilterValues     [2]uint32       `json:"rangePrefilterValues"`
    99  	// Range of archive batches to process: [Start, end).
   100  	// Depending on the archiving progress of each shard, live batches may be
   101  	// skipped for processing if the archiving cutoff is after the time of
   102  	// ArchiveBatchIDEnd.
   103  	ArchiveBatchIDStart int `json:"archiveBatchIDStart"`
   104  	ArchiveBatchIDEnd   int `json:"archiveBatchIDEnd"`
   105  }
   106  
   107  // foreignTables stores foreignTables data
   108  type foreignTable struct {
   109  	// batches[batchIndex][columnIndex]
   110  	// batchIndex = batchID - BaseBatchID
   111  	// columnIndex corresponds to columnIndex in TableScanner columns order
   112  	batches               [][]deviceVectorPartySlice
   113  	numRecordsInLastBatch int
   114  	// stores the remote join column in main table
   115  	remoteJoinColumn *expr.VarRef
   116  	// primary key data at host.
   117  	hostPrimaryKeyData  memstore.PrimaryKeyData
   118  	devicePrimaryKeyPtr devicePointer
   119  }
   120  
   121  // deviceVectorPartySlice stores pointers to data for a column in device memory.
   122  type deviceVectorPartySlice struct {
   123  	values devicePointer
   124  	nulls  devicePointer
   125  	// The length of the count vector is Length+1, similar to memstore.VectorParty
   126  	counts devicePointer
   127  	// Used only by device column. We allocate device memory deviceManagerOnce for counts, nulls
   128  	// and values vector and when free we free only the base pointer. The memory layout
   129  	// is counts,nulls,values and for counts vector, we will not copy the 64 bytes padding.
   130  	basePtr   devicePointer
   131  	length    int
   132  	valueType memCom.DataType
   133  	// pointer to default value from schema
   134  	defaultValue    memCom.DataValue
   135  	valueStartIndex int
   136  	nullStartIndex  int
   137  	countStartIndex int
   138  }
   139  
   140  // oopkBatchContext stores context for the current batch being processed by
   141  // one-operator-per-kernel execution. For simplicity OOPK only supports data
   142  // width up to 32 bit.
   143  type oopkBatchContext struct {
   144  	// For convenience purpose.
   145  	device int
   146  
   147  	// Input data according to TableScanner.Columns order.
   148  	columns []deviceVectorPartySlice
   149  
   150  	// pointer to Columns[firstColumn]'s count vector
   151  	baseCountD devicePointer
   152  	// startRow when firstColumn has no count vector
   153  	startRow int
   154  	// Index for permuting elements in raw column values. Also filter will be applied on
   155  	// index vector instead on value vectors.
   156  	indexVectorD devicePointer
   157  	// Space for storing filter values. True value means we will keep the row.
   158  	// We will reuse this space for all filter processing.
   159  	predicateVectorD devicePointer
   160  	// geo predicate vector
   161  	geoPredicateVectorD devicePointer
   162  	// foreignTableRecordIDsD holds recordIDs for related to each foreign table
   163  	// to address the recordIDVector for foreignTable with tableID x, use foreignTableRecordIDsD[x-1]
   164  	foreignTableRecordIDsD []devicePointer
   165  
   166  	// timezoneLookupD points to an array of timezone offsets in seconds indexed by timezone string enum
   167  	timezoneLookupD     devicePointer
   168  	timezoneLookupDSize int
   169  
   170  	// Remaining number of inputs in indexVectorD after filtering.
   171  	// Notice that this size is not necessarily number of database rows
   172  	// when columns[0] is compressed.
   173  	size int
   174  
   175  	// Scratch vectors for evaluating the current AST expr in device memory.
   176  	// [0] stores the values and [1] stores the validities (NULLs).
   177  	// The data width of each value is always 4 bytes.
   178  	// The data width of each validity (NULL) is always 1 byte.
   179  	// Values and validities of each stack frame are allocated together,
   180  	// with the validity array following the value array.
   181  	// The length of each vector is size (same as indexVectorD).
   182  	exprStackD [][2]devicePointer
   183  
   184  	// Input and output storage in device memory before and after sort-reduce-by-key.
   185  	// The capacity of the dimension and measure vector should be at least
   186  	// resultSize+size.
   187  	// First resultSize records stores results from processing prior batches.
   188  	// Followed by size records from the current batch.
   189  	// Sort and reduce by key will operate on all resultSize+size records.
   190  	//
   191  	// Because reduce_by_key outputs to separate buffers, we need to alternate two
   192  	// sets of dimension and measure buffers for input and output.
   193  	// We store the input buffer in [0], and the output buffer in [1] for the
   194  	// following dimensionVectorH and measureVectorH.
   195  
   196  	// one giant dimension vector
   197  	// that contains each dimension columnar vector
   198  	// ordered in the following order:
   199  	// 4 byte dimensions -> 2 byte dimensions -> 1 byte dimensions (including validity vector).
   200  	dimensionVectorD [2]devicePointer
   201  	// hash vector is used to store the 64bit hash value hashed from
   202  	// dimension row (combining all dimension values into one byte array)
   203  	// generated in sort, used in sort and reduce
   204  	hashVectorD [2]devicePointer
   205  	// dimIndexVectorD is different from index vector,
   206  	// it is the index of dimension vector
   207  	// its length is the resultSize from previous batches + size of current batch
   208  	dimIndexVectorD [2]devicePointer
   209  
   210  	// Each element stores a 4 byte measure value.
   211  	// Except SUM that uses 8 bytes
   212  	measureVectorD [2]devicePointer
   213  
   214  	// Size of the results from prior batches.
   215  	resultSize int
   216  
   217  	// Capacity of the result dimension and measure vector, should be at least
   218  	// resultSize+size.
   219  	resultCapacity int
   220  
   221  	// Query execution stats for current batch.
   222  	stats oopkBatchStats
   223  }
   224  
   225  // OOPKContext defines additional query context for one-operator-per-kernel
   226  // execution.
   227  type OOPKContext struct {
   228  	// Compiled and annotated filters.
   229  	// The filters are converted to CNF equivalent so that AND does not exist in
   230  	// any underlying expr.Expr any more.
   231  
   232  	// Filters that apply to all archive and live batches.
   233  	// MainTableCommonFilters match filters with only main table columns involved
   234  	MainTableCommonFilters []expr.Expr `json:"mainTableCommonFilters,omitempty"`
   235  	// ForeignTableCommonFilters match filters with foreign table columns involved
   236  	ForeignTableCommonFilters []expr.Expr `json:"foreignTableCommonFilters,omitempty"`
   237  	// Lower bound [0] and upper bound [1] time filter. nil if not applicable.
   238  	// [0] should be applied to the first archive batch and all live batches.
   239  	// [1] should be applied to the last archive batch and all live batches.
   240  	TimeFilters [2]expr.Expr `json:"timeFilters"`
   241  	// Prefilters that only apply to live batches.
   242  	// Archiving cutoff filtering is processed directly by the query engine and not
   243  	// included here (different shards may have different cutoffs).
   244  	Prefilters []expr.Expr `json:"prefilters,omitempty"`
   245  
   246  	// Compiled and annotated ASTs for dimensions and measure.
   247  	Dimensions []expr.Expr `json:"dimensions"`
   248  	// Index of single dimension vector in global dimension vector
   249  	// Following sorted order based on bytes
   250  	DimensionVectorIndex []int `json:"dimensionVectorIndex"`
   251  	// Number of dimensions per dim width
   252  	NumDimsPerDimWidth queryCom.DimCountsPerDimWidth `json:"numDims"`
   253  	// Dim row bytes is the sum number of bytes of all dimension values
   254  	// plus validity bytes, for memory allocation convenience
   255  	DimRowBytes int `json:"dimRowBytes"`
   256  
   257  	// For one-operator-per-kernel we only support one measure per query.
   258  	Measure       expr.Expr                `json:"measure"`
   259  	MeasureBytes  int                      `json:"measureBytes"`
   260  	AggregateType C.enum_AggregateFunction `json:"aggregate"`
   261  
   262  	// Storage for current batch.
   263  	currentBatch oopkBatchContext
   264  
   265  	// foreignTables holds the batches for each foreign table
   266  	// to address each foreignTable with tableID x, use foreignTables[x-1]
   267  	// nil foreignTable means not an actual foreign table join.
   268  	foreignTables []*foreignTable
   269  
   270  	// nil means no geo intersection
   271  	geoIntersection *geoIntersection
   272  
   273  	// Result storage in host memory. The format is the same as the dimension and
   274  	// measure vector in oopkBatchContext.
   275  	dimensionVectorH unsafe.Pointer
   276  	measureVectorH   unsafe.Pointer
   277  	// hllVectorD stores hll dense or sparse vector in device memory.
   278  	hllVectorD devicePointer
   279  	// size of hll vector
   280  	hllVectorSize int64
   281  	// hllDimRegIDCountD stores regID count for each dim in device memory.
   282  	hllDimRegIDCountD devicePointer
   283  	ResultSize        int `json:"resultSize"`
   284  
   285  	// For reporting purpose only.
   286  	DeviceMemoryRequirement int           `json:"deviceMem"`
   287  	DurationWaitedForDevice time.Duration `json:"durationWaitedForDevice"`
   288  
   289  	// Stores the overall query stats for live batches and archive batches.
   290  	LiveBatchStats    oopkQueryStats `json:"liveStats"`
   291  	ArchiveBatchStats oopkQueryStats `json:"archiveStats"`
   292  
   293  	// indicate query can be return in the middle, no need to process all batches,
   294  	// this is usually for non-aggregation query with limit condition
   295  	done bool
   296  }
   297  
   298  // timezoneTableContext stores context for timezone column queries
   299  type timezoneTableContext struct {
   300  	tableAlias  string
   301  	tableColumn string
   302  }
   303  
   304  // GeoIntersection is the struct to storing geo intersection related fields.
   305  type geoIntersection struct {
   306  	// Following fields are generated by compiler.
   307  	// Geo tableID (scanner id)
   308  	shapeTableID int
   309  	// ID of the shape column.
   310  	shapeColumnID int
   311  	// Table ID of geo point.
   312  	pointTableID int
   313  	// ID of the point column in main table.
   314  	pointColumnID int
   315  	// List of shape uuids.
   316  	shapeUUIDs []string
   317  	// check point in shape or not
   318  	inOrOut bool
   319  	// dimIndex is the geo dimension index
   320  	// if <0, meaning there is no dimension for geo
   321  	// and this query only has geo filter
   322  	dimIndex int
   323  
   324  	// Following fields are generated by processor
   325  	shapeLatLongs devicePointer
   326  	shapeIndexs   devicePointer
   327  	// map from shape index to index of shapeUUID
   328  	validShapeUUIDs []string
   329  	numShapes       int
   330  	totalNumPoints  int
   331  }
   332  
   333  // AQLQueryContext stores all contextual data for handling an AQL query.
   334  type AQLQueryContext struct {
   335  	// The query input.
   336  	Query *AQLQuery `json:"query"`
   337  
   338  	// Context for one-operator-per-kernel execution.
   339  	OOPK OOPKContext `json:"oopk"`
   340  
   341  	//// Compiled time series aggregate query structure.
   342  	//// TODO: TSAggregate is only used for VM based query engine.
   343  	//TSAggregate C.TimeSeriesAggregate `json:"-"`
   344  
   345  	// Scanner for all tables. [0] for the main table; [1:] for tables in joins.
   346  	TableScanners []*TableScanner `json:"scanners"`
   347  	// Map from table alias to ID (index to TableScanners).
   348  	TableIDByAlias map[string]int `json:"tableIDs"`
   349  	// Map from table name to schema for convenience. In case of self join,
   350  	// only one entry is referenced here by the name of the table.
   351  	TableSchemaByName map[string]*memstore.TableSchema `json:"-"`
   352  	// Index to filters in Query.Filters that are identified as prefilters.
   353  	Prefilters []int `json:"prefilters,omitempty"`
   354  
   355  	Error error `json:"error,omitempty"`
   356  
   357  	Device int `json:"device"`
   358  
   359  	Debug bool `json:"debug,omitempty"`
   360  
   361  	Profiling string `json:"profiling,omitempty"`
   362  
   363  	// We alternate with two Cuda streams between batches for pipelining.
   364  	// [0] stores the current stream, and [1] stores the other stream.
   365  	cudaStreams [2]unsafe.Pointer
   366  
   367  	Results queryCom.AQLQueryResult `json:"-"`
   368  
   369  	// whether to serialize the query result as HLLData. If ReturnHLLData is true, we will not release dimension
   370  	// vector and measure vector until serialization is done.
   371  	ReturnHLLData  bool   `json:"ReturnHLLData"`
   372  	HLLQueryResult []byte `json:"-"`
   373  
   374  	// for time filter
   375  	fixedTimezone *time.Location
   376  	fromTime      *alignedTime
   377  	toTime        *alignedTime
   378  	dstswitch     int64
   379  
   380  	// timezone column and time filter related
   381  	timezoneTable timezoneTableContext
   382  
   383  	// Flag to indicate if this query is not aggregation query
   384  	isNonAggregationQuery bool
   385  }
   386  
   387  // IsHLL return if the aggregation function is HLL
   388  func (ctx *OOPKContext) IsHLL() bool {
   389  	return ctx.AggregateType == C.AGGR_HLL
   390  }