github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/aql_context.go (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package query 16 17 // #include "time_series_aggregate.h" 18 import "C" 19 20 import ( 21 "bytes" 22 "github.com/uber/aresdb/memstore" 23 memCom "github.com/uber/aresdb/memstore/common" 24 queryCom "github.com/uber/aresdb/query/common" 25 "github.com/uber/aresdb/query/expr" 26 "strings" 27 "time" 28 "unsafe" 29 ) 30 31 type boundaryType int 32 33 const ( 34 noBoundary boundaryType = iota 35 inclusiveBoundary 36 exclusiveBoundary 37 ) 38 39 // columnUsage is a bitmap that tracks how a column is used and whether the 40 // column should be pushed to device memory for different types of batches. 41 type columnUsage int 42 43 const ( 44 columnUsedByAllBatches columnUsage = 1 << iota 45 columnUsedByLiveBatches 46 columnUsedByFirstArchiveBatch 47 columnUsedByLastArchiveBatch 48 columnUsedByPrefilter 49 columnUsedHighSentinel 50 ) 51 52 var columnUsageNames = map[columnUsage]string{ 53 columnUsedByAllBatches: "allBatches", 54 columnUsedByLiveBatches: "liveBatches", 55 columnUsedByFirstArchiveBatch: "firstArchiveBatch", 56 columnUsedByLastArchiveBatch: "lastArchiveBatch", 57 columnUsedByPrefilter: "prefilter", 58 } 59 60 func (u columnUsage) MarshalJSON() ([]byte, error) { 61 var usageStrings []string 62 for mask := columnUsedByAllBatches; mask < columnUsedHighSentinel; mask <<= 1 { 63 usage := u & mask 64 if usage != 0 { 65 usageStrings = append(usageStrings, columnUsageNames[usage]) 66 } 67 } 68 buffer := bytes.NewBufferString(`"`) 69 buffer.WriteString(strings.Join(usageStrings, "+")) 70 buffer.WriteString(`"`) 71 return buffer.Bytes(), nil 72 } 73 74 // TableScanner defines how data for a table should be fed to device memory for 75 // processing (scanner in a traditional terminology). 76 type TableScanner struct { 77 // Snapshot of the table schema for convenience. 78 Schema *memstore.TableSchema `json:"-"` 79 // IDS of all table shards to be scanned on this instance. 80 Shards []int `json:"shards"` 81 // IDs of columns to be used in this query, in the following order: 82 // 1. Columns not from ArchivingSortColumns. 83 // 2. Columns from ArchivingSortColumns in reverse order. 84 Columns []int `json:"columns"` 85 // reversed mapping from columnID to column scan order index 86 ColumnsByIDs map[int]int `json:"-"` 87 88 // Map from column ID to its usage by the query. 89 ColumnUsages map[int]columnUsage `json:"columnUsage"` 90 91 // Fact table specifics: 92 93 // Values of equality prefilters in order. Each 4 bytes of the uint32 is used 94 // to store any data type other than UUID (not supported). 95 EqualityPrefilterValues []uint32 `json:"equalityPrefilterValues,omitempty"` 96 // Boundary types and values of the final range prefilter. 97 RangePrefilterBoundaries [2]boundaryType `json:"rangePrefilterBoundaries"` 98 RangePrefilterValues [2]uint32 `json:"rangePrefilterValues"` 99 // Range of archive batches to process: [Start, end). 100 // Depending on the archiving progress of each shard, live batches may be 101 // skipped for processing if the archiving cutoff is after the time of 102 // ArchiveBatchIDEnd. 103 ArchiveBatchIDStart int `json:"archiveBatchIDStart"` 104 ArchiveBatchIDEnd int `json:"archiveBatchIDEnd"` 105 } 106 107 // foreignTables stores foreignTables data 108 type foreignTable struct { 109 // batches[batchIndex][columnIndex] 110 // batchIndex = batchID - BaseBatchID 111 // columnIndex corresponds to columnIndex in TableScanner columns order 112 batches [][]deviceVectorPartySlice 113 numRecordsInLastBatch int 114 // stores the remote join column in main table 115 remoteJoinColumn *expr.VarRef 116 // primary key data at host. 117 hostPrimaryKeyData memstore.PrimaryKeyData 118 devicePrimaryKeyPtr devicePointer 119 } 120 121 // deviceVectorPartySlice stores pointers to data for a column in device memory. 122 type deviceVectorPartySlice struct { 123 values devicePointer 124 nulls devicePointer 125 // The length of the count vector is Length+1, similar to memstore.VectorParty 126 counts devicePointer 127 // Used only by device column. We allocate device memory deviceManagerOnce for counts, nulls 128 // and values vector and when free we free only the base pointer. The memory layout 129 // is counts,nulls,values and for counts vector, we will not copy the 64 bytes padding. 130 basePtr devicePointer 131 length int 132 valueType memCom.DataType 133 // pointer to default value from schema 134 defaultValue memCom.DataValue 135 valueStartIndex int 136 nullStartIndex int 137 countStartIndex int 138 } 139 140 // oopkBatchContext stores context for the current batch being processed by 141 // one-operator-per-kernel execution. For simplicity OOPK only supports data 142 // width up to 32 bit. 143 type oopkBatchContext struct { 144 // For convenience purpose. 145 device int 146 147 // Input data according to TableScanner.Columns order. 148 columns []deviceVectorPartySlice 149 150 // pointer to Columns[firstColumn]'s count vector 151 baseCountD devicePointer 152 // startRow when firstColumn has no count vector 153 startRow int 154 // Index for permuting elements in raw column values. Also filter will be applied on 155 // index vector instead on value vectors. 156 indexVectorD devicePointer 157 // Space for storing filter values. True value means we will keep the row. 158 // We will reuse this space for all filter processing. 159 predicateVectorD devicePointer 160 // geo predicate vector 161 geoPredicateVectorD devicePointer 162 // foreignTableRecordIDsD holds recordIDs for related to each foreign table 163 // to address the recordIDVector for foreignTable with tableID x, use foreignTableRecordIDsD[x-1] 164 foreignTableRecordIDsD []devicePointer 165 166 // timezoneLookupD points to an array of timezone offsets in seconds indexed by timezone string enum 167 timezoneLookupD devicePointer 168 timezoneLookupDSize int 169 170 // Remaining number of inputs in indexVectorD after filtering. 171 // Notice that this size is not necessarily number of database rows 172 // when columns[0] is compressed. 173 size int 174 175 // Scratch vectors for evaluating the current AST expr in device memory. 176 // [0] stores the values and [1] stores the validities (NULLs). 177 // The data width of each value is always 4 bytes. 178 // The data width of each validity (NULL) is always 1 byte. 179 // Values and validities of each stack frame are allocated together, 180 // with the validity array following the value array. 181 // The length of each vector is size (same as indexVectorD). 182 exprStackD [][2]devicePointer 183 184 // Input and output storage in device memory before and after sort-reduce-by-key. 185 // The capacity of the dimension and measure vector should be at least 186 // resultSize+size. 187 // First resultSize records stores results from processing prior batches. 188 // Followed by size records from the current batch. 189 // Sort and reduce by key will operate on all resultSize+size records. 190 // 191 // Because reduce_by_key outputs to separate buffers, we need to alternate two 192 // sets of dimension and measure buffers for input and output. 193 // We store the input buffer in [0], and the output buffer in [1] for the 194 // following dimensionVectorH and measureVectorH. 195 196 // one giant dimension vector 197 // that contains each dimension columnar vector 198 // ordered in the following order: 199 // 4 byte dimensions -> 2 byte dimensions -> 1 byte dimensions (including validity vector). 200 dimensionVectorD [2]devicePointer 201 // hash vector is used to store the 64bit hash value hashed from 202 // dimension row (combining all dimension values into one byte array) 203 // generated in sort, used in sort and reduce 204 hashVectorD [2]devicePointer 205 // dimIndexVectorD is different from index vector, 206 // it is the index of dimension vector 207 // its length is the resultSize from previous batches + size of current batch 208 dimIndexVectorD [2]devicePointer 209 210 // Each element stores a 4 byte measure value. 211 // Except SUM that uses 8 bytes 212 measureVectorD [2]devicePointer 213 214 // Size of the results from prior batches. 215 resultSize int 216 217 // Capacity of the result dimension and measure vector, should be at least 218 // resultSize+size. 219 resultCapacity int 220 221 // Query execution stats for current batch. 222 stats oopkBatchStats 223 } 224 225 // OOPKContext defines additional query context for one-operator-per-kernel 226 // execution. 227 type OOPKContext struct { 228 // Compiled and annotated filters. 229 // The filters are converted to CNF equivalent so that AND does not exist in 230 // any underlying expr.Expr any more. 231 232 // Filters that apply to all archive and live batches. 233 // MainTableCommonFilters match filters with only main table columns involved 234 MainTableCommonFilters []expr.Expr `json:"mainTableCommonFilters,omitempty"` 235 // ForeignTableCommonFilters match filters with foreign table columns involved 236 ForeignTableCommonFilters []expr.Expr `json:"foreignTableCommonFilters,omitempty"` 237 // Lower bound [0] and upper bound [1] time filter. nil if not applicable. 238 // [0] should be applied to the first archive batch and all live batches. 239 // [1] should be applied to the last archive batch and all live batches. 240 TimeFilters [2]expr.Expr `json:"timeFilters"` 241 // Prefilters that only apply to live batches. 242 // Archiving cutoff filtering is processed directly by the query engine and not 243 // included here (different shards may have different cutoffs). 244 Prefilters []expr.Expr `json:"prefilters,omitempty"` 245 246 // Compiled and annotated ASTs for dimensions and measure. 247 Dimensions []expr.Expr `json:"dimensions"` 248 // Index of single dimension vector in global dimension vector 249 // Following sorted order based on bytes 250 DimensionVectorIndex []int `json:"dimensionVectorIndex"` 251 // Number of dimensions per dim width 252 NumDimsPerDimWidth queryCom.DimCountsPerDimWidth `json:"numDims"` 253 // Dim row bytes is the sum number of bytes of all dimension values 254 // plus validity bytes, for memory allocation convenience 255 DimRowBytes int `json:"dimRowBytes"` 256 257 // For one-operator-per-kernel we only support one measure per query. 258 Measure expr.Expr `json:"measure"` 259 MeasureBytes int `json:"measureBytes"` 260 AggregateType C.enum_AggregateFunction `json:"aggregate"` 261 262 // Storage for current batch. 263 currentBatch oopkBatchContext 264 265 // foreignTables holds the batches for each foreign table 266 // to address each foreignTable with tableID x, use foreignTables[x-1] 267 // nil foreignTable means not an actual foreign table join. 268 foreignTables []*foreignTable 269 270 // nil means no geo intersection 271 geoIntersection *geoIntersection 272 273 // Result storage in host memory. The format is the same as the dimension and 274 // measure vector in oopkBatchContext. 275 dimensionVectorH unsafe.Pointer 276 measureVectorH unsafe.Pointer 277 // hllVectorD stores hll dense or sparse vector in device memory. 278 hllVectorD devicePointer 279 // size of hll vector 280 hllVectorSize int64 281 // hllDimRegIDCountD stores regID count for each dim in device memory. 282 hllDimRegIDCountD devicePointer 283 ResultSize int `json:"resultSize"` 284 285 // For reporting purpose only. 286 DeviceMemoryRequirement int `json:"deviceMem"` 287 DurationWaitedForDevice time.Duration `json:"durationWaitedForDevice"` 288 289 // Stores the overall query stats for live batches and archive batches. 290 LiveBatchStats oopkQueryStats `json:"liveStats"` 291 ArchiveBatchStats oopkQueryStats `json:"archiveStats"` 292 293 // indicate query can be return in the middle, no need to process all batches, 294 // this is usually for non-aggregation query with limit condition 295 done bool 296 } 297 298 // timezoneTableContext stores context for timezone column queries 299 type timezoneTableContext struct { 300 tableAlias string 301 tableColumn string 302 } 303 304 // GeoIntersection is the struct to storing geo intersection related fields. 305 type geoIntersection struct { 306 // Following fields are generated by compiler. 307 // Geo tableID (scanner id) 308 shapeTableID int 309 // ID of the shape column. 310 shapeColumnID int 311 // Table ID of geo point. 312 pointTableID int 313 // ID of the point column in main table. 314 pointColumnID int 315 // List of shape uuids. 316 shapeUUIDs []string 317 // check point in shape or not 318 inOrOut bool 319 // dimIndex is the geo dimension index 320 // if <0, meaning there is no dimension for geo 321 // and this query only has geo filter 322 dimIndex int 323 324 // Following fields are generated by processor 325 shapeLatLongs devicePointer 326 shapeIndexs devicePointer 327 // map from shape index to index of shapeUUID 328 validShapeUUIDs []string 329 numShapes int 330 totalNumPoints int 331 } 332 333 // AQLQueryContext stores all contextual data for handling an AQL query. 334 type AQLQueryContext struct { 335 // The query input. 336 Query *AQLQuery `json:"query"` 337 338 // Context for one-operator-per-kernel execution. 339 OOPK OOPKContext `json:"oopk"` 340 341 //// Compiled time series aggregate query structure. 342 //// TODO: TSAggregate is only used for VM based query engine. 343 //TSAggregate C.TimeSeriesAggregate `json:"-"` 344 345 // Scanner for all tables. [0] for the main table; [1:] for tables in joins. 346 TableScanners []*TableScanner `json:"scanners"` 347 // Map from table alias to ID (index to TableScanners). 348 TableIDByAlias map[string]int `json:"tableIDs"` 349 // Map from table name to schema for convenience. In case of self join, 350 // only one entry is referenced here by the name of the table. 351 TableSchemaByName map[string]*memstore.TableSchema `json:"-"` 352 // Index to filters in Query.Filters that are identified as prefilters. 353 Prefilters []int `json:"prefilters,omitempty"` 354 355 Error error `json:"error,omitempty"` 356 357 Device int `json:"device"` 358 359 Debug bool `json:"debug,omitempty"` 360 361 Profiling string `json:"profiling,omitempty"` 362 363 // We alternate with two Cuda streams between batches for pipelining. 364 // [0] stores the current stream, and [1] stores the other stream. 365 cudaStreams [2]unsafe.Pointer 366 367 Results queryCom.AQLQueryResult `json:"-"` 368 369 // whether to serialize the query result as HLLData. If ReturnHLLData is true, we will not release dimension 370 // vector and measure vector until serialization is done. 371 ReturnHLLData bool `json:"ReturnHLLData"` 372 HLLQueryResult []byte `json:"-"` 373 374 // for time filter 375 fixedTimezone *time.Location 376 fromTime *alignedTime 377 toTime *alignedTime 378 dstswitch int64 379 380 // timezone column and time filter related 381 timezoneTable timezoneTableContext 382 383 // Flag to indicate if this query is not aggregation query 384 isNonAggregationQuery bool 385 } 386 387 // IsHLL return if the aggregation function is HLL 388 func (ctx *OOPKContext) IsHLL() bool { 389 return ctx.AggregateType == C.AGGR_HLL 390 }