github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/memstore/common/upsert_batch_builder.go (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package common 16 17 import ( 18 "math" 19 "strings" 20 21 metaCom "github.com/uber/aresdb/metastore/common" 22 "github.com/uber/aresdb/utils" 23 "unsafe" 24 ) 25 26 // ColumnUpdateMode represents how to update data from UpsertBatch 27 type ColumnUpdateMode int 28 29 // UpsertBatchVersion represents the version of upsert batch 30 type UpsertBatchVersion uint32 31 32 const ( 33 // UpdateOverwriteNotNull (default) will overwrite existing value if new value is NOT null, otherwise just skip 34 UpdateOverwriteNotNull ColumnUpdateMode = iota 35 // UpdateForceOverwrite will simply overwrite existing value even when new data is null 36 UpdateForceOverwrite 37 // UpdateWithAddition will add the existing value with new value if new value is not null, existing null value will be treated as 0 in Funculation 38 UpdateWithAddition 39 // UpdateWithMin will save the minimum of existing and new value if new value is not null, existing null value will be treated as MAX_INT in Funculation 40 UpdateWithMin 41 // UpdateWithMax will save the maximum of existing and new value if new value is not null, existing null value will be treated as MIN_INT in Funculation 42 UpdateWithMax 43 // MaxColumnUpdateMode is the current upper limit for column update modes 44 MaxColumnUpdateMode 45 ) 46 47 const ( 48 V1 UpsertBatchVersion = 0xFEED0001 49 ) 50 51 type columnBuilder struct { 52 columnID int 53 dataType DataType 54 values []interface{} 55 enumDict map[string]int 56 // enumDictLengthInBytes is final length in bytes for enum dict vector 57 // first byte represent validity 58 // 1+len(enum)+len(delimiter)+len(enum)+... 59 enumDictLengthInBytes int 60 numValidValues int 61 updateMode ColumnUpdateMode 62 } 63 64 // SetValue write a value into the column at given row. 65 func (c *columnBuilder) SetValue(row int, value interface{}) error { 66 oldValueNull := c.values[row] == nil 67 68 if value == nil { 69 c.values[row] = nil 70 } else { 71 if IsEnumType(c.dataType) { 72 if strVal, ok := value.(string); ok { 73 value = c.GetOrAppendEnumCase(strVal) 74 } 75 } 76 var err error 77 c.values[row], err = ConvertValueForType(c.dataType, value) 78 if err != nil { 79 return err 80 } 81 } 82 83 if oldValueNull && c.values[row] != nil { 84 c.numValidValues++ 85 } else if !oldValueNull && c.values[row] == nil { 86 c.numValidValues-- 87 } 88 89 return nil 90 } 91 92 // GetOrAppendEnumCase add an enum cases to the column 93 // and returns the enumID 94 // caller should make sure the column is a enum column 95 func (c *columnBuilder) GetOrAppendEnumCase(str string) int { 96 newID := len(c.enumDict) 97 if index, exist := c.enumDict[str]; exist { 98 return index 99 } 100 if newID == 0 { 101 // first byte represent the validity 102 c.enumDictLengthInBytes += 1 + len(str) 103 } else { 104 c.enumDictLengthInBytes += len(metaCom.EnumDelimiter) + len(str) 105 } 106 c.enumDict[str] = newID 107 return newID 108 } 109 110 // AddRow grow the value array by 1. 111 func (c *columnBuilder) AddRow() { 112 c.values = append(c.values, nil) 113 } 114 115 // AddRow shrink the value array by 1. 116 func (c *columnBuilder) RemoveRow() { 117 lastValueIndex := len(c.values) - 1 118 lastValueNull := c.values[lastValueIndex] == nil 119 c.values = c.values[:lastValueIndex] 120 if !lastValueNull { 121 c.numValidValues-- 122 } 123 } 124 125 // ResetRows reset the row count to 0. 126 func (c *columnBuilder) ResetRows() { 127 c.values = c.values[0:0] 128 c.numValidValues = 0 129 } 130 131 func (c *columnBuilder) getEnumDictVector() []byte { 132 enumReverseMap := make([]string, len(c.enumDict)) 133 for enum, id := range c.enumDict { 134 if id == 0 { 135 // prepend the validity byte to the first enum case 136 enumReverseMap[id] = string([]byte{1}) + enum 137 } else { 138 enumReverseMap[id] = enum 139 } 140 } 141 return []byte(strings.Join(enumReverseMap, metaCom.EnumDelimiter)) 142 } 143 144 // Calculated BufferSize returns the size of the column data in serialized format. 145 func (c *columnBuilder) CalculateBufferSize(offset *int) { 146 isGoType := IsGoType(c.dataType) 147 148 switch c.GetMode() { 149 case AllValuesDefault: 150 case HasNullVector: 151 if !isGoType { 152 *offset += (len(c.values) + 7) / 8 153 } 154 fallthrough 155 case AllValuesPresent: 156 // write enum buffer if exists 157 enumDictLength := c.enumDictLengthInBytes 158 *offset += enumDictLength 159 // if golang memory, align to 4 bytes for offset vector 160 if isGoType { 161 *offset = utils.AlignOffset(*offset, 4) 162 // 1. uint32 for each offset value, and length = numRows + 1 163 // 2. last offset value is the end offset of the offset buffer 164 *offset += (len(c.values) + 1) * 4 165 // Padding size for value vector 166 *offset = utils.AlignOffset(*offset, 8) 167 for _, v := range c.values { 168 if v != nil { 169 goVal := v.(GoDataValue) 170 *offset += goVal.GetSerBytes() 171 } 172 } 173 } else { 174 // Padding size for value vector 175 *offset = utils.AlignOffset(*offset, 8) 176 // fixed value size 177 *offset += (DataTypeBits(c.dataType)*len(c.values) + 7) / 8 178 } 179 } 180 } 181 182 // AppendToBuffer writes the column data to buffer and advances offset. 183 func (c *columnBuilder) AppendToBuffer(writer *utils.BufferWriter) error { 184 writer.AlignBytes(1) 185 isGoType := IsGoType(c.dataType) 186 187 switch c.GetMode() { 188 case AllValuesDefault: 189 return nil 190 case HasNullVector: 191 // only non goType needs to write null vector 192 if !isGoType { 193 for row := 0; row < len(c.values); row++ { 194 value := c.values[row] 195 if err := writer.AppendBool(value != nil); err != nil { 196 return utils.StackError(err, "Failed to write null vector at row %d", row) 197 } 198 } 199 } 200 fallthrough 201 case AllValuesPresent: 202 if c.enumDictLengthInBytes > 0 { 203 if err := writer.Append(c.getEnumDictVector()); err != nil { 204 return utils.StackError(err, "Failed to write enum dict vector") 205 } 206 } 207 var offsetWriter, valueWriter *utils.BufferWriter 208 // only goType needs to write offsetVector 209 if isGoType { 210 // Padding to 4 byte alignment for offset vector 211 writer.AlignBytes(4) 212 writerForked := *writer 213 offsetWriter = &writerForked 214 // skip offset bytes 215 writer.SkipBytes((len(c.values) + 1) * 4) 216 } 217 218 // Padding to 8 byte alignment for value vector 219 writer.AlignBytes(8) 220 valueWriter = writer 221 // local byte offset of current value in value vector 222 currentValueOffset := uint32(0) 223 // write values starting from current value vector offset 224 for row := 0; row < len(c.values); row++ { 225 // write current offset if offsetWriter is defined 226 if offsetWriter != nil { 227 err := offsetWriter.AppendUint32(currentValueOffset) 228 if err != nil { 229 return utils.StackError(err, "Failed to write offset value at row %d", row) 230 } 231 } 232 233 value := c.values[row] 234 // Handle null value. 235 if value == nil { 236 // only skip bits when there is no offset vector 237 if offsetWriter == nil { 238 valueWriter.SkipBits(DataTypeBits(c.dataType)) 239 } 240 continue 241 } 242 243 switch c.dataType { 244 case Bool: 245 if err := valueWriter.AppendBool(value.(bool)); err != nil { 246 return utils.StackError(err, "Failed to write bool value at row %d", row) 247 } 248 case Int8: 249 if err := valueWriter.AppendInt8(value.(int8)); err != nil { 250 return utils.StackError(err, "Failed to write int8 value at row %d", row) 251 } 252 case Uint8: 253 if err := valueWriter.AppendUint8(value.(uint8)); err != nil { 254 return utils.StackError(err, "Failed to write uint8 value at row %d", row) 255 } 256 case Int16: 257 if err := valueWriter.AppendInt16(value.(int16)); err != nil { 258 return utils.StackError(err, "Failed to write int16 value at row %d", row) 259 } 260 case Uint16: 261 if err := valueWriter.AppendUint16(value.(uint16)); err != nil { 262 return utils.StackError(err, "Failed to write uint16 value at row %d", row) 263 } 264 case Int32: 265 if err := valueWriter.AppendInt32(value.(int32)); err != nil { 266 return utils.StackError(err, "Failed to write int32 value at row %d", row) 267 } 268 case Int64: 269 if err := valueWriter.AppendInt64(value.(int64)); err != nil { 270 return utils.StackError(err, "Failed to write int64 value at row %d", row) 271 } 272 case Uint32: 273 if err := valueWriter.AppendUint32(value.(uint32)); err != nil { 274 return utils.StackError(err, "Failed to write uint32 value at row %d", row) 275 } 276 case Float32: 277 if err := valueWriter.AppendFloat32(value.(float32)); err != nil { 278 return utils.StackError(err, "Failed to write float32 value at row %d", row) 279 } 280 case SmallEnum: 281 if err := valueWriter.AppendUint8(value.(uint8)); err != nil { 282 return utils.StackError(err, "Failed to write small enum value at row %d", row) 283 } 284 case BigEnum: 285 if err := valueWriter.AppendUint16(value.(uint16)); err != nil { 286 return utils.StackError(err, "Failed to write big enum value at row %d", row) 287 } 288 case UUID: 289 err := valueWriter.AppendUint64(value.([2]uint64)[0]) 290 if err == nil { 291 err = writer.AppendUint64(value.([2]uint64)[1]) 292 } 293 if err != nil { 294 return utils.StackError(err, "Failed to write uuid value at row %d", row) 295 } 296 case GeoPoint: 297 err := valueWriter.AppendFloat32(value.([2]float32)[0]) 298 if err == nil { 299 err = writer.AppendFloat32(value.([2]float32)[1]) 300 } 301 if err != nil { 302 return utils.StackError(err, "Failed to write geopoint value at row %d", row) 303 } 304 case GeoShape: 305 goVal := value.(GoDataValue) 306 dataWriter := utils.NewStreamDataWriter(valueWriter) 307 err := goVal.Write(&dataWriter) 308 if err != nil { 309 return utils.StackError(err, "Failed to write geoshape value at row %d", row) 310 } 311 // advance current offset 312 currentValueOffset += uint32(goVal.GetSerBytes()) 313 } 314 } 315 316 // lastly write the final offset into offsetWriter 317 if offsetWriter != nil { 318 err := offsetWriter.AppendUint32(currentValueOffset) 319 if err != nil { 320 return utils.StackError(err, "Failed to write offset value at row %d", len(c.values)) 321 } 322 } 323 } 324 // Align at byte for bit values. 325 writer.AlignBytes(1) 326 return nil 327 } 328 329 // GetMode get the mode based on number of valid values. 330 func (c *columnBuilder) GetMode() ColumnMode { 331 if c.numValidValues == 0 { 332 return AllValuesDefault 333 } else if c.numValidValues == len(c.values) { 334 return AllValuesPresent 335 } else { 336 return HasNullVector 337 } 338 } 339 340 // UpsertBatchBuilder is the builder for constructing an UpsertBatch buffer. It allows random value 341 // write at (row, col). 342 type UpsertBatchBuilder struct { 343 NumRows int 344 columns []*columnBuilder 345 } 346 347 // NewUpsertBatchBuilder creates a new builder for constructing an UpersetBatch. 348 func NewUpsertBatchBuilder() *UpsertBatchBuilder { 349 return &UpsertBatchBuilder{} 350 } 351 352 // AddColumn add a new column to the builder. Initially, new columns have all values set to null. 353 func (u *UpsertBatchBuilder) AddColumn(columnID int, dataType DataType) error { 354 if len(u.columns) > math.MaxUint16 { 355 return utils.StackError(nil, "Upsert batch cannot hold more than %d columns", math.MaxUint16) 356 } 357 values := make([]interface{}, u.NumRows) 358 column := &columnBuilder{ 359 columnID: columnID, 360 dataType: dataType, 361 numValidValues: 0, 362 values: values, 363 enumDict: make(map[string]int), 364 } 365 u.columns = append(u.columns, column) 366 return nil 367 } 368 369 // AddColumnWithUpdateMode add a new column to the builder with update mode info. Initially, new columns have all values set to null. 370 func (u *UpsertBatchBuilder) AddColumnWithUpdateMode(columnID int, dataType DataType, updateMode ColumnUpdateMode) error { 371 if updateMode >= MaxColumnUpdateMode { 372 return utils.StackError(nil, "Invalid update mode %d", updateMode) 373 } 374 if err := u.AddColumn(columnID, dataType); err != nil { 375 return err 376 } 377 u.columns[len(u.columns)-1].updateMode = updateMode 378 return nil 379 } 380 381 // AddRow increases the number of rows in the batch by 1. A new row with all nil values is appended 382 // to the row array. 383 func (u *UpsertBatchBuilder) AddRow() { 384 for _, column := range u.columns { 385 column.AddRow() 386 } 387 u.NumRows++ 388 } 389 390 // RemoveRow decreases the number of rows in the batch by 1. The last row will be removed. It's a 391 // no-op if the number of rows is 0. 392 func (u *UpsertBatchBuilder) RemoveRow() { 393 if u.NumRows > 0 { 394 for _, column := range u.columns { 395 column.RemoveRow() 396 } 397 u.NumRows-- 398 } 399 } 400 401 // ResetRows reset the row count to 0. 402 func (u *UpsertBatchBuilder) ResetRows() { 403 for _, column := range u.columns { 404 column.ResetRows() 405 } 406 u.NumRows = 0 407 } 408 409 // SetValue set a value to a given (row, col). 410 func (u *UpsertBatchBuilder) SetValue(row int, col int, value interface{}) error { 411 if row >= u.NumRows { 412 return utils.StackError(nil, "Row index %d out of range %d", row, u.NumRows) 413 } 414 if col >= len(u.columns) { 415 return utils.StackError(nil, "Col index %d out of range %d", col, len(u.columns)) 416 } 417 return u.columns[col].SetValue(row, value) 418 } 419 420 // ToByteArray produces a serialized UpsertBatch in byte array. 421 func (u UpsertBatchBuilder) ToByteArray() ([]byte, error) { 422 // Create buffer. 423 numCols := len(u.columns) 424 // initialized size to 4 bytes (version number). 425 versionHeaderSize := 4 426 // 24 bytes consist of fixed headers: 427 // [int32] num_of_rows (4 bytes) 428 // [uint16] num_of_columns (2 bytes) 429 // <reserve 14 bytes> 430 // [uint32] arrival_time (4 bytes) 431 fixedHeaderSize := 24 432 columnHeaderSize := ColumnHeaderSize(numCols) 433 headerSize := versionHeaderSize + fixedHeaderSize + columnHeaderSize 434 size := headerSize 435 for _, column := range u.columns { 436 column.CalculateBufferSize(&size) 437 } 438 size = utils.AlignOffset(size, 8) 439 buffer := make([]byte, size) 440 writer := utils.NewBufferWriter(buffer) 441 442 // Write upsert batch version. 443 if err := writer.AppendUint32(uint32(V1)); err != nil { 444 return nil, utils.StackError(err, "Failed to write version number") 445 } 446 // Write fixed headers. 447 if err := writer.AppendInt32(int32(u.NumRows)); err != nil { 448 return nil, utils.StackError(err, "Failed to write number of rows") 449 } 450 if err := writer.AppendUint16(uint16(len(u.columns))); err != nil { 451 return nil, utils.StackError(err, "Failed to write number of columns") 452 } 453 writer.SkipBytes(14) 454 if err := writer.AppendUint32(uint32(utils.Now().Unix())); err != nil { 455 return nil, utils.StackError(err, "Failed to write arrival time") 456 } 457 columnHeader := NewUpsertBatchHeader(buffer[writer.GetOffset():headerSize], numCols) 458 // skip to data offset 459 writer.SkipBytes(columnHeaderSize) 460 461 // Write per column data their headers. 462 for i, column := range u.columns { 463 if err := columnHeader.WriteColumnID(column.columnID, i); err != nil { 464 return nil, err 465 } 466 if err := columnHeader.WriteColumnFlag(column.GetMode(), column.updateMode, i); err != nil { 467 return nil, err 468 } 469 if err := columnHeader.WriteColumnType(column.dataType, i); err != nil { 470 return nil, err 471 } 472 if err := columnHeader.WriteColumnOffset(writer.GetOffset(), i); err != nil { 473 return nil, err 474 } 475 if err := columnHeader.WriteEnumDictLength(column.enumDictLengthInBytes, i); err != nil { 476 return nil, err 477 } 478 if err := column.AppendToBuffer(&writer); err != nil { 479 return nil, utils.StackError(err, "Failed to write data for column %d", i) 480 } 481 if err := columnHeader.WriteColumnOffset(writer.GetOffset(), i+1); err != nil { 482 return nil, err 483 } 484 } 485 486 return buffer, nil 487 } 488 489 func AdditionUpdate(oldValue, newValue unsafe.Pointer, dataType DataType) { 490 switch dataType { 491 case Int8: 492 *(*int8)(oldValue) = *(*int8)(oldValue) + *(*int8)(newValue) 493 case Uint8: 494 *(*uint8)(oldValue) = *(*uint8)(oldValue) + *(*uint8)(newValue) 495 case Int16: 496 *(*int16)(oldValue) = *(*int16)(oldValue) + *(*int16)(newValue) 497 case Uint16: 498 *(*uint16)(oldValue) = *(*uint16)(oldValue) + *(*uint16)(newValue) 499 case Int32: 500 *(*int32)(oldValue) = *(*int32)(oldValue) + *(*int32)(newValue) 501 case Uint32: 502 *(*uint32)(oldValue) = *(*uint32)(oldValue) + *(*uint32)(newValue) 503 case Int64: 504 *(*int64)(oldValue) = *(*int64)(oldValue) + *(*int64)(newValue) 505 case Float32: 506 *(*float32)(oldValue) = *(*float32)(oldValue) + *(*float32)(newValue) 507 } 508 } 509 510 // MinMaxUpdate update the old value if compareRes == expectedRes 511 func MinMaxUpdate(oldValue, newValue unsafe.Pointer, dataType DataType, cmpFunc CompareFunc, expectedRes int) { 512 if compareRes := cmpFunc(oldValue, newValue); compareRes*expectedRes > 0 { 513 switch dataType { 514 case Int8: 515 *(*int8)(oldValue) = *(*int8)(newValue) 516 case Uint8: 517 *(*uint8)(oldValue) = *(*uint8)(newValue) 518 case Int16: 519 *(*int16)(oldValue) = *(*int16)(newValue) 520 case Uint16: 521 *(*uint16)(oldValue) = *(*uint16)(newValue) 522 case Int32: 523 *(*int32)(oldValue) = *(*int32)(newValue) 524 case Uint32: 525 *(*uint32)(oldValue) = *(*uint32)(newValue) 526 case Int64: 527 *(*int64)(oldValue) = *(*int64)(newValue) 528 case Float32: 529 *(*float32)(oldValue) = *(*float32)(newValue) 530 } 531 } 532 }