github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/memstore/common/upsert_batch_builder.go (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package common
    16  
    17  import (
    18  	"math"
    19  	"strings"
    20  
    21  	metaCom "github.com/uber/aresdb/metastore/common"
    22  	"github.com/uber/aresdb/utils"
    23  	"unsafe"
    24  )
    25  
    26  // ColumnUpdateMode represents how to update data from UpsertBatch
    27  type ColumnUpdateMode int
    28  
    29  // UpsertBatchVersion represents the version of upsert batch
    30  type UpsertBatchVersion uint32
    31  
    32  const (
    33  	// UpdateOverwriteNotNull (default) will overwrite existing value if new value is NOT null, otherwise just skip
    34  	UpdateOverwriteNotNull ColumnUpdateMode = iota
    35  	// UpdateForceOverwrite will simply overwrite existing value even when new data is null
    36  	UpdateForceOverwrite
    37  	// UpdateWithAddition will add the existing value with new value if new value is not null, existing null value will be treated as 0 in Funculation
    38  	UpdateWithAddition
    39  	// UpdateWithMin will save the minimum of existing and new value if new value is not null, existing null value will be treated as MAX_INT in Funculation
    40  	UpdateWithMin
    41  	// UpdateWithMax will save the maximum of existing and new value if new value is not null, existing null value will be treated as MIN_INT in Funculation
    42  	UpdateWithMax
    43  	// MaxColumnUpdateMode is the current upper limit for column update modes
    44  	MaxColumnUpdateMode
    45  )
    46  
    47  const (
    48  	V1 UpsertBatchVersion = 0xFEED0001
    49  )
    50  
    51  type columnBuilder struct {
    52  	columnID int
    53  	dataType DataType
    54  	values   []interface{}
    55  	enumDict map[string]int
    56  	// enumDictLengthInBytes is final length in bytes for enum dict vector
    57  	// first byte represent validity
    58  	// 1+len(enum)+len(delimiter)+len(enum)+...
    59  	enumDictLengthInBytes int
    60  	numValidValues        int
    61  	updateMode            ColumnUpdateMode
    62  }
    63  
    64  // SetValue write a value into the column at given row.
    65  func (c *columnBuilder) SetValue(row int, value interface{}) error {
    66  	oldValueNull := c.values[row] == nil
    67  
    68  	if value == nil {
    69  		c.values[row] = nil
    70  	} else {
    71  		if IsEnumType(c.dataType) {
    72  			if strVal, ok := value.(string); ok {
    73  				value = c.GetOrAppendEnumCase(strVal)
    74  			}
    75  		}
    76  		var err error
    77  		c.values[row], err = ConvertValueForType(c.dataType, value)
    78  		if err != nil {
    79  			return err
    80  		}
    81  	}
    82  
    83  	if oldValueNull && c.values[row] != nil {
    84  		c.numValidValues++
    85  	} else if !oldValueNull && c.values[row] == nil {
    86  		c.numValidValues--
    87  	}
    88  
    89  	return nil
    90  }
    91  
    92  // GetOrAppendEnumCase add an enum cases to the column
    93  // and returns the enumID
    94  // caller should make sure the column is a enum column
    95  func (c *columnBuilder) GetOrAppendEnumCase(str string) int {
    96  	newID := len(c.enumDict)
    97  	if index, exist := c.enumDict[str]; exist {
    98  		return index
    99  	}
   100  	if newID == 0 {
   101  		// first byte represent the validity
   102  		c.enumDictLengthInBytes += 1 + len(str)
   103  	} else {
   104  		c.enumDictLengthInBytes += len(metaCom.EnumDelimiter) + len(str)
   105  	}
   106  	c.enumDict[str] = newID
   107  	return newID
   108  }
   109  
   110  // AddRow grow the value array by 1.
   111  func (c *columnBuilder) AddRow() {
   112  	c.values = append(c.values, nil)
   113  }
   114  
   115  // AddRow shrink the value array by 1.
   116  func (c *columnBuilder) RemoveRow() {
   117  	lastValueIndex := len(c.values) - 1
   118  	lastValueNull := c.values[lastValueIndex] == nil
   119  	c.values = c.values[:lastValueIndex]
   120  	if !lastValueNull {
   121  		c.numValidValues--
   122  	}
   123  }
   124  
   125  // ResetRows reset the row count to 0.
   126  func (c *columnBuilder) ResetRows() {
   127  	c.values = c.values[0:0]
   128  	c.numValidValues = 0
   129  }
   130  
   131  func (c *columnBuilder) getEnumDictVector() []byte {
   132  	enumReverseMap := make([]string, len(c.enumDict))
   133  	for enum, id := range c.enumDict {
   134  		if id == 0 {
   135  			// prepend the validity byte to the first enum case
   136  			enumReverseMap[id] = string([]byte{1}) + enum
   137  		} else {
   138  			enumReverseMap[id] = enum
   139  		}
   140  	}
   141  	return []byte(strings.Join(enumReverseMap, metaCom.EnumDelimiter))
   142  }
   143  
   144  // Calculated BufferSize returns the size of the column data in serialized format.
   145  func (c *columnBuilder) CalculateBufferSize(offset *int) {
   146  	isGoType := IsGoType(c.dataType)
   147  
   148  	switch c.GetMode() {
   149  	case AllValuesDefault:
   150  	case HasNullVector:
   151  		if !isGoType {
   152  			*offset += (len(c.values) + 7) / 8
   153  		}
   154  		fallthrough
   155  	case AllValuesPresent:
   156  		// write enum buffer if exists
   157  		enumDictLength := c.enumDictLengthInBytes
   158  		*offset += enumDictLength
   159  		// if golang memory, align to 4 bytes for offset vector
   160  		if isGoType {
   161  			*offset = utils.AlignOffset(*offset, 4)
   162  			// 1. uint32 for each offset value, and length = numRows + 1
   163  			// 2. last offset value is the end offset of the offset buffer
   164  			*offset += (len(c.values) + 1) * 4
   165  			// Padding size for value vector
   166  			*offset = utils.AlignOffset(*offset, 8)
   167  			for _, v := range c.values {
   168  				if v != nil {
   169  					goVal := v.(GoDataValue)
   170  					*offset += goVal.GetSerBytes()
   171  				}
   172  			}
   173  		} else {
   174  			// Padding size for value vector
   175  			*offset = utils.AlignOffset(*offset, 8)
   176  			// fixed value size
   177  			*offset += (DataTypeBits(c.dataType)*len(c.values) + 7) / 8
   178  		}
   179  	}
   180  }
   181  
   182  // AppendToBuffer writes the column data to buffer and advances offset.
   183  func (c *columnBuilder) AppendToBuffer(writer *utils.BufferWriter) error {
   184  	writer.AlignBytes(1)
   185  	isGoType := IsGoType(c.dataType)
   186  
   187  	switch c.GetMode() {
   188  	case AllValuesDefault:
   189  		return nil
   190  	case HasNullVector:
   191  		// only non goType needs to write null vector
   192  		if !isGoType {
   193  			for row := 0; row < len(c.values); row++ {
   194  				value := c.values[row]
   195  				if err := writer.AppendBool(value != nil); err != nil {
   196  					return utils.StackError(err, "Failed to write null vector at row %d", row)
   197  				}
   198  			}
   199  		}
   200  		fallthrough
   201  	case AllValuesPresent:
   202  		if c.enumDictLengthInBytes > 0 {
   203  			if err := writer.Append(c.getEnumDictVector()); err != nil {
   204  				return utils.StackError(err, "Failed to write enum dict vector")
   205  			}
   206  		}
   207  		var offsetWriter, valueWriter *utils.BufferWriter
   208  		// only goType needs to write offsetVector
   209  		if isGoType {
   210  			// Padding to 4 byte alignment for offset vector
   211  			writer.AlignBytes(4)
   212  			writerForked := *writer
   213  			offsetWriter = &writerForked
   214  			// skip offset bytes
   215  			writer.SkipBytes((len(c.values) + 1) * 4)
   216  		}
   217  
   218  		// Padding to 8 byte alignment for value vector
   219  		writer.AlignBytes(8)
   220  		valueWriter = writer
   221  		// local byte offset of current value in value vector
   222  		currentValueOffset := uint32(0)
   223  		// write values starting from current value vector offset
   224  		for row := 0; row < len(c.values); row++ {
   225  			// write current offset if offsetWriter is defined
   226  			if offsetWriter != nil {
   227  				err := offsetWriter.AppendUint32(currentValueOffset)
   228  				if err != nil {
   229  					return utils.StackError(err, "Failed to write offset value at row %d", row)
   230  				}
   231  			}
   232  
   233  			value := c.values[row]
   234  			// Handle null value.
   235  			if value == nil {
   236  				// only skip bits when there is no offset vector
   237  				if offsetWriter == nil {
   238  					valueWriter.SkipBits(DataTypeBits(c.dataType))
   239  				}
   240  				continue
   241  			}
   242  
   243  			switch c.dataType {
   244  			case Bool:
   245  				if err := valueWriter.AppendBool(value.(bool)); err != nil {
   246  					return utils.StackError(err, "Failed to write bool value at row %d", row)
   247  				}
   248  			case Int8:
   249  				if err := valueWriter.AppendInt8(value.(int8)); err != nil {
   250  					return utils.StackError(err, "Failed to write int8 value at row %d", row)
   251  				}
   252  			case Uint8:
   253  				if err := valueWriter.AppendUint8(value.(uint8)); err != nil {
   254  					return utils.StackError(err, "Failed to write uint8 value at row %d", row)
   255  				}
   256  			case Int16:
   257  				if err := valueWriter.AppendInt16(value.(int16)); err != nil {
   258  					return utils.StackError(err, "Failed to write int16 value at row %d", row)
   259  				}
   260  			case Uint16:
   261  				if err := valueWriter.AppendUint16(value.(uint16)); err != nil {
   262  					return utils.StackError(err, "Failed to write uint16 value at row %d", row)
   263  				}
   264  			case Int32:
   265  				if err := valueWriter.AppendInt32(value.(int32)); err != nil {
   266  					return utils.StackError(err, "Failed to write int32 value at row %d", row)
   267  				}
   268  			case Int64:
   269  				if err := valueWriter.AppendInt64(value.(int64)); err != nil {
   270  					return utils.StackError(err, "Failed to write int64 value at row %d", row)
   271  				}
   272  			case Uint32:
   273  				if err := valueWriter.AppendUint32(value.(uint32)); err != nil {
   274  					return utils.StackError(err, "Failed to write uint32 value at row %d", row)
   275  				}
   276  			case Float32:
   277  				if err := valueWriter.AppendFloat32(value.(float32)); err != nil {
   278  					return utils.StackError(err, "Failed to write float32 value at row %d", row)
   279  				}
   280  			case SmallEnum:
   281  				if err := valueWriter.AppendUint8(value.(uint8)); err != nil {
   282  					return utils.StackError(err, "Failed to write small enum value at row %d", row)
   283  				}
   284  			case BigEnum:
   285  				if err := valueWriter.AppendUint16(value.(uint16)); err != nil {
   286  					return utils.StackError(err, "Failed to write big enum value at row %d", row)
   287  				}
   288  			case UUID:
   289  				err := valueWriter.AppendUint64(value.([2]uint64)[0])
   290  				if err == nil {
   291  					err = writer.AppendUint64(value.([2]uint64)[1])
   292  				}
   293  				if err != nil {
   294  					return utils.StackError(err, "Failed to write uuid value at row %d", row)
   295  				}
   296  			case GeoPoint:
   297  				err := valueWriter.AppendFloat32(value.([2]float32)[0])
   298  				if err == nil {
   299  					err = writer.AppendFloat32(value.([2]float32)[1])
   300  				}
   301  				if err != nil {
   302  					return utils.StackError(err, "Failed to write geopoint value at row %d", row)
   303  				}
   304  			case GeoShape:
   305  				goVal := value.(GoDataValue)
   306  				dataWriter := utils.NewStreamDataWriter(valueWriter)
   307  				err := goVal.Write(&dataWriter)
   308  				if err != nil {
   309  					return utils.StackError(err, "Failed to write geoshape value at row %d", row)
   310  				}
   311  				// advance current offset
   312  				currentValueOffset += uint32(goVal.GetSerBytes())
   313  			}
   314  		}
   315  
   316  		// lastly write the final offset into offsetWriter
   317  		if offsetWriter != nil {
   318  			err := offsetWriter.AppendUint32(currentValueOffset)
   319  			if err != nil {
   320  				return utils.StackError(err, "Failed to write offset value at row %d", len(c.values))
   321  			}
   322  		}
   323  	}
   324  	// Align at byte for bit values.
   325  	writer.AlignBytes(1)
   326  	return nil
   327  }
   328  
   329  // GetMode get the mode based on number of valid values.
   330  func (c *columnBuilder) GetMode() ColumnMode {
   331  	if c.numValidValues == 0 {
   332  		return AllValuesDefault
   333  	} else if c.numValidValues == len(c.values) {
   334  		return AllValuesPresent
   335  	} else {
   336  		return HasNullVector
   337  	}
   338  }
   339  
   340  // UpsertBatchBuilder is the builder for constructing an UpsertBatch buffer. It allows random value
   341  // write at (row, col).
   342  type UpsertBatchBuilder struct {
   343  	NumRows int
   344  	columns []*columnBuilder
   345  }
   346  
   347  // NewUpsertBatchBuilder creates a new builder for constructing an UpersetBatch.
   348  func NewUpsertBatchBuilder() *UpsertBatchBuilder {
   349  	return &UpsertBatchBuilder{}
   350  }
   351  
   352  // AddColumn add a new column to the builder. Initially, new columns have all values set to null.
   353  func (u *UpsertBatchBuilder) AddColumn(columnID int, dataType DataType) error {
   354  	if len(u.columns) > math.MaxUint16 {
   355  		return utils.StackError(nil, "Upsert batch cannot hold more than %d columns", math.MaxUint16)
   356  	}
   357  	values := make([]interface{}, u.NumRows)
   358  	column := &columnBuilder{
   359  		columnID:       columnID,
   360  		dataType:       dataType,
   361  		numValidValues: 0,
   362  		values:         values,
   363  		enumDict:       make(map[string]int),
   364  	}
   365  	u.columns = append(u.columns, column)
   366  	return nil
   367  }
   368  
   369  // AddColumnWithUpdateMode add a new column to the builder with update mode info. Initially, new columns have all values set to null.
   370  func (u *UpsertBatchBuilder) AddColumnWithUpdateMode(columnID int, dataType DataType, updateMode ColumnUpdateMode) error {
   371  	if updateMode >= MaxColumnUpdateMode {
   372  		return utils.StackError(nil, "Invalid update mode %d", updateMode)
   373  	}
   374  	if err := u.AddColumn(columnID, dataType); err != nil {
   375  		return err
   376  	}
   377  	u.columns[len(u.columns)-1].updateMode = updateMode
   378  	return nil
   379  }
   380  
   381  // AddRow increases the number of rows in the batch by 1. A new row with all nil values is appended
   382  // to the row array.
   383  func (u *UpsertBatchBuilder) AddRow() {
   384  	for _, column := range u.columns {
   385  		column.AddRow()
   386  	}
   387  	u.NumRows++
   388  }
   389  
   390  // RemoveRow decreases the number of rows in the batch by 1. The last row will be removed. It's a
   391  // no-op if the number of rows is 0.
   392  func (u *UpsertBatchBuilder) RemoveRow() {
   393  	if u.NumRows > 0 {
   394  		for _, column := range u.columns {
   395  			column.RemoveRow()
   396  		}
   397  		u.NumRows--
   398  	}
   399  }
   400  
   401  // ResetRows reset the row count to 0.
   402  func (u *UpsertBatchBuilder) ResetRows() {
   403  	for _, column := range u.columns {
   404  		column.ResetRows()
   405  	}
   406  	u.NumRows = 0
   407  }
   408  
   409  // SetValue set a value to a given (row, col).
   410  func (u *UpsertBatchBuilder) SetValue(row int, col int, value interface{}) error {
   411  	if row >= u.NumRows {
   412  		return utils.StackError(nil, "Row index %d out of range %d", row, u.NumRows)
   413  	}
   414  	if col >= len(u.columns) {
   415  		return utils.StackError(nil, "Col index %d out of range %d", col, len(u.columns))
   416  	}
   417  	return u.columns[col].SetValue(row, value)
   418  }
   419  
   420  // ToByteArray produces a serialized UpsertBatch in byte array.
   421  func (u UpsertBatchBuilder) ToByteArray() ([]byte, error) {
   422  	// Create buffer.
   423  	numCols := len(u.columns)
   424  	// initialized size to 4 bytes (version number).
   425  	versionHeaderSize := 4
   426  	// 24 bytes consist of fixed headers:
   427  	// [int32] num_of_rows (4 bytes)
   428  	// [uint16] num_of_columns (2 bytes)
   429  	// <reserve 14 bytes>
   430  	// [uint32] arrival_time (4 bytes)
   431  	fixedHeaderSize := 24
   432  	columnHeaderSize := ColumnHeaderSize(numCols)
   433  	headerSize := versionHeaderSize + fixedHeaderSize + columnHeaderSize
   434  	size := headerSize
   435  	for _, column := range u.columns {
   436  		column.CalculateBufferSize(&size)
   437  	}
   438  	size = utils.AlignOffset(size, 8)
   439  	buffer := make([]byte, size)
   440  	writer := utils.NewBufferWriter(buffer)
   441  
   442  	// Write upsert batch version.
   443  	if err := writer.AppendUint32(uint32(V1)); err != nil {
   444  		return nil, utils.StackError(err, "Failed to write version number")
   445  	}
   446  	// Write fixed headers.
   447  	if err := writer.AppendInt32(int32(u.NumRows)); err != nil {
   448  		return nil, utils.StackError(err, "Failed to write number of rows")
   449  	}
   450  	if err := writer.AppendUint16(uint16(len(u.columns))); err != nil {
   451  		return nil, utils.StackError(err, "Failed to write number of columns")
   452  	}
   453  	writer.SkipBytes(14)
   454  	if err := writer.AppendUint32(uint32(utils.Now().Unix())); err != nil {
   455  		return nil, utils.StackError(err, "Failed to write arrival time")
   456  	}
   457  	columnHeader := NewUpsertBatchHeader(buffer[writer.GetOffset():headerSize], numCols)
   458  	// skip to data offset
   459  	writer.SkipBytes(columnHeaderSize)
   460  
   461  	// Write per column data their headers.
   462  	for i, column := range u.columns {
   463  		if err := columnHeader.WriteColumnID(column.columnID, i); err != nil {
   464  			return nil, err
   465  		}
   466  		if err := columnHeader.WriteColumnFlag(column.GetMode(), column.updateMode, i); err != nil {
   467  			return nil, err
   468  		}
   469  		if err := columnHeader.WriteColumnType(column.dataType, i); err != nil {
   470  			return nil, err
   471  		}
   472  		if err := columnHeader.WriteColumnOffset(writer.GetOffset(), i); err != nil {
   473  			return nil, err
   474  		}
   475  		if err := columnHeader.WriteEnumDictLength(column.enumDictLengthInBytes, i); err != nil {
   476  			return nil, err
   477  		}
   478  		if err := column.AppendToBuffer(&writer); err != nil {
   479  			return nil, utils.StackError(err, "Failed to write data for column %d", i)
   480  		}
   481  		if err := columnHeader.WriteColumnOffset(writer.GetOffset(), i+1); err != nil {
   482  			return nil, err
   483  		}
   484  	}
   485  
   486  	return buffer, nil
   487  }
   488  
   489  func AdditionUpdate(oldValue, newValue unsafe.Pointer, dataType DataType) {
   490  	switch dataType {
   491  	case Int8:
   492  		*(*int8)(oldValue) = *(*int8)(oldValue) + *(*int8)(newValue)
   493  	case Uint8:
   494  		*(*uint8)(oldValue) = *(*uint8)(oldValue) + *(*uint8)(newValue)
   495  	case Int16:
   496  		*(*int16)(oldValue) = *(*int16)(oldValue) + *(*int16)(newValue)
   497  	case Uint16:
   498  		*(*uint16)(oldValue) = *(*uint16)(oldValue) + *(*uint16)(newValue)
   499  	case Int32:
   500  		*(*int32)(oldValue) = *(*int32)(oldValue) + *(*int32)(newValue)
   501  	case Uint32:
   502  		*(*uint32)(oldValue) = *(*uint32)(oldValue) + *(*uint32)(newValue)
   503  	case Int64:
   504  		*(*int64)(oldValue) = *(*int64)(oldValue) + *(*int64)(newValue)
   505  	case Float32:
   506  		*(*float32)(oldValue) = *(*float32)(oldValue) + *(*float32)(newValue)
   507  	}
   508  }
   509  
   510  // MinMaxUpdate update the old value if compareRes == expectedRes
   511  func MinMaxUpdate(oldValue, newValue unsafe.Pointer, dataType DataType, cmpFunc CompareFunc, expectedRes int) {
   512  	if compareRes := cmpFunc(oldValue, newValue); compareRes*expectedRes > 0 {
   513  		switch dataType {
   514  		case Int8:
   515  			*(*int8)(oldValue) = *(*int8)(newValue)
   516  		case Uint8:
   517  			*(*uint8)(oldValue) = *(*uint8)(newValue)
   518  		case Int16:
   519  			*(*int16)(oldValue) = *(*int16)(newValue)
   520  		case Uint16:
   521  			*(*uint16)(oldValue) = *(*uint16)(newValue)
   522  		case Int32:
   523  			*(*int32)(oldValue) = *(*int32)(newValue)
   524  		case Uint32:
   525  			*(*uint32)(oldValue) = *(*uint32)(newValue)
   526  		case Int64:
   527  			*(*int64)(oldValue) = *(*int64)(newValue)
   528  		case Float32:
   529  			*(*float32)(oldValue) = *(*float32)(newValue)
   530  		}
   531  	}
   532  }