github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/client/connector.go (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package client
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"net/http"
    21  	"sync"
    22  	"time"
    23  
    24  	"strconv"
    25  	"strings"
    26  	"unsafe"
    27  
    28  	"github.com/uber-go/tally"
    29  	memCom "github.com/uber/aresdb/memstore/common"
    30  	metaCom "github.com/uber/aresdb/metastore/common"
    31  	"github.com/uber/aresdb/utils"
    32  	"go.uber.org/zap"
    33  )
    34  
    35  const (
    36  	// default request time out in seconds
    37  	defaultRequestTimeout = 5
    38  	// default schema refresh interval in seconds
    39  	defaultSchemaRefreshInterval = 600
    40  	dataIngestionHeader          = "application/upsert-data"
    41  	applicationJSONHeader        = "application/json"
    42  )
    43  
    44  // Row represents a row of insert data.
    45  type Row []interface{}
    46  
    47  // Connector is the connector interface for ares.
    48  type Connector interface {
    49  	// Insert inserts rows to ares
    50  	// returns number of rows inserted and error.
    51  	// updateModes are optional, if ignored for all columns, no need to set
    52  	// if set, then all columns needs to be set
    53  	Insert(tableName string, columnNames []string, rows []Row, updateModes ...memCom.ColumnUpdateMode) (int, error)
    54  }
    55  
    56  // UpsertBatchBuilder is an interface of upsertBatch on client side
    57  type UpsertBatchBuilder interface {
    58  	PrepareUpsertBatch(tableName string, columnNames []string, updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error)
    59  }
    60  
    61  // enumCasesWrapper is a response/request body which wraps enum cases
    62  type enumCasesWrapper struct {
    63  	EnumCases []string
    64  }
    65  
    66  type TableSchema struct {
    67  	Table *metaCom.Table
    68  	// maps from column name to columnID for convenience
    69  	ColumnDict map[string]int
    70  }
    71  
    72  // enumDict maps from enum value to enumID
    73  type enumDict map[string]int
    74  
    75  // UpsertBatchBuilderImpl implements interface UpsertBatchBuilder
    76  type UpsertBatchBuilderImpl struct {
    77  	sync.RWMutex
    78  
    79  	logger        *zap.SugaredLogger
    80  	metricScope   tally.Scope
    81  	schemaHandler *CachedSchemaHandler
    82  }
    83  
    84  // connector is the ares connector implementation
    85  type connector struct {
    86  	cfg                ConnectorConfig
    87  	httpClient         http.Client
    88  	upsertBatchBuilder UpsertBatchBuilder
    89  	schemaHandler      *CachedSchemaHandler
    90  }
    91  
    92  // ConnectorConfig holds the configurations for ares Connector.
    93  type ConnectorConfig struct {
    94  	// Address is in the format of host:port
    95  	Address string `yaml:"address"`
    96  	// DeviceChoosingTimeout value is the request timeout in seconds for http calls
    97  	// if <= 0, will use default
    98  	Timeout int `yaml:"timeout"`
    99  	// SchemaRefreshInterval is the interval in seconds for the connector to
   100  	// fetch and refresh schema from ares
   101  	// if <= 0, will use default
   102  	SchemaRefreshInterval int `yaml:"schemaRefreshInterval"`
   103  }
   104  
   105  func NewUpsertBatchBuilderImpl(logger *zap.SugaredLogger, scope tally.Scope, schemaHandler *CachedSchemaHandler) UpsertBatchBuilder {
   106  	return &UpsertBatchBuilderImpl{
   107  		logger:        logger,
   108  		metricScope:   scope,
   109  		schemaHandler: schemaHandler,
   110  	}
   111  }
   112  
   113  // NewConnector returns a new ares Connector
   114  func (cfg ConnectorConfig) NewConnector(logger *zap.SugaredLogger, metricScope tally.Scope) (Connector, error) {
   115  	if cfg.SchemaRefreshInterval <= 0 {
   116  		cfg.SchemaRefreshInterval = defaultSchemaRefreshInterval
   117  	}
   118  
   119  	if cfg.Timeout <= 0 {
   120  		cfg.Timeout = defaultRequestTimeout
   121  	}
   122  
   123  	httpClient := http.Client{
   124  		Timeout: time.Duration(cfg.Timeout) * time.Second,
   125  	}
   126  
   127  	httpSchemaFetcher := NewHttpSchemaFetcher(httpClient, cfg.Address, metricScope)
   128  	cachedSchemaHandler := NewCachedSchemaHandler(logger, metricScope, httpSchemaFetcher)
   129  	err := cachedSchemaHandler.Start(cfg.SchemaRefreshInterval)
   130  	if err != nil {
   131  		return nil, err
   132  	}
   133  
   134  	connector := &connector{
   135  		cfg:        cfg,
   136  		httpClient: httpClient,
   137  		upsertBatchBuilder: &UpsertBatchBuilderImpl{
   138  			logger:        logger,
   139  			metricScope:   metricScope,
   140  			schemaHandler: cachedSchemaHandler,
   141  		},
   142  		schemaHandler: cachedSchemaHandler,
   143  	}
   144  	return connector, nil
   145  }
   146  
   147  // Insert inserts a batch of rows into ares
   148  func (c *connector) Insert(tableName string, columnNames []string, rows []Row, updateModes ...memCom.ColumnUpdateMode) (int, error) {
   149  	if len(columnNames) == 0 {
   150  		return 0, utils.StackError(nil, "No column names specified")
   151  	}
   152  
   153  	// if no update modes at all, use default
   154  	if len(updateModes) == 0 {
   155  		updateModes = make([]memCom.ColumnUpdateMode, len(columnNames))
   156  	}
   157  
   158  	if len(updateModes) != len(columnNames) {
   159  		return 0, utils.StackError(nil, "length of column update modes %d does not equal to number of columns %d", len(updateModes), len(columnNames))
   160  	}
   161  
   162  	if len(rows) == 0 {
   163  		// Do nothing when there is no row to insert
   164  		return 0, nil
   165  	}
   166  
   167  	for _, row := range rows {
   168  		if len(row) != len(columnNames) {
   169  			return 0, utils.StackError(nil,
   170  				"Length of column names should match length of a single row, length of column names :%d, length of row: %d",
   171  				len(columnNames),
   172  				len(row),
   173  			)
   174  		}
   175  	}
   176  
   177  	upsertBatchBytes, numRows, err := c.prepareUpsertBatch(tableName, columnNames, updateModes, rows)
   178  	if err != nil {
   179  		return numRows, err
   180  	}
   181  
   182  	//TODO: currently always use shard zero for single instance version
   183  	resp, err := c.httpClient.Post(c.dataPath(tableName, 0), dataIngestionHeader, bytes.NewReader(upsertBatchBytes))
   184  	if err != nil || resp.StatusCode != http.StatusOK {
   185  		//TODO: break status code check and error check into two parts for more specific handling like retrying on 5xx
   186  		return 0, utils.StackError(err, "Failed to post upsert batch, table: %s, shard: %d", tableName, 0)
   187  	}
   188  
   189  	return numRows, nil
   190  }
   191  
   192  // computeHLLValue populate hyperloglog value
   193  func computeHLLValue(dataType memCom.DataType, value interface{}) (uint32, error) {
   194  	var ok bool
   195  	var hashed uint64
   196  	switch dataType {
   197  	case memCom.UUID:
   198  		var v [2]uint64
   199  		v, ok = memCom.ConvertToUUID(value)
   200  		hashed = v[0] ^ v[1]
   201  	case memCom.Uint32:
   202  		var v uint32
   203  		v, ok = memCom.ConvertToUint32(value)
   204  		hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0)
   205  	case memCom.Int32:
   206  		var v int32
   207  		v, ok = memCom.ConvertToInt32(value)
   208  		hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0)
   209  	case memCom.Int64:
   210  		var v int64
   211  		v, ok = memCom.ConvertToInt64(value)
   212  		hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0)
   213  	default:
   214  		return 0, utils.StackError(nil, "invalid type %s for fast hll value", memCom.DataTypeName[dataType])
   215  	}
   216  	if !ok {
   217  		return 0, utils.StackError(nil, "invalid data value %v for data type %s", value, memCom.DataTypeName[dataType])
   218  	}
   219  	return utils.ComputeHLLValue(hashed), nil
   220  }
   221  
   222  // prepareUpsertBatch prepares the upsert batch for upsert,
   223  // returns upsertBatch byte array, number of rows in upsert batch and error.
   224  func (c *connector) prepareUpsertBatch(tableName string, columnNames []string, updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error) {
   225  	schema, err := c.schemaHandler.FetchSchema(tableName)
   226  	if err != nil {
   227  		return nil, 0, err
   228  	}
   229  
   230  	// return error if primary key is missing
   231  	if err = checkPrimaryKeys(schema, columnNames); err != nil {
   232  		return nil, 0, err
   233  	}
   234  
   235  	// return error if time column is missing
   236  	if err = checkTimeColumnExistence(schema, columnNames); err != nil {
   237  		return nil, 0, err
   238  	}
   239  
   240  	return c.upsertBatchBuilder.PrepareUpsertBatch(tableName, columnNames, updateModes, rows)
   241  }
   242  
   243  // checkPrimaryKeys checks whether primary key is missing
   244  func checkPrimaryKeys(schema *TableSchema, columnNames []string) error {
   245  	for _, columnID := range schema.Table.PrimaryKeyColumns {
   246  		pkColumn := schema.Table.Columns[columnID]
   247  		index := utils.IndexOfStr(columnNames, pkColumn.Name)
   248  		if index < 0 {
   249  			return utils.StackError(nil, "Missing primary key column")
   250  		}
   251  	}
   252  	return nil
   253  }
   254  
   255  // checkTimeColumnExistence checks if time column is missing for fact table
   256  func checkTimeColumnExistence(schema *TableSchema, columnNames []string) error {
   257  	if !schema.Table.IsFactTable || schema.Table.Config.AllowMissingEventTime {
   258  		return nil
   259  	}
   260  
   261  	for _, columnName := range columnNames {
   262  		columnID, exist := schema.ColumnDict[columnName]
   263  		if !exist {
   264  			continue
   265  		}
   266  
   267  		if columnID == 0 {
   268  			return nil
   269  		}
   270  	}
   271  	return utils.StackError(nil, "Missing time column")
   272  }
   273  
   274  func (c *connector) dataPath(tableName string, shard int) string {
   275  	return fmt.Sprintf("http://%s/data/%s/%d", c.cfg.Address, tableName, shard)
   276  }
   277  
   278  func (u *UpsertBatchBuilderImpl) prepareEnumCases(tableName, columnName string, colIndex, columnID int, rows []Row, abandonRows map[int]struct{}, caseInsensitive bool, disableAutoExpand bool) error {
   279  	enumCaseSet := make(map[string]struct{})
   280  	for rowIndex, row := range rows {
   281  		if _, exist := abandonRows[rowIndex]; exist {
   282  			continue
   283  		}
   284  		value := row[colIndex]
   285  
   286  		if value == nil {
   287  			continue
   288  		}
   289  
   290  		if enumCase, ok := value.(string); ok {
   291  			if caseInsensitive {
   292  				enumCase = strings.ToLower(enumCase)
   293  			}
   294  			enumCaseSet[enumCase] = struct{}{}
   295  		} else {
   296  			u.logger.With(
   297  				"name", "prepareEnumCases",
   298  				"error", "Enum value should be string",
   299  				"table", tableName,
   300  				"columnID", columnID,
   301  				"value", value).Debug("Enum value is not string")
   302  			u.metricScope.Tagged(map[string]string{"table": tableName, "columnID": strconv.Itoa(columnID)}).
   303  				Counter("abandoned_rows").Inc(1)
   304  			abandonRows[rowIndex] = struct{}{}
   305  		}
   306  	}
   307  
   308  	if len(enumCaseSet) > 0 {
   309  		enumCases := make([]string, 0, len(enumCaseSet))
   310  		for enumCase := range enumCaseSet {
   311  			enumCases = append(enumCases, enumCase)
   312  		}
   313  		err := u.schemaHandler.PrepareEnumCases(tableName, columnName, enumCases)
   314  		if err != nil {
   315  			return err
   316  		}
   317  	}
   318  	return nil
   319  }
   320  
   321  // PrepareUpsertBatch prepares the upsert batch for upsert,
   322  // returns upsertBatch byte array, number of rows in upsert batch and error.
   323  func (u *UpsertBatchBuilderImpl) PrepareUpsertBatch(tableName string, columnNames []string,
   324  	updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error) {
   325  	var err error
   326  	upsertBatchBuilder := memCom.NewUpsertBatchBuilder()
   327  
   328  	schema, err := u.schemaHandler.FetchSchema(tableName)
   329  	if err != nil {
   330  		return nil, 0, err
   331  	}
   332  
   333  	// use abandonRows to record abandoned row index due to invalid data
   334  	abandonRows := make(map[int]struct{})
   335  
   336  	for colIndex, columnName := range columnNames {
   337  		columnID, exist := schema.ColumnDict[columnName]
   338  		if !exist {
   339  			continue
   340  		}
   341  		column := schema.Table.Columns[columnID]
   342  
   343  		// following conditions only overwrite is supported:
   344  		// 1. dimension table (TODO: might support min/max in the future if needed)
   345  		// 2. primary key column
   346  		// 3. archiving sort column
   347  		// 4. data type not in uint8, int8, uint16, int16, uint32, int32, float32
   348  		if (!schema.Table.IsFactTable ||
   349  			utils.IndexOfInt(schema.Table.PrimaryKeyColumns, columnID) >= 0 ||
   350  			utils.IndexOfInt(schema.Table.ArchivingSortColumns, columnID) >= 0 ||
   351  			schema.Table.Columns[columnID].IsOverwriteOnlyDataType()) &&
   352  			updateModes[colIndex] > memCom.UpdateForceOverwrite {
   353  			return nil, 0, utils.StackError(nil, "column %s only supports overwrite", columnName)
   354  		}
   355  
   356  		dataType := memCom.DataTypeForColumn(column)
   357  		if err = upsertBatchBuilder.AddColumnWithUpdateMode(columnID, dataType, updateModes[colIndex]); err != nil {
   358  			return nil, 0, err
   359  		}
   360  
   361  		if column.IsEnumColumn() {
   362  			if err = u.prepareEnumCases(tableName, columnName, colIndex, columnID, rows, abandonRows, column.CaseInsensitive, column.DisableAutoExpand); err != nil {
   363  				return nil, 0, err
   364  			}
   365  		}
   366  	}
   367  
   368  	for rowIndex, row := range rows {
   369  		if _, exist := abandonRows[rowIndex]; exist {
   370  			continue
   371  		}
   372  		upsertBatchBuilder.AddRow()
   373  
   374  		upsertBatchColumnIndex := 0
   375  		for inputColIndex, columnName := range columnNames {
   376  			columnID, exist := schema.ColumnDict[columnName]
   377  			if !exist {
   378  				continue
   379  			}
   380  			column := schema.Table.Columns[columnID]
   381  
   382  			value := row[inputColIndex]
   383  
   384  			// prevent primary key being nil
   385  			if value == nil && utils.IndexOfInt(schema.Table.PrimaryKeyColumns, columnID) >= 0 {
   386  				upsertBatchBuilder.RemoveRow()
   387  				u.logger.With(
   388  					"name", "PrepareUpsertBatch",
   389  					"table", tableName,
   390  					"columnID", columnID,
   391  					"value", value).Error("PrimaryKey column is nil")
   392  				break
   393  			}
   394  
   395  			// skip rows if time column is nil for fact table
   396  			if value == nil && schema.Table.IsFactTable && !schema.Table.Config.AllowMissingEventTime && columnID == 0 {
   397  				upsertBatchBuilder.RemoveRow()
   398  				u.logger.With(
   399  					"name", "PrepareUpsertBatch",
   400  					"table", tableName,
   401  					"columnID", columnID,
   402  					"value", value).Error("Time column is nil")
   403  				break
   404  			}
   405  
   406  			if column.IsEnumColumn() {
   407  				value, err = u.schemaHandler.TranslateEnum(tableName, columnID, value, column.CaseInsensitive)
   408  				if err != nil {
   409  					upsertBatchBuilder.RemoveRow()
   410  					u.logger.With(
   411  						"name", "prepareUpsertBatch",
   412  						"error", err.Error(),
   413  						"table", tableName,
   414  						"columnID", columnID,
   415  						"value", value).Error("Failed to translate enum")
   416  					break
   417  				}
   418  
   419  				// If enum value is not found from predefined enum cases and default value is not set, we set it to nil.
   420  				if value == -1 {
   421  					value = nil
   422  				}
   423  			}
   424  
   425  			// Set value to the last row.
   426  			// compute hll value to insert
   427  			if column.HLLConfig.IsHLLColumn {
   428  				// here use original column data type to compute hll value
   429  				value, err = computeHLLValue(memCom.DataTypeFromString(column.Type), value)
   430  				if err != nil {
   431  					upsertBatchBuilder.RemoveRow()
   432  					u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value")
   433  					break
   434  				}
   435  				if err = upsertBatchBuilder.SetValue(upsertBatchBuilder.NumRows-1, upsertBatchColumnIndex, value); err != nil {
   436  					upsertBatchBuilder.RemoveRow()
   437  					u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value")
   438  					break
   439  				}
   440  			} else {
   441  				// directly insert value
   442  				if err = upsertBatchBuilder.SetValue(upsertBatchBuilder.NumRows-1, upsertBatchColumnIndex, value); err != nil {
   443  					upsertBatchBuilder.RemoveRow()
   444  					u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value")
   445  					break
   446  				}
   447  			}
   448  			upsertBatchColumnIndex++
   449  		}
   450  	}
   451  
   452  	batchBytes, err := upsertBatchBuilder.ToByteArray()
   453  	return batchBytes, upsertBatchBuilder.NumRows, err
   454  }