github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/client/connector.go (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package client 16 17 import ( 18 "bytes" 19 "fmt" 20 "net/http" 21 "sync" 22 "time" 23 24 "strconv" 25 "strings" 26 "unsafe" 27 28 "github.com/uber-go/tally" 29 memCom "github.com/uber/aresdb/memstore/common" 30 metaCom "github.com/uber/aresdb/metastore/common" 31 "github.com/uber/aresdb/utils" 32 "go.uber.org/zap" 33 ) 34 35 const ( 36 // default request time out in seconds 37 defaultRequestTimeout = 5 38 // default schema refresh interval in seconds 39 defaultSchemaRefreshInterval = 600 40 dataIngestionHeader = "application/upsert-data" 41 applicationJSONHeader = "application/json" 42 ) 43 44 // Row represents a row of insert data. 45 type Row []interface{} 46 47 // Connector is the connector interface for ares. 48 type Connector interface { 49 // Insert inserts rows to ares 50 // returns number of rows inserted and error. 51 // updateModes are optional, if ignored for all columns, no need to set 52 // if set, then all columns needs to be set 53 Insert(tableName string, columnNames []string, rows []Row, updateModes ...memCom.ColumnUpdateMode) (int, error) 54 } 55 56 // UpsertBatchBuilder is an interface of upsertBatch on client side 57 type UpsertBatchBuilder interface { 58 PrepareUpsertBatch(tableName string, columnNames []string, updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error) 59 } 60 61 // enumCasesWrapper is a response/request body which wraps enum cases 62 type enumCasesWrapper struct { 63 EnumCases []string 64 } 65 66 type TableSchema struct { 67 Table *metaCom.Table 68 // maps from column name to columnID for convenience 69 ColumnDict map[string]int 70 } 71 72 // enumDict maps from enum value to enumID 73 type enumDict map[string]int 74 75 // UpsertBatchBuilderImpl implements interface UpsertBatchBuilder 76 type UpsertBatchBuilderImpl struct { 77 sync.RWMutex 78 79 logger *zap.SugaredLogger 80 metricScope tally.Scope 81 schemaHandler *CachedSchemaHandler 82 } 83 84 // connector is the ares connector implementation 85 type connector struct { 86 cfg ConnectorConfig 87 httpClient http.Client 88 upsertBatchBuilder UpsertBatchBuilder 89 schemaHandler *CachedSchemaHandler 90 } 91 92 // ConnectorConfig holds the configurations for ares Connector. 93 type ConnectorConfig struct { 94 // Address is in the format of host:port 95 Address string `yaml:"address"` 96 // DeviceChoosingTimeout value is the request timeout in seconds for http calls 97 // if <= 0, will use default 98 Timeout int `yaml:"timeout"` 99 // SchemaRefreshInterval is the interval in seconds for the connector to 100 // fetch and refresh schema from ares 101 // if <= 0, will use default 102 SchemaRefreshInterval int `yaml:"schemaRefreshInterval"` 103 } 104 105 func NewUpsertBatchBuilderImpl(logger *zap.SugaredLogger, scope tally.Scope, schemaHandler *CachedSchemaHandler) UpsertBatchBuilder { 106 return &UpsertBatchBuilderImpl{ 107 logger: logger, 108 metricScope: scope, 109 schemaHandler: schemaHandler, 110 } 111 } 112 113 // NewConnector returns a new ares Connector 114 func (cfg ConnectorConfig) NewConnector(logger *zap.SugaredLogger, metricScope tally.Scope) (Connector, error) { 115 if cfg.SchemaRefreshInterval <= 0 { 116 cfg.SchemaRefreshInterval = defaultSchemaRefreshInterval 117 } 118 119 if cfg.Timeout <= 0 { 120 cfg.Timeout = defaultRequestTimeout 121 } 122 123 httpClient := http.Client{ 124 Timeout: time.Duration(cfg.Timeout) * time.Second, 125 } 126 127 httpSchemaFetcher := NewHttpSchemaFetcher(httpClient, cfg.Address, metricScope) 128 cachedSchemaHandler := NewCachedSchemaHandler(logger, metricScope, httpSchemaFetcher) 129 err := cachedSchemaHandler.Start(cfg.SchemaRefreshInterval) 130 if err != nil { 131 return nil, err 132 } 133 134 connector := &connector{ 135 cfg: cfg, 136 httpClient: httpClient, 137 upsertBatchBuilder: &UpsertBatchBuilderImpl{ 138 logger: logger, 139 metricScope: metricScope, 140 schemaHandler: cachedSchemaHandler, 141 }, 142 schemaHandler: cachedSchemaHandler, 143 } 144 return connector, nil 145 } 146 147 // Insert inserts a batch of rows into ares 148 func (c *connector) Insert(tableName string, columnNames []string, rows []Row, updateModes ...memCom.ColumnUpdateMode) (int, error) { 149 if len(columnNames) == 0 { 150 return 0, utils.StackError(nil, "No column names specified") 151 } 152 153 // if no update modes at all, use default 154 if len(updateModes) == 0 { 155 updateModes = make([]memCom.ColumnUpdateMode, len(columnNames)) 156 } 157 158 if len(updateModes) != len(columnNames) { 159 return 0, utils.StackError(nil, "length of column update modes %d does not equal to number of columns %d", len(updateModes), len(columnNames)) 160 } 161 162 if len(rows) == 0 { 163 // Do nothing when there is no row to insert 164 return 0, nil 165 } 166 167 for _, row := range rows { 168 if len(row) != len(columnNames) { 169 return 0, utils.StackError(nil, 170 "Length of column names should match length of a single row, length of column names :%d, length of row: %d", 171 len(columnNames), 172 len(row), 173 ) 174 } 175 } 176 177 upsertBatchBytes, numRows, err := c.prepareUpsertBatch(tableName, columnNames, updateModes, rows) 178 if err != nil { 179 return numRows, err 180 } 181 182 //TODO: currently always use shard zero for single instance version 183 resp, err := c.httpClient.Post(c.dataPath(tableName, 0), dataIngestionHeader, bytes.NewReader(upsertBatchBytes)) 184 if err != nil || resp.StatusCode != http.StatusOK { 185 //TODO: break status code check and error check into two parts for more specific handling like retrying on 5xx 186 return 0, utils.StackError(err, "Failed to post upsert batch, table: %s, shard: %d", tableName, 0) 187 } 188 189 return numRows, nil 190 } 191 192 // computeHLLValue populate hyperloglog value 193 func computeHLLValue(dataType memCom.DataType, value interface{}) (uint32, error) { 194 var ok bool 195 var hashed uint64 196 switch dataType { 197 case memCom.UUID: 198 var v [2]uint64 199 v, ok = memCom.ConvertToUUID(value) 200 hashed = v[0] ^ v[1] 201 case memCom.Uint32: 202 var v uint32 203 v, ok = memCom.ConvertToUint32(value) 204 hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0) 205 case memCom.Int32: 206 var v int32 207 v, ok = memCom.ConvertToInt32(value) 208 hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0) 209 case memCom.Int64: 210 var v int64 211 v, ok = memCom.ConvertToInt64(value) 212 hashed = utils.Murmur3Sum64(unsafe.Pointer(&v), memCom.DataTypeBytes(dataType), 0) 213 default: 214 return 0, utils.StackError(nil, "invalid type %s for fast hll value", memCom.DataTypeName[dataType]) 215 } 216 if !ok { 217 return 0, utils.StackError(nil, "invalid data value %v for data type %s", value, memCom.DataTypeName[dataType]) 218 } 219 return utils.ComputeHLLValue(hashed), nil 220 } 221 222 // prepareUpsertBatch prepares the upsert batch for upsert, 223 // returns upsertBatch byte array, number of rows in upsert batch and error. 224 func (c *connector) prepareUpsertBatch(tableName string, columnNames []string, updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error) { 225 schema, err := c.schemaHandler.FetchSchema(tableName) 226 if err != nil { 227 return nil, 0, err 228 } 229 230 // return error if primary key is missing 231 if err = checkPrimaryKeys(schema, columnNames); err != nil { 232 return nil, 0, err 233 } 234 235 // return error if time column is missing 236 if err = checkTimeColumnExistence(schema, columnNames); err != nil { 237 return nil, 0, err 238 } 239 240 return c.upsertBatchBuilder.PrepareUpsertBatch(tableName, columnNames, updateModes, rows) 241 } 242 243 // checkPrimaryKeys checks whether primary key is missing 244 func checkPrimaryKeys(schema *TableSchema, columnNames []string) error { 245 for _, columnID := range schema.Table.PrimaryKeyColumns { 246 pkColumn := schema.Table.Columns[columnID] 247 index := utils.IndexOfStr(columnNames, pkColumn.Name) 248 if index < 0 { 249 return utils.StackError(nil, "Missing primary key column") 250 } 251 } 252 return nil 253 } 254 255 // checkTimeColumnExistence checks if time column is missing for fact table 256 func checkTimeColumnExistence(schema *TableSchema, columnNames []string) error { 257 if !schema.Table.IsFactTable || schema.Table.Config.AllowMissingEventTime { 258 return nil 259 } 260 261 for _, columnName := range columnNames { 262 columnID, exist := schema.ColumnDict[columnName] 263 if !exist { 264 continue 265 } 266 267 if columnID == 0 { 268 return nil 269 } 270 } 271 return utils.StackError(nil, "Missing time column") 272 } 273 274 func (c *connector) dataPath(tableName string, shard int) string { 275 return fmt.Sprintf("http://%s/data/%s/%d", c.cfg.Address, tableName, shard) 276 } 277 278 func (u *UpsertBatchBuilderImpl) prepareEnumCases(tableName, columnName string, colIndex, columnID int, rows []Row, abandonRows map[int]struct{}, caseInsensitive bool, disableAutoExpand bool) error { 279 enumCaseSet := make(map[string]struct{}) 280 for rowIndex, row := range rows { 281 if _, exist := abandonRows[rowIndex]; exist { 282 continue 283 } 284 value := row[colIndex] 285 286 if value == nil { 287 continue 288 } 289 290 if enumCase, ok := value.(string); ok { 291 if caseInsensitive { 292 enumCase = strings.ToLower(enumCase) 293 } 294 enumCaseSet[enumCase] = struct{}{} 295 } else { 296 u.logger.With( 297 "name", "prepareEnumCases", 298 "error", "Enum value should be string", 299 "table", tableName, 300 "columnID", columnID, 301 "value", value).Debug("Enum value is not string") 302 u.metricScope.Tagged(map[string]string{"table": tableName, "columnID": strconv.Itoa(columnID)}). 303 Counter("abandoned_rows").Inc(1) 304 abandonRows[rowIndex] = struct{}{} 305 } 306 } 307 308 if len(enumCaseSet) > 0 { 309 enumCases := make([]string, 0, len(enumCaseSet)) 310 for enumCase := range enumCaseSet { 311 enumCases = append(enumCases, enumCase) 312 } 313 err := u.schemaHandler.PrepareEnumCases(tableName, columnName, enumCases) 314 if err != nil { 315 return err 316 } 317 } 318 return nil 319 } 320 321 // PrepareUpsertBatch prepares the upsert batch for upsert, 322 // returns upsertBatch byte array, number of rows in upsert batch and error. 323 func (u *UpsertBatchBuilderImpl) PrepareUpsertBatch(tableName string, columnNames []string, 324 updateModes []memCom.ColumnUpdateMode, rows []Row) ([]byte, int, error) { 325 var err error 326 upsertBatchBuilder := memCom.NewUpsertBatchBuilder() 327 328 schema, err := u.schemaHandler.FetchSchema(tableName) 329 if err != nil { 330 return nil, 0, err 331 } 332 333 // use abandonRows to record abandoned row index due to invalid data 334 abandonRows := make(map[int]struct{}) 335 336 for colIndex, columnName := range columnNames { 337 columnID, exist := schema.ColumnDict[columnName] 338 if !exist { 339 continue 340 } 341 column := schema.Table.Columns[columnID] 342 343 // following conditions only overwrite is supported: 344 // 1. dimension table (TODO: might support min/max in the future if needed) 345 // 2. primary key column 346 // 3. archiving sort column 347 // 4. data type not in uint8, int8, uint16, int16, uint32, int32, float32 348 if (!schema.Table.IsFactTable || 349 utils.IndexOfInt(schema.Table.PrimaryKeyColumns, columnID) >= 0 || 350 utils.IndexOfInt(schema.Table.ArchivingSortColumns, columnID) >= 0 || 351 schema.Table.Columns[columnID].IsOverwriteOnlyDataType()) && 352 updateModes[colIndex] > memCom.UpdateForceOverwrite { 353 return nil, 0, utils.StackError(nil, "column %s only supports overwrite", columnName) 354 } 355 356 dataType := memCom.DataTypeForColumn(column) 357 if err = upsertBatchBuilder.AddColumnWithUpdateMode(columnID, dataType, updateModes[colIndex]); err != nil { 358 return nil, 0, err 359 } 360 361 if column.IsEnumColumn() { 362 if err = u.prepareEnumCases(tableName, columnName, colIndex, columnID, rows, abandonRows, column.CaseInsensitive, column.DisableAutoExpand); err != nil { 363 return nil, 0, err 364 } 365 } 366 } 367 368 for rowIndex, row := range rows { 369 if _, exist := abandonRows[rowIndex]; exist { 370 continue 371 } 372 upsertBatchBuilder.AddRow() 373 374 upsertBatchColumnIndex := 0 375 for inputColIndex, columnName := range columnNames { 376 columnID, exist := schema.ColumnDict[columnName] 377 if !exist { 378 continue 379 } 380 column := schema.Table.Columns[columnID] 381 382 value := row[inputColIndex] 383 384 // prevent primary key being nil 385 if value == nil && utils.IndexOfInt(schema.Table.PrimaryKeyColumns, columnID) >= 0 { 386 upsertBatchBuilder.RemoveRow() 387 u.logger.With( 388 "name", "PrepareUpsertBatch", 389 "table", tableName, 390 "columnID", columnID, 391 "value", value).Error("PrimaryKey column is nil") 392 break 393 } 394 395 // skip rows if time column is nil for fact table 396 if value == nil && schema.Table.IsFactTable && !schema.Table.Config.AllowMissingEventTime && columnID == 0 { 397 upsertBatchBuilder.RemoveRow() 398 u.logger.With( 399 "name", "PrepareUpsertBatch", 400 "table", tableName, 401 "columnID", columnID, 402 "value", value).Error("Time column is nil") 403 break 404 } 405 406 if column.IsEnumColumn() { 407 value, err = u.schemaHandler.TranslateEnum(tableName, columnID, value, column.CaseInsensitive) 408 if err != nil { 409 upsertBatchBuilder.RemoveRow() 410 u.logger.With( 411 "name", "prepareUpsertBatch", 412 "error", err.Error(), 413 "table", tableName, 414 "columnID", columnID, 415 "value", value).Error("Failed to translate enum") 416 break 417 } 418 419 // If enum value is not found from predefined enum cases and default value is not set, we set it to nil. 420 if value == -1 { 421 value = nil 422 } 423 } 424 425 // Set value to the last row. 426 // compute hll value to insert 427 if column.HLLConfig.IsHLLColumn { 428 // here use original column data type to compute hll value 429 value, err = computeHLLValue(memCom.DataTypeFromString(column.Type), value) 430 if err != nil { 431 upsertBatchBuilder.RemoveRow() 432 u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value") 433 break 434 } 435 if err = upsertBatchBuilder.SetValue(upsertBatchBuilder.NumRows-1, upsertBatchColumnIndex, value); err != nil { 436 upsertBatchBuilder.RemoveRow() 437 u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value") 438 break 439 } 440 } else { 441 // directly insert value 442 if err = upsertBatchBuilder.SetValue(upsertBatchBuilder.NumRows-1, upsertBatchColumnIndex, value); err != nil { 443 upsertBatchBuilder.RemoveRow() 444 u.logger.With("name", "PrepareUpsertBatch", "error", err.Error(), "table", tableName, "columnID", columnID, "value", value).Error("Failed to set value") 445 break 446 } 447 } 448 upsertBatchColumnIndex++ 449 } 450 } 451 452 batchBytes, err := upsertBatchBuilder.ToByteArray() 453 return batchBytes, upsertBatchBuilder.NumRows, err 454 }