github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/external/external.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package external 16 17 import ( 18 "archive/tar" 19 "bufio" 20 "bytes" 21 "compress/bzip2" 22 "compress/flate" 23 "compress/gzip" 24 "compress/zlib" 25 "context" 26 "encoding/json" 27 "errors" 28 "fmt" 29 "io" 30 "math" 31 "strconv" 32 "strings" 33 "time" 34 35 "github.com/matrixorigin/matrixone/pkg/catalog" 36 "github.com/matrixorigin/matrixone/pkg/common/moerr" 37 "github.com/matrixorigin/matrixone/pkg/common/morpc" 38 "github.com/matrixorigin/matrixone/pkg/common/mpool" 39 "github.com/matrixorigin/matrixone/pkg/common/util" 40 "github.com/matrixorigin/matrixone/pkg/container/batch" 41 "github.com/matrixorigin/matrixone/pkg/container/bytejson" 42 "github.com/matrixorigin/matrixone/pkg/container/nulls" 43 "github.com/matrixorigin/matrixone/pkg/container/types" 44 "github.com/matrixorigin/matrixone/pkg/container/vector" 45 "github.com/matrixorigin/matrixone/pkg/fileservice" 46 "github.com/matrixorigin/matrixone/pkg/logutil" 47 "github.com/matrixorigin/matrixone/pkg/objectio" 48 "github.com/matrixorigin/matrixone/pkg/pb/plan" 49 "github.com/matrixorigin/matrixone/pkg/sql/colexec" 50 "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" 51 plan2 "github.com/matrixorigin/matrixone/pkg/sql/plan" 52 "github.com/matrixorigin/matrixone/pkg/sql/util/csvparser" 53 "github.com/matrixorigin/matrixone/pkg/util/errutil" 54 v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2" 55 "github.com/matrixorigin/matrixone/pkg/util/trace" 56 "github.com/matrixorigin/matrixone/pkg/vm" 57 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio" 58 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options" 59 "github.com/matrixorigin/matrixone/pkg/vm/process" 60 "github.com/pierrec/lz4/v4" 61 ) 62 63 var ( 64 OneBatchMaxRow = int(options.DefaultBlockMaxRows) 65 S3ParallelMaxnum = 10 66 ) 67 68 var ( 69 STATEMENT_ACCOUNT = "account" 70 ) 71 72 const argName = "external" 73 74 func (arg *Argument) String(buf *bytes.Buffer) { 75 buf.WriteString(argName) 76 buf.WriteString(": external output") 77 } 78 79 func (arg *Argument) Prepare(proc *process.Process) error { 80 _, span := trace.Start(proc.Ctx, "ExternalPrepare") 81 defer span.End() 82 param := arg.Es 83 if proc.Lim.MaxMsgSize == 0 { 84 param.maxBatchSize = uint64(morpc.GetMessageSize()) 85 } else { 86 param.maxBatchSize = proc.Lim.MaxMsgSize 87 } 88 param.maxBatchSize = uint64(float64(param.maxBatchSize) * 0.6) 89 if param.Extern == nil { 90 param.Extern = &tree.ExternParam{} 91 if err := json.Unmarshal([]byte(param.CreateSql), param.Extern); err != nil { 92 return err 93 } 94 if err := plan2.InitS3Param(param.Extern); err != nil { 95 return err 96 } 97 param.Extern.FileService = proc.FileService 98 } 99 if !loadFormatIsValid(param.Extern) { 100 return moerr.NewNYI(proc.Ctx, "load format '%s'", param.Extern.Format) 101 } 102 103 if param.Extern.Format != tree.PARQUET { 104 if param.Extern.Format == tree.JSONLINE { 105 if param.Extern.JsonData != tree.OBJECT && param.Extern.JsonData != tree.ARRAY { 106 param.Fileparam.End = true 107 return moerr.NewNotSupported(proc.Ctx, "the jsonline format '%s' is not supported now", param.Extern.JsonData) 108 } 109 } 110 param.IgnoreLineTag = int(param.Extern.Tail.IgnoredLines) 111 param.IgnoreLine = param.IgnoreLineTag 112 param.MoCsvLineArray = make([][]csvparser.Field, OneBatchMaxRow) 113 } 114 115 if len(param.FileList) == 0 && param.Extern.ScanType != tree.INLINE { 116 logutil.Warnf("no such file '%s'", param.Extern.Filepath) 117 param.Fileparam.End = true 118 } 119 param.Fileparam.FileCnt = len(param.FileList) 120 param.Ctx = proc.Ctx 121 param.Zoneparam = &ZonemapFileparam{} 122 name2ColIndex := make(map[string]int32, len(param.Cols)) 123 for i, col := range param.Cols { 124 name2ColIndex[col.Name] = int32(i) 125 } 126 param.tableDef = &plan.TableDef{ 127 Name2ColIndex: name2ColIndex, 128 } 129 param.Filter.columnMap, _, _, _ = plan2.GetColumnsByExpr(param.Filter.FilterExpr, param.tableDef) 130 param.Filter.zonemappable = plan2.ExprIsZonemappable(proc.Ctx, param.Filter.FilterExpr) 131 return nil 132 } 133 134 func (arg *Argument) Call(proc *process.Process) (vm.CallResult, error) { 135 if err, isCancel := vm.CancelCheck(proc); isCancel { 136 return vm.CancelResult, err 137 } 138 139 t := time.Now() 140 ctx, span := trace.Start(proc.Ctx, "ExternalCall") 141 t1 := time.Now() 142 anal := proc.GetAnalyze(arg.GetIdx(), arg.GetParallelIdx(), arg.GetParallelMajor()) 143 anal.Start() 144 defer func() { 145 anal.Stop() 146 anal.AddScanTime(t1) 147 span.End() 148 v2.TxnStatementExternalScanDurationHistogram.Observe(time.Since(t).Seconds()) 149 }() 150 anal.Input(nil, arg.GetIsFirst()) 151 152 var err error 153 result := vm.NewCallResult() 154 param := arg.Es 155 if param.Fileparam.End { 156 result.Status = vm.ExecStop 157 return result, nil 158 } 159 if (param.plh == nil && param.parqh == nil) && param.Extern.ScanType != tree.INLINE { 160 if param.Fileparam.FileIndex >= len(param.FileList) { 161 result.Status = vm.ExecStop 162 return result, nil 163 } 164 param.Fileparam.Filepath = param.FileList[param.Fileparam.FileIndex] 165 param.Fileparam.FileIndex++ 166 } 167 if arg.buf != nil { 168 proc.PutBatch(arg.buf) 169 arg.buf = nil 170 } 171 arg.buf, err = scanFileData(ctx, param, proc) 172 if err != nil { 173 param.Fileparam.End = true 174 return result, err 175 } 176 177 if arg.buf != nil { 178 anal.Output(arg.buf, arg.GetIsLast()) 179 arg.maxAllocSize = max(arg.maxAllocSize, arg.buf.Size()) 180 } 181 result.Batch = arg.buf 182 if result.Batch != nil { 183 result.Batch.ShuffleIDX = param.Idx 184 } 185 return result, nil 186 } 187 188 func containColname(col string) bool { 189 return strings.Contains(col, STATEMENT_ACCOUNT) || strings.Contains(col, catalog.ExternalFilePath) 190 } 191 192 func judgeContainColname(expr *plan.Expr) bool { 193 expr_F, ok := expr.Expr.(*plan.Expr_F) 194 if !ok { 195 return false 196 } 197 if expr_F.F.Func.ObjName == "or" { 198 flag := true 199 for i := 0; i < len(expr_F.F.Args); i++ { 200 flag = flag && judgeContainColname(expr_F.F.Args[i]) 201 } 202 return flag 203 } 204 expr_Col, ok := expr_F.F.Args[0].Expr.(*plan.Expr_Col) 205 if ok && containColname(expr_Col.Col.Name) { 206 return true 207 } 208 for _, arg := range expr_F.F.Args { 209 if judgeContainColname(arg) { 210 return true 211 } 212 } 213 return false 214 } 215 216 func getAccountCol(filepath string) string { 217 pathDir := strings.Split(filepath, "/") 218 if len(pathDir) < 2 { 219 return "" 220 } 221 return pathDir[1] 222 } 223 224 func makeFilepathBatch(node *plan.Node, proc *process.Process, fileList []string) (bat *batch.Batch, err error) { 225 num := len(node.TableDef.Cols) 226 bat = &batch.Batch{ 227 Attrs: make([]string, num), 228 Vecs: make([]*vector.Vector, num), 229 Cnt: 1, 230 } 231 232 var buf bytes.Buffer 233 mp := proc.GetMPool() 234 for i := 0; i < num; i++ { 235 bat.Attrs[i] = node.TableDef.Cols[i].Name 236 if bat.Attrs[i] == STATEMENT_ACCOUNT { 237 typ := types.New(types.T(node.TableDef.Cols[i].Typ.Id), node.TableDef.Cols[i].Typ.Width, node.TableDef.Cols[i].Typ.Scale) 238 bat.Vecs[i], err = proc.AllocVectorOfRows(typ, len(fileList), nil) 239 if err != nil { 240 bat.Clean(mp) 241 return nil, err 242 } 243 244 for j := 0; j < len(fileList); j++ { 245 buf.WriteString(getAccountCol(fileList[j])) 246 bs := buf.Bytes() 247 if err = vector.SetBytesAt(bat.Vecs[i], j, bs, mp); err != nil { 248 bat.Clean(mp) 249 return nil, err 250 } 251 buf.Reset() 252 } 253 } else if bat.Attrs[i] == catalog.ExternalFilePath { 254 typ := types.T_varchar.ToType() 255 bat.Vecs[i], err = proc.AllocVectorOfRows(typ, len(fileList), nil) 256 if err != nil { 257 bat.Clean(mp) 258 return nil, err 259 } 260 261 for j := 0; j < len(fileList); j++ { 262 buf.WriteString(fileList[j]) 263 bs := buf.Bytes() 264 if err = vector.SetBytesAt(bat.Vecs[i], j, bs, mp); err != nil { 265 bat.Clean(mp) 266 return nil, err 267 } 268 buf.Reset() 269 } 270 } 271 } 272 bat.SetRowCount(len(fileList)) 273 return bat, nil 274 } 275 276 func filterByAccountAndFilename(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) { 277 _, span := trace.Start(ctx, "filterByAccountAndFilename") 278 defer span.End() 279 filterList := make([]*plan.Expr, 0) 280 filterList2 := make([]*plan.Expr, 0) 281 for i := 0; i < len(node.FilterList); i++ { 282 if judgeContainColname(node.FilterList[i]) { 283 filterList = append(filterList, node.FilterList[i]) 284 } else { 285 filterList2 = append(filterList2, node.FilterList[i]) 286 } 287 } 288 if len(filterList) == 0 { 289 return fileList, fileSize, nil 290 } 291 bat, err := makeFilepathBatch(node, proc, fileList) 292 if err != nil { 293 return nil, nil, err 294 } 295 filter := colexec.RewriteFilterExprList(filterList) 296 297 executor, err := colexec.NewExpressionExecutor(proc, filter) 298 if err != nil { 299 return nil, nil, err 300 } 301 vec, err := executor.Eval(proc, []*batch.Batch{bat}) 302 if err != nil { 303 executor.Free() 304 return nil, nil, err 305 } 306 307 fileListTmp := make([]string, 0) 308 fileSizeTmp := make([]int64, 0) 309 bs := vector.MustFixedCol[bool](vec) 310 for i := 0; i < len(bs); i++ { 311 if bs[i] { 312 fileListTmp = append(fileListTmp, fileList[i]) 313 fileSizeTmp = append(fileSizeTmp, fileSize[i]) 314 } 315 } 316 executor.Free() 317 node.FilterList = filterList2 318 return fileListTmp, fileSizeTmp, nil 319 } 320 321 func FilterFileList(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) { 322 return filterByAccountAndFilename(ctx, node, proc, fileList, fileSize) 323 } 324 325 func readFile(param *ExternalParam, proc *process.Process) (io.ReadCloser, error) { 326 if param.Extern.ScanType == tree.INLINE { 327 return io.NopCloser(bytes.NewReader(util.UnsafeStringToBytes(param.Extern.Data))), nil 328 } 329 if param.Extern.Local { 330 return io.NopCloser(proc.LoadLocalReader), nil 331 } 332 fs, readPath, err := plan2.GetForETLWithType(param.Extern, param.Fileparam.Filepath) 333 if err != nil { 334 return nil, err 335 } 336 var r io.ReadCloser 337 vec := fileservice.IOVector{ 338 FilePath: readPath, 339 Entries: []fileservice.IOEntry{ 340 0: { 341 Offset: 0, 342 Size: -1, 343 ReadCloserForRead: &r, 344 }, 345 }, 346 } 347 if 2*param.Idx >= len(param.FileOffsetTotal[param.Fileparam.FileIndex-1].Offset) { 348 return nil, nil 349 } 350 param.FileOffset = param.FileOffsetTotal[param.Fileparam.FileIndex-1].Offset[2*param.Idx : 2*param.Idx+2] 351 if param.Extern.Parallel { 352 vec.Entries[0].Offset = param.FileOffset[0] 353 vec.Entries[0].Size = param.FileOffset[1] - param.FileOffset[0] 354 } 355 if vec.Entries[0].Size == 0 || vec.Entries[0].Offset >= param.FileSize[param.Fileparam.FileIndex-1] { 356 return nil, nil 357 } 358 err = fs.Read(param.Ctx, &vec) 359 if err != nil { 360 return nil, err 361 } 362 return r, nil 363 } 364 365 // TODO : merge below two functions 366 func ReadFileOffsetNoStrict(param *tree.ExternParam, mcpu int, fileSize int64) ([]int64, error) { 367 arr := make([]int64, 0) 368 369 fs, readPath, err := plan2.GetForETLWithType(param, param.Filepath) 370 if err != nil { 371 return nil, err 372 } 373 var r io.ReadCloser 374 vec := fileservice.IOVector{ 375 FilePath: readPath, 376 Entries: []fileservice.IOEntry{ 377 0: { 378 Offset: 0, 379 Size: -1, 380 ReadCloserForRead: &r, 381 }, 382 }, 383 } 384 var tailSize []int64 385 var offset []int64 386 for i := 0; i < mcpu; i++ { 387 vec.Entries[0].Offset = int64(i) * (fileSize / int64(mcpu)) 388 if err = fs.Read(param.Ctx, &vec); err != nil { 389 return nil, err 390 } 391 r2 := bufio.NewReader(r) 392 line, _ := r2.ReadString('\n') 393 tailSize = append(tailSize, int64(len(line))) 394 offset = append(offset, vec.Entries[0].Offset) 395 } 396 397 start := int64(0) 398 for i := 0; i < mcpu; i++ { 399 if i+1 < mcpu { 400 arr = append(arr, start) 401 arr = append(arr, offset[i+1]+tailSize[i+1]) 402 start = offset[i+1] + tailSize[i+1] 403 } else { 404 arr = append(arr, start) 405 arr = append(arr, -1) 406 } 407 } 408 return arr, nil 409 } 410 411 func ReadFileOffsetStrict(param *tree.ExternParam, mcpu int, fileSize int64, visibleCols []*plan.ColDef) ([]int64, error) { 412 arr := make([]int64, 0) 413 414 fs, readPath, err := plan2.GetForETLWithType(param, param.Filepath) 415 if err != nil { 416 return nil, err 417 } 418 var r io.ReadCloser 419 vec := fileservice.IOVector{ 420 FilePath: readPath, 421 Entries: []fileservice.IOEntry{ 422 0: { 423 Offset: 0, 424 Size: -1, 425 ReadCloserForRead: &r, 426 }, 427 }, 428 } 429 430 var offset []int64 431 batchSize := fileSize / int64(mcpu) 432 433 offset = append(offset, 0) 434 435 for i := 1; i < mcpu; i++ { 436 vec.Entries[0].Offset = offset[i-1] + batchSize 437 if vec.Entries[0].Offset >= fileSize { 438 break 439 } 440 if err = fs.Read(param.Ctx, &vec); err != nil { 441 return nil, err 442 } 443 tailSize, err := getTailSize(param, visibleCols, r) 444 if err != nil { 445 break 446 } 447 offset = append(offset, vec.Entries[0].Offset+tailSize) 448 } 449 450 for i := 0; i < len(offset); i++ { 451 if i+1 < len(offset) { 452 arr = append(arr, offset[i]) 453 arr = append(arr, offset[i+1]) 454 } else { 455 arr = append(arr, offset[i]) 456 arr = append(arr, -1) 457 } 458 } 459 return arr, nil 460 } 461 462 func getTailSize(param *tree.ExternParam, cols []*plan.ColDef, r io.ReadCloser) (int64, error) { 463 bufR := bufio.NewReader(r) 464 // ensure the first character is not field quote symbol 465 quoteByte := byte('"') 466 if param.Tail.Fields != nil { 467 if enclosed := param.Tail.Fields.EnclosedBy; enclosed != nil && enclosed.Value != 0 { 468 quoteByte = enclosed.Value 469 } 470 } 471 skipCount := int64(0) 472 for { 473 ch, err := bufR.ReadByte() 474 if err != nil { 475 return 0, err 476 } 477 if ch != quoteByte { 478 err = bufR.UnreadByte() 479 if err != nil { 480 return 0, err 481 } 482 break 483 } 484 skipCount++ 485 } 486 csvReader, err := newReaderWithParam(&ExternalParam{ 487 ExParamConst: ExParamConst{Extern: param}, 488 ExParam: ExParam{reader: io.NopCloser(bufR)}, 489 }, true) 490 if err != nil { 491 return 0, err 492 } 493 var fields []csvparser.Field 494 for { 495 fields, err = csvReader.Read() 496 if err != nil { 497 return 0, err 498 } 499 if len(fields) < len(cols) { 500 continue 501 } 502 if isLegalLine(param, cols, fields) { 503 return csvReader.Pos() + skipCount, nil 504 } 505 } 506 } 507 508 func isLegalLine(param *tree.ExternParam, cols []*plan.ColDef, fields []csvparser.Field) bool { 509 for idx, col := range cols { 510 field := fields[idx] 511 id := types.T(col.Typ.Id) 512 if id != types.T_char && id != types.T_varchar && id != types.T_json && 513 id != types.T_binary && id != types.T_varbinary && id != types.T_blob && id != types.T_text { 514 field.Val = strings.TrimSpace(field.Val) 515 } 516 isNullOrEmpty := field.IsNull || (getNullFlag(param.NullMap, col.Name, field.Val)) 517 if id != types.T_char && id != types.T_varchar && 518 id != types.T_binary && id != types.T_varbinary && id != types.T_json && id != types.T_blob && id != types.T_text { 519 isNullOrEmpty = isNullOrEmpty || len(field.Val) == 0 520 } 521 if isNullOrEmpty { 522 continue 523 } 524 switch id { 525 case types.T_bool: 526 _, err := types.ParseBool(field.Val) 527 if err != nil { 528 return false 529 } 530 case types.T_bit: 531 if len(field.Val) > 8 { 532 return false 533 } 534 width := col.Typ.Width 535 var val uint64 536 for i := 0; i < len(field.Val); i++ { 537 val = (val << 8) | uint64(field.Val[i]) 538 } 539 if val > uint64(1<<width-1) { 540 return false 541 } 542 case types.T_int8: 543 _, err := strconv.ParseInt(field.Val, 10, 8) 544 if err != nil { 545 if errors.Is(err, strconv.ErrRange) { 546 return false 547 } 548 f, err := strconv.ParseFloat(field.Val, 64) 549 if err != nil || f < math.MinInt8 || f > math.MaxInt8 { 550 return false 551 } 552 } 553 case types.T_int16: 554 _, err := strconv.ParseInt(field.Val, 10, 16) 555 if err != nil { 556 if errors.Is(err, strconv.ErrRange) { 557 return false 558 } 559 f, err := strconv.ParseFloat(field.Val, 64) 560 if err != nil || f < math.MinInt16 || f > math.MaxInt16 { 561 return false 562 } 563 } 564 case types.T_int32: 565 _, err := strconv.ParseInt(field.Val, 10, 32) 566 if err != nil { 567 if errors.Is(err, strconv.ErrRange) { 568 return false 569 } 570 f, err := strconv.ParseFloat(field.Val, 64) 571 if err != nil || f < math.MinInt32 || f > math.MaxInt32 { 572 return false 573 } 574 } 575 case types.T_int64: 576 _, err := strconv.ParseInt(field.Val, 10, 64) 577 if err != nil { 578 if errors.Is(err, strconv.ErrRange) { 579 return false 580 } 581 f, err := strconv.ParseFloat(field.Val, 64) 582 if err != nil || f < math.MinInt64 || f > math.MaxInt64 { 583 return false 584 } 585 } 586 case types.T_uint8: 587 _, err := strconv.ParseUint(field.Val, 10, 8) 588 if err != nil { 589 if errors.Is(err, strconv.ErrRange) { 590 return false 591 } 592 f, err := strconv.ParseFloat(field.Val, 64) 593 if err != nil || f < 0 || f > math.MaxUint8 { 594 return false 595 } 596 } 597 case types.T_uint16: 598 _, err := strconv.ParseUint(field.Val, 10, 16) 599 if err != nil { 600 if errors.Is(err, strconv.ErrRange) { 601 return false 602 } 603 f, err := strconv.ParseFloat(field.Val, 64) 604 if err != nil || f < 0 || f > math.MaxUint16 { 605 return false 606 } 607 } 608 case types.T_uint32: 609 _, err := strconv.ParseUint(field.Val, 10, 32) 610 if err != nil { 611 if errors.Is(err, strconv.ErrRange) { 612 return false 613 } 614 f, err := strconv.ParseFloat(field.Val, 64) 615 if err != nil || f < 0 || f > math.MaxUint32 { 616 return false 617 } 618 } 619 case types.T_uint64: 620 _, err := strconv.ParseUint(field.Val, 10, 64) 621 if err != nil { 622 if errors.Is(err, strconv.ErrRange) { 623 return false 624 } 625 f, err := strconv.ParseFloat(field.Val, 64) 626 if err != nil || f < 0 || f > math.MaxUint64 { 627 return false 628 } 629 } 630 case types.T_float32: 631 // origin float32 data type 632 if col.Typ.Scale < 0 || col.Typ.Width == 0 { 633 _, err := strconv.ParseFloat(field.Val, 32) 634 if err != nil { 635 return false 636 } 637 } else { 638 _, err := types.ParseDecimal128(field.Val, col.Typ.Width, col.Typ.Scale) 639 if err != nil { 640 return false 641 } 642 } 643 case types.T_float64: 644 // origin float64 data type 645 if col.Typ.Scale < 0 || col.Typ.Width == 0 { 646 _, err := strconv.ParseFloat(field.Val, 64) 647 if err != nil { 648 return false 649 } 650 } else { 651 _, err := types.ParseDecimal128(field.Val, col.Typ.Width, col.Typ.Scale) 652 if err != nil { 653 return false 654 } 655 656 } 657 case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 658 continue 659 case types.T_array_float32: 660 _, err := types.StringToArrayToBytes[float32](field.Val) 661 if err != nil { 662 return false 663 } 664 case types.T_array_float64: 665 _, err := types.StringToArrayToBytes[float64](field.Val) 666 if err != nil { 667 return false 668 } 669 case types.T_json: 670 if param.Format == tree.CSV { 671 field.Val = fmt.Sprintf("%v", strings.Trim(field.Val, "\"")) 672 byteJson, err := types.ParseStringToByteJson(field.Val) 673 if err != nil { 674 return false 675 } 676 _, err = types.EncodeJson(byteJson) 677 if err != nil { 678 return false 679 } 680 } 681 case types.T_date: 682 _, err := types.ParseDateCast(field.Val) 683 if err != nil { 684 return false 685 } 686 case types.T_time: 687 _, err := types.ParseTime(field.Val, col.Typ.Scale) 688 if err != nil { 689 return false 690 } 691 case types.T_datetime: 692 _, err := types.ParseDatetime(field.Val, col.Typ.Scale) 693 if err != nil { 694 return false 695 } 696 case types.T_enum: 697 _, err := strconv.ParseUint(field.Val, 10, 16) 698 if err == nil { 699 continue 700 } else if errors.Is(err, strconv.ErrSyntax) { 701 _, err := types.ParseEnum(col.Typ.Enumvalues, field.Val) 702 if err != nil { 703 return false 704 } 705 } else { 706 if errors.Is(err, strconv.ErrRange) { 707 return false 708 } 709 f, err := strconv.ParseFloat(field.Val, 64) 710 if err != nil || f < 0 || f > math.MaxUint16 { 711 return false 712 } 713 } 714 case types.T_decimal64: 715 _, err := types.ParseDecimal64(field.Val, col.Typ.Width, col.Typ.Scale) 716 if err != nil { 717 // we tolerate loss of digits. 718 if !moerr.IsMoErrCode(err, moerr.ErrDataTruncated) { 719 return false 720 } 721 } 722 case types.T_decimal128: 723 _, err := types.ParseDecimal128(field.Val, col.Typ.Width, col.Typ.Scale) 724 if err != nil { 725 // we tolerate loss of digits. 726 if !moerr.IsMoErrCode(err, moerr.ErrDataTruncated) { 727 return false 728 } 729 } 730 case types.T_timestamp: 731 t := time.Local 732 _, err := types.ParseTimestamp(t, field.Val, col.Typ.Scale) 733 if err != nil { 734 return false 735 } 736 case types.T_uuid: 737 _, err := types.ParseUuid(field.Val) 738 if err != nil { 739 return false 740 } 741 default: 742 return false 743 } 744 } 745 return true 746 } 747 748 func GetCompressType(param *tree.ExternParam, filepath string) string { 749 if param.CompressType != "" && param.CompressType != tree.AUTO { 750 return param.CompressType 751 } 752 753 filepath = strings.ToLower(filepath) 754 755 switch { 756 case strings.HasSuffix(filepath, ".tar.gz") || strings.HasSuffix(filepath, ".tar.gzip"): 757 return tree.TAR_GZ 758 case strings.HasSuffix(filepath, ".tar.bz2") || strings.HasSuffix(filepath, ".tar.bzip2"): 759 return tree.TAR_BZ2 760 case strings.HasSuffix(filepath, ".gz") || strings.HasSuffix(filepath, ".gzip"): 761 return tree.GZIP 762 case strings.HasSuffix(filepath, ".bz2") || strings.HasSuffix(filepath, ".bzip2"): 763 return tree.BZIP2 764 case strings.HasSuffix(filepath, ".lz4"): 765 return tree.LZ4 766 default: 767 return tree.NOCOMPRESS 768 } 769 } 770 771 func getUnCompressReader(param *tree.ExternParam, filepath string, r io.ReadCloser) (io.ReadCloser, error) { 772 switch strings.ToLower(GetCompressType(param, filepath)) { 773 case tree.NOCOMPRESS: 774 return r, nil 775 case tree.GZIP, tree.GZ: 776 return gzip.NewReader(r) 777 case tree.BZIP2, tree.BZ2: 778 return io.NopCloser(bzip2.NewReader(r)), nil 779 case tree.FLATE: 780 return flate.NewReader(r), nil 781 case tree.ZLIB: 782 return zlib.NewReader(r) 783 case tree.LZ4: 784 return io.NopCloser(lz4.NewReader(r)), nil 785 case tree.LZW: 786 return nil, moerr.NewInternalError(param.Ctx, "the compress type '%s' is not support now", param.CompressType) 787 case tree.TAR_GZ: 788 gzipReader, err := gzip.NewReader(r) 789 if err != nil { 790 return nil, err 791 } 792 return getTarReader(param.Ctx, gzipReader) 793 case tree.TAR_BZ2: 794 return getTarReader(param.Ctx, bzip2.NewReader(r)) 795 default: 796 return nil, moerr.NewInternalError(param.Ctx, "the compress type '%s' is not support now", param.CompressType) 797 } 798 } 799 800 func getTarReader(ctx context.Context, r io.Reader) (io.ReadCloser, error) { 801 tarReader := tar.NewReader(r) 802 // move to first file 803 for { 804 header, err := tarReader.Next() 805 if err == io.EOF { 806 return nil, moerr.NewInternalError(ctx, "failed to decompress the file, no available files found") 807 } 808 if err != nil { 809 return nil, err 810 } 811 if !header.FileInfo().IsDir() && !strings.HasPrefix(header.FileInfo().Name(), ".") { 812 break 813 } 814 } 815 return io.NopCloser(tarReader), nil 816 } 817 818 func makeType(typ *plan.Type, flag bool) types.Type { 819 if flag { 820 return types.New(types.T_varchar, 0, 0) 821 } 822 return types.New(types.T(typ.Id), typ.Width, typ.Scale) 823 } 824 825 func makeBatch(param *ExternalParam, batchSize int, proc *process.Process) (bat *batch.Batch, err error) { 826 bat = batch.New(false, param.Attrs) 827 //alloc space for vector 828 for i := range param.Attrs { 829 typ := makeType(¶m.Cols[i].Typ, param.ParallelLoad) 830 bat.Vecs[i] = proc.GetVector(typ) 831 } 832 if err = bat.PreExtend(proc.GetMPool(), batchSize); err != nil { 833 bat.Clean(proc.GetMPool()) 834 return nil, err 835 } 836 for i := range bat.Vecs { 837 bat.Vecs[i].SetLength(batchSize) 838 } 839 return bat, nil 840 } 841 842 func getRealAttrCnt(attrs []string, cols []*plan.ColDef) int { 843 cnt := 0 844 for i := 0; i < len(attrs); i++ { 845 if catalog.ContainExternalHidenCol(attrs[i]) || cols[i].Hidden { 846 cnt++ 847 } 848 } 849 return len(attrs) - cnt 850 } 851 852 func getBatchData(param *ExternalParam, plh *ParseLineHandler, proc *process.Process) (*batch.Batch, error) { 853 bat, err := makeBatch(param, plh.batchSize, proc) 854 if err != nil { 855 return nil, err 856 } 857 858 unexpectEOF := false 859 for rowIdx := 0; rowIdx < plh.batchSize; rowIdx++ { 860 line := plh.moCsvLineArray[rowIdx] 861 if param.Extern.Format == tree.JSONLINE { 862 line, err = transJson2Lines(proc.Ctx, line[0].Val, param.Attrs, param.Cols, param.Extern.JsonData, param) 863 if err != nil { 864 if errors.Is(err, io.ErrUnexpectedEOF) { 865 logutil.Infof("unexpected EOF, wait for next batch") 866 unexpectEOF = true 867 continue 868 } 869 return nil, err 870 } 871 plh.moCsvLineArray[rowIdx] = line 872 } 873 if param.ClusterTable != nil && param.ClusterTable.GetIsClusterTable() { 874 //the column account_id of the cluster table do need to be filled here 875 if len(line)+1 < getRealAttrCnt(param.Attrs, param.Cols) { 876 return nil, moerr.NewInternalError(proc.Ctx, ColumnCntLargerErrorInfo) 877 } 878 } else { 879 if !param.Extern.SysTable && len(line) < getRealAttrCnt(param.Attrs, param.Cols) { 880 return nil, moerr.NewInternalError(proc.Ctx, ColumnCntLargerErrorInfo) 881 } 882 } 883 err = getOneRowData(bat, line, rowIdx, param, proc.GetMPool()) 884 if err != nil { 885 return nil, err 886 } 887 } 888 889 n := bat.Vecs[0].Length() 890 if unexpectEOF && n > 0 { 891 n-- 892 for i := 0; i < bat.VectorCount(); i++ { 893 vec := bat.GetVector(int32(i)) 894 vec.SetLength(n) 895 } 896 } 897 bat.SetRowCount(n) 898 return bat, nil 899 } 900 901 // getMOCSVReader get file reader from external file 902 func getMOCSVReader(param *ExternalParam, proc *process.Process) (*ParseLineHandler, error) { 903 var err error 904 param.reader, err = readFile(param, proc) 905 if err != nil || param.reader == nil { 906 return nil, err 907 } 908 param.reader, err = getUnCompressReader(param.Extern, param.Fileparam.Filepath, param.reader) 909 if err != nil { 910 return nil, err 911 } 912 913 csvReader, err := newReaderWithParam(param, false) 914 if err != nil { 915 return nil, err 916 } 917 plh := &ParseLineHandler{ 918 csvReader: csvReader, 919 moCsvLineArray: param.MoCsvLineArray, 920 } 921 return plh, nil 922 } 923 924 func scanCsvFile(ctx context.Context, param *ExternalParam, proc *process.Process) (*batch.Batch, error) { 925 var bat *batch.Batch 926 var err error 927 var cnt int 928 _, span := trace.Start(ctx, "scanCsvFile") 929 defer span.End() 930 if param.plh == nil { 931 param.IgnoreLine = param.IgnoreLineTag 932 param.plh, err = getMOCSVReader(param, proc) 933 if err != nil || param.plh == nil { 934 return nil, err 935 } 936 } 937 plh := param.plh 938 finish := false 939 cnt, finish, err = readCountStringLimitSize(plh.csvReader, proc.Ctx, param.maxBatchSize, plh.moCsvLineArray) 940 if err != nil { 941 logutil.Errorf("read external file meet error: %s", err.Error()) 942 return nil, err 943 } 944 945 if finish { 946 err := param.reader.Close() 947 if err != nil { 948 logutil.Errorf("close file failed. err:%v", err) 949 } 950 param.plh = nil 951 param.Fileparam.FileFin++ 952 if param.Fileparam.FileFin >= param.Fileparam.FileCnt { 953 param.Fileparam.End = true 954 } 955 } 956 if param.IgnoreLine != 0 { 957 if !param.Extern.Parallel || param.FileOffset[0] == 0 { 958 if cnt >= param.IgnoreLine { 959 plh.moCsvLineArray = plh.moCsvLineArray[param.IgnoreLine:cnt] 960 cnt -= param.IgnoreLine 961 plh.moCsvLineArray = append(plh.moCsvLineArray, make([]csvparser.Field, param.IgnoreLine)) 962 } else { 963 plh.moCsvLineArray = nil 964 cnt = 0 965 } 966 param.IgnoreLine = 0 967 } 968 } 969 plh.batchSize = cnt 970 bat, err = getBatchData(param, plh, proc) 971 if err != nil { 972 return nil, err 973 } 974 return bat, nil 975 } 976 977 func getBatchFromZonemapFile(ctx context.Context, param *ExternalParam, proc *process.Process, objectReader *blockio.BlockReader) (bat *batch.Batch, err error) { 978 var tmpBat *batch.Batch 979 var vecTmp *vector.Vector 980 var release func() 981 mp := proc.Mp() 982 983 ctx, span := trace.Start(ctx, "getBatchFromZonemapFile") 984 defer func() { 985 span.End() 986 if tmpBat != nil { 987 for i, v := range tmpBat.Vecs { 988 if v == vecTmp { 989 tmpBat.Vecs[i] = nil 990 } 991 } 992 tmpBat.Clean(mp) 993 } 994 if vecTmp != nil { 995 vecTmp.Free(mp) 996 } 997 if release != nil { 998 release() 999 } 1000 if err != nil && bat != nil { 1001 bat.Clean(mp) 1002 } 1003 }() 1004 1005 bat, err = makeBatch(param, 0, proc) 1006 if err != nil { 1007 return nil, err 1008 } 1009 if param.Zoneparam.offset >= len(param.Zoneparam.bs) { 1010 return bat, nil 1011 } 1012 1013 rows := 0 1014 1015 idxs := make([]uint16, len(param.Attrs)) 1016 meta := param.Zoneparam.bs[param.Zoneparam.offset].GetMeta() 1017 colCnt := meta.BlockHeader().ColumnCount() 1018 for i := 0; i < len(param.Attrs); i++ { 1019 idxs[i] = uint16(param.Name2ColIndex[param.Attrs[i]]) 1020 if param.Extern.SysTable && idxs[i] >= colCnt { 1021 idxs[i] = 0 1022 } 1023 } 1024 1025 tmpBat, release, err = objectReader.LoadColumns(ctx, idxs, nil, param.Zoneparam.bs[param.Zoneparam.offset].BlockHeader().BlockID().Sequence(), mp) 1026 if err != nil { 1027 return nil, err 1028 } 1029 filepathBytes := []byte(param.Fileparam.Filepath) 1030 1031 var sels []int32 1032 for i := 0; i < len(param.Attrs); i++ { 1033 if param.Extern.SysTable && uint16(param.Name2ColIndex[param.Attrs[i]]) >= colCnt { 1034 vecTmp, err = proc.AllocVectorOfRows(makeType(¶m.Cols[i].Typ, false), rows, nil) 1035 if err != nil { 1036 return nil, err 1037 } 1038 for j := 0; j < rows; j++ { 1039 nulls.Add(vecTmp.GetNulls(), uint64(j)) 1040 } 1041 } else if catalog.ContainExternalHidenCol(param.Attrs[i]) { 1042 if rows == 0 { 1043 rows = tmpBat.Vecs[i].Length() 1044 } 1045 vecTmp, err = proc.AllocVectorOfRows(makeType(¶m.Cols[i].Typ, false), rows, nil) 1046 if err != nil { 1047 return nil, err 1048 } 1049 for j := 0; j < rows; j++ { 1050 if err = vector.SetBytesAt(vecTmp, j, filepathBytes, mp); err != nil { 1051 return nil, err 1052 } 1053 } 1054 } else { 1055 vecTmp = tmpBat.Vecs[i] 1056 rows = vecTmp.Length() 1057 } 1058 if cap(sels) >= vecTmp.Length() { 1059 sels = sels[:vecTmp.Length()] 1060 } else { 1061 sels = make([]int32, vecTmp.Length()) 1062 1063 for j, k := int32(0), int32(len(sels)); j < k; j++ { 1064 sels[j] = j 1065 } 1066 } 1067 1068 if err = bat.Vecs[i].Union(vecTmp, sels, proc.GetMPool()); err != nil { 1069 return nil, err 1070 } 1071 } 1072 1073 n := bat.Vecs[0].Length() 1074 bat.SetRowCount(n) 1075 return bat, nil 1076 } 1077 1078 func needRead(ctx context.Context, param *ExternalParam, proc *process.Process) bool { 1079 _, span := trace.Start(ctx, "needRead") 1080 defer span.End() 1081 1082 expr := param.Filter.FilterExpr 1083 if expr == nil { 1084 return true 1085 } 1086 if param.Zoneparam.offset >= len(param.Zoneparam.bs) { 1087 return true 1088 } 1089 1090 notReportErrCtx := errutil.ContextWithNoReport(proc.Ctx, true) 1091 1092 meta := param.Zoneparam.bs[param.Zoneparam.offset] 1093 columnMap := param.Filter.columnMap 1094 var ( 1095 zms []objectio.ZoneMap 1096 vecs []*vector.Vector 1097 ) 1098 1099 if isMonoExpr := plan2.ExprIsZonemappable(proc.Ctx, expr); isMonoExpr { 1100 cnt := plan2.AssignAuxIdForExpr(expr, 0) 1101 zms = make([]objectio.ZoneMap, cnt) 1102 vecs = make([]*vector.Vector, cnt) 1103 } 1104 1105 return colexec.EvaluateFilterByZoneMap( 1106 notReportErrCtx, proc, expr, meta, columnMap, zms, vecs) 1107 } 1108 1109 func getZonemapBatch(ctx context.Context, param *ExternalParam, proc *process.Process, objectReader *blockio.BlockReader) (*batch.Batch, error) { 1110 var err error 1111 param.Zoneparam.bs, err = objectReader.LoadAllBlocks(param.Ctx, proc.GetMPool()) 1112 if err != nil { 1113 return nil, err 1114 } 1115 if param.Zoneparam.offset >= len(param.Zoneparam.bs) { 1116 return makeBatch(param, 0, proc) 1117 } 1118 1119 if param.Filter.zonemappable { 1120 for !needRead(ctx, param, proc) { 1121 param.Zoneparam.offset++ 1122 } 1123 } 1124 return getBatchFromZonemapFile(ctx, param, proc, objectReader) 1125 } 1126 1127 func scanZonemapFile(ctx context.Context, param *ExternalParam, proc *process.Process) (*batch.Batch, error) { 1128 var err error 1129 param.Filter.blockReader, err = blockio.NewFileReader(param.Extern.FileService, param.Fileparam.Filepath) 1130 if err != nil { 1131 return nil, err 1132 } 1133 1134 bat, err := getZonemapBatch(ctx, param, proc, param.Filter.blockReader) 1135 if err != nil { 1136 return nil, err 1137 } 1138 1139 if param.Zoneparam.offset >= len(param.Zoneparam.bs) { 1140 param.Filter.blockReader = nil 1141 param.Zoneparam.bs = nil 1142 param.plh = nil 1143 param.Fileparam.FileFin++ 1144 if param.Fileparam.FileFin >= param.Fileparam.FileCnt { 1145 param.Fileparam.End = true 1146 } 1147 param.Zoneparam.offset = 0 1148 } 1149 return bat, nil 1150 } 1151 1152 // scanFileData read batch data from external file 1153 func scanFileData(ctx context.Context, param *ExternalParam, proc *process.Process) (*batch.Batch, error) { 1154 if param.Extern.QueryResult { 1155 return scanZonemapFile(ctx, param, proc) 1156 } 1157 if param.Extern.Format == tree.PARQUET { 1158 return scanParquetFile(ctx, param, proc) 1159 } 1160 return scanCsvFile(ctx, param, proc) 1161 } 1162 1163 func transJson2Lines(ctx context.Context, str string, attrs []string, cols []*plan.ColDef, jsonData string, param *ExternalParam) ([]csvparser.Field, error) { 1164 switch jsonData { 1165 case tree.OBJECT: 1166 return transJsonObject2Lines(ctx, str, attrs, cols, param) 1167 case tree.ARRAY: 1168 return transJsonArray2Lines(ctx, str, attrs, cols, param) 1169 default: 1170 return nil, moerr.NewNotSupported(ctx, "the jsonline format '%s' is not support now", jsonData) 1171 } 1172 } 1173 1174 const JsonNull = "\\N" 1175 1176 func transJsonObject2Lines(ctx context.Context, str string, attrs []string, cols []*plan.ColDef, param *ExternalParam) ([]csvparser.Field, error) { 1177 var ( 1178 err error 1179 res = make([]csvparser.Field, 0, len(attrs)) 1180 ) 1181 if param.prevStr != "" { 1182 str = param.prevStr + str 1183 param.prevStr = "" 1184 } 1185 var jsonMap map[string]interface{} 1186 var decoder = json.NewDecoder(bytes.NewReader([]byte(str))) 1187 decoder.UseNumber() 1188 err = decoder.Decode(&jsonMap) 1189 if err != nil { 1190 logutil.Errorf("json unmarshal err:%v", err) 1191 param.prevStr = str 1192 return nil, err 1193 } 1194 if len(jsonMap) < getRealAttrCnt(attrs, cols) { 1195 return nil, moerr.NewInternalError(ctx, ColumnCntLargerErrorInfo) 1196 } 1197 for idx, attr := range attrs { 1198 if cols[idx].Hidden { 1199 continue 1200 } 1201 if val, ok := jsonMap[attr]; ok { 1202 if val == nil { 1203 res = append(res, csvparser.Field{IsNull: true}) 1204 continue 1205 } 1206 tp := cols[idx].Typ.Id 1207 if tp != int32(types.T_json) { 1208 val = fmt.Sprintf("%v", val) 1209 res = append(res, csvparser.Field{Val: fmt.Sprintf("%v", val), IsNull: val == JsonNull}) 1210 continue 1211 } 1212 var bj bytejson.ByteJson 1213 err = bj.UnmarshalObject(val) 1214 if err != nil { 1215 return nil, err 1216 } 1217 dt, err := bj.Marshal() 1218 if err != nil { 1219 return nil, err 1220 } 1221 res = append(res, csvparser.Field{Val: string(dt)}) 1222 } else { 1223 return nil, moerr.NewInvalidInput(ctx, "the attr %s is not in json", attr) 1224 } 1225 } 1226 return res, nil 1227 } 1228 1229 func transJsonArray2Lines(ctx context.Context, str string, attrs []string, cols []*plan.ColDef, param *ExternalParam) ([]csvparser.Field, error) { 1230 var ( 1231 err error 1232 res = make([]csvparser.Field, 0, len(attrs)) 1233 ) 1234 if param.prevStr != "" { 1235 str = param.prevStr + str 1236 param.prevStr = "" 1237 } 1238 var jsonArray []interface{} 1239 var decoder = json.NewDecoder(bytes.NewReader([]byte(str))) 1240 decoder.UseNumber() 1241 err = decoder.Decode(&jsonArray) 1242 if err != nil { 1243 param.prevStr = str 1244 return nil, err 1245 } 1246 if len(jsonArray) < getRealAttrCnt(attrs, cols) { 1247 return nil, moerr.NewInternalError(ctx, ColumnCntLargerErrorInfo) 1248 } 1249 for idx, val := range jsonArray { 1250 if val == nil { 1251 res = append(res, csvparser.Field{IsNull: true}) 1252 continue 1253 } 1254 if idx >= len(cols) { 1255 return nil, moerr.NewInvalidInput(ctx, str+" , wrong number of colunms") 1256 } 1257 tp := cols[idx].Typ.Id 1258 if tp != int32(types.T_json) { 1259 val = fmt.Sprintf("%v", val) 1260 res = append(res, csvparser.Field{Val: fmt.Sprintf("%v", val), IsNull: val == JsonNull}) 1261 continue 1262 } 1263 var bj bytejson.ByteJson 1264 err = bj.UnmarshalObject(val) 1265 if err != nil { 1266 return nil, err 1267 } 1268 dt, err := bj.Marshal() 1269 if err != nil { 1270 return nil, err 1271 } 1272 res = append(res, csvparser.Field{Val: string(dt)}) 1273 } 1274 return res, nil 1275 } 1276 1277 func getNullFlag(nullMap map[string]([]string), attr, field string) bool { 1278 if nullMap == nil || len(nullMap[attr]) == 0 { 1279 return false 1280 } 1281 field = strings.ToLower(field) 1282 for _, v := range nullMap[attr] { 1283 if v == field { 1284 return true 1285 } 1286 } 1287 return false 1288 } 1289 1290 func getFieldFromLine(line []csvparser.Field, colIdx int, param *ExternalParam) csvparser.Field { 1291 if catalog.ContainExternalHidenCol(param.Attrs[colIdx]) { 1292 return csvparser.Field{Val: param.Fileparam.Filepath} 1293 } 1294 return line[param.Name2ColIndex[param.Attrs[colIdx]]] 1295 } 1296 1297 func getOneRowData(bat *batch.Batch, line []csvparser.Field, rowIdx int, param *ExternalParam, mp *mpool.MPool) error { 1298 var buf bytes.Buffer 1299 for colIdx := range param.Attrs { 1300 vec := bat.Vecs[colIdx] 1301 if param.Cols[colIdx].Hidden { 1302 nulls.Add(vec.GetNulls(), uint64(rowIdx)) 1303 continue 1304 } 1305 field := getFieldFromLine(line, colIdx, param) 1306 id := types.T(param.Cols[colIdx].Typ.Id) 1307 if id != types.T_char && id != types.T_varchar && id != types.T_json && 1308 id != types.T_binary && id != types.T_varbinary && id != types.T_blob && id != types.T_text { 1309 field.Val = strings.TrimSpace(field.Val) 1310 } 1311 isNullOrEmpty := field.IsNull || (getNullFlag(param.Extern.NullMap, param.Attrs[colIdx], field.Val)) 1312 if id != types.T_char && id != types.T_varchar && 1313 id != types.T_binary && id != types.T_varbinary && id != types.T_json && id != types.T_blob && id != types.T_text { 1314 isNullOrEmpty = isNullOrEmpty || len(field.Val) == 0 1315 } 1316 if isNullOrEmpty { 1317 nulls.Add(vec.GetNulls(), uint64(rowIdx)) 1318 continue 1319 } 1320 if param.ParallelLoad { 1321 buf.WriteString(field.Val) 1322 bs := buf.Bytes() 1323 err := vector.SetBytesAt(vec, rowIdx, bs, mp) 1324 if err != nil { 1325 return err 1326 } 1327 buf.Reset() 1328 continue 1329 } 1330 1331 switch id { 1332 case types.T_bool: 1333 b, err := types.ParseBool(field.Val) 1334 if err != nil { 1335 return moerr.NewInternalError(param.Ctx, "the input value '%s' is not bool type for column %d", field.Val, colIdx) 1336 } 1337 if err := vector.SetFixedAt(vec, rowIdx, b); err != nil { 1338 return err 1339 } 1340 case types.T_bit: 1341 if len(field.Val) > 8 { 1342 return moerr.NewInternalError(param.Ctx, "data too long, len(val) = %v", len(field.Val)) 1343 } 1344 1345 width := param.Cols[colIdx].Typ.Width 1346 var val uint64 1347 for i := 0; i < len(field.Val); i++ { 1348 val = (val << 8) | uint64(field.Val[i]) 1349 } 1350 if val > uint64(1<<width-1) { 1351 return moerr.NewInternalError(param.Ctx, "data too long, type width = %d, val = %b", width, val) 1352 } 1353 if err := vector.SetFixedAt(vec, rowIdx, val); err != nil { 1354 return err 1355 } 1356 buf.Reset() 1357 case types.T_int8: 1358 d, err := strconv.ParseInt(field.Val, 10, 8) 1359 if err == nil { 1360 if err := vector.SetFixedAt(vec, rowIdx, int8(d)); err != nil { 1361 return err 1362 } 1363 } else { 1364 if errors.Is(err, strconv.ErrRange) { 1365 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1366 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int8 type for column %d", field.Val, colIdx) 1367 } 1368 f, err := strconv.ParseFloat(field.Val, 64) 1369 if err != nil || f < math.MinInt8 || f > math.MaxInt8 { 1370 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1371 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int8 type for column %d", field.Val, colIdx) 1372 } 1373 if err := vector.SetFixedAt(vec, rowIdx, int8(f)); err != nil { 1374 return err 1375 } 1376 } 1377 case types.T_int16: 1378 d, err := strconv.ParseInt(field.Val, 10, 16) 1379 if err == nil { 1380 if err := vector.SetFixedAt(vec, rowIdx, int16(d)); err != nil { 1381 return err 1382 } 1383 } else { 1384 if errors.Is(err, strconv.ErrRange) { 1385 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1386 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int16 type for column %d", field.Val, colIdx) 1387 } 1388 f, err := strconv.ParseFloat(field.Val, 64) 1389 if err != nil || f < math.MinInt16 || f > math.MaxInt16 { 1390 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1391 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int16 type for column %d", field.Val, colIdx) 1392 } 1393 if err := vector.SetFixedAt(vec, rowIdx, int16(f)); err != nil { 1394 return err 1395 } 1396 } 1397 case types.T_int32: 1398 d, err := strconv.ParseInt(field.Val, 10, 32) 1399 if err == nil { 1400 if err := vector.SetFixedAt(vec, rowIdx, int32(d)); err != nil { 1401 return err 1402 } 1403 } else { 1404 if errors.Is(err, strconv.ErrRange) { 1405 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1406 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int32 type for column %d", field.Val, colIdx) 1407 } 1408 f, err := strconv.ParseFloat(field.Val, 64) 1409 if err != nil || f < math.MinInt32 || f > math.MaxInt32 { 1410 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1411 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int32 type for column %d", field.Val, colIdx) 1412 } 1413 if err := vector.SetFixedAt(vec, rowIdx, int32(f)); err != nil { 1414 return err 1415 } 1416 } 1417 case types.T_int64: 1418 d, err := strconv.ParseInt(field.Val, 10, 64) 1419 if err == nil { 1420 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1421 return err 1422 } 1423 } else { 1424 if errors.Is(err, strconv.ErrRange) { 1425 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1426 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int64 type for column %d", field.Val, colIdx) 1427 } 1428 f, err := strconv.ParseFloat(field.Val, 64) 1429 if err != nil || f < math.MinInt64 || f > math.MaxInt64 { 1430 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1431 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not int64 type for column %d", field.Val, colIdx) 1432 } 1433 if err := vector.SetFixedAt(vec, rowIdx, int64(f)); err != nil { 1434 return err 1435 } 1436 } 1437 case types.T_uint8: 1438 d, err := strconv.ParseUint(field.Val, 10, 8) 1439 if err == nil { 1440 if err := vector.SetFixedAt(vec, rowIdx, uint8(d)); err != nil { 1441 return err 1442 } 1443 } else { 1444 if errors.Is(err, strconv.ErrRange) { 1445 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1446 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint8 type for column %d", field.Val, colIdx) 1447 } 1448 f, err := strconv.ParseFloat(field.Val, 64) 1449 if err != nil || f < 0 || f > math.MaxUint8 { 1450 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1451 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint8 type for column %d", field.Val, colIdx) 1452 } 1453 if err := vector.SetFixedAt(vec, rowIdx, uint8(f)); err != nil { 1454 return err 1455 } 1456 } 1457 case types.T_uint16: 1458 d, err := strconv.ParseUint(field.Val, 10, 16) 1459 if err == nil { 1460 if err := vector.SetFixedAt(vec, rowIdx, uint16(d)); err != nil { 1461 return err 1462 } 1463 } else { 1464 if errors.Is(err, strconv.ErrRange) { 1465 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1466 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint16 type for column %d", field.Val, colIdx) 1467 } 1468 f, err := strconv.ParseFloat(field.Val, 64) 1469 if err != nil || f < 0 || f > math.MaxUint16 { 1470 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1471 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint16 type for column %d", field.Val, colIdx) 1472 } 1473 if err := vector.SetFixedAt(vec, rowIdx, uint16(f)); err != nil { 1474 return err 1475 } 1476 } 1477 case types.T_uint32: 1478 d, err := strconv.ParseUint(field.Val, 10, 32) 1479 if err == nil { 1480 if err := vector.SetFixedAt(vec, rowIdx, uint32(d)); err != nil { 1481 return err 1482 } 1483 } else { 1484 if errors.Is(err, strconv.ErrRange) { 1485 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1486 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint32 type for column %d", field.Val, colIdx) 1487 } 1488 f, err := strconv.ParseFloat(field.Val, 64) 1489 if err != nil || f < 0 || f > math.MaxUint32 { 1490 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1491 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint32 type for column %d", field.Val, colIdx) 1492 } 1493 if err := vector.SetFixedAt(vec, rowIdx, uint32(f)); err != nil { 1494 return err 1495 } 1496 } 1497 case types.T_uint64: 1498 d, err := strconv.ParseUint(field.Val, 10, 64) 1499 if err == nil { 1500 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1501 return err 1502 } 1503 } else { 1504 if errors.Is(err, strconv.ErrRange) { 1505 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1506 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint64 type for column %d", field.Val, colIdx) 1507 } 1508 f, err := strconv.ParseFloat(field.Val, 64) 1509 if err != nil || f < 0 || f > math.MaxUint64 { 1510 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1511 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint64 type for column %d", field.Val, colIdx) 1512 } 1513 if err := vector.SetFixedAt(vec, rowIdx, uint64(f)); err != nil { 1514 return err 1515 } 1516 } 1517 case types.T_float32: 1518 // origin float32 data type 1519 if vec.GetType().Scale < 0 || vec.GetType().Width == 0 { 1520 d, err := strconv.ParseFloat(field.Val, 32) 1521 if err != nil { 1522 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1523 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not float32 type for column %d", field.Val, colIdx) 1524 } 1525 if err := vector.SetFixedAt(vec, rowIdx, float32(d)); err != nil { 1526 return err 1527 } 1528 } else { 1529 d, err := types.ParseDecimal128(field.Val, vec.GetType().Width, vec.GetType().Scale) 1530 if err != nil { 1531 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1532 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not float32 type for column %d", field.Val, colIdx) 1533 } 1534 if err := vector.SetFixedAt(vec, rowIdx, float32(types.Decimal128ToFloat64(d, vec.GetType().Scale))); err != nil { 1535 return err 1536 } 1537 } 1538 case types.T_float64: 1539 // origin float64 data type 1540 if vec.GetType().Scale < 0 || vec.GetType().Width == 0 { 1541 d, err := strconv.ParseFloat(field.Val, 64) 1542 if err != nil { 1543 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1544 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not float64 type for column %d", field.Val, colIdx) 1545 } 1546 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1547 return err 1548 } 1549 } else { 1550 d, err := types.ParseDecimal128(field.Val, vec.GetType().Width, vec.GetType().Scale) 1551 if err != nil { 1552 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1553 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not float64 type for column %d", field.Val, colIdx) 1554 } 1555 if err := vector.SetFixedAt(vec, rowIdx, types.Decimal128ToFloat64(d, vec.GetType().Scale)); err != nil { 1556 return err 1557 } 1558 } 1559 case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 1560 // XXX Memory accounting? 1561 buf.WriteString(field.Val) 1562 bs := buf.Bytes() 1563 err := vector.SetBytesAt(vec, rowIdx, bs, mp) 1564 if err != nil { 1565 return err 1566 } 1567 buf.Reset() 1568 case types.T_array_float32: 1569 arrBytes, err := types.StringToArrayToBytes[float32](field.Val) 1570 if err != nil { 1571 return err 1572 } 1573 err = vector.SetBytesAt(vec, rowIdx, arrBytes, mp) 1574 if err != nil { 1575 return err 1576 } 1577 buf.Reset() 1578 case types.T_array_float64: 1579 arrBytes, err := types.StringToArrayToBytes[float64](field.Val) 1580 if err != nil { 1581 return err 1582 } 1583 err = vector.SetBytesAt(vec, rowIdx, arrBytes, mp) 1584 if err != nil { 1585 return err 1586 } 1587 buf.Reset() 1588 case types.T_json: 1589 var jsonBytes []byte 1590 if param.Extern.Format != tree.CSV { 1591 jsonBytes = []byte(field.Val) 1592 } else { 1593 field.Val = fmt.Sprintf("%v", strings.Trim(field.Val, "\"")) 1594 byteJson, err := types.ParseStringToByteJson(field.Val) 1595 if err != nil { 1596 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1597 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not json type for column %d", field.Val, colIdx) 1598 } 1599 jsonBytes, err = types.EncodeJson(byteJson) 1600 if err != nil { 1601 logutil.Errorf("encode json[%v] err:%v", field.Val, err) 1602 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not json type for column %d", field.Val, colIdx) 1603 } 1604 } 1605 1606 err := vector.SetBytesAt(vec, rowIdx, jsonBytes, mp) 1607 if err != nil { 1608 return err 1609 } 1610 case types.T_date: 1611 d, err := types.ParseDateCast(field.Val) 1612 if err != nil { 1613 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1614 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not Date type for column %d", field.Val, colIdx) 1615 } 1616 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1617 return err 1618 } 1619 case types.T_time: 1620 d, err := types.ParseTime(field.Val, vec.GetType().Scale) 1621 if err != nil { 1622 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1623 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not Time type for column %d", field.Val, colIdx) 1624 } 1625 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1626 return err 1627 } 1628 case types.T_datetime: 1629 d, err := types.ParseDatetime(field.Val, vec.GetType().Scale) 1630 if err != nil { 1631 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1632 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not Datetime type for column %d", field.Val, colIdx) 1633 } 1634 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1635 return err 1636 } 1637 case types.T_enum: 1638 d, err := strconv.ParseUint(field.Val, 10, 16) 1639 if err == nil { 1640 if err := vector.SetFixedAt(vec, rowIdx, types.Enum(d)); err != nil { 1641 return err 1642 } 1643 } else if errors.Is(err, strconv.ErrSyntax) { 1644 v, err := types.ParseEnum(param.Cols[colIdx].Typ.Enumvalues, field.Val) 1645 if err != nil { 1646 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1647 return err 1648 } 1649 if err := vector.SetFixedAt(vec, rowIdx, types.Enum(v)); err != nil { 1650 return err 1651 } 1652 } else { 1653 if errors.Is(err, strconv.ErrRange) { 1654 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1655 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint16 type for column %d", field.Val, colIdx) 1656 } 1657 f, err := strconv.ParseFloat(field.Val, 64) 1658 if err != nil || f < 0 || f > math.MaxUint16 { 1659 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1660 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uint16 type for column %d", field.Val, colIdx) 1661 } 1662 if err := vector.SetFixedAt(vec, rowIdx, types.Enum(f)); err != nil { 1663 return err 1664 } 1665 } 1666 case types.T_decimal64: 1667 d, err := types.ParseDecimal64(field.Val, vec.GetType().Width, vec.GetType().Scale) 1668 if err != nil { 1669 // we tolerate loss of digits. 1670 if !moerr.IsMoErrCode(err, moerr.ErrDataTruncated) { 1671 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1672 return moerr.NewInternalError(param.Ctx, "the input value '%v' is invalid Decimal64 type for column %d", field.Val, colIdx) 1673 } 1674 } 1675 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1676 return err 1677 } 1678 case types.T_decimal128: 1679 d, err := types.ParseDecimal128(field.Val, vec.GetType().Width, vec.GetType().Scale) 1680 if err != nil { 1681 // we tolerate loss of digits. 1682 if !moerr.IsMoErrCode(err, moerr.ErrDataTruncated) { 1683 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1684 return moerr.NewInternalError(param.Ctx, "the input value '%v' is invalid Decimal128 type for column %d", field.Val, colIdx) 1685 } 1686 } 1687 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1688 return err 1689 } 1690 case types.T_timestamp: 1691 t := time.Local 1692 d, err := types.ParseTimestamp(t, field.Val, vec.GetType().Scale) 1693 if err != nil { 1694 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1695 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not Timestamp type for column %d", field.Val, colIdx) 1696 } 1697 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1698 return err 1699 } 1700 case types.T_uuid: 1701 d, err := types.ParseUuid(field.Val) 1702 if err != nil { 1703 logutil.Errorf("parse field[%v] err:%v", field.Val, err) 1704 return moerr.NewInternalError(param.Ctx, "the input value '%v' is not uuid type for column %d", field.Val, colIdx) 1705 } 1706 if err := vector.SetFixedAt(vec, rowIdx, d); err != nil { 1707 return err 1708 } 1709 default: 1710 return moerr.NewInternalError(param.Ctx, "the value type %d is not support now", param.Cols[rowIdx].Typ.Id) 1711 } 1712 } 1713 return nil 1714 } 1715 1716 // Read reads len count records from r. 1717 // Each record is a slice of fields. 1718 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 1719 // defined to read until EOF, it does not treat an end of file as an error to be 1720 // reported. 1721 func readCountStringLimitSize(r *csvparser.CSVParser, ctx context.Context, size uint64, records [][]csvparser.Field) (int, bool, error) { 1722 var curBatchSize uint64 = 0 1723 for i := 0; i < OneBatchMaxRow; i++ { 1724 select { 1725 case <-ctx.Done(): 1726 return i, true, nil 1727 default: 1728 } 1729 record, err := r.Read() 1730 if err != nil { 1731 if err == io.EOF { 1732 return i, true, nil 1733 } 1734 return i, true, err 1735 } 1736 records[i] = record 1737 for j := 0; j < len(record); j++ { 1738 curBatchSize += uint64(len(record[j].Val)) 1739 } 1740 if curBatchSize >= size { 1741 return i + 1, false, nil 1742 } 1743 } 1744 return OneBatchMaxRow, false, nil 1745 } 1746 1747 func loadFormatIsValid(param *tree.ExternParam) bool { 1748 switch param.Format { 1749 case tree.JSONLINE, tree.CSV, tree.PARQUET: 1750 return true 1751 } 1752 return false 1753 }