github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/plan/external.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package plan
    16  
    17  import (
    18  	"bufio"
    19  	"context"
    20  	"encoding/json"
    21  	"io"
    22  	"strings"
    23  
    24  	"github.com/matrixorigin/matrixone/pkg/catalog"
    25  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    26  	"github.com/matrixorigin/matrixone/pkg/container/types"
    27  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    28  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    29  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    30  	"github.com/matrixorigin/matrixone/pkg/sql/colexec"
    31  	"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
    32  	"github.com/matrixorigin/matrixone/pkg/util/trace"
    33  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options"
    34  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    35  )
    36  
    37  var (
    38  	STATEMENT_ACCOUNT = "account"
    39  )
    40  
    41  //this file is duplicate with colexec/external/external.go , to avoid import cycle
    42  
    43  func filterFileList(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) {
    44  	return filterByAccountAndFilename(ctx, node, proc, fileList, fileSize)
    45  }
    46  
    47  func containColname(col string) bool {
    48  	return strings.Contains(col, STATEMENT_ACCOUNT) || strings.Contains(col, catalog.ExternalFilePath)
    49  }
    50  
    51  func judgeContainColname(expr *plan.Expr) bool {
    52  	expr_F, ok := expr.Expr.(*plan.Expr_F)
    53  	if !ok {
    54  		return false
    55  	}
    56  	if expr_F.F.Func.ObjName == "or" {
    57  		flag := true
    58  		for i := 0; i < len(expr_F.F.Args); i++ {
    59  			flag = flag && judgeContainColname(expr_F.F.Args[i])
    60  		}
    61  		return flag
    62  	}
    63  	expr_Col, ok := expr_F.F.Args[0].Expr.(*plan.Expr_Col)
    64  	if ok && containColname(expr_Col.Col.Name) {
    65  		return true
    66  	}
    67  	for _, arg := range expr_F.F.Args {
    68  		if judgeContainColname(arg) {
    69  			return true
    70  		}
    71  	}
    72  	return false
    73  }
    74  
    75  func filterByAccountAndFilename(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) {
    76  	_, span := trace.Start(ctx, "filterByAccountAndFilename")
    77  	defer span.End()
    78  	filterList := make([]*plan.Expr, 0)
    79  	filterList2 := make([]*plan.Expr, 0)
    80  	for i := 0; i < len(node.FilterList); i++ {
    81  		if judgeContainColname(node.FilterList[i]) {
    82  			filterList = append(filterList, node.FilterList[i])
    83  		} else {
    84  			filterList2 = append(filterList2, node.FilterList[i])
    85  		}
    86  	}
    87  	if len(filterList) == 0 {
    88  		return fileList, fileSize, nil
    89  	}
    90  	bat := makeFilepathBatch(node, proc, filterList, fileList)
    91  	filter := colexec.RewriteFilterExprList(filterList)
    92  
    93  	vec, err := colexec.EvalExpressionOnce(proc, filter, []*batch.Batch{bat})
    94  	if err != nil {
    95  		return nil, fileSize, err
    96  	}
    97  
    98  	fileListTmp := make([]string, 0)
    99  	fileSizeTmp := make([]int64, 0)
   100  	bs := vector.MustFixedCol[bool](vec)
   101  	for i := 0; i < len(bs); i++ {
   102  		if bs[i] {
   103  			fileListTmp = append(fileListTmp, fileList[i])
   104  			fileSizeTmp = append(fileSizeTmp, fileSize[i])
   105  		}
   106  	}
   107  	vec.Free(proc.Mp())
   108  	node.FilterList = filterList2
   109  	return fileListTmp, fileSizeTmp, nil
   110  }
   111  
   112  func makeFilepathBatch(node *plan.Node, proc *process.Process, filterList []*plan.Expr, fileList []string) *batch.Batch {
   113  	num := len(node.TableDef.Cols)
   114  	bat := &batch.Batch{
   115  		Attrs: make([]string, num),
   116  		Vecs:  make([]*vector.Vector, num),
   117  		Cnt:   1,
   118  	}
   119  	for i := 0; i < num; i++ {
   120  		bat.Attrs[i] = node.TableDef.Cols[i].Name
   121  		if bat.Attrs[i] == STATEMENT_ACCOUNT {
   122  			typ := types.New(types.T(node.TableDef.Cols[i].Typ.Id), node.TableDef.Cols[i].Typ.Width, node.TableDef.Cols[i].Typ.Scale)
   123  			vec, _ := proc.AllocVectorOfRows(typ, len(fileList), nil)
   124  			//vec.SetOriginal(false)
   125  			for j := 0; j < len(fileList); j++ {
   126  				vector.SetStringAt(vec, j, getAccountCol(fileList[j]), proc.Mp())
   127  			}
   128  			bat.Vecs[i] = vec
   129  		} else if bat.Attrs[i] == catalog.ExternalFilePath {
   130  			typ := types.T_varchar.ToType()
   131  			vec, _ := proc.AllocVectorOfRows(typ, len(fileList), nil)
   132  			//vec.SetOriginal(false)
   133  			for j := 0; j < len(fileList); j++ {
   134  				vector.SetStringAt(vec, j, fileList[j], proc.Mp())
   135  			}
   136  			bat.Vecs[i] = vec
   137  		}
   138  	}
   139  	bat.SetRowCount(len(fileList))
   140  	return bat
   141  }
   142  
   143  func getAccountCol(filepath string) string {
   144  	pathDir := strings.Split(filepath, "/")
   145  	if len(pathDir) < 2 {
   146  		return ""
   147  	}
   148  	return pathDir[1]
   149  }
   150  
   151  func getExternalStats(node *plan.Node, builder *QueryBuilder) *Stats {
   152  	externScan := node.ExternScan
   153  	if externScan != nil && externScan.Type == tree.INLINE {
   154  		totolSize := len(externScan.Data)
   155  		lineSize := float64(0.0)
   156  		if externScan.Format == tree.CSV {
   157  			lineSize = float64(strings.Index(externScan.Data, "\n"))
   158  		}
   159  
   160  		if externScan.Format == tree.JSONLINE {
   161  			lineSize = GetRowSizeFromTableDef(node.GetTableDef(), true) * 0.8
   162  		}
   163  
   164  		if lineSize > 0 {
   165  			cost := float64(totolSize) / lineSize
   166  			return &plan.Stats{
   167  				Outcnt:      cost,
   168  				Cost:        cost,
   169  				Rowsize:     lineSize,
   170  				Selectivity: 1,
   171  				TableCnt:    cost,
   172  				BlockNum:    int32(cost / float64(options.DefaultBlockMaxRows)),
   173  			}
   174  		}
   175  	}
   176  
   177  	param := &tree.ExternParam{}
   178  	err := json.Unmarshal([]byte(node.TableDef.Createsql), param)
   179  	if err != nil || param.Local || param.ScanType == tree.S3 {
   180  		return DefaultHugeStats()
   181  	}
   182  
   183  	if param.ScanType == tree.S3 {
   184  		if err = InitS3Param(param); err != nil {
   185  			return DefaultHugeStats()
   186  		}
   187  	} else {
   188  		if err = InitInfileParam(param); err != nil {
   189  			return DefaultHugeStats()
   190  		}
   191  	}
   192  
   193  	param.FileService = builder.compCtx.GetProcess().FileService
   194  	param.Ctx = builder.compCtx.GetProcess().Ctx
   195  	_, spanReadDir := trace.Start(param.Ctx, "ReCalcNodeStats.ReadDir")
   196  	fileList, fileSize, err := ReadDir(param)
   197  	spanReadDir.End()
   198  	if err != nil {
   199  		return DefaultHugeStats()
   200  	}
   201  	fileList, fileSize, err = filterFileList(param.Ctx, node, builder.compCtx.GetProcess(), fileList, fileSize)
   202  	if err != nil {
   203  		return DefaultHugeStats()
   204  	}
   205  	if param.LoadFile && len(fileList) == 0 {
   206  		// all files filtered, return a default small stats
   207  		return DefaultStats()
   208  	}
   209  	var cost float64
   210  	for i := range fileSize {
   211  		cost += float64(fileSize[i])
   212  	}
   213  
   214  	//read one line
   215  	fs, readPath, err := GetForETLWithType(param, param.Filepath)
   216  	if err != nil {
   217  		return DefaultHugeStats()
   218  	}
   219  	var r io.ReadCloser
   220  	vec := fileservice.IOVector{
   221  		FilePath: readPath,
   222  		Entries: []fileservice.IOEntry{
   223  			0: {
   224  				Offset:            0,
   225  				Size:              -1,
   226  				ReadCloserForRead: &r,
   227  			},
   228  		},
   229  	}
   230  	if err = fs.Read(param.Ctx, &vec); err != nil {
   231  		return DefaultHugeStats()
   232  	}
   233  	r2 := bufio.NewReader(r)
   234  	line, _ := r2.ReadString('\n')
   235  	size := len(line)
   236  	cost = cost / float64(size)
   237  
   238  	return &plan.Stats{
   239  		Outcnt:      cost,
   240  		Cost:        cost,
   241  		Rowsize:     float64(size),
   242  		Selectivity: 1,
   243  		TableCnt:    cost,
   244  		BlockNum:    int32(cost / float64(options.DefaultBlockMaxRows)),
   245  	}
   246  }