github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/plan/external.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package plan 16 17 import ( 18 "bufio" 19 "context" 20 "encoding/json" 21 "io" 22 "strings" 23 24 "github.com/matrixorigin/matrixone/pkg/catalog" 25 "github.com/matrixorigin/matrixone/pkg/container/batch" 26 "github.com/matrixorigin/matrixone/pkg/container/types" 27 "github.com/matrixorigin/matrixone/pkg/container/vector" 28 "github.com/matrixorigin/matrixone/pkg/fileservice" 29 "github.com/matrixorigin/matrixone/pkg/pb/plan" 30 "github.com/matrixorigin/matrixone/pkg/sql/colexec" 31 "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" 32 "github.com/matrixorigin/matrixone/pkg/util/trace" 33 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options" 34 "github.com/matrixorigin/matrixone/pkg/vm/process" 35 ) 36 37 var ( 38 STATEMENT_ACCOUNT = "account" 39 ) 40 41 //this file is duplicate with colexec/external/external.go , to avoid import cycle 42 43 func filterFileList(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) { 44 return filterByAccountAndFilename(ctx, node, proc, fileList, fileSize) 45 } 46 47 func containColname(col string) bool { 48 return strings.Contains(col, STATEMENT_ACCOUNT) || strings.Contains(col, catalog.ExternalFilePath) 49 } 50 51 func judgeContainColname(expr *plan.Expr) bool { 52 expr_F, ok := expr.Expr.(*plan.Expr_F) 53 if !ok { 54 return false 55 } 56 if expr_F.F.Func.ObjName == "or" { 57 flag := true 58 for i := 0; i < len(expr_F.F.Args); i++ { 59 flag = flag && judgeContainColname(expr_F.F.Args[i]) 60 } 61 return flag 62 } 63 expr_Col, ok := expr_F.F.Args[0].Expr.(*plan.Expr_Col) 64 if ok && containColname(expr_Col.Col.Name) { 65 return true 66 } 67 for _, arg := range expr_F.F.Args { 68 if judgeContainColname(arg) { 69 return true 70 } 71 } 72 return false 73 } 74 75 func filterByAccountAndFilename(ctx context.Context, node *plan.Node, proc *process.Process, fileList []string, fileSize []int64) ([]string, []int64, error) { 76 _, span := trace.Start(ctx, "filterByAccountAndFilename") 77 defer span.End() 78 filterList := make([]*plan.Expr, 0) 79 filterList2 := make([]*plan.Expr, 0) 80 for i := 0; i < len(node.FilterList); i++ { 81 if judgeContainColname(node.FilterList[i]) { 82 filterList = append(filterList, node.FilterList[i]) 83 } else { 84 filterList2 = append(filterList2, node.FilterList[i]) 85 } 86 } 87 if len(filterList) == 0 { 88 return fileList, fileSize, nil 89 } 90 bat := makeFilepathBatch(node, proc, filterList, fileList) 91 filter := colexec.RewriteFilterExprList(filterList) 92 93 vec, err := colexec.EvalExpressionOnce(proc, filter, []*batch.Batch{bat}) 94 if err != nil { 95 return nil, fileSize, err 96 } 97 98 fileListTmp := make([]string, 0) 99 fileSizeTmp := make([]int64, 0) 100 bs := vector.MustFixedCol[bool](vec) 101 for i := 0; i < len(bs); i++ { 102 if bs[i] { 103 fileListTmp = append(fileListTmp, fileList[i]) 104 fileSizeTmp = append(fileSizeTmp, fileSize[i]) 105 } 106 } 107 vec.Free(proc.Mp()) 108 node.FilterList = filterList2 109 return fileListTmp, fileSizeTmp, nil 110 } 111 112 func makeFilepathBatch(node *plan.Node, proc *process.Process, filterList []*plan.Expr, fileList []string) *batch.Batch { 113 num := len(node.TableDef.Cols) 114 bat := &batch.Batch{ 115 Attrs: make([]string, num), 116 Vecs: make([]*vector.Vector, num), 117 Cnt: 1, 118 } 119 for i := 0; i < num; i++ { 120 bat.Attrs[i] = node.TableDef.Cols[i].Name 121 if bat.Attrs[i] == STATEMENT_ACCOUNT { 122 typ := types.New(types.T(node.TableDef.Cols[i].Typ.Id), node.TableDef.Cols[i].Typ.Width, node.TableDef.Cols[i].Typ.Scale) 123 vec, _ := proc.AllocVectorOfRows(typ, len(fileList), nil) 124 //vec.SetOriginal(false) 125 for j := 0; j < len(fileList); j++ { 126 vector.SetStringAt(vec, j, getAccountCol(fileList[j]), proc.Mp()) 127 } 128 bat.Vecs[i] = vec 129 } else if bat.Attrs[i] == catalog.ExternalFilePath { 130 typ := types.T_varchar.ToType() 131 vec, _ := proc.AllocVectorOfRows(typ, len(fileList), nil) 132 //vec.SetOriginal(false) 133 for j := 0; j < len(fileList); j++ { 134 vector.SetStringAt(vec, j, fileList[j], proc.Mp()) 135 } 136 bat.Vecs[i] = vec 137 } 138 } 139 bat.SetRowCount(len(fileList)) 140 return bat 141 } 142 143 func getAccountCol(filepath string) string { 144 pathDir := strings.Split(filepath, "/") 145 if len(pathDir) < 2 { 146 return "" 147 } 148 return pathDir[1] 149 } 150 151 func getExternalStats(node *plan.Node, builder *QueryBuilder) *Stats { 152 externScan := node.ExternScan 153 if externScan != nil && externScan.Type == tree.INLINE { 154 totolSize := len(externScan.Data) 155 lineSize := float64(0.0) 156 if externScan.Format == tree.CSV { 157 lineSize = float64(strings.Index(externScan.Data, "\n")) 158 } 159 160 if externScan.Format == tree.JSONLINE { 161 lineSize = GetRowSizeFromTableDef(node.GetTableDef(), true) * 0.8 162 } 163 164 if lineSize > 0 { 165 cost := float64(totolSize) / lineSize 166 return &plan.Stats{ 167 Outcnt: cost, 168 Cost: cost, 169 Rowsize: lineSize, 170 Selectivity: 1, 171 TableCnt: cost, 172 BlockNum: int32(cost / float64(options.DefaultBlockMaxRows)), 173 } 174 } 175 } 176 177 param := &tree.ExternParam{} 178 err := json.Unmarshal([]byte(node.TableDef.Createsql), param) 179 if err != nil || param.Local || param.ScanType == tree.S3 { 180 return DefaultHugeStats() 181 } 182 183 if param.ScanType == tree.S3 { 184 if err = InitS3Param(param); err != nil { 185 return DefaultHugeStats() 186 } 187 } else { 188 if err = InitInfileParam(param); err != nil { 189 return DefaultHugeStats() 190 } 191 } 192 193 param.FileService = builder.compCtx.GetProcess().FileService 194 param.Ctx = builder.compCtx.GetProcess().Ctx 195 _, spanReadDir := trace.Start(param.Ctx, "ReCalcNodeStats.ReadDir") 196 fileList, fileSize, err := ReadDir(param) 197 spanReadDir.End() 198 if err != nil { 199 return DefaultHugeStats() 200 } 201 fileList, fileSize, err = filterFileList(param.Ctx, node, builder.compCtx.GetProcess(), fileList, fileSize) 202 if err != nil { 203 return DefaultHugeStats() 204 } 205 if param.LoadFile && len(fileList) == 0 { 206 // all files filtered, return a default small stats 207 return DefaultStats() 208 } 209 var cost float64 210 for i := range fileSize { 211 cost += float64(fileSize[i]) 212 } 213 214 //read one line 215 fs, readPath, err := GetForETLWithType(param, param.Filepath) 216 if err != nil { 217 return DefaultHugeStats() 218 } 219 var r io.ReadCloser 220 vec := fileservice.IOVector{ 221 FilePath: readPath, 222 Entries: []fileservice.IOEntry{ 223 0: { 224 Offset: 0, 225 Size: -1, 226 ReadCloserForRead: &r, 227 }, 228 }, 229 } 230 if err = fs.Read(param.Ctx, &vec); err != nil { 231 return DefaultHugeStats() 232 } 233 r2 := bufio.NewReader(r) 234 line, _ := r2.ReadString('\n') 235 size := len(line) 236 cost = cost / float64(size) 237 238 return &plan.Stats{ 239 Outcnt: cost, 240 Cost: cost, 241 Rowsize: float64(size), 242 Selectivity: 1, 243 TableCnt: cost, 244 BlockNum: int32(cost / float64(options.DefaultBlockMaxRows)), 245 } 246 }