github.com/matrixorigin/matrixone@v1.2.0/pkg/util/export/etl/tae.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package etl 16 17 import ( 18 "context" 19 "fmt" 20 "strconv" 21 "time" 22 23 "github.com/matrixorigin/matrixone/pkg/common/moerr" 24 "github.com/matrixorigin/matrixone/pkg/common/mpool" 25 "github.com/matrixorigin/matrixone/pkg/common/util" 26 "github.com/matrixorigin/matrixone/pkg/container/batch" 27 "github.com/matrixorigin/matrixone/pkg/container/bytejson" 28 "github.com/matrixorigin/matrixone/pkg/container/types" 29 "github.com/matrixorigin/matrixone/pkg/container/vector" 30 "github.com/matrixorigin/matrixone/pkg/fileservice" 31 "github.com/matrixorigin/matrixone/pkg/objectio" 32 "github.com/matrixorigin/matrixone/pkg/util/export/table" 33 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio" 34 ) 35 36 const BatchSize = 8192 37 38 var _ table.RowWriter = (*TAEWriter)(nil) 39 40 // TAEWriter implements table.RowWriter and writes data to a tae file. 41 // Deprecated 42 type TAEWriter struct { 43 ctx context.Context 44 columnsTypes []types.Type 45 idxs []uint16 46 batchSize int 47 mp *mpool.MPool 48 filename string 49 fs fileservice.FileService 50 //writer objectio.Writer 51 writer *blockio.BlockWriter 52 rows []*table.Row 53 54 flushRows int 55 } 56 57 // NewTAEWriter returns a new instance of TAEWriter 58 // Deprecated 59 func NewTAEWriter(ctx context.Context, tbl *table.Table, mp *mpool.MPool, filePath string, fs fileservice.FileService) *TAEWriter { 60 w := &TAEWriter{ 61 ctx: ctx, 62 batchSize: BatchSize, 63 mp: mp, 64 filename: filePath, 65 fs: fs, 66 rows: make([]*table.Row, 0, BatchSize), 67 } 68 69 w.idxs = make([]uint16, len(tbl.Columns)) 70 for idx, c := range tbl.Columns { 71 w.columnsTypes = append(w.columnsTypes, c.ColType.ToType()) 72 w.idxs[idx] = uint16(idx) 73 } 74 w.writer, _ = blockio.NewBlockWriter(fs, filePath) 75 return w 76 } 77 78 func newBatch(batchSize int, typs []types.Type, pool *mpool.MPool) *batch.Batch { 79 batch := batch.NewWithSize(len(typs)) 80 for i, typ := range typs { 81 switch typ.Oid { 82 case types.T_datetime: 83 typ.Scale = 6 84 } 85 vec := vector.NewVec(typ) 86 if err := vec.PreExtend(batchSize, pool); err != nil { 87 panic(err) 88 } 89 vec.SetLength(batchSize) 90 //vec.SetOriginal(false) 91 batch.Vecs[i] = vec 92 } 93 return batch 94 } 95 96 func (w *TAEWriter) GetContent() string { return "" } 97 98 // WriteStrings implement ETLWriter 99 func (w *TAEWriter) WriteStrings(Line []string) error { 100 var elems = make([]table.ColumnField, len(w.columnsTypes)) 101 for colIdx, typ := range w.columnsTypes { 102 field := Line[colIdx] 103 id := typ.Oid 104 switch id { 105 case types.T_int64: 106 val, err := strconv.ParseInt(field, 10, 64) 107 if err != nil { 108 // fixme: help merge to continue 109 return moerr.NewInternalError(w.ctx, "the input value is not int64 type for column %d: %v, err: %s", colIdx, field, err) 110 } 111 elems[colIdx] = table.Int64Field(val) 112 case types.T_uint64: 113 val, err := strconv.ParseUint(field, 10, 64) 114 if err != nil { 115 return moerr.NewInternalError(w.ctx, "the input value is not uint64 type for column %d: %v, err: %s", colIdx, field, err) 116 } 117 elems[colIdx] = table.Uint64Field(val) 118 case types.T_float64: 119 val, err := strconv.ParseFloat(field, 64) 120 if err != nil { 121 return moerr.NewInternalError(w.ctx, "the input value is not float64 type for column %d: %v, err: %s", colIdx, field, err) 122 } 123 elems[colIdx] = table.Float64Field(val) 124 case types.T_char, types.T_varchar, 125 types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 126 //TAEWriter is deprecated. So no need to add T_array here. 127 elems[colIdx] = table.StringField(field) 128 case types.T_json: 129 elems[colIdx] = table.StringField(field) 130 case types.T_datetime: 131 elems[colIdx] = table.StringField(field) 132 default: 133 elems[colIdx] = table.StringField(field) 134 } 135 } 136 row := table.NewRow() 137 row.Columns = elems 138 w.rows = append(w.rows, row) 139 return w.writeRows() 140 } 141 142 // WriteRow implement ETLWriter 143 func (w *TAEWriter) WriteRow(row *table.Row) error { 144 w.rows = append(w.rows, row.Clone()) 145 return w.writeRows() 146 } 147 148 func (w *TAEWriter) writeRows() error { 149 if len(w.rows) >= w.batchSize { 150 if err := w.writeBatch(); err != nil { 151 return err 152 } 153 } 154 return nil 155 } 156 157 func (w *TAEWriter) writeBatch() error { 158 if len(w.rows) == 0 { 159 return nil 160 } 161 batch := newBatch(len(w.rows), w.columnsTypes, w.mp) 162 for rowId, row := range w.rows { 163 err := getOneRowData(w.ctx, batch, row.GetRawColumns(), rowId, w.columnsTypes, w.mp) 164 if err != nil { 165 return err 166 } 167 } 168 _, err := w.writer.WriteBatch(batch) 169 if err != nil { 170 return err 171 } 172 // check if empty 173 w.flushRows += len(w.rows) 174 // clean 175 for idx, row := range w.rows { 176 row.Free() 177 w.rows[idx] = nil 178 } 179 w.rows = w.rows[:0] 180 batch.Clean(w.mp) 181 return nil 182 } 183 184 func (w *TAEWriter) flush() error { 185 err := w.writeBatch() 186 if err != nil { 187 return err 188 } 189 _, _, err = w.writer.Sync(w.ctx) 190 if err != nil { 191 return err 192 } 193 if w.flushRows == 0 { 194 return moerr.NewEmptyRange(w.ctx, w.filename) 195 } 196 return nil 197 } 198 199 // FlushAndClose implement ETLWriter 200 func (w *TAEWriter) FlushAndClose() (int, error) { 201 return 0, w.flush() 202 } 203 204 func getOneRowData(ctx context.Context, bat *batch.Batch, Line []table.ColumnField, rowIdx int, typs []types.Type, mp *mpool.MPool) error { 205 206 for colIdx, typ := range typs { 207 field := Line[colIdx] 208 id := typ.Oid 209 vec := bat.Vecs[colIdx] 210 switch id { 211 case types.T_int64: 212 cols := vector.MustFixedCol[int64](vec) 213 cols[rowIdx] = field.Integer 214 case types.T_uint64: 215 cols := vector.MustFixedCol[uint64](vec) 216 cols[rowIdx] = uint64(field.Integer) 217 case types.T_float64: 218 cols := vector.MustFixedCol[float64](vec) 219 cols[rowIdx] = field.GetFloat64() 220 case types.T_char, types.T_varchar, 221 types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 222 //TODO: How to handle T_array here? 223 switch field.Type { 224 case table.TVarchar, table.TText: 225 err := vector.SetStringAt(vec, rowIdx, field.String, mp) 226 if err != nil { 227 return err 228 } 229 case table.TBytes: 230 err := vector.SetBytesAt(vec, rowIdx, field.Bytes, mp) 231 if err != nil { 232 return err 233 } 234 case table.TUuid: 235 dst := field.EncodeUuid() 236 err := vector.SetBytesAt(vec, rowIdx, dst[:], mp) 237 if err != nil { 238 return err 239 } 240 default: 241 return moerr.NewInternalError(ctx, "not Support string type %v", field.Type) 242 } 243 case types.T_json: 244 switch field.Type { 245 case table.TVarchar, table.TText: 246 // convert normal json-string to bytejson-bytes 247 jsonBytes, err := bytejson.ParseJsonByteFromString(field.String) 248 if err != nil { 249 return moerr.NewInternalError(ctx, "the input value is not json type for column %d: %v", colIdx, field) 250 } 251 err = vector.SetBytesAt(vec, rowIdx, jsonBytes, mp) 252 if err != nil { 253 return err 254 } 255 case table.TBytes: 256 val := field.Bytes 257 if len(val) == 0 { 258 val = util.UnsafeStringToBytes(field.String) 259 } 260 err := vector.SetBytesAt(vec, rowIdx, val, mp) 261 if err != nil { 262 return err 263 } 264 } 265 266 case types.T_datetime: 267 cols := vector.MustFixedCol[types.Datetime](vec) 268 switch field.Type { 269 case table.TDatetime: 270 var buf [64]byte 271 dst := field.EncodedDatetime(buf[:0]) 272 d, err := types.ParseDatetime(string(dst), vec.GetType().Scale) 273 if err != nil { 274 return moerr.NewInternalError(ctx, "the input value is not Datetime type for column %d: %v", colIdx, field) 275 } 276 cols[rowIdx] = d 277 case table.TVarchar, table.TText: 278 datetimeStr := field.String 279 if len(datetimeStr) == 0 { 280 cols[rowIdx] = types.Datetime(0) 281 } else { 282 d, err := types.ParseDatetime(datetimeStr, vec.GetType().Scale) 283 if err != nil { 284 return moerr.NewInternalError(ctx, "the input value is not Datetime type for column %d: %v", colIdx, field) 285 } 286 cols[rowIdx] = d 287 } 288 default: 289 return moerr.NewInternalError(ctx, "not Support datetime type %v", field.Type) 290 } 291 default: 292 return moerr.NewInternalError(ctx, "the value type %s is not support now", *vec.GetType()) 293 } 294 } 295 return nil 296 } 297 298 // TAEReader implements the io.Reader interface for reading a tae file. 299 // Deprecated 300 type TAEReader struct { 301 ctx context.Context 302 filepath string 303 filesize int64 304 fs fileservice.FileService 305 mp *mpool.MPool 306 typs []types.Type 307 idxs []uint16 308 309 blockReader *blockio.BlockReader 310 311 bs []objectio.BlockObject 312 batchs []*batch.Batch 313 batchIdx int 314 rowIdx int 315 316 release func() 317 } 318 319 // NewTaeReader returns a TAEReader. 320 // Deprecated 321 func NewTaeReader(ctx context.Context, tbl *table.Table, filePath string, filesize int64, fs fileservice.FileService, mp *mpool.MPool) (*TAEReader, error) { 322 var err error 323 r := &TAEReader{ 324 ctx: ctx, 325 filepath: filePath, 326 filesize: filesize, 327 fs: fs, 328 mp: mp, 329 } 330 r.idxs = make([]uint16, len(tbl.Columns)) 331 for idx, c := range tbl.Columns { 332 r.typs = append(r.typs, c.ColType.ToType()) 333 r.idxs[idx] = uint16(idx) 334 } 335 r.blockReader, err = blockio.NewFileReaderNoCache(r.fs, r.filepath) 336 if err != nil { 337 return nil, err 338 } 339 return r, nil 340 } 341 342 func (r *TAEReader) ReadAll(ctx context.Context) ([]*batch.Batch, error) { 343 if r.release != nil { 344 panic("can only call once") 345 } 346 ioVec, release, err := r.blockReader.LoadAllColumns(ctx, r.idxs, r.mp) 347 if err != nil { 348 return nil, err 349 } 350 r.release = release 351 r.batchs = append(r.batchs, ioVec...) 352 return r.batchs, nil 353 } 354 355 func (r *TAEReader) ReadLine() ([]string, error) { 356 var record = make([]string, len(r.idxs)) 357 if r.batchIdx >= len(r.batchs) { 358 return nil, nil 359 } 360 if r.rowIdx >= r.batchs[r.batchIdx].Vecs[0].Length() { 361 r.batchIdx++ 362 r.rowIdx = 0 363 } 364 if r.batchIdx >= len(r.batchs) || r.rowIdx >= r.batchs[r.batchIdx].Vecs[0].Length() { 365 return nil, nil 366 } 367 vecs := r.batchs[r.batchIdx].Vecs 368 for idx, vecIdx := range r.idxs { 369 val, err := ValToString(r.ctx, vecs[vecIdx], r.rowIdx) 370 if err != nil { 371 return nil, err 372 } 373 record[idx] = val 374 } 375 r.rowIdx++ 376 return record, nil 377 } 378 379 func (r *TAEReader) ReadRow(row *table.Row) error { 380 panic("NOT implement") 381 } 382 383 func (r *TAEReader) Close() { 384 for idx := range r.batchs { 385 // do NOT release it in mpool (like r.batchs[idx].Clean(r.mp)). right now, the buffer is new one. 386 r.batchs[idx] = nil 387 } 388 r.batchs = nil 389 if r.release != nil { 390 r.release() 391 } 392 } 393 394 func GetVectorArrayLen(ctx context.Context, vec *vector.Vector) (int, error) { 395 typ := vec.GetType() 396 switch typ.Oid { 397 case types.T_int64: 398 cols := vector.MustFixedCol[int64](vec) 399 return len(cols), nil 400 case types.T_uint64: 401 cols := vector.MustFixedCol[uint64](vec) 402 return len(cols), nil 403 case types.T_float64: 404 cols := vector.MustFixedCol[float64](vec) 405 return len(cols), nil 406 case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_blob, types.T_text, 407 types.T_array_float32, types.T_array_float64: 408 cols := vector.MustFixedCol[types.Varlena](vec) 409 return len(cols), nil 410 case types.T_json: 411 cols := vector.MustFixedCol[types.Varlena](vec) 412 return len(cols), nil 413 case types.T_datetime: 414 cols := vector.MustFixedCol[types.Datetime](vec) 415 return len(cols), nil 416 default: 417 return 0, moerr.NewInternalError(ctx, "the value type with oid %d is not support now", vec.GetType().Oid) 418 } 419 } 420 421 func ValToString(ctx context.Context, vec *vector.Vector, rowIdx int) (string, error) { 422 typ := vec.GetType() 423 switch typ.Oid { 424 case types.T_int64: 425 cols := vector.MustFixedCol[int64](vec) 426 return fmt.Sprintf("%d", cols[rowIdx]), nil 427 case types.T_uint64: 428 cols := vector.MustFixedCol[uint64](vec) 429 return fmt.Sprintf("%d", cols[rowIdx]), nil 430 case types.T_float64: 431 cols := vector.MustFixedCol[float64](vec) 432 return fmt.Sprintf("%f", cols[rowIdx]), nil 433 case types.T_char, types.T_varchar, 434 types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 435 cols, area := vector.MustVarlenaRawData(vec) 436 return cols[rowIdx].GetString(area), nil 437 case types.T_array_float32: 438 cols, area := vector.MustVarlenaRawData(vec) 439 return types.ArrayToString[float32](types.GetArray[float32](&cols[rowIdx], area)), nil 440 case types.T_array_float64: 441 cols, area := vector.MustVarlenaRawData(vec) 442 return types.ArrayToString[float64](types.GetArray[float64](&cols[rowIdx], area)), nil 443 case types.T_json: 444 cols, area := vector.MustVarlenaRawData(vec) 445 val := cols[rowIdx].GetByteSlice(area) 446 bjson := types.DecodeJson(val) 447 return bjson.String(), nil 448 case types.T_datetime: 449 cols := vector.MustFixedCol[types.Datetime](vec) 450 return table.Time2DatetimeString(cols[rowIdx].ConvertToGoTime(time.Local)), nil 451 default: 452 return "", moerr.NewInternalError(ctx, "the value type with oid %d is not support now", vec.GetType().Oid) 453 } 454 }