github.com/matrixorigin/matrixone@v1.2.0/pkg/util/export/etl/tae.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package etl
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  	"time"
    22  
    23  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    24  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    25  	"github.com/matrixorigin/matrixone/pkg/common/util"
    26  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    27  	"github.com/matrixorigin/matrixone/pkg/container/bytejson"
    28  	"github.com/matrixorigin/matrixone/pkg/container/types"
    29  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    30  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    31  	"github.com/matrixorigin/matrixone/pkg/objectio"
    32  	"github.com/matrixorigin/matrixone/pkg/util/export/table"
    33  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio"
    34  )
    35  
    36  const BatchSize = 8192
    37  
    38  var _ table.RowWriter = (*TAEWriter)(nil)
    39  
    40  // TAEWriter implements table.RowWriter and writes data to a tae file.
    41  // Deprecated
    42  type TAEWriter struct {
    43  	ctx          context.Context
    44  	columnsTypes []types.Type
    45  	idxs         []uint16
    46  	batchSize    int
    47  	mp           *mpool.MPool
    48  	filename     string
    49  	fs           fileservice.FileService
    50  	//writer       objectio.Writer
    51  	writer *blockio.BlockWriter
    52  	rows   []*table.Row
    53  
    54  	flushRows int
    55  }
    56  
    57  // NewTAEWriter returns a new instance of TAEWriter
    58  // Deprecated
    59  func NewTAEWriter(ctx context.Context, tbl *table.Table, mp *mpool.MPool, filePath string, fs fileservice.FileService) *TAEWriter {
    60  	w := &TAEWriter{
    61  		ctx:       ctx,
    62  		batchSize: BatchSize,
    63  		mp:        mp,
    64  		filename:  filePath,
    65  		fs:        fs,
    66  		rows:      make([]*table.Row, 0, BatchSize),
    67  	}
    68  
    69  	w.idxs = make([]uint16, len(tbl.Columns))
    70  	for idx, c := range tbl.Columns {
    71  		w.columnsTypes = append(w.columnsTypes, c.ColType.ToType())
    72  		w.idxs[idx] = uint16(idx)
    73  	}
    74  	w.writer, _ = blockio.NewBlockWriter(fs, filePath)
    75  	return w
    76  }
    77  
    78  func newBatch(batchSize int, typs []types.Type, pool *mpool.MPool) *batch.Batch {
    79  	batch := batch.NewWithSize(len(typs))
    80  	for i, typ := range typs {
    81  		switch typ.Oid {
    82  		case types.T_datetime:
    83  			typ.Scale = 6
    84  		}
    85  		vec := vector.NewVec(typ)
    86  		if err := vec.PreExtend(batchSize, pool); err != nil {
    87  			panic(err)
    88  		}
    89  		vec.SetLength(batchSize)
    90  		//vec.SetOriginal(false)
    91  		batch.Vecs[i] = vec
    92  	}
    93  	return batch
    94  }
    95  
    96  func (w *TAEWriter) GetContent() string { return "" }
    97  
    98  // WriteStrings implement ETLWriter
    99  func (w *TAEWriter) WriteStrings(Line []string) error {
   100  	var elems = make([]table.ColumnField, len(w.columnsTypes))
   101  	for colIdx, typ := range w.columnsTypes {
   102  		field := Line[colIdx]
   103  		id := typ.Oid
   104  		switch id {
   105  		case types.T_int64:
   106  			val, err := strconv.ParseInt(field, 10, 64)
   107  			if err != nil {
   108  				// fixme: help merge to continue
   109  				return moerr.NewInternalError(w.ctx, "the input value is not int64 type for column %d: %v, err: %s", colIdx, field, err)
   110  			}
   111  			elems[colIdx] = table.Int64Field(val)
   112  		case types.T_uint64:
   113  			val, err := strconv.ParseUint(field, 10, 64)
   114  			if err != nil {
   115  				return moerr.NewInternalError(w.ctx, "the input value is not uint64 type for column %d: %v, err: %s", colIdx, field, err)
   116  			}
   117  			elems[colIdx] = table.Uint64Field(val)
   118  		case types.T_float64:
   119  			val, err := strconv.ParseFloat(field, 64)
   120  			if err != nil {
   121  				return moerr.NewInternalError(w.ctx, "the input value is not float64 type for column %d: %v, err: %s", colIdx, field, err)
   122  			}
   123  			elems[colIdx] = table.Float64Field(val)
   124  		case types.T_char, types.T_varchar,
   125  			types.T_binary, types.T_varbinary, types.T_blob, types.T_text:
   126  			//TAEWriter is deprecated. So no need to add T_array here.
   127  			elems[colIdx] = table.StringField(field)
   128  		case types.T_json:
   129  			elems[colIdx] = table.StringField(field)
   130  		case types.T_datetime:
   131  			elems[colIdx] = table.StringField(field)
   132  		default:
   133  			elems[colIdx] = table.StringField(field)
   134  		}
   135  	}
   136  	row := table.NewRow()
   137  	row.Columns = elems
   138  	w.rows = append(w.rows, row)
   139  	return w.writeRows()
   140  }
   141  
   142  // WriteRow implement ETLWriter
   143  func (w *TAEWriter) WriteRow(row *table.Row) error {
   144  	w.rows = append(w.rows, row.Clone())
   145  	return w.writeRows()
   146  }
   147  
   148  func (w *TAEWriter) writeRows() error {
   149  	if len(w.rows) >= w.batchSize {
   150  		if err := w.writeBatch(); err != nil {
   151  			return err
   152  		}
   153  	}
   154  	return nil
   155  }
   156  
   157  func (w *TAEWriter) writeBatch() error {
   158  	if len(w.rows) == 0 {
   159  		return nil
   160  	}
   161  	batch := newBatch(len(w.rows), w.columnsTypes, w.mp)
   162  	for rowId, row := range w.rows {
   163  		err := getOneRowData(w.ctx, batch, row.GetRawColumns(), rowId, w.columnsTypes, w.mp)
   164  		if err != nil {
   165  			return err
   166  		}
   167  	}
   168  	_, err := w.writer.WriteBatch(batch)
   169  	if err != nil {
   170  		return err
   171  	}
   172  	// check if empty
   173  	w.flushRows += len(w.rows)
   174  	// clean
   175  	for idx, row := range w.rows {
   176  		row.Free()
   177  		w.rows[idx] = nil
   178  	}
   179  	w.rows = w.rows[:0]
   180  	batch.Clean(w.mp)
   181  	return nil
   182  }
   183  
   184  func (w *TAEWriter) flush() error {
   185  	err := w.writeBatch()
   186  	if err != nil {
   187  		return err
   188  	}
   189  	_, _, err = w.writer.Sync(w.ctx)
   190  	if err != nil {
   191  		return err
   192  	}
   193  	if w.flushRows == 0 {
   194  		return moerr.NewEmptyRange(w.ctx, w.filename)
   195  	}
   196  	return nil
   197  }
   198  
   199  // FlushAndClose implement ETLWriter
   200  func (w *TAEWriter) FlushAndClose() (int, error) {
   201  	return 0, w.flush()
   202  }
   203  
   204  func getOneRowData(ctx context.Context, bat *batch.Batch, Line []table.ColumnField, rowIdx int, typs []types.Type, mp *mpool.MPool) error {
   205  
   206  	for colIdx, typ := range typs {
   207  		field := Line[colIdx]
   208  		id := typ.Oid
   209  		vec := bat.Vecs[colIdx]
   210  		switch id {
   211  		case types.T_int64:
   212  			cols := vector.MustFixedCol[int64](vec)
   213  			cols[rowIdx] = field.Integer
   214  		case types.T_uint64:
   215  			cols := vector.MustFixedCol[uint64](vec)
   216  			cols[rowIdx] = uint64(field.Integer)
   217  		case types.T_float64:
   218  			cols := vector.MustFixedCol[float64](vec)
   219  			cols[rowIdx] = field.GetFloat64()
   220  		case types.T_char, types.T_varchar,
   221  			types.T_binary, types.T_varbinary, types.T_blob, types.T_text:
   222  			//TODO: How to handle T_array here?
   223  			switch field.Type {
   224  			case table.TVarchar, table.TText:
   225  				err := vector.SetStringAt(vec, rowIdx, field.String, mp)
   226  				if err != nil {
   227  					return err
   228  				}
   229  			case table.TBytes:
   230  				err := vector.SetBytesAt(vec, rowIdx, field.Bytes, mp)
   231  				if err != nil {
   232  					return err
   233  				}
   234  			case table.TUuid:
   235  				dst := field.EncodeUuid()
   236  				err := vector.SetBytesAt(vec, rowIdx, dst[:], mp)
   237  				if err != nil {
   238  					return err
   239  				}
   240  			default:
   241  				return moerr.NewInternalError(ctx, "not Support string type %v", field.Type)
   242  			}
   243  		case types.T_json:
   244  			switch field.Type {
   245  			case table.TVarchar, table.TText:
   246  				// convert normal json-string to bytejson-bytes
   247  				jsonBytes, err := bytejson.ParseJsonByteFromString(field.String)
   248  				if err != nil {
   249  					return moerr.NewInternalError(ctx, "the input value is not json type for column %d: %v", colIdx, field)
   250  				}
   251  				err = vector.SetBytesAt(vec, rowIdx, jsonBytes, mp)
   252  				if err != nil {
   253  					return err
   254  				}
   255  			case table.TBytes:
   256  				val := field.Bytes
   257  				if len(val) == 0 {
   258  					val = util.UnsafeStringToBytes(field.String)
   259  				}
   260  				err := vector.SetBytesAt(vec, rowIdx, val, mp)
   261  				if err != nil {
   262  					return err
   263  				}
   264  			}
   265  
   266  		case types.T_datetime:
   267  			cols := vector.MustFixedCol[types.Datetime](vec)
   268  			switch field.Type {
   269  			case table.TDatetime:
   270  				var buf [64]byte
   271  				dst := field.EncodedDatetime(buf[:0])
   272  				d, err := types.ParseDatetime(string(dst), vec.GetType().Scale)
   273  				if err != nil {
   274  					return moerr.NewInternalError(ctx, "the input value is not Datetime type for column %d: %v", colIdx, field)
   275  				}
   276  				cols[rowIdx] = d
   277  			case table.TVarchar, table.TText:
   278  				datetimeStr := field.String
   279  				if len(datetimeStr) == 0 {
   280  					cols[rowIdx] = types.Datetime(0)
   281  				} else {
   282  					d, err := types.ParseDatetime(datetimeStr, vec.GetType().Scale)
   283  					if err != nil {
   284  						return moerr.NewInternalError(ctx, "the input value is not Datetime type for column %d: %v", colIdx, field)
   285  					}
   286  					cols[rowIdx] = d
   287  				}
   288  			default:
   289  				return moerr.NewInternalError(ctx, "not Support datetime type %v", field.Type)
   290  			}
   291  		default:
   292  			return moerr.NewInternalError(ctx, "the value type %s is not support now", *vec.GetType())
   293  		}
   294  	}
   295  	return nil
   296  }
   297  
   298  // TAEReader implements the io.Reader interface for reading a tae file.
   299  // Deprecated
   300  type TAEReader struct {
   301  	ctx      context.Context
   302  	filepath string
   303  	filesize int64
   304  	fs       fileservice.FileService
   305  	mp       *mpool.MPool
   306  	typs     []types.Type
   307  	idxs     []uint16
   308  
   309  	blockReader *blockio.BlockReader
   310  
   311  	bs       []objectio.BlockObject
   312  	batchs   []*batch.Batch
   313  	batchIdx int
   314  	rowIdx   int
   315  
   316  	release func()
   317  }
   318  
   319  // NewTaeReader returns a TAEReader.
   320  // Deprecated
   321  func NewTaeReader(ctx context.Context, tbl *table.Table, filePath string, filesize int64, fs fileservice.FileService, mp *mpool.MPool) (*TAEReader, error) {
   322  	var err error
   323  	r := &TAEReader{
   324  		ctx:      ctx,
   325  		filepath: filePath,
   326  		filesize: filesize,
   327  		fs:       fs,
   328  		mp:       mp,
   329  	}
   330  	r.idxs = make([]uint16, len(tbl.Columns))
   331  	for idx, c := range tbl.Columns {
   332  		r.typs = append(r.typs, c.ColType.ToType())
   333  		r.idxs[idx] = uint16(idx)
   334  	}
   335  	r.blockReader, err = blockio.NewFileReaderNoCache(r.fs, r.filepath)
   336  	if err != nil {
   337  		return nil, err
   338  	}
   339  	return r, nil
   340  }
   341  
   342  func (r *TAEReader) ReadAll(ctx context.Context) ([]*batch.Batch, error) {
   343  	if r.release != nil {
   344  		panic("can only call once")
   345  	}
   346  	ioVec, release, err := r.blockReader.LoadAllColumns(ctx, r.idxs, r.mp)
   347  	if err != nil {
   348  		return nil, err
   349  	}
   350  	r.release = release
   351  	r.batchs = append(r.batchs, ioVec...)
   352  	return r.batchs, nil
   353  }
   354  
   355  func (r *TAEReader) ReadLine() ([]string, error) {
   356  	var record = make([]string, len(r.idxs))
   357  	if r.batchIdx >= len(r.batchs) {
   358  		return nil, nil
   359  	}
   360  	if r.rowIdx >= r.batchs[r.batchIdx].Vecs[0].Length() {
   361  		r.batchIdx++
   362  		r.rowIdx = 0
   363  	}
   364  	if r.batchIdx >= len(r.batchs) || r.rowIdx >= r.batchs[r.batchIdx].Vecs[0].Length() {
   365  		return nil, nil
   366  	}
   367  	vecs := r.batchs[r.batchIdx].Vecs
   368  	for idx, vecIdx := range r.idxs {
   369  		val, err := ValToString(r.ctx, vecs[vecIdx], r.rowIdx)
   370  		if err != nil {
   371  			return nil, err
   372  		}
   373  		record[idx] = val
   374  	}
   375  	r.rowIdx++
   376  	return record, nil
   377  }
   378  
   379  func (r *TAEReader) ReadRow(row *table.Row) error {
   380  	panic("NOT implement")
   381  }
   382  
   383  func (r *TAEReader) Close() {
   384  	for idx := range r.batchs {
   385  		// do NOT release it in mpool (like r.batchs[idx].Clean(r.mp)). right now, the buffer is new one.
   386  		r.batchs[idx] = nil
   387  	}
   388  	r.batchs = nil
   389  	if r.release != nil {
   390  		r.release()
   391  	}
   392  }
   393  
   394  func GetVectorArrayLen(ctx context.Context, vec *vector.Vector) (int, error) {
   395  	typ := vec.GetType()
   396  	switch typ.Oid {
   397  	case types.T_int64:
   398  		cols := vector.MustFixedCol[int64](vec)
   399  		return len(cols), nil
   400  	case types.T_uint64:
   401  		cols := vector.MustFixedCol[uint64](vec)
   402  		return len(cols), nil
   403  	case types.T_float64:
   404  		cols := vector.MustFixedCol[float64](vec)
   405  		return len(cols), nil
   406  	case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_blob, types.T_text,
   407  		types.T_array_float32, types.T_array_float64:
   408  		cols := vector.MustFixedCol[types.Varlena](vec)
   409  		return len(cols), nil
   410  	case types.T_json:
   411  		cols := vector.MustFixedCol[types.Varlena](vec)
   412  		return len(cols), nil
   413  	case types.T_datetime:
   414  		cols := vector.MustFixedCol[types.Datetime](vec)
   415  		return len(cols), nil
   416  	default:
   417  		return 0, moerr.NewInternalError(ctx, "the value type with oid %d is not support now", vec.GetType().Oid)
   418  	}
   419  }
   420  
   421  func ValToString(ctx context.Context, vec *vector.Vector, rowIdx int) (string, error) {
   422  	typ := vec.GetType()
   423  	switch typ.Oid {
   424  	case types.T_int64:
   425  		cols := vector.MustFixedCol[int64](vec)
   426  		return fmt.Sprintf("%d", cols[rowIdx]), nil
   427  	case types.T_uint64:
   428  		cols := vector.MustFixedCol[uint64](vec)
   429  		return fmt.Sprintf("%d", cols[rowIdx]), nil
   430  	case types.T_float64:
   431  		cols := vector.MustFixedCol[float64](vec)
   432  		return fmt.Sprintf("%f", cols[rowIdx]), nil
   433  	case types.T_char, types.T_varchar,
   434  		types.T_binary, types.T_varbinary, types.T_blob, types.T_text:
   435  		cols, area := vector.MustVarlenaRawData(vec)
   436  		return cols[rowIdx].GetString(area), nil
   437  	case types.T_array_float32:
   438  		cols, area := vector.MustVarlenaRawData(vec)
   439  		return types.ArrayToString[float32](types.GetArray[float32](&cols[rowIdx], area)), nil
   440  	case types.T_array_float64:
   441  		cols, area := vector.MustVarlenaRawData(vec)
   442  		return types.ArrayToString[float64](types.GetArray[float64](&cols[rowIdx], area)), nil
   443  	case types.T_json:
   444  		cols, area := vector.MustVarlenaRawData(vec)
   445  		val := cols[rowIdx].GetByteSlice(area)
   446  		bjson := types.DecodeJson(val)
   447  		return bjson.String(), nil
   448  	case types.T_datetime:
   449  		cols := vector.MustFixedCol[types.Datetime](vec)
   450  		return table.Time2DatetimeString(cols[rowIdx].ConvertToGoTime(time.Local)), nil
   451  	default:
   452  		return "", moerr.NewInternalError(ctx, "the value type with oid %d is not support now", vec.GetType().Oid)
   453  	}
   454  }