github.com/matrixorigin/matrixone@v0.7.0/pkg/vm/engine/tae/tables/jobs/mergeblocks.go (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package jobs
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  	"unsafe"
    22  
    23  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/dataio/blockio"
    24  
    25  	"github.com/RoaringBitmap/roaring"
    26  	"github.com/matrixorigin/matrixone/pkg/logutil"
    27  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/catalog"
    28  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common"
    29  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/containers"
    30  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/iface/handle"
    31  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/iface/txnif"
    32  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/mergesort"
    33  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/model"
    34  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/tables/txnentries"
    35  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/tasks"
    36  	"go.uber.org/zap/zapcore"
    37  )
    38  
    39  // CompactSegmentTaskFactory merge non-appendable blocks of an appendable-segment
    40  // into a new non-appendable segment.
    41  var CompactSegmentTaskFactory = func(mergedBlks []*catalog.BlockEntry, scheduler tasks.TaskScheduler) tasks.TxnTaskFactory {
    42  	return func(ctx *tasks.Context, txn txnif.AsyncTxn) (tasks.Task, error) {
    43  		mergedSegs := make([]*catalog.SegmentEntry, 1)
    44  		mergedSegs[0] = mergedBlks[0].GetSegment()
    45  		return NewMergeBlocksTask(ctx, txn, mergedBlks, mergedSegs, nil, scheduler)
    46  	}
    47  }
    48  
    49  var MergeBlocksIntoSegmentTaskFctory = func(mergedBlks []*catalog.BlockEntry, toSegEntry *catalog.SegmentEntry, scheduler tasks.TaskScheduler) tasks.TxnTaskFactory {
    50  	if toSegEntry == nil {
    51  		panic(tasks.ErrBadTaskRequestPara)
    52  	}
    53  	return func(ctx *tasks.Context, txn txnif.AsyncTxn) (tasks.Task, error) {
    54  		return NewMergeBlocksTask(ctx, txn, mergedBlks, nil, toSegEntry, scheduler)
    55  	}
    56  }
    57  
    58  type mergeBlocksTask struct {
    59  	*tasks.BaseTask
    60  	txn         txnif.AsyncTxn
    61  	toSegEntry  *catalog.SegmentEntry
    62  	createdSegs []*catalog.SegmentEntry
    63  	mergedSegs  []*catalog.SegmentEntry
    64  	mergedBlks  []*catalog.BlockEntry
    65  	createdBlks []*catalog.BlockEntry
    66  	compacted   []handle.Block
    67  	rel         handle.Relation
    68  	scheduler   tasks.TaskScheduler
    69  	scopes      []common.ID
    70  	deletes     []*roaring.Bitmap
    71  }
    72  
    73  func NewMergeBlocksTask(ctx *tasks.Context, txn txnif.AsyncTxn, mergedBlks []*catalog.BlockEntry, mergedSegs []*catalog.SegmentEntry, toSegEntry *catalog.SegmentEntry, scheduler tasks.TaskScheduler) (task *mergeBlocksTask, err error) {
    74  	task = &mergeBlocksTask{
    75  		txn:         txn,
    76  		mergedBlks:  mergedBlks,
    77  		mergedSegs:  mergedSegs,
    78  		createdBlks: make([]*catalog.BlockEntry, 0),
    79  		compacted:   make([]handle.Block, 0),
    80  		scheduler:   scheduler,
    81  		toSegEntry:  toSegEntry,
    82  	}
    83  	dbId := mergedBlks[0].GetSegment().GetTable().GetDB().ID
    84  	database, err := txn.GetDatabaseByID(dbId)
    85  	if err != nil {
    86  		return
    87  	}
    88  	relId := mergedBlks[0].GetSegment().GetTable().ID
    89  	task.rel, err = database.GetRelationByID(relId)
    90  	if err != nil {
    91  		return
    92  	}
    93  	for _, meta := range mergedBlks {
    94  		seg, err := task.rel.GetSegment(meta.GetSegment().GetID())
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  		blk, err := seg.GetBlock(meta.GetID())
    99  		if err != nil {
   100  			return nil, err
   101  		}
   102  		task.compacted = append(task.compacted, blk)
   103  		task.scopes = append(task.scopes, *meta.AsCommonID())
   104  	}
   105  	task.BaseTask = tasks.NewBaseTask(task, tasks.DataCompactionTask, ctx)
   106  	return
   107  }
   108  
   109  func (task *mergeBlocksTask) Scopes() []common.ID { return task.scopes }
   110  
   111  func (task *mergeBlocksTask) mergeColumn(
   112  	vecs []containers.Vector,
   113  	sortedIdx *[]uint32,
   114  	isPrimary bool,
   115  	fromLayout,
   116  	toLayout []uint32,
   117  	sort bool) (column []containers.Vector, mapping []uint32) {
   118  	if len(vecs) == 0 {
   119  		return
   120  	}
   121  	if sort {
   122  		if isPrimary {
   123  			column, mapping = mergesort.MergeSortedColumn(vecs, sortedIdx, fromLayout, toLayout)
   124  		} else {
   125  			column = mergesort.ShuffleColumn(vecs, *sortedIdx, fromLayout, toLayout)
   126  		}
   127  	} else {
   128  		column, mapping = task.mergeColumnWithOutSort(vecs, fromLayout, toLayout)
   129  	}
   130  	for _, vec := range vecs {
   131  		vec.Close()
   132  	}
   133  	return
   134  }
   135  
   136  func (task *mergeBlocksTask) mergeColumnWithOutSort(column []containers.Vector, fromLayout, toLayout []uint32) (ret []containers.Vector, mapping []uint32) {
   137  	totalLength := uint32(0)
   138  	for _, i := range toLayout {
   139  		totalLength += i
   140  	}
   141  	mapping = make([]uint32, totalLength)
   142  	for i := range mapping {
   143  		mapping[i] = uint32(i)
   144  	}
   145  	ret = mergesort.Reshape(column, fromLayout, toLayout)
   146  	return
   147  }
   148  
   149  func (task *mergeBlocksTask) MarshalLogObject(enc zapcore.ObjectEncoder) (err error) {
   150  	blks := ""
   151  	for _, blk := range task.mergedBlks {
   152  		blks = fmt.Sprintf("%s%d,", blks, blk.GetID())
   153  	}
   154  	enc.AddString("from-blks", blks)
   155  	segs := ""
   156  	for _, seg := range task.mergedSegs {
   157  		segs = fmt.Sprintf("%s%d,", segs, seg.GetID())
   158  	}
   159  	enc.AddString("from-segs", segs)
   160  
   161  	toblks := ""
   162  	for _, blk := range task.createdBlks {
   163  		toblks = fmt.Sprintf("%s%d,", toblks, blk.GetID())
   164  	}
   165  	if toblks != "" {
   166  		enc.AddString("to-blks", toblks)
   167  	}
   168  
   169  	tosegs := ""
   170  	for _, seg := range task.createdSegs {
   171  		tosegs = fmt.Sprintf("%s%d,", tosegs, seg.GetID())
   172  	}
   173  	if tosegs != "" {
   174  		enc.AddString("to-segs", tosegs)
   175  	}
   176  	return
   177  }
   178  
   179  func (task *mergeBlocksTask) Execute() (err error) {
   180  	logutil.Info("[Start] Mergeblocks", common.OperationField(task.Name()),
   181  		common.OperandField(task))
   182  	now := time.Now()
   183  	var toSegEntry handle.Segment
   184  	if task.toSegEntry == nil {
   185  		if toSegEntry, err = task.rel.CreateNonAppendableSegment(false); err != nil {
   186  			return err
   187  		}
   188  		task.toSegEntry = toSegEntry.GetMeta().(*catalog.SegmentEntry)
   189  		task.toSegEntry.SetSorted()
   190  		task.createdSegs = append(task.createdSegs, task.toSegEntry)
   191  	} else {
   192  		if toSegEntry, err = task.rel.GetSegment(task.toSegEntry.GetID()); err != nil {
   193  			return
   194  		}
   195  	}
   196  
   197  	schema := task.mergedBlks[0].GetSchema()
   198  	var view *model.ColumnView
   199  	sortVecs := make([]containers.Vector, 0)
   200  	rows := make([]uint32, 0)
   201  	skipBlks := make([]int, 0)
   202  	length := 0
   203  	fromAddr := make([]uint32, 0, len(task.compacted))
   204  	ids := make([]*common.ID, 0, len(task.compacted))
   205  	task.deletes = make([]*roaring.Bitmap, len(task.compacted))
   206  
   207  	// Prepare sort key resources
   208  	// If there's no sort key, use physical address key
   209  	var sortColDef *catalog.ColDef
   210  	if schema.HasSortKey() {
   211  		sortColDef = schema.GetSingleSortKey()
   212  	} else {
   213  		sortColDef = schema.PhyAddrKey
   214  	}
   215  	logutil.Infof("Mergeblocks on sort column %s\n", sortColDef.Name)
   216  	for i, block := range task.compacted {
   217  		if view, err = block.GetColumnDataById(sortColDef.Idx, nil); err != nil {
   218  			return
   219  		}
   220  		defer view.Close()
   221  		task.deletes[i] = view.DeleteMask
   222  		view.ApplyDeletes()
   223  		vec := view.Orphan()
   224  		defer vec.Close()
   225  		if vec.Length() == 0 {
   226  			skipBlks = append(skipBlks, i)
   227  			continue
   228  		}
   229  		sortVecs = append(sortVecs, vec)
   230  		rows = append(rows, uint32(vec.Length()))
   231  		fromAddr = append(fromAddr, uint32(length))
   232  		length += vec.Length()
   233  		ids = append(ids, block.Fingerprint())
   234  	}
   235  
   236  	to := make([]uint32, 0)
   237  	maxrow := schema.BlockMaxRows
   238  	totalRows := length
   239  	for totalRows > 0 {
   240  		if totalRows > int(maxrow) {
   241  			to = append(to, maxrow)
   242  			totalRows -= int(maxrow)
   243  		} else {
   244  			to = append(to, uint32(totalRows))
   245  			break
   246  		}
   247  	}
   248  
   249  	// merge sort the sort key
   250  	node, err := common.DefaultAllocator.Alloc(length * 4)
   251  	if err != nil {
   252  		panic(err)
   253  	}
   254  	buf := node[:length]
   255  	defer common.DefaultAllocator.Free(node)
   256  	sortedIdx := *(*[]uint32)(unsafe.Pointer(&buf))
   257  	vecs, mapping := task.mergeColumn(sortVecs, &sortedIdx, true, rows, to, schema.HasSortKey())
   258  	// logutil.Infof("mapping is %v", mapping)
   259  	// logutil.Infof("sortedIdx is %v", sortedIdx)
   260  	length = 0
   261  	var blk handle.Block
   262  	toAddr := make([]uint32, 0, len(vecs))
   263  	// index meta for every created block
   264  	// Prepare new block placeholder
   265  	// Build and flush block index if sort key is defined
   266  	// Flush sort key it correlates to only one column
   267  	batchs := make([]*containers.Batch, 0)
   268  	blockHandles := make([]handle.Block, 0)
   269  	for _, vec := range vecs {
   270  		toAddr = append(toAddr, uint32(length))
   271  		length += vec.Length()
   272  		blk, err = toSegEntry.CreateNonAppendableBlock()
   273  		if err != nil {
   274  			return err
   275  		}
   276  		task.createdBlks = append(task.createdBlks, blk.GetMeta().(*catalog.BlockEntry))
   277  		blockHandles = append(blockHandles, blk)
   278  		batch := containers.NewBatch()
   279  		batchs = append(batchs, batch)
   280  		vec.Close()
   281  	}
   282  
   283  	// Build and flush block index if sort key is defined
   284  	// Flush sort key it correlates to only one column
   285  
   286  	for _, def := range schema.ColDefs {
   287  		if def.IsPhyAddr() {
   288  			continue
   289  		}
   290  		// Skip
   291  		// PhyAddr column was processed before
   292  		// If only one single sort key, it was processed before
   293  		vecs = vecs[:0]
   294  		for _, block := range task.compacted {
   295  			if view, err = block.GetColumnDataById(def.Idx, nil); err != nil {
   296  				return
   297  			}
   298  			defer view.Close()
   299  			view.ApplyDeletes()
   300  			vec := view.Orphan()
   301  			if vec.Length() == 0 {
   302  				continue
   303  			}
   304  			defer vec.Close()
   305  			vecs = append(vecs, vec)
   306  		}
   307  		vecs, _ := task.mergeColumn(vecs, &sortedIdx, false, rows, to, schema.HasSortKey())
   308  		for i := range vecs {
   309  			defer vecs[i].Close()
   310  		}
   311  		for i, vec := range vecs {
   312  			batchs[i].AddVector(def.Name, vec)
   313  		}
   314  	}
   315  
   316  	phyAddr := schema.PhyAddrKey
   317  	name := blockio.EncodeObjectName()
   318  	writer := blockio.NewWriter(context.Background(), task.mergedBlks[0].GetBlockData().GetFs(), name)
   319  	pkIdx := -1
   320  	if schema.HasPK() {
   321  		pkIdx = schema.GetSingleSortKeyIdx()
   322  	}
   323  	for _, bat := range batchs {
   324  		block, err := writer.WriteBlock(bat)
   325  		if err != nil {
   326  			return err
   327  		}
   328  		for idx, vec := range bat.Vecs {
   329  			if phyAddr.Idx == idx {
   330  				continue
   331  			}
   332  			isPk := idx == pkIdx
   333  			_, err = BuildColumnIndex(writer.GetWriter(), block, schema.ColDefs[idx], vec, isPk, isPk)
   334  			if err != nil {
   335  				return err
   336  			}
   337  		}
   338  	}
   339  	blocks, err := writer.Sync()
   340  	if err != nil {
   341  		return err
   342  	}
   343  	var metaLoc string
   344  	for i, block := range blocks {
   345  		metaLoc, err = blockio.EncodeMetaLocWithObject(block.GetExtent(), uint32(batchs[i].Length()), blocks)
   346  		if err != nil {
   347  			return
   348  		}
   349  		err = blockHandles[i].UpdateMetaLoc(metaLoc)
   350  	}
   351  	for _, blk := range task.createdBlks {
   352  		if err = blk.GetBlockData().Init(); err != nil {
   353  			return err
   354  		}
   355  	}
   356  
   357  	for _, compacted := range task.compacted {
   358  		seg := compacted.GetSegment()
   359  		if err = seg.SoftDeleteBlock(compacted.Fingerprint().BlockID); err != nil {
   360  			return err
   361  		}
   362  	}
   363  	for _, entry := range task.mergedSegs {
   364  		if err = task.rel.SoftDeleteSegment(entry.GetID()); err != nil {
   365  			return err
   366  		}
   367  	}
   368  
   369  	table := task.toSegEntry.GetTable()
   370  	txnEntry := txnentries.NewMergeBlocksEntry(
   371  		task.txn,
   372  		task.rel,
   373  		task.mergedSegs,
   374  		task.createdSegs,
   375  		task.mergedBlks,
   376  		task.createdBlks,
   377  		mapping,
   378  		fromAddr,
   379  		toAddr,
   380  		task.deletes,
   381  		skipBlks,
   382  		task.scheduler)
   383  	if err = task.txn.LogTxnEntry(table.GetDB().ID, table.ID, txnEntry, ids); err != nil {
   384  		return err
   385  	}
   386  
   387  	logutil.Info("[Done] Mergeblocks",
   388  		common.AnyField("txn-start-ts", task.txn.GetStartTS().ToString()),
   389  		common.OperationField(task.Name()),
   390  		common.OperandField(task),
   391  		common.DurationField(time.Since(now)))
   392  	return err
   393  }