github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/soliton/filesort/filesort.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package filesort
    15  
    16  import (
    17  	"container/heap"
    18  	"encoding/binary"
    19  	"io"
    20  	"os"
    21  	"path/filepath"
    22  	"sort"
    23  	"strconv"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"github.com/whtcorpsinc/errors"
    29  	"github.com/whtcorpsinc/BerolinaSQL/terror"
    30  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    31  	"github.com/whtcorpsinc/milevadb/types"
    32  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    33  )
    34  
    35  type comparableRow struct {
    36  	key    []types.Causet
    37  	val    []types.Causet
    38  	handle int64
    39  }
    40  
    41  type item struct {
    42  	index int // source file index
    43  	value *comparableRow
    44  }
    45  
    46  // rowHeap maintains a min-heap property of comparableRows.
    47  type rowHeap struct {
    48  	sc     *stmtctx.StatementContext
    49  	ims    []*item
    50  	byDesc []bool
    51  	err    error
    52  }
    53  
    54  var headSize = 8
    55  
    56  func lessThan(sc *stmtctx.StatementContext, i []types.Causet, j []types.Causet, byDesc []bool) (bool, error) {
    57  	for k := range byDesc {
    58  		v1 := i[k]
    59  		v2 := j[k]
    60  
    61  		ret, err := v1.CompareCauset(sc, &v2)
    62  		if err != nil {
    63  			return false, errors.Trace(err)
    64  		}
    65  
    66  		if byDesc[k] {
    67  			ret = -ret
    68  		}
    69  
    70  		if ret < 0 {
    71  			return true, nil
    72  		} else if ret > 0 {
    73  			return false, nil
    74  		}
    75  	}
    76  	return false, nil
    77  }
    78  
    79  // Len implements heap.Interface Len interface.
    80  func (rh *rowHeap) Len() int { return len(rh.ims) }
    81  
    82  // Swap implements heap.Interface Swap interface.
    83  func (rh *rowHeap) Swap(i, j int) { rh.ims[i], rh.ims[j] = rh.ims[j], rh.ims[i] }
    84  
    85  // Less implements heap.Interface Less interface.
    86  func (rh *rowHeap) Less(i, j int) bool {
    87  	l := rh.ims[i].value.key
    88  	r := rh.ims[j].value.key
    89  	ret, err := lessThan(rh.sc, l, r, rh.byDesc)
    90  	if rh.err == nil {
    91  		rh.err = err
    92  	}
    93  	return ret
    94  }
    95  
    96  // Push pushes an element into rowHeap.
    97  func (rh *rowHeap) Push(x interface{}) {
    98  	rh.ims = append(rh.ims, x.(*item))
    99  }
   100  
   101  // Pop pops the last element from rowHeap.
   102  func (rh *rowHeap) Pop() interface{} {
   103  	old := rh.ims
   104  	n := len(old)
   105  	x := old[n-1]
   106  	rh.ims = old[0 : n-1]
   107  	return x
   108  }
   109  
   110  // FileSorter sorts the given rows according to the byDesc order.
   111  // FileSorter can sort rows that exceed predefined memory capacity.
   112  type FileSorter struct {
   113  	sc     *stmtctx.StatementContext
   114  	byDesc []bool
   115  
   116  	workers  []*Worker
   117  	nWorkers int // number of workers used in async sorting
   118  	cWorker  int // the next worker to which the sorting job is sent
   119  
   120  	mu     sync.Mutex
   121  	tmFIDelir string
   122  	files  []string
   123  	nFiles int
   124  	cursor int // required when performing full in-memory sort
   125  
   126  	rowHeap    *rowHeap
   127  	fds        []*os.File
   128  	rowBytes   []byte
   129  	head       []byte
   130  	dcod       []types.Causet
   131  	keySize    int
   132  	valSize    int
   133  	maxRowSize int
   134  
   135  	wg       sync.WaitGroup
   136  	closed   bool
   137  	fetched  bool
   138  	external bool // mark the necessity of performing external file sort
   139  }
   140  
   141  // Worker sorts file asynchronously.
   142  type Worker struct {
   143  	ctx     *FileSorter
   144  	busy    int32
   145  	keySize int
   146  	valSize int
   147  	rowSize int
   148  	bufSize int
   149  	buf     []*comparableRow
   150  	head    []byte
   151  	err     error
   152  }
   153  
   154  // Builder builds a new FileSorter.
   155  type Builder struct {
   156  	sc       *stmtctx.StatementContext
   157  	keySize  int
   158  	valSize  int
   159  	bufSize  int
   160  	nWorkers int
   161  	byDesc   []bool
   162  	tmFIDelir   string
   163  }
   164  
   165  // SetSC sets StatementContext instance which is required in event comparison.
   166  func (b *Builder) SetSC(sc *stmtctx.StatementContext) *Builder {
   167  	b.sc = sc
   168  	return b
   169  }
   170  
   171  // SetSchema sets the schemaReplicant of event, including key size and value size.
   172  func (b *Builder) SetSchema(keySize, valSize int) *Builder {
   173  	b.keySize = keySize
   174  	b.valSize = valSize
   175  	return b
   176  }
   177  
   178  // SetBuf sets the number of rows FileSorter can hold in memory at a time.
   179  func (b *Builder) SetBuf(bufSize int) *Builder {
   180  	b.bufSize = bufSize
   181  	return b
   182  }
   183  
   184  // SetWorkers sets the number of workers used in async sorting.
   185  func (b *Builder) SetWorkers(nWorkers int) *Builder {
   186  	b.nWorkers = nWorkers
   187  	return b
   188  }
   189  
   190  // SetDesc sets the ordering rule of event comparison.
   191  func (b *Builder) SetDesc(byDesc []bool) *Builder {
   192  	b.byDesc = byDesc
   193  	return b
   194  }
   195  
   196  // SetDir sets the working directory for FileSorter.
   197  func (b *Builder) SetDir(tmFIDelir string) *Builder {
   198  	b.tmFIDelir = tmFIDelir
   199  	return b
   200  }
   201  
   202  // Build creates a FileSorter instance using given data.
   203  func (b *Builder) Build() (*FileSorter, error) {
   204  	// Sanity checks
   205  	if b.sc == nil {
   206  		return nil, errors.New("StatementContext is nil")
   207  	}
   208  	if b.keySize != len(b.byDesc) {
   209  		return nil, errors.New("mismatch in key size and byDesc slice")
   210  	}
   211  	if b.keySize <= 0 {
   212  		return nil, errors.New("key size is not positive")
   213  	}
   214  	if b.valSize <= 0 {
   215  		return nil, errors.New("value size is not positive")
   216  	}
   217  	if b.bufSize <= 0 {
   218  		return nil, errors.New("buffer size is not positive")
   219  	}
   220  	_, err := os.Stat(b.tmFIDelir)
   221  	if err != nil {
   222  		if os.IsNotExist(err) {
   223  			return nil, errors.New("tmFIDelir does not exist")
   224  		}
   225  		return nil, errors.Trace(err)
   226  	}
   227  
   228  	ws := make([]*Worker, b.nWorkers)
   229  	for i := range ws {
   230  		ws[i] = &Worker{
   231  			keySize: b.keySize,
   232  			valSize: b.valSize,
   233  			rowSize: b.keySize + b.valSize + 1,
   234  			bufSize: b.bufSize / b.nWorkers,
   235  			buf:     make([]*comparableRow, 0, b.bufSize/b.nWorkers),
   236  			head:    make([]byte, headSize),
   237  		}
   238  	}
   239  
   240  	rh := &rowHeap{sc: b.sc,
   241  		ims:    make([]*item, 0),
   242  		byDesc: b.byDesc,
   243  	}
   244  
   245  	fs := &FileSorter{sc: b.sc,
   246  		workers:  ws,
   247  		nWorkers: b.nWorkers,
   248  		cWorker:  0,
   249  
   250  		head:    make([]byte, headSize),
   251  		dcod:    make([]types.Causet, 0, b.keySize+b.valSize+1),
   252  		keySize: b.keySize,
   253  		valSize: b.valSize,
   254  
   255  		tmFIDelir:  b.tmFIDelir,
   256  		files:   make([]string, 0),
   257  		byDesc:  b.byDesc,
   258  		rowHeap: rh,
   259  	}
   260  
   261  	for i := 0; i < b.nWorkers; i++ {
   262  		fs.workers[i].ctx = fs
   263  	}
   264  
   265  	return fs, nil
   266  }
   267  
   268  func (fs *FileSorter) getUniqueFileName() string {
   269  	fs.mu.Lock()
   270  	defer fs.mu.Unlock()
   271  	ret := filepath.Join(fs.tmFIDelir, strconv.Itoa(fs.nFiles))
   272  	fs.nFiles++
   273  	return ret
   274  }
   275  
   276  func (fs *FileSorter) appendFileName(fn string) {
   277  	fs.mu.Lock()
   278  	defer fs.mu.Unlock()
   279  	fs.files = append(fs.files, fn)
   280  }
   281  
   282  func (fs *FileSorter) closeAllFiles() error {
   283  	var reportErr error
   284  	for _, fd := range fs.fds {
   285  		err := fd.Close()
   286  		if reportErr == nil {
   287  			reportErr = err
   288  		}
   289  	}
   290  	err := os.RemoveAll(fs.tmFIDelir)
   291  	if reportErr == nil {
   292  		reportErr = err
   293  	}
   294  	if reportErr != nil {
   295  		return errors.Trace(reportErr)
   296  	}
   297  	return nil
   298  }
   299  
   300  // internalSort performs full in-memory sort.
   301  func (fs *FileSorter) internalSort() (*comparableRow, error) {
   302  	w := fs.workers[fs.cWorker]
   303  
   304  	if !fs.fetched {
   305  		sort.Sort(w)
   306  		if w.err != nil {
   307  			return nil, errors.Trace(w.err)
   308  		}
   309  		fs.fetched = true
   310  	}
   311  	if fs.cursor < len(w.buf) {
   312  		r := w.buf[fs.cursor]
   313  		fs.cursor++
   314  		return r, nil
   315  	}
   316  	return nil, nil
   317  }
   318  
   319  // externalSort performs external file sort.
   320  func (fs *FileSorter) externalSort() (*comparableRow, error) {
   321  	if !fs.fetched {
   322  		// flush all remaining content to file (if any)
   323  		for _, w := range fs.workers {
   324  			if atomic.LoadInt32(&(w.busy)) == 0 && len(w.buf) > 0 {
   325  				fs.wg.Add(1)
   326  				go w.flushToFile()
   327  			}
   328  		}
   329  
   330  		// wait for all workers to finish
   331  		fs.wg.Wait()
   332  
   333  		// check errors from workers
   334  		for _, w := range fs.workers {
   335  			if w.err != nil {
   336  				return nil, errors.Trace(w.err)
   337  			}
   338  			if w.rowSize > fs.maxRowSize {
   339  				fs.maxRowSize = w.rowSize
   340  			}
   341  		}
   342  
   343  		heap.Init(fs.rowHeap)
   344  		if fs.rowHeap.err != nil {
   345  			return nil, errors.Trace(fs.rowHeap.err)
   346  		}
   347  
   348  		fs.rowBytes = make([]byte, fs.maxRowSize)
   349  
   350  		err := fs.openAllFiles()
   351  		if err != nil {
   352  			return nil, errors.Trace(err)
   353  		}
   354  
   355  		for id := range fs.fds {
   356  			event, err := fs.fetchNextRow(id)
   357  			if err != nil {
   358  				return nil, errors.Trace(err)
   359  			}
   360  			if event == nil {
   361  				return nil, errors.New("file is empty")
   362  			}
   363  
   364  			im := &item{
   365  				index: id,
   366  				value: event,
   367  			}
   368  
   369  			heap.Push(fs.rowHeap, im)
   370  			if fs.rowHeap.err != nil {
   371  				return nil, errors.Trace(fs.rowHeap.err)
   372  			}
   373  		}
   374  
   375  		fs.fetched = true
   376  	}
   377  
   378  	if fs.rowHeap.Len() > 0 {
   379  		im := heap.Pop(fs.rowHeap).(*item)
   380  		if fs.rowHeap.err != nil {
   381  			return nil, errors.Trace(fs.rowHeap.err)
   382  		}
   383  
   384  		event, err := fs.fetchNextRow(im.index)
   385  		if err != nil {
   386  			return nil, errors.Trace(err)
   387  		}
   388  		if event != nil {
   389  			nextIm := &item{
   390  				index: im.index,
   391  				value: event,
   392  			}
   393  
   394  			heap.Push(fs.rowHeap, nextIm)
   395  			if fs.rowHeap.err != nil {
   396  				return nil, errors.Trace(fs.rowHeap.err)
   397  			}
   398  		}
   399  
   400  		return im.value, nil
   401  	}
   402  
   403  	return nil, nil
   404  }
   405  
   406  func (fs *FileSorter) openAllFiles() error {
   407  	for _, fname := range fs.files {
   408  		fd, err := os.Open(fname)
   409  		if err != nil {
   410  			return errors.Trace(err)
   411  		}
   412  		fs.fds = append(fs.fds, fd)
   413  	}
   414  	return nil
   415  }
   416  
   417  // fetchNextRow fetches the next event given the source file index.
   418  func (fs *FileSorter) fetchNextRow(index int) (*comparableRow, error) {
   419  	n, err := fs.fds[index].Read(fs.head)
   420  	if err == io.EOF {
   421  		return nil, nil
   422  	}
   423  	if err != nil {
   424  		return nil, errors.Trace(err)
   425  	}
   426  	if n != headSize {
   427  		return nil, errors.New("incorrect header")
   428  	}
   429  	rowSize := int(binary.BigEndian.Uint64(fs.head))
   430  
   431  	n, err = fs.fds[index].Read(fs.rowBytes)
   432  	if err != nil {
   433  		return nil, errors.Trace(err)
   434  	}
   435  	if n != rowSize {
   436  		return nil, errors.New("incorrect event")
   437  	}
   438  
   439  	fs.dcod, err = codec.Decode(fs.rowBytes, fs.keySize+fs.valSize+1)
   440  	if err != nil {
   441  		return nil, errors.Trace(err)
   442  	}
   443  
   444  	return &comparableRow{
   445  		key:    fs.dcod[:fs.keySize],
   446  		val:    fs.dcod[fs.keySize : fs.keySize+fs.valSize],
   447  		handle: fs.dcod[fs.keySize+fs.valSize:][0].GetInt64(),
   448  	}, nil
   449  }
   450  
   451  // Input adds one event into FileSorter.
   452  // Caller should not call Input after calling Output.
   453  func (fs *FileSorter) Input(key []types.Causet, val []types.Causet, handle int64) error {
   454  	if fs.closed {
   455  		return errors.New("FileSorter has been closed")
   456  	}
   457  	if fs.fetched {
   458  		return errors.New("call input after output")
   459  	}
   460  
   461  	assigned := false
   462  	abortTime := time.Duration(1) * time.Minute           // 1 minute
   463  	cooldownTime := time.Duration(100) * time.Millisecond // 100 milliseconds
   464  	event := &comparableRow{
   465  		key:    key,
   466  		val:    val,
   467  		handle: handle,
   468  	}
   469  
   470  	origin := time.Now()
   471  	// assign input event to some worker in a round-robin way
   472  	for {
   473  		for i := 0; i < fs.nWorkers; i++ {
   474  			wid := (fs.cWorker + i) % fs.nWorkers
   475  			if atomic.LoadInt32(&(fs.workers[wid].busy)) == 0 {
   476  				fs.workers[wid].input(event)
   477  				assigned = true
   478  				fs.cWorker = wid
   479  				break
   480  			}
   481  		}
   482  		if assigned {
   483  			break
   484  		}
   485  
   486  		// all workers are busy now, cooldown and retry
   487  		time.Sleep(cooldownTime)
   488  
   489  		if time.Since(origin) >= abortTime {
   490  			// weird: all workers are busy for at least 1 min
   491  			// choose to abort for safety
   492  			return errors.New("can not make progress since all workers are busy")
   493  		}
   494  	}
   495  	return nil
   496  }
   497  
   498  // Output gets the next sorted event.
   499  func (fs *FileSorter) Output() ([]types.Causet, []types.Causet, int64, error) {
   500  	var (
   501  		r   *comparableRow
   502  		err error
   503  	)
   504  	if fs.closed {
   505  		return nil, nil, 0, errors.New("FileSorter has been closed")
   506  	}
   507  
   508  	if fs.external {
   509  		r, err = fs.externalSort()
   510  	} else {
   511  		r, err = fs.internalSort()
   512  	}
   513  
   514  	if err != nil {
   515  		return nil, nil, 0, errors.Trace(err)
   516  	} else if r != nil {
   517  		return r.key, r.val, r.handle, nil
   518  	} else {
   519  		return nil, nil, 0, nil
   520  	}
   521  }
   522  
   523  // Close terminates the input or output process and discards all remaining data.
   524  func (fs *FileSorter) Close() error {
   525  	if fs.closed {
   526  		return nil
   527  	}
   528  	fs.wg.Wait()
   529  	for _, w := range fs.workers {
   530  		w.buf = w.buf[:0]
   531  	}
   532  	fs.closed = true
   533  	err := fs.closeAllFiles()
   534  	if err != nil {
   535  		return errors.Trace(err)
   536  	}
   537  	return nil
   538  }
   539  
   540  func (w *Worker) Len() int { return len(w.buf) }
   541  
   542  func (w *Worker) Swap(i, j int) { w.buf[i], w.buf[j] = w.buf[j], w.buf[i] }
   543  
   544  func (w *Worker) Less(i, j int) bool {
   545  	l := w.buf[i].key
   546  	r := w.buf[j].key
   547  	ret, err := lessThan(w.ctx.sc, l, r, w.ctx.byDesc)
   548  	if w.err == nil {
   549  		w.err = errors.Trace(err)
   550  	}
   551  	return ret
   552  }
   553  
   554  func (w *Worker) input(event *comparableRow) {
   555  	w.buf = append(w.buf, event)
   556  
   557  	if len(w.buf) > w.bufSize {
   558  		atomic.StoreInt32(&(w.busy), int32(1))
   559  		w.ctx.wg.Add(1)
   560  		w.ctx.external = true
   561  		go w.flushToFile()
   562  	}
   563  }
   564  
   565  // flushToFile flushes the buffer to file if it is full.
   566  func (w *Worker) flushToFile() {
   567  	defer w.ctx.wg.Done()
   568  	var (
   569  		outputByte []byte
   570  		prevLen    int
   571  	)
   572  
   573  	sort.Sort(w)
   574  	if w.err != nil {
   575  		return
   576  	}
   577  
   578  	fileName := w.ctx.getUniqueFileName()
   579  
   580  	outputFile, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
   581  	if err != nil {
   582  		w.err = errors.Trace(err)
   583  		return
   584  	}
   585  	defer terror.Call(outputFile.Close)
   586  	sc := &stmtctx.StatementContext{TimeZone: time.Local}
   587  	for _, event := range w.buf {
   588  		prevLen = len(outputByte)
   589  		outputByte = append(outputByte, w.head...)
   590  		outputByte, err = codec.EncodeKey(sc, outputByte, event.key...)
   591  		if err != nil {
   592  			w.err = errors.Trace(err)
   593  			return
   594  		}
   595  		outputByte, err = codec.EncodeKey(sc, outputByte, event.val...)
   596  		if err != nil {
   597  			w.err = errors.Trace(err)
   598  			return
   599  		}
   600  		outputByte, err = codec.EncodeKey(sc, outputByte, types.NewIntCauset(event.handle))
   601  		if err != nil {
   602  			w.err = errors.Trace(err)
   603  			return
   604  		}
   605  
   606  		if len(outputByte)-prevLen-headSize > w.rowSize {
   607  			w.rowSize = len(outputByte) - prevLen - headSize
   608  		}
   609  		binary.BigEndian.PutUint64(w.head, uint64(len(outputByte)-prevLen-headSize))
   610  		for i := 0; i < headSize; i++ {
   611  			outputByte[prevLen+i] = w.head[i]
   612  		}
   613  	}
   614  
   615  	_, err = outputFile.Write(outputByte)
   616  	if err != nil {
   617  		w.err = errors.Trace(err)
   618  		return
   619  	}
   620  
   621  	w.ctx.appendFileName(fileName)
   622  	w.buf = w.buf[:0]
   623  	atomic.StoreInt32(&(w.busy), int32(0))
   624  }