github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/soliton/filesort/filesort.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package filesort 15 16 import ( 17 "container/heap" 18 "encoding/binary" 19 "io" 20 "os" 21 "path/filepath" 22 "sort" 23 "strconv" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "github.com/whtcorpsinc/errors" 29 "github.com/whtcorpsinc/BerolinaSQL/terror" 30 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 31 "github.com/whtcorpsinc/milevadb/types" 32 "github.com/whtcorpsinc/milevadb/soliton/codec" 33 ) 34 35 type comparableRow struct { 36 key []types.Causet 37 val []types.Causet 38 handle int64 39 } 40 41 type item struct { 42 index int // source file index 43 value *comparableRow 44 } 45 46 // rowHeap maintains a min-heap property of comparableRows. 47 type rowHeap struct { 48 sc *stmtctx.StatementContext 49 ims []*item 50 byDesc []bool 51 err error 52 } 53 54 var headSize = 8 55 56 func lessThan(sc *stmtctx.StatementContext, i []types.Causet, j []types.Causet, byDesc []bool) (bool, error) { 57 for k := range byDesc { 58 v1 := i[k] 59 v2 := j[k] 60 61 ret, err := v1.CompareCauset(sc, &v2) 62 if err != nil { 63 return false, errors.Trace(err) 64 } 65 66 if byDesc[k] { 67 ret = -ret 68 } 69 70 if ret < 0 { 71 return true, nil 72 } else if ret > 0 { 73 return false, nil 74 } 75 } 76 return false, nil 77 } 78 79 // Len implements heap.Interface Len interface. 80 func (rh *rowHeap) Len() int { return len(rh.ims) } 81 82 // Swap implements heap.Interface Swap interface. 83 func (rh *rowHeap) Swap(i, j int) { rh.ims[i], rh.ims[j] = rh.ims[j], rh.ims[i] } 84 85 // Less implements heap.Interface Less interface. 86 func (rh *rowHeap) Less(i, j int) bool { 87 l := rh.ims[i].value.key 88 r := rh.ims[j].value.key 89 ret, err := lessThan(rh.sc, l, r, rh.byDesc) 90 if rh.err == nil { 91 rh.err = err 92 } 93 return ret 94 } 95 96 // Push pushes an element into rowHeap. 97 func (rh *rowHeap) Push(x interface{}) { 98 rh.ims = append(rh.ims, x.(*item)) 99 } 100 101 // Pop pops the last element from rowHeap. 102 func (rh *rowHeap) Pop() interface{} { 103 old := rh.ims 104 n := len(old) 105 x := old[n-1] 106 rh.ims = old[0 : n-1] 107 return x 108 } 109 110 // FileSorter sorts the given rows according to the byDesc order. 111 // FileSorter can sort rows that exceed predefined memory capacity. 112 type FileSorter struct { 113 sc *stmtctx.StatementContext 114 byDesc []bool 115 116 workers []*Worker 117 nWorkers int // number of workers used in async sorting 118 cWorker int // the next worker to which the sorting job is sent 119 120 mu sync.Mutex 121 tmFIDelir string 122 files []string 123 nFiles int 124 cursor int // required when performing full in-memory sort 125 126 rowHeap *rowHeap 127 fds []*os.File 128 rowBytes []byte 129 head []byte 130 dcod []types.Causet 131 keySize int 132 valSize int 133 maxRowSize int 134 135 wg sync.WaitGroup 136 closed bool 137 fetched bool 138 external bool // mark the necessity of performing external file sort 139 } 140 141 // Worker sorts file asynchronously. 142 type Worker struct { 143 ctx *FileSorter 144 busy int32 145 keySize int 146 valSize int 147 rowSize int 148 bufSize int 149 buf []*comparableRow 150 head []byte 151 err error 152 } 153 154 // Builder builds a new FileSorter. 155 type Builder struct { 156 sc *stmtctx.StatementContext 157 keySize int 158 valSize int 159 bufSize int 160 nWorkers int 161 byDesc []bool 162 tmFIDelir string 163 } 164 165 // SetSC sets StatementContext instance which is required in event comparison. 166 func (b *Builder) SetSC(sc *stmtctx.StatementContext) *Builder { 167 b.sc = sc 168 return b 169 } 170 171 // SetSchema sets the schemaReplicant of event, including key size and value size. 172 func (b *Builder) SetSchema(keySize, valSize int) *Builder { 173 b.keySize = keySize 174 b.valSize = valSize 175 return b 176 } 177 178 // SetBuf sets the number of rows FileSorter can hold in memory at a time. 179 func (b *Builder) SetBuf(bufSize int) *Builder { 180 b.bufSize = bufSize 181 return b 182 } 183 184 // SetWorkers sets the number of workers used in async sorting. 185 func (b *Builder) SetWorkers(nWorkers int) *Builder { 186 b.nWorkers = nWorkers 187 return b 188 } 189 190 // SetDesc sets the ordering rule of event comparison. 191 func (b *Builder) SetDesc(byDesc []bool) *Builder { 192 b.byDesc = byDesc 193 return b 194 } 195 196 // SetDir sets the working directory for FileSorter. 197 func (b *Builder) SetDir(tmFIDelir string) *Builder { 198 b.tmFIDelir = tmFIDelir 199 return b 200 } 201 202 // Build creates a FileSorter instance using given data. 203 func (b *Builder) Build() (*FileSorter, error) { 204 // Sanity checks 205 if b.sc == nil { 206 return nil, errors.New("StatementContext is nil") 207 } 208 if b.keySize != len(b.byDesc) { 209 return nil, errors.New("mismatch in key size and byDesc slice") 210 } 211 if b.keySize <= 0 { 212 return nil, errors.New("key size is not positive") 213 } 214 if b.valSize <= 0 { 215 return nil, errors.New("value size is not positive") 216 } 217 if b.bufSize <= 0 { 218 return nil, errors.New("buffer size is not positive") 219 } 220 _, err := os.Stat(b.tmFIDelir) 221 if err != nil { 222 if os.IsNotExist(err) { 223 return nil, errors.New("tmFIDelir does not exist") 224 } 225 return nil, errors.Trace(err) 226 } 227 228 ws := make([]*Worker, b.nWorkers) 229 for i := range ws { 230 ws[i] = &Worker{ 231 keySize: b.keySize, 232 valSize: b.valSize, 233 rowSize: b.keySize + b.valSize + 1, 234 bufSize: b.bufSize / b.nWorkers, 235 buf: make([]*comparableRow, 0, b.bufSize/b.nWorkers), 236 head: make([]byte, headSize), 237 } 238 } 239 240 rh := &rowHeap{sc: b.sc, 241 ims: make([]*item, 0), 242 byDesc: b.byDesc, 243 } 244 245 fs := &FileSorter{sc: b.sc, 246 workers: ws, 247 nWorkers: b.nWorkers, 248 cWorker: 0, 249 250 head: make([]byte, headSize), 251 dcod: make([]types.Causet, 0, b.keySize+b.valSize+1), 252 keySize: b.keySize, 253 valSize: b.valSize, 254 255 tmFIDelir: b.tmFIDelir, 256 files: make([]string, 0), 257 byDesc: b.byDesc, 258 rowHeap: rh, 259 } 260 261 for i := 0; i < b.nWorkers; i++ { 262 fs.workers[i].ctx = fs 263 } 264 265 return fs, nil 266 } 267 268 func (fs *FileSorter) getUniqueFileName() string { 269 fs.mu.Lock() 270 defer fs.mu.Unlock() 271 ret := filepath.Join(fs.tmFIDelir, strconv.Itoa(fs.nFiles)) 272 fs.nFiles++ 273 return ret 274 } 275 276 func (fs *FileSorter) appendFileName(fn string) { 277 fs.mu.Lock() 278 defer fs.mu.Unlock() 279 fs.files = append(fs.files, fn) 280 } 281 282 func (fs *FileSorter) closeAllFiles() error { 283 var reportErr error 284 for _, fd := range fs.fds { 285 err := fd.Close() 286 if reportErr == nil { 287 reportErr = err 288 } 289 } 290 err := os.RemoveAll(fs.tmFIDelir) 291 if reportErr == nil { 292 reportErr = err 293 } 294 if reportErr != nil { 295 return errors.Trace(reportErr) 296 } 297 return nil 298 } 299 300 // internalSort performs full in-memory sort. 301 func (fs *FileSorter) internalSort() (*comparableRow, error) { 302 w := fs.workers[fs.cWorker] 303 304 if !fs.fetched { 305 sort.Sort(w) 306 if w.err != nil { 307 return nil, errors.Trace(w.err) 308 } 309 fs.fetched = true 310 } 311 if fs.cursor < len(w.buf) { 312 r := w.buf[fs.cursor] 313 fs.cursor++ 314 return r, nil 315 } 316 return nil, nil 317 } 318 319 // externalSort performs external file sort. 320 func (fs *FileSorter) externalSort() (*comparableRow, error) { 321 if !fs.fetched { 322 // flush all remaining content to file (if any) 323 for _, w := range fs.workers { 324 if atomic.LoadInt32(&(w.busy)) == 0 && len(w.buf) > 0 { 325 fs.wg.Add(1) 326 go w.flushToFile() 327 } 328 } 329 330 // wait for all workers to finish 331 fs.wg.Wait() 332 333 // check errors from workers 334 for _, w := range fs.workers { 335 if w.err != nil { 336 return nil, errors.Trace(w.err) 337 } 338 if w.rowSize > fs.maxRowSize { 339 fs.maxRowSize = w.rowSize 340 } 341 } 342 343 heap.Init(fs.rowHeap) 344 if fs.rowHeap.err != nil { 345 return nil, errors.Trace(fs.rowHeap.err) 346 } 347 348 fs.rowBytes = make([]byte, fs.maxRowSize) 349 350 err := fs.openAllFiles() 351 if err != nil { 352 return nil, errors.Trace(err) 353 } 354 355 for id := range fs.fds { 356 event, err := fs.fetchNextRow(id) 357 if err != nil { 358 return nil, errors.Trace(err) 359 } 360 if event == nil { 361 return nil, errors.New("file is empty") 362 } 363 364 im := &item{ 365 index: id, 366 value: event, 367 } 368 369 heap.Push(fs.rowHeap, im) 370 if fs.rowHeap.err != nil { 371 return nil, errors.Trace(fs.rowHeap.err) 372 } 373 } 374 375 fs.fetched = true 376 } 377 378 if fs.rowHeap.Len() > 0 { 379 im := heap.Pop(fs.rowHeap).(*item) 380 if fs.rowHeap.err != nil { 381 return nil, errors.Trace(fs.rowHeap.err) 382 } 383 384 event, err := fs.fetchNextRow(im.index) 385 if err != nil { 386 return nil, errors.Trace(err) 387 } 388 if event != nil { 389 nextIm := &item{ 390 index: im.index, 391 value: event, 392 } 393 394 heap.Push(fs.rowHeap, nextIm) 395 if fs.rowHeap.err != nil { 396 return nil, errors.Trace(fs.rowHeap.err) 397 } 398 } 399 400 return im.value, nil 401 } 402 403 return nil, nil 404 } 405 406 func (fs *FileSorter) openAllFiles() error { 407 for _, fname := range fs.files { 408 fd, err := os.Open(fname) 409 if err != nil { 410 return errors.Trace(err) 411 } 412 fs.fds = append(fs.fds, fd) 413 } 414 return nil 415 } 416 417 // fetchNextRow fetches the next event given the source file index. 418 func (fs *FileSorter) fetchNextRow(index int) (*comparableRow, error) { 419 n, err := fs.fds[index].Read(fs.head) 420 if err == io.EOF { 421 return nil, nil 422 } 423 if err != nil { 424 return nil, errors.Trace(err) 425 } 426 if n != headSize { 427 return nil, errors.New("incorrect header") 428 } 429 rowSize := int(binary.BigEndian.Uint64(fs.head)) 430 431 n, err = fs.fds[index].Read(fs.rowBytes) 432 if err != nil { 433 return nil, errors.Trace(err) 434 } 435 if n != rowSize { 436 return nil, errors.New("incorrect event") 437 } 438 439 fs.dcod, err = codec.Decode(fs.rowBytes, fs.keySize+fs.valSize+1) 440 if err != nil { 441 return nil, errors.Trace(err) 442 } 443 444 return &comparableRow{ 445 key: fs.dcod[:fs.keySize], 446 val: fs.dcod[fs.keySize : fs.keySize+fs.valSize], 447 handle: fs.dcod[fs.keySize+fs.valSize:][0].GetInt64(), 448 }, nil 449 } 450 451 // Input adds one event into FileSorter. 452 // Caller should not call Input after calling Output. 453 func (fs *FileSorter) Input(key []types.Causet, val []types.Causet, handle int64) error { 454 if fs.closed { 455 return errors.New("FileSorter has been closed") 456 } 457 if fs.fetched { 458 return errors.New("call input after output") 459 } 460 461 assigned := false 462 abortTime := time.Duration(1) * time.Minute // 1 minute 463 cooldownTime := time.Duration(100) * time.Millisecond // 100 milliseconds 464 event := &comparableRow{ 465 key: key, 466 val: val, 467 handle: handle, 468 } 469 470 origin := time.Now() 471 // assign input event to some worker in a round-robin way 472 for { 473 for i := 0; i < fs.nWorkers; i++ { 474 wid := (fs.cWorker + i) % fs.nWorkers 475 if atomic.LoadInt32(&(fs.workers[wid].busy)) == 0 { 476 fs.workers[wid].input(event) 477 assigned = true 478 fs.cWorker = wid 479 break 480 } 481 } 482 if assigned { 483 break 484 } 485 486 // all workers are busy now, cooldown and retry 487 time.Sleep(cooldownTime) 488 489 if time.Since(origin) >= abortTime { 490 // weird: all workers are busy for at least 1 min 491 // choose to abort for safety 492 return errors.New("can not make progress since all workers are busy") 493 } 494 } 495 return nil 496 } 497 498 // Output gets the next sorted event. 499 func (fs *FileSorter) Output() ([]types.Causet, []types.Causet, int64, error) { 500 var ( 501 r *comparableRow 502 err error 503 ) 504 if fs.closed { 505 return nil, nil, 0, errors.New("FileSorter has been closed") 506 } 507 508 if fs.external { 509 r, err = fs.externalSort() 510 } else { 511 r, err = fs.internalSort() 512 } 513 514 if err != nil { 515 return nil, nil, 0, errors.Trace(err) 516 } else if r != nil { 517 return r.key, r.val, r.handle, nil 518 } else { 519 return nil, nil, 0, nil 520 } 521 } 522 523 // Close terminates the input or output process and discards all remaining data. 524 func (fs *FileSorter) Close() error { 525 if fs.closed { 526 return nil 527 } 528 fs.wg.Wait() 529 for _, w := range fs.workers { 530 w.buf = w.buf[:0] 531 } 532 fs.closed = true 533 err := fs.closeAllFiles() 534 if err != nil { 535 return errors.Trace(err) 536 } 537 return nil 538 } 539 540 func (w *Worker) Len() int { return len(w.buf) } 541 542 func (w *Worker) Swap(i, j int) { w.buf[i], w.buf[j] = w.buf[j], w.buf[i] } 543 544 func (w *Worker) Less(i, j int) bool { 545 l := w.buf[i].key 546 r := w.buf[j].key 547 ret, err := lessThan(w.ctx.sc, l, r, w.ctx.byDesc) 548 if w.err == nil { 549 w.err = errors.Trace(err) 550 } 551 return ret 552 } 553 554 func (w *Worker) input(event *comparableRow) { 555 w.buf = append(w.buf, event) 556 557 if len(w.buf) > w.bufSize { 558 atomic.StoreInt32(&(w.busy), int32(1)) 559 w.ctx.wg.Add(1) 560 w.ctx.external = true 561 go w.flushToFile() 562 } 563 } 564 565 // flushToFile flushes the buffer to file if it is full. 566 func (w *Worker) flushToFile() { 567 defer w.ctx.wg.Done() 568 var ( 569 outputByte []byte 570 prevLen int 571 ) 572 573 sort.Sort(w) 574 if w.err != nil { 575 return 576 } 577 578 fileName := w.ctx.getUniqueFileName() 579 580 outputFile, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) 581 if err != nil { 582 w.err = errors.Trace(err) 583 return 584 } 585 defer terror.Call(outputFile.Close) 586 sc := &stmtctx.StatementContext{TimeZone: time.Local} 587 for _, event := range w.buf { 588 prevLen = len(outputByte) 589 outputByte = append(outputByte, w.head...) 590 outputByte, err = codec.EncodeKey(sc, outputByte, event.key...) 591 if err != nil { 592 w.err = errors.Trace(err) 593 return 594 } 595 outputByte, err = codec.EncodeKey(sc, outputByte, event.val...) 596 if err != nil { 597 w.err = errors.Trace(err) 598 return 599 } 600 outputByte, err = codec.EncodeKey(sc, outputByte, types.NewIntCauset(event.handle)) 601 if err != nil { 602 w.err = errors.Trace(err) 603 return 604 } 605 606 if len(outputByte)-prevLen-headSize > w.rowSize { 607 w.rowSize = len(outputByte) - prevLen - headSize 608 } 609 binary.BigEndian.PutUint64(w.head, uint64(len(outputByte)-prevLen-headSize)) 610 for i := 0; i < headSize; i++ { 611 outputByte[prevLen+i] = w.head[i] 612 } 613 } 614 615 _, err = outputFile.Write(outputByte) 616 if err != nil { 617 w.err = errors.Trace(err) 618 return 619 } 620 621 w.ctx.appendFileName(fileName) 622 w.buf = w.buf[:0] 623 atomic.StoreInt32(&(w.busy), int32(0)) 624 }