github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/cdc/puller/sorter/backend_pool.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package sorter
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"os"
    20  	"path/filepath"
    21  	"reflect"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  	"unsafe"
    26  
    27  	"github.com/mackerelio/go-osstat/memory"
    28  	"github.com/pingcap/errors"
    29  	"github.com/pingcap/failpoint"
    30  	"github.com/pingcap/log"
    31  	"github.com/pingcap/ticdc/pkg/config"
    32  	cerrors "github.com/pingcap/ticdc/pkg/errors"
    33  	"github.com/pingcap/ticdc/pkg/filelock"
    34  	"github.com/pingcap/ticdc/pkg/util"
    35  	"go.uber.org/zap"
    36  )
    37  
    38  const (
    39  	backgroundJobInterval      = time.Second * 15
    40  	sortDirLockFileName        = "ticdc_lock"
    41  	sortDirDataFileMagicPrefix = "sort"
    42  )
    43  
    44  var (
    45  	pool   *backEndPool // this is the singleton instance of backEndPool
    46  	poolMu sync.Mutex   // this mutex is for delayed initialization of `pool` only
    47  )
    48  
    49  type backEndPool struct {
    50  	memoryUseEstimate int64
    51  	onDiskDataSize    int64
    52  	fileNameCounter   uint64
    53  	memPressure       int32
    54  	cache             [256]unsafe.Pointer
    55  	dir               string
    56  	filePrefix        string
    57  
    58  	// to prevent `dir` from being accidentally used by another TiCDC server process.
    59  	fileLock *filelock.FileLock
    60  
    61  	// cancelCh needs to be unbuffered to prevent races
    62  	cancelCh chan struct{}
    63  	// cancelRWLock protects cache against races when the backEnd is exiting
    64  	cancelRWLock  sync.RWMutex
    65  	isTerminating bool
    66  }
    67  
    68  func newBackEndPool(dir string, captureAddr string) (*backEndPool, error) {
    69  	ret := &backEndPool{
    70  		memoryUseEstimate: 0,
    71  		fileNameCounter:   0,
    72  		dir:               dir,
    73  		cancelCh:          make(chan struct{}),
    74  		filePrefix:        fmt.Sprintf("%s/%s-%d-", dir, sortDirDataFileMagicPrefix, os.Getpid()),
    75  	}
    76  
    77  	err := ret.lockSortDir()
    78  	if err != nil {
    79  		log.Warn("failed to lock file prefix",
    80  			zap.String("prefix", ret.filePrefix),
    81  			zap.Error(err))
    82  		return nil, errors.Trace(err)
    83  	}
    84  
    85  	err = ret.cleanUpStaleFiles()
    86  	if err != nil {
    87  		log.Warn("Unified Sorter: failed to clean up stale temporary files. Report a bug if you believe this is unexpected", zap.Error(err))
    88  		return nil, errors.Trace(err)
    89  	}
    90  
    91  	go func() {
    92  		ticker := time.NewTicker(backgroundJobInterval)
    93  		defer ticker.Stop()
    94  
    95  		metricSorterInMemoryDataSizeGauge := sorterInMemoryDataSizeGauge.WithLabelValues(captureAddr)
    96  		metricSorterOnDiskDataSizeGauge := sorterOnDiskDataSizeGauge.WithLabelValues(captureAddr)
    97  		metricSorterOpenFileCountGauge := sorterOpenFileCountGauge.WithLabelValues(captureAddr)
    98  
    99  		for {
   100  			select {
   101  			case <-ret.cancelCh:
   102  				log.Info("Unified Sorter backEnd is being cancelled")
   103  				return
   104  			case <-ticker.C:
   105  			}
   106  
   107  			metricSorterInMemoryDataSizeGauge.Set(float64(atomic.LoadInt64(&ret.memoryUseEstimate)))
   108  			metricSorterOnDiskDataSizeGauge.Set(float64(atomic.LoadInt64(&ret.onDiskDataSize)))
   109  			metricSorterOpenFileCountGauge.Set(float64(atomic.LoadInt64(&openFDCount)))
   110  
   111  			// update memPressure
   112  			m, err := memory.Get()
   113  
   114  			failpoint.Inject("getMemoryPressureFails", func() {
   115  				m = nil
   116  				err = errors.New("injected get memory pressure failure")
   117  			})
   118  
   119  			if err != nil {
   120  				failpoint.Inject("sorterDebug", func() {
   121  					log.Panic("unified sorter: getting system memory usage failed", zap.Error(err))
   122  				})
   123  
   124  				log.Warn("unified sorter: getting system memory usage failed", zap.Error(err))
   125  				// Reports a 100% memory pressure, so that the backEndPool will allocate fileBackEnds.
   126  				// We default to fileBackEnds because they are unlikely to cause OOMs. If IO errors are
   127  				// encountered, we can fail gracefully.
   128  				atomic.StoreInt32(&ret.memPressure, 100)
   129  			} else {
   130  				memPressure := m.Used * 100 / m.Total
   131  				atomic.StoreInt32(&ret.memPressure, int32(memPressure))
   132  			}
   133  
   134  			// garbage collect temporary files in batches
   135  			freedCount := 0
   136  			for i := range ret.cache {
   137  				ptr := &ret.cache[i]
   138  				innerPtr := atomic.SwapPointer(ptr, nil)
   139  				if innerPtr == nil {
   140  					continue
   141  				}
   142  				backEnd := (*fileBackEnd)(innerPtr)
   143  				err := backEnd.free()
   144  				if err != nil {
   145  					log.Warn("Cannot remove temporary file for sorting", zap.String("file", backEnd.fileName), zap.Error(err))
   146  				} else {
   147  					log.Debug("Temporary file removed", zap.String("file", backEnd.fileName))
   148  					freedCount += 1
   149  				}
   150  				if freedCount >= 16 {
   151  					freedCount = 0
   152  					break
   153  				}
   154  			}
   155  		}
   156  	}()
   157  
   158  	return ret, nil
   159  }
   160  
   161  func (p *backEndPool) alloc(ctx context.Context) (backEnd, error) {
   162  	sorterConfig := config.GetGlobalServerConfig().Sorter
   163  	if p.sorterMemoryUsage() < int64(sorterConfig.MaxMemoryConsumption) &&
   164  		p.memoryPressure() < int32(sorterConfig.MaxMemoryPressure) {
   165  
   166  		ret := newMemoryBackEnd()
   167  		return ret, nil
   168  	}
   169  
   170  	p.cancelRWLock.RLock()
   171  	defer p.cancelRWLock.RUnlock()
   172  
   173  	if p.isTerminating {
   174  		return nil, cerrors.ErrUnifiedSorterBackendTerminating.GenWithStackByArgs()
   175  	}
   176  
   177  	for i := range p.cache {
   178  		ptr := &p.cache[i]
   179  		ret := atomic.SwapPointer(ptr, nil)
   180  		if ret != nil {
   181  			return (*fileBackEnd)(ret), nil
   182  		}
   183  	}
   184  
   185  	fname := fmt.Sprintf("%s%d.tmp", p.filePrefix, atomic.AddUint64(&p.fileNameCounter, 1))
   186  	tableID, tableName := util.TableIDFromCtx(ctx)
   187  	log.Debug("Unified Sorter: trying to create file backEnd",
   188  		zap.String("filename", fname),
   189  		zap.Int64("table-id", tableID),
   190  		zap.String("table-name", tableName))
   191  
   192  	if err := util.CheckDataDirSatisfied(); err != nil {
   193  		return nil, errors.Trace(err)
   194  	}
   195  
   196  	ret, err := newFileBackEnd(fname, &msgPackGenSerde{})
   197  	if err != nil {
   198  		return nil, errors.Trace(err)
   199  	}
   200  
   201  	return ret, nil
   202  }
   203  
   204  func (p *backEndPool) dealloc(backEnd backEnd) error {
   205  	switch b := backEnd.(type) {
   206  	case *memoryBackEnd:
   207  		err := b.free()
   208  		if err != nil {
   209  			log.Warn("error freeing memory backend", zap.Error(err))
   210  		}
   211  		// Let GC do its job
   212  		return nil
   213  	case *fileBackEnd:
   214  		failpoint.Inject("sorterDebug", func() {
   215  			if atomic.LoadInt32(&b.borrowed) != 0 {
   216  				log.Warn("Deallocating a fileBackEnd in use", zap.String("filename", b.fileName))
   217  				failpoint.Return(nil)
   218  			}
   219  		})
   220  
   221  		b.cleanStats()
   222  
   223  		p.cancelRWLock.RLock()
   224  		defer p.cancelRWLock.RUnlock()
   225  
   226  		if p.isTerminating {
   227  			return cerrors.ErrUnifiedSorterBackendTerminating.GenWithStackByArgs()
   228  		}
   229  
   230  		for i := range p.cache {
   231  			ptr := &p.cache[i]
   232  			if atomic.CompareAndSwapPointer(ptr, nil, unsafe.Pointer(b)) {
   233  				return nil
   234  			}
   235  		}
   236  		// Cache is full.
   237  		err := b.free()
   238  		if err != nil {
   239  			return errors.Trace(err)
   240  		}
   241  
   242  		return nil
   243  	default:
   244  		log.Panic("backEndPool: unexpected backEnd type to be deallocated", zap.Reflect("type", reflect.TypeOf(backEnd)))
   245  	}
   246  	return nil
   247  }
   248  
   249  func (p *backEndPool) terminate() {
   250  	defer func() {
   251  		if p.fileLock == nil {
   252  			return
   253  		}
   254  		err := p.unlockSortDir()
   255  		if err != nil {
   256  			log.Warn("failed to unlock file prefix", zap.String("prefix", p.filePrefix))
   257  		}
   258  	}()
   259  
   260  	p.cancelCh <- struct{}{}
   261  	defer close(p.cancelCh)
   262  	// the background goroutine can be considered terminated here
   263  
   264  	log.Debug("Unified Sorter terminating...")
   265  	p.cancelRWLock.Lock()
   266  	defer p.cancelRWLock.Unlock()
   267  	p.isTerminating = true
   268  
   269  	log.Debug("Unified Sorter cleaning up before exiting")
   270  	// any new allocs and deallocs will not succeed from this point
   271  	// accessing p.cache without atomics is safe from now
   272  
   273  	for i := range p.cache {
   274  		ptr := &p.cache[i]
   275  		backend := (*fileBackEnd)(*ptr)
   276  		if backend == nil {
   277  			continue
   278  		}
   279  		_ = backend.free()
   280  	}
   281  
   282  	if p.filePrefix == "" {
   283  		// This should not happen. But to prevent accidents in production, we add this anyway.
   284  		log.Panic("Empty filePrefix, please report a bug")
   285  	}
   286  
   287  	files, err := filepath.Glob(p.filePrefix + "*")
   288  	if err != nil {
   289  		log.Warn("Unified Sorter clean-up failed", zap.Error(err))
   290  	}
   291  	for _, file := range files {
   292  		log.Debug("Unified Sorter backEnd removing file", zap.String("file", file))
   293  		err = os.RemoveAll(file)
   294  		if err != nil {
   295  			log.Warn("Unified Sorter clean-up failed: failed to remove", zap.String("file-name", file), zap.Error(err))
   296  		}
   297  	}
   298  
   299  	log.Debug("Unified Sorter backEnd terminated")
   300  }
   301  
   302  func (p *backEndPool) sorterMemoryUsage() int64 {
   303  	failpoint.Inject("memoryUsageInjectPoint", func(val failpoint.Value) {
   304  		failpoint.Return(int64(val.(int)))
   305  	})
   306  	return atomic.LoadInt64(&p.memoryUseEstimate)
   307  }
   308  
   309  func (p *backEndPool) memoryPressure() int32 {
   310  	failpoint.Inject("memoryPressureInjectPoint", func(val failpoint.Value) {
   311  		failpoint.Return(int32(val.(int)))
   312  	})
   313  	return atomic.LoadInt32(&p.memPressure)
   314  }
   315  
   316  func (p *backEndPool) lockSortDir() error {
   317  	lockFileName := fmt.Sprintf("%s/%s", p.dir, sortDirLockFileName)
   318  	fileLock, err := filelock.NewFileLock(lockFileName)
   319  	if err != nil {
   320  		return cerrors.ErrSortDirLockError.Wrap(err).GenWithStackByCause()
   321  	}
   322  
   323  	err = fileLock.Lock()
   324  	if err != nil {
   325  		if cerrors.ErrConflictingFileLocks.Equal(err) {
   326  			log.Warn("TiCDC failed to lock sorter temporary file directory. "+
   327  				"Make sure that another instance of TiCDC, or any other program, is not using the directory. "+
   328  				"If you believe you should not see this error, try deleting the lock file and resume the changefeed. "+
   329  				"Report a bug or contact support if the problem persists.",
   330  				zap.String("lock-file", lockFileName))
   331  			return errors.Trace(err)
   332  		}
   333  		return cerrors.ErrSortDirLockError.Wrap(err).GenWithStackByCause()
   334  	}
   335  
   336  	p.fileLock = fileLock
   337  	return nil
   338  }
   339  
   340  func (p *backEndPool) unlockSortDir() error {
   341  	err := p.fileLock.Unlock()
   342  	if err != nil {
   343  		return cerrors.ErrSortDirLockError.Wrap(err).FastGenWithCause()
   344  	}
   345  	return nil
   346  }
   347  
   348  func (p *backEndPool) cleanUpStaleFiles() error {
   349  	if p.dir == "" {
   350  		// guard against programmer error. Must be careful when we are deleting user files.
   351  		log.Panic("unexpected sort-dir", zap.String("sort-dir", p.dir))
   352  	}
   353  
   354  	files, err := filepath.Glob(filepath.Join(p.dir, fmt.Sprintf("%s-*", sortDirDataFileMagicPrefix)))
   355  	if err != nil {
   356  		return errors.Trace(err)
   357  	}
   358  
   359  	for _, toRemoveFilePath := range files {
   360  		log.Debug("Removing stale sorter temporary file", zap.String("file", toRemoveFilePath))
   361  		err := os.Remove(toRemoveFilePath)
   362  		if err != nil {
   363  			// In production, we do not want an error here to interfere with normal operation,
   364  			// because in most situations, failure to remove files only indicates non-fatal misconfigurations
   365  			// such as permission problems, rather than fatal errors.
   366  			// If the directory is truly unusable, other errors would be raised when we try to write to it.
   367  			log.Warn("failed to remove file",
   368  				zap.String("file", toRemoveFilePath),
   369  				zap.Error(err))
   370  			// For fail-fast in integration tests
   371  			failpoint.Inject("sorterDebug", func() {
   372  				log.Panic("panicking", zap.Error(err))
   373  			})
   374  		}
   375  	}
   376  
   377  	return nil
   378  }