github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/cdc/puller/sorter/backend_pool.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package sorter 15 16 import ( 17 "context" 18 "fmt" 19 "os" 20 "path/filepath" 21 "reflect" 22 "sync" 23 "sync/atomic" 24 "time" 25 "unsafe" 26 27 "github.com/mackerelio/go-osstat/memory" 28 "github.com/pingcap/errors" 29 "github.com/pingcap/failpoint" 30 "github.com/pingcap/log" 31 "github.com/pingcap/ticdc/pkg/config" 32 cerrors "github.com/pingcap/ticdc/pkg/errors" 33 "github.com/pingcap/ticdc/pkg/filelock" 34 "github.com/pingcap/ticdc/pkg/util" 35 "go.uber.org/zap" 36 ) 37 38 const ( 39 backgroundJobInterval = time.Second * 15 40 sortDirLockFileName = "ticdc_lock" 41 sortDirDataFileMagicPrefix = "sort" 42 ) 43 44 var ( 45 pool *backEndPool // this is the singleton instance of backEndPool 46 poolMu sync.Mutex // this mutex is for delayed initialization of `pool` only 47 ) 48 49 type backEndPool struct { 50 memoryUseEstimate int64 51 onDiskDataSize int64 52 fileNameCounter uint64 53 memPressure int32 54 cache [256]unsafe.Pointer 55 dir string 56 filePrefix string 57 58 // to prevent `dir` from being accidentally used by another TiCDC server process. 59 fileLock *filelock.FileLock 60 61 // cancelCh needs to be unbuffered to prevent races 62 cancelCh chan struct{} 63 // cancelRWLock protects cache against races when the backEnd is exiting 64 cancelRWLock sync.RWMutex 65 isTerminating bool 66 } 67 68 func newBackEndPool(dir string, captureAddr string) (*backEndPool, error) { 69 ret := &backEndPool{ 70 memoryUseEstimate: 0, 71 fileNameCounter: 0, 72 dir: dir, 73 cancelCh: make(chan struct{}), 74 filePrefix: fmt.Sprintf("%s/%s-%d-", dir, sortDirDataFileMagicPrefix, os.Getpid()), 75 } 76 77 err := ret.lockSortDir() 78 if err != nil { 79 log.Warn("failed to lock file prefix", 80 zap.String("prefix", ret.filePrefix), 81 zap.Error(err)) 82 return nil, errors.Trace(err) 83 } 84 85 err = ret.cleanUpStaleFiles() 86 if err != nil { 87 log.Warn("Unified Sorter: failed to clean up stale temporary files. Report a bug if you believe this is unexpected", zap.Error(err)) 88 return nil, errors.Trace(err) 89 } 90 91 go func() { 92 ticker := time.NewTicker(backgroundJobInterval) 93 defer ticker.Stop() 94 95 metricSorterInMemoryDataSizeGauge := sorterInMemoryDataSizeGauge.WithLabelValues(captureAddr) 96 metricSorterOnDiskDataSizeGauge := sorterOnDiskDataSizeGauge.WithLabelValues(captureAddr) 97 metricSorterOpenFileCountGauge := sorterOpenFileCountGauge.WithLabelValues(captureAddr) 98 99 for { 100 select { 101 case <-ret.cancelCh: 102 log.Info("Unified Sorter backEnd is being cancelled") 103 return 104 case <-ticker.C: 105 } 106 107 metricSorterInMemoryDataSizeGauge.Set(float64(atomic.LoadInt64(&ret.memoryUseEstimate))) 108 metricSorterOnDiskDataSizeGauge.Set(float64(atomic.LoadInt64(&ret.onDiskDataSize))) 109 metricSorterOpenFileCountGauge.Set(float64(atomic.LoadInt64(&openFDCount))) 110 111 // update memPressure 112 m, err := memory.Get() 113 114 failpoint.Inject("getMemoryPressureFails", func() { 115 m = nil 116 err = errors.New("injected get memory pressure failure") 117 }) 118 119 if err != nil { 120 failpoint.Inject("sorterDebug", func() { 121 log.Panic("unified sorter: getting system memory usage failed", zap.Error(err)) 122 }) 123 124 log.Warn("unified sorter: getting system memory usage failed", zap.Error(err)) 125 // Reports a 100% memory pressure, so that the backEndPool will allocate fileBackEnds. 126 // We default to fileBackEnds because they are unlikely to cause OOMs. If IO errors are 127 // encountered, we can fail gracefully. 128 atomic.StoreInt32(&ret.memPressure, 100) 129 } else { 130 memPressure := m.Used * 100 / m.Total 131 atomic.StoreInt32(&ret.memPressure, int32(memPressure)) 132 } 133 134 // garbage collect temporary files in batches 135 freedCount := 0 136 for i := range ret.cache { 137 ptr := &ret.cache[i] 138 innerPtr := atomic.SwapPointer(ptr, nil) 139 if innerPtr == nil { 140 continue 141 } 142 backEnd := (*fileBackEnd)(innerPtr) 143 err := backEnd.free() 144 if err != nil { 145 log.Warn("Cannot remove temporary file for sorting", zap.String("file", backEnd.fileName), zap.Error(err)) 146 } else { 147 log.Debug("Temporary file removed", zap.String("file", backEnd.fileName)) 148 freedCount += 1 149 } 150 if freedCount >= 16 { 151 freedCount = 0 152 break 153 } 154 } 155 } 156 }() 157 158 return ret, nil 159 } 160 161 func (p *backEndPool) alloc(ctx context.Context) (backEnd, error) { 162 sorterConfig := config.GetGlobalServerConfig().Sorter 163 if p.sorterMemoryUsage() < int64(sorterConfig.MaxMemoryConsumption) && 164 p.memoryPressure() < int32(sorterConfig.MaxMemoryPressure) { 165 166 ret := newMemoryBackEnd() 167 return ret, nil 168 } 169 170 p.cancelRWLock.RLock() 171 defer p.cancelRWLock.RUnlock() 172 173 if p.isTerminating { 174 return nil, cerrors.ErrUnifiedSorterBackendTerminating.GenWithStackByArgs() 175 } 176 177 for i := range p.cache { 178 ptr := &p.cache[i] 179 ret := atomic.SwapPointer(ptr, nil) 180 if ret != nil { 181 return (*fileBackEnd)(ret), nil 182 } 183 } 184 185 fname := fmt.Sprintf("%s%d.tmp", p.filePrefix, atomic.AddUint64(&p.fileNameCounter, 1)) 186 tableID, tableName := util.TableIDFromCtx(ctx) 187 log.Debug("Unified Sorter: trying to create file backEnd", 188 zap.String("filename", fname), 189 zap.Int64("table-id", tableID), 190 zap.String("table-name", tableName)) 191 192 if err := util.CheckDataDirSatisfied(); err != nil { 193 return nil, errors.Trace(err) 194 } 195 196 ret, err := newFileBackEnd(fname, &msgPackGenSerde{}) 197 if err != nil { 198 return nil, errors.Trace(err) 199 } 200 201 return ret, nil 202 } 203 204 func (p *backEndPool) dealloc(backEnd backEnd) error { 205 switch b := backEnd.(type) { 206 case *memoryBackEnd: 207 err := b.free() 208 if err != nil { 209 log.Warn("error freeing memory backend", zap.Error(err)) 210 } 211 // Let GC do its job 212 return nil 213 case *fileBackEnd: 214 failpoint.Inject("sorterDebug", func() { 215 if atomic.LoadInt32(&b.borrowed) != 0 { 216 log.Warn("Deallocating a fileBackEnd in use", zap.String("filename", b.fileName)) 217 failpoint.Return(nil) 218 } 219 }) 220 221 b.cleanStats() 222 223 p.cancelRWLock.RLock() 224 defer p.cancelRWLock.RUnlock() 225 226 if p.isTerminating { 227 return cerrors.ErrUnifiedSorterBackendTerminating.GenWithStackByArgs() 228 } 229 230 for i := range p.cache { 231 ptr := &p.cache[i] 232 if atomic.CompareAndSwapPointer(ptr, nil, unsafe.Pointer(b)) { 233 return nil 234 } 235 } 236 // Cache is full. 237 err := b.free() 238 if err != nil { 239 return errors.Trace(err) 240 } 241 242 return nil 243 default: 244 log.Panic("backEndPool: unexpected backEnd type to be deallocated", zap.Reflect("type", reflect.TypeOf(backEnd))) 245 } 246 return nil 247 } 248 249 func (p *backEndPool) terminate() { 250 defer func() { 251 if p.fileLock == nil { 252 return 253 } 254 err := p.unlockSortDir() 255 if err != nil { 256 log.Warn("failed to unlock file prefix", zap.String("prefix", p.filePrefix)) 257 } 258 }() 259 260 p.cancelCh <- struct{}{} 261 defer close(p.cancelCh) 262 // the background goroutine can be considered terminated here 263 264 log.Debug("Unified Sorter terminating...") 265 p.cancelRWLock.Lock() 266 defer p.cancelRWLock.Unlock() 267 p.isTerminating = true 268 269 log.Debug("Unified Sorter cleaning up before exiting") 270 // any new allocs and deallocs will not succeed from this point 271 // accessing p.cache without atomics is safe from now 272 273 for i := range p.cache { 274 ptr := &p.cache[i] 275 backend := (*fileBackEnd)(*ptr) 276 if backend == nil { 277 continue 278 } 279 _ = backend.free() 280 } 281 282 if p.filePrefix == "" { 283 // This should not happen. But to prevent accidents in production, we add this anyway. 284 log.Panic("Empty filePrefix, please report a bug") 285 } 286 287 files, err := filepath.Glob(p.filePrefix + "*") 288 if err != nil { 289 log.Warn("Unified Sorter clean-up failed", zap.Error(err)) 290 } 291 for _, file := range files { 292 log.Debug("Unified Sorter backEnd removing file", zap.String("file", file)) 293 err = os.RemoveAll(file) 294 if err != nil { 295 log.Warn("Unified Sorter clean-up failed: failed to remove", zap.String("file-name", file), zap.Error(err)) 296 } 297 } 298 299 log.Debug("Unified Sorter backEnd terminated") 300 } 301 302 func (p *backEndPool) sorterMemoryUsage() int64 { 303 failpoint.Inject("memoryUsageInjectPoint", func(val failpoint.Value) { 304 failpoint.Return(int64(val.(int))) 305 }) 306 return atomic.LoadInt64(&p.memoryUseEstimate) 307 } 308 309 func (p *backEndPool) memoryPressure() int32 { 310 failpoint.Inject("memoryPressureInjectPoint", func(val failpoint.Value) { 311 failpoint.Return(int32(val.(int))) 312 }) 313 return atomic.LoadInt32(&p.memPressure) 314 } 315 316 func (p *backEndPool) lockSortDir() error { 317 lockFileName := fmt.Sprintf("%s/%s", p.dir, sortDirLockFileName) 318 fileLock, err := filelock.NewFileLock(lockFileName) 319 if err != nil { 320 return cerrors.ErrSortDirLockError.Wrap(err).GenWithStackByCause() 321 } 322 323 err = fileLock.Lock() 324 if err != nil { 325 if cerrors.ErrConflictingFileLocks.Equal(err) { 326 log.Warn("TiCDC failed to lock sorter temporary file directory. "+ 327 "Make sure that another instance of TiCDC, or any other program, is not using the directory. "+ 328 "If you believe you should not see this error, try deleting the lock file and resume the changefeed. "+ 329 "Report a bug or contact support if the problem persists.", 330 zap.String("lock-file", lockFileName)) 331 return errors.Trace(err) 332 } 333 return cerrors.ErrSortDirLockError.Wrap(err).GenWithStackByCause() 334 } 335 336 p.fileLock = fileLock 337 return nil 338 } 339 340 func (p *backEndPool) unlockSortDir() error { 341 err := p.fileLock.Unlock() 342 if err != nil { 343 return cerrors.ErrSortDirLockError.Wrap(err).FastGenWithCause() 344 } 345 return nil 346 } 347 348 func (p *backEndPool) cleanUpStaleFiles() error { 349 if p.dir == "" { 350 // guard against programmer error. Must be careful when we are deleting user files. 351 log.Panic("unexpected sort-dir", zap.String("sort-dir", p.dir)) 352 } 353 354 files, err := filepath.Glob(filepath.Join(p.dir, fmt.Sprintf("%s-*", sortDirDataFileMagicPrefix))) 355 if err != nil { 356 return errors.Trace(err) 357 } 358 359 for _, toRemoveFilePath := range files { 360 log.Debug("Removing stale sorter temporary file", zap.String("file", toRemoveFilePath)) 361 err := os.Remove(toRemoveFilePath) 362 if err != nil { 363 // In production, we do not want an error here to interfere with normal operation, 364 // because in most situations, failure to remove files only indicates non-fatal misconfigurations 365 // such as permission problems, rather than fatal errors. 366 // If the directory is truly unusable, other errors would be raised when we try to write to it. 367 log.Warn("failed to remove file", 368 zap.String("file", toRemoveFilePath), 369 zap.Error(err)) 370 // For fail-fast in integration tests 371 failpoint.Inject("sorterDebug", func() { 372 log.Panic("panicking", zap.Error(err)) 373 }) 374 } 375 } 376 377 return nil 378 }