github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/cdc/server.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package cdc 15 16 import ( 17 "context" 18 "fmt" 19 "net/http" 20 "os" 21 "path/filepath" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/pingcap/errors" 27 "github.com/pingcap/log" 28 "github.com/pingcap/ticdc/cdc/capture" 29 "github.com/pingcap/ticdc/cdc/kv" 30 "github.com/pingcap/ticdc/cdc/puller/sorter" 31 "github.com/pingcap/ticdc/pkg/config" 32 cerror "github.com/pingcap/ticdc/pkg/errors" 33 "github.com/pingcap/ticdc/pkg/httputil" 34 "github.com/pingcap/ticdc/pkg/util" 35 "github.com/pingcap/ticdc/pkg/version" 36 tidbkv "github.com/pingcap/tidb/kv" 37 "github.com/prometheus/client_golang/prometheus" 38 pd "github.com/tikv/pd/client" 39 "go.etcd.io/etcd/clientv3" 40 "go.etcd.io/etcd/mvcc" 41 "go.etcd.io/etcd/pkg/logutil" 42 "go.uber.org/zap" 43 "go.uber.org/zap/zapcore" 44 "golang.org/x/sync/errgroup" 45 "golang.org/x/time/rate" 46 "google.golang.org/grpc" 47 "google.golang.org/grpc/backoff" 48 ) 49 50 const ( 51 ownerRunInterval = time.Millisecond * 500 52 defaultDataDir = "/tmp/cdc_data" 53 // dataDirThreshold is used to warn if the free space of the specified data-dir is lower than it, unit is GB 54 dataDirThreshold = 500 55 ) 56 57 // Server is the capture server 58 type Server struct { 59 captureV2 *capture.Capture 60 61 capture *Capture 62 owner *Owner 63 ownerLock sync.RWMutex 64 statusServer *http.Server 65 pdClient pd.Client 66 etcdClient *kv.CDCEtcdClient 67 kvStorage tidbkv.Storage 68 pdEndpoints []string 69 } 70 71 // NewServer creates a Server instance. 72 func NewServer(pdEndpoints []string) (*Server, error) { 73 conf := config.GetGlobalServerConfig() 74 log.Info("creating CDC server", 75 zap.Strings("pd-addrs", pdEndpoints), 76 zap.Stringer("config", conf), 77 ) 78 79 s := &Server{ 80 pdEndpoints: pdEndpoints, 81 } 82 return s, nil 83 } 84 85 // Run runs the server. 86 func (s *Server) Run(ctx context.Context) error { 87 conf := config.GetGlobalServerConfig() 88 89 grpcTLSOption, err := conf.Security.ToGRPCDialOption() 90 if err != nil { 91 return errors.Trace(err) 92 } 93 pdClient, err := pd.NewClientWithContext( 94 ctx, s.pdEndpoints, conf.Security.PDSecurityOption(), 95 pd.WithGRPCDialOptions( 96 grpcTLSOption, 97 grpc.WithBlock(), 98 grpc.WithConnectParams(grpc.ConnectParams{ 99 Backoff: backoff.Config{ 100 BaseDelay: time.Second, 101 Multiplier: 1.1, 102 Jitter: 0.1, 103 MaxDelay: 3 * time.Second, 104 }, 105 MinConnectTimeout: 3 * time.Second, 106 }), 107 )) 108 if err != nil { 109 return cerror.WrapError(cerror.ErrServerNewPDClient, err) 110 } 111 s.pdClient = pdClient 112 if config.NewReplicaImpl { 113 tlsConfig, err := conf.Security.ToTLSConfig() 114 if err != nil { 115 return errors.Trace(err) 116 } 117 logConfig := logutil.DefaultZapLoggerConfig 118 logConfig.Level = zap.NewAtomicLevelAt(zapcore.ErrorLevel) 119 etcdCli, err := clientv3.New(clientv3.Config{ 120 Endpoints: s.pdEndpoints, 121 TLS: tlsConfig, 122 Context: ctx, 123 LogConfig: &logConfig, 124 DialTimeout: 5 * time.Second, 125 DialOptions: []grpc.DialOption{ 126 grpcTLSOption, 127 grpc.WithBlock(), 128 grpc.WithConnectParams(grpc.ConnectParams{ 129 Backoff: backoff.Config{ 130 BaseDelay: time.Second, 131 Multiplier: 1.1, 132 Jitter: 0.1, 133 MaxDelay: 3 * time.Second, 134 }, 135 MinConnectTimeout: 3 * time.Second, 136 }), 137 }, 138 }) 139 if err != nil { 140 return errors.Annotate(cerror.WrapError(cerror.ErrNewCaptureFailed, err), "new etcd client") 141 } 142 etcdClient := kv.NewCDCEtcdClient(ctx, etcdCli) 143 s.etcdClient = &etcdClient 144 } 145 146 if err := s.initDataDir(ctx); err != nil { 147 return errors.Trace(err) 148 } 149 // To not block CDC server startup, we need to warn instead of error 150 // when TiKV is incompatible. 151 errorTiKVIncompatible := false 152 for _, pdEndpoint := range s.pdEndpoints { 153 err = version.CheckClusterVersion(ctx, s.pdClient, pdEndpoint, conf.Security, errorTiKVIncompatible) 154 if err == nil { 155 break 156 } 157 } 158 if err != nil { 159 return err 160 } 161 err = s.startStatusHTTP() 162 if err != nil { 163 return err 164 } 165 166 kv.InitWorkerPool() 167 kvStore, err := kv.CreateTiStore(strings.Join(s.pdEndpoints, ","), conf.Security) 168 if err != nil { 169 return errors.Trace(err) 170 } 171 defer func() { 172 err := kvStore.Close() 173 if err != nil { 174 log.Warn("kv store close failed", zap.Error(err)) 175 } 176 }() 177 s.kvStorage = kvStore 178 ctx = util.PutKVStorageInCtx(ctx, kvStore) 179 if config.NewReplicaImpl { 180 s.captureV2 = capture.NewCapture(s.pdClient, s.kvStorage, s.etcdClient) 181 return s.run(ctx) 182 } 183 // When a capture suicided, restart it 184 for { 185 if err := s.run(ctx); cerror.ErrCaptureSuicide.NotEqual(err) { 186 return err 187 } 188 log.Info("server recovered", zap.String("capture-id", s.capture.info.ID)) 189 } 190 } 191 192 func (s *Server) setOwner(owner *Owner) { 193 s.ownerLock.Lock() 194 defer s.ownerLock.Unlock() 195 s.owner = owner 196 } 197 198 func (s *Server) campaignOwnerLoop(ctx context.Context) error { 199 // In most failure cases, we don't return error directly, just run another 200 // campaign loop. We treat campaign loop as a special background routine. 201 202 conf := config.GetGlobalServerConfig() 203 rl := rate.NewLimiter(0.05, 2) 204 for { 205 err := rl.Wait(ctx) 206 if err != nil { 207 if errors.Cause(err) == context.Canceled { 208 return nil 209 } 210 return errors.Trace(err) 211 } 212 213 // Campaign to be an owner, it blocks until it becomes the owner 214 if err := s.capture.Campaign(ctx); err != nil { 215 switch errors.Cause(err) { 216 case context.Canceled: 217 return nil 218 case mvcc.ErrCompacted: 219 continue 220 } 221 log.Warn("campaign owner failed", zap.Error(err)) 222 continue 223 } 224 captureID := s.capture.info.ID 225 log.Info("campaign owner successfully", zap.String("capture-id", captureID)) 226 owner, err := NewOwner(ctx, s.pdClient, s.capture.grpcPool, s.capture.session, conf.GcTTL, time.Duration(conf.OwnerFlushInterval)) 227 if err != nil { 228 log.Warn("create new owner failed", zap.Error(err)) 229 continue 230 } 231 232 s.setOwner(owner) 233 if err := owner.Run(ctx, ownerRunInterval); err != nil { 234 if errors.Cause(err) == context.Canceled { 235 log.Info("owner exited", zap.String("capture-id", captureID)) 236 select { 237 case <-ctx.Done(): 238 // only exits the campaignOwnerLoop if parent context is done 239 return ctx.Err() 240 default: 241 } 242 log.Info("owner exited", zap.String("capture-id", captureID)) 243 } 244 err2 := s.capture.Resign(ctx) 245 if err2 != nil { 246 // if resign owner failed, return error to let capture exits 247 return errors.Annotatef(err2, "resign owner failed, capture: %s", captureID) 248 } 249 log.Warn("run owner failed", zap.Error(err)) 250 } 251 // owner is resigned by API, reset owner and continue the campaign loop 252 s.setOwner(nil) 253 } 254 } 255 256 func (s *Server) etcdHealthChecker(ctx context.Context) error { 257 ticker := time.NewTicker(time.Second * 3) 258 defer ticker.Stop() 259 conf := config.GetGlobalServerConfig() 260 261 httpCli, err := httputil.NewClient(conf.Security) 262 if err != nil { 263 return err 264 } 265 defer httpCli.CloseIdleConnections() 266 metrics := make(map[string]prometheus.Observer) 267 for _, pdEndpoint := range s.pdEndpoints { 268 metrics[pdEndpoint] = etcdHealthCheckDuration.WithLabelValues(conf.AdvertiseAddr, pdEndpoint) 269 } 270 271 for { 272 select { 273 case <-ctx.Done(): 274 return ctx.Err() 275 case <-ticker.C: 276 for _, pdEndpoint := range s.pdEndpoints { 277 start := time.Now() 278 ctx, cancel := context.WithTimeout(ctx, time.Second*10) 279 req, err := http.NewRequestWithContext( 280 ctx, http.MethodGet, fmt.Sprintf("%s/health", pdEndpoint), nil) 281 if err != nil { 282 log.Warn("etcd health check failed", zap.Error(err)) 283 cancel() 284 continue 285 } 286 _, err = httpCli.Do(req) 287 if err != nil { 288 log.Warn("etcd health check error", zap.Error(err)) 289 } else { 290 metrics[pdEndpoint].Observe(float64(time.Since(start)) / float64(time.Second)) 291 } 292 cancel() 293 } 294 } 295 } 296 } 297 298 func (s *Server) run(ctx context.Context) (err error) { 299 if !config.NewReplicaImpl { 300 kvStorage, err := util.KVStorageFromCtx(ctx) 301 if err != nil { 302 return errors.Trace(err) 303 } 304 if s.capture != nil && s.capture.session != nil { 305 if err := s.capture.session.Close(); err != nil { 306 log.Warn("close old capture session failed", zap.Error(err)) 307 } 308 } 309 capture, err := NewCapture(ctx, s.pdEndpoints, s.pdClient, kvStorage) 310 if err != nil { 311 return err 312 } 313 s.capture = capture 314 s.etcdClient = &capture.etcdClient 315 conf := config.GetGlobalServerConfig() 316 defer func() { 317 timeoutCtx, cancel := context.WithTimeout(context.Background(), time.Duration(conf.CaptureSessionTTL)*time.Second) 318 if err := s.etcdClient.DeleteCaptureInfo(timeoutCtx, s.capture.info.ID); err != nil { 319 log.Warn("failed to delete capture info when capture exited", zap.Error(err)) 320 } 321 cancel() 322 }() 323 } 324 ctx, cancel := context.WithCancel(ctx) 325 defer cancel() 326 327 wg, cctx := errgroup.WithContext(ctx) 328 if config.NewReplicaImpl { 329 wg.Go(func() error { 330 return s.captureV2.Run(cctx) 331 }) 332 } else { 333 wg.Go(func() error { 334 return s.campaignOwnerLoop(cctx) 335 }) 336 337 wg.Go(func() error { 338 return s.capture.Run(cctx) 339 }) 340 } 341 wg.Go(func() error { 342 return s.etcdHealthChecker(cctx) 343 }) 344 345 wg.Go(func() error { 346 return sorter.RunWorkerPool(cctx) 347 }) 348 349 wg.Go(func() error { 350 return kv.RunWorkerPool(cctx) 351 }) 352 353 return wg.Wait() 354 } 355 356 // Close closes the server. 357 func (s *Server) Close() { 358 if s.capture != nil { 359 if !config.NewReplicaImpl { 360 s.capture.Cleanup() 361 } 362 closeCtx, closeCancel := context.WithTimeout(context.Background(), time.Second*2) 363 err := s.capture.Close(closeCtx) 364 if err != nil { 365 log.Error("close capture", zap.Error(err)) 366 } 367 closeCancel() 368 } 369 if s.captureV2 != nil { 370 s.captureV2.AsyncClose() 371 } 372 if s.statusServer != nil { 373 err := s.statusServer.Close() 374 if err != nil { 375 log.Error("close status server", zap.Error(err)) 376 } 377 s.statusServer = nil 378 } 379 } 380 381 func (s *Server) initDataDir(ctx context.Context) error { 382 if err := s.setUpDataDir(ctx); err != nil { 383 return errors.Trace(err) 384 } 385 conf := config.GetGlobalServerConfig() 386 err := os.MkdirAll(conf.DataDir, 0o755) 387 if err != nil { 388 return errors.Trace(err) 389 } 390 diskInfo, err := util.GetDiskInfo(conf.DataDir) 391 if err != nil { 392 return errors.Trace(err) 393 } 394 395 log.Info(fmt.Sprintf("%s is set as data-dir (%dGB available), sort-dir=%s. "+ 396 "It is recommended that the disk for data-dir at least have %dGB available space", conf.DataDir, diskInfo.Avail, conf.Sorter.SortDir, dataDirThreshold)) 397 398 return nil 399 } 400 401 func (s *Server) setUpDataDir(ctx context.Context) error { 402 conf := config.GetGlobalServerConfig() 403 if conf.DataDir != "" { 404 conf.Sorter.SortDir = filepath.Join(conf.DataDir, config.DefaultSortDir) 405 config.StoreGlobalServerConfig(conf) 406 407 return nil 408 } 409 410 // s.etcdClient maybe nil if NewReplicaImpl is not set to true 411 // todo: remove this after NewReplicaImpl set to true in a specific branch, and use server.etcdClient instead. 412 cli := s.etcdClient 413 if cli == nil { 414 client, err := clientv3.New(clientv3.Config{ 415 Endpoints: s.pdEndpoints, 416 Context: ctx, 417 DialTimeout: 5 * time.Second, 418 }) 419 if err != nil { 420 return err 421 } 422 etcdClient := kv.NewCDCEtcdClient(ctx, client) 423 cli = &etcdClient 424 defer cli.Close() 425 } 426 427 // data-dir will be decide by exist changefeed for backward compatibility 428 allStatus, err := cli.GetAllChangeFeedStatus(ctx) 429 if err != nil { 430 return errors.Trace(err) 431 } 432 433 candidates := make([]string, 0, len(allStatus)) 434 for id := range allStatus { 435 info, err := cli.GetChangeFeedInfo(ctx, id) 436 if err != nil { 437 return errors.Trace(err) 438 } 439 if info.SortDir != "" { 440 candidates = append(candidates, info.SortDir) 441 } 442 } 443 444 conf.DataDir = defaultDataDir 445 best, ok := findBestDataDir(candidates) 446 if ok { 447 conf.DataDir = best 448 } 449 450 conf.Sorter.SortDir = filepath.Join(conf.DataDir, config.DefaultSortDir) 451 config.StoreGlobalServerConfig(conf) 452 return nil 453 } 454 455 // try to find the best data dir by rules 456 // at the moment, only consider available disk space 457 func findBestDataDir(candidates []string) (result string, ok bool) { 458 var low uint64 = 0 459 460 checker := func(dir string) (*util.DiskInfo, error) { 461 if err := os.MkdirAll(dir, 0o755); err != nil { 462 return nil, err 463 } 464 if err := util.IsDirReadWritable(dir); err != nil { 465 return nil, err 466 } 467 info, err := util.GetDiskInfo(dir) 468 if err != nil { 469 return nil, err 470 } 471 return info, err 472 } 473 474 for _, dir := range candidates { 475 info, err := checker(dir) 476 if err != nil { 477 log.Warn("check the availability of dir", zap.String("dir", dir), zap.Error(err)) 478 continue 479 } 480 if info.Avail > low { 481 result = dir 482 low = info.Avail 483 ok = true 484 } 485 } 486 487 if !ok && len(candidates) != 0 { 488 log.Warn("try to find directory for data-dir failed, use `/tmp/cdc_data` as data-dir", zap.Strings("candidates", candidates)) 489 } 490 491 return result, ok 492 }