github.com/matrixorigin/matrixone@v1.2.0/cmd/mo-service/main.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"flag"
    21  	"fmt"
    22  	"net/http"
    23  	"os"
    24  	"os/signal"
    25  	"path/filepath"
    26  	struntime "runtime"
    27  	"strings"
    28  	"sync"
    29  	"syscall"
    30  	"time"
    31  	_ "time/tzdata"
    32  
    33  	"github.com/google/uuid"
    34  	"go.uber.org/zap"
    35  
    36  	"github.com/matrixorigin/matrixone/pkg/catalog"
    37  	"github.com/matrixorigin/matrixone/pkg/clusterservice"
    38  	"github.com/matrixorigin/matrixone/pkg/cnservice"
    39  	"github.com/matrixorigin/matrixone/pkg/cnservice/cnclient"
    40  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    41  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    42  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    43  	"github.com/matrixorigin/matrixone/pkg/common/system"
    44  	"github.com/matrixorigin/matrixone/pkg/defines"
    45  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    46  	"github.com/matrixorigin/matrixone/pkg/gossip"
    47  	"github.com/matrixorigin/matrixone/pkg/logservice"
    48  	"github.com/matrixorigin/matrixone/pkg/logutil"
    49  	"github.com/matrixorigin/matrixone/pkg/pb/metadata"
    50  	"github.com/matrixorigin/matrixone/pkg/proxy"
    51  	qclient "github.com/matrixorigin/matrixone/pkg/queryservice/client"
    52  	"github.com/matrixorigin/matrixone/pkg/sql/compile"
    53  	"github.com/matrixorigin/matrixone/pkg/tnservice"
    54  	"github.com/matrixorigin/matrixone/pkg/udf/pythonservice"
    55  	"github.com/matrixorigin/matrixone/pkg/util"
    56  	"github.com/matrixorigin/matrixone/pkg/util/debug/goroutine"
    57  	"github.com/matrixorigin/matrixone/pkg/util/export"
    58  	"github.com/matrixorigin/matrixone/pkg/util/export/table"
    59  	"github.com/matrixorigin/matrixone/pkg/util/metric/mometric"
    60  	"github.com/matrixorigin/matrixone/pkg/util/profile"
    61  	"github.com/matrixorigin/matrixone/pkg/util/trace/impl/motrace"
    62  )
    63  
    64  var (
    65  	configFile   = flag.String("cfg", "", "toml configuration used to start mo-service")
    66  	launchFile   = flag.String("launch", "", "toml configuration used to launch mo cluster")
    67  	versionFlag  = flag.Bool("version", false, "print version information")
    68  	daemon       = flag.Bool("daemon", false, "run mo-service in daemon mode")
    69  	withProxy    = flag.Bool("with-proxy", false, "run mo-service with proxy module started")
    70  	maxProcessor = flag.Int("max-processor", 0, "set max processor for go runtime")
    71  	globalEtlFS  fileservice.FileService
    72  )
    73  
    74  func main() {
    75  	if *maxProcessor > 0 {
    76  		struntime.GOMAXPROCS(*maxProcessor)
    77  	}
    78  
    79  	flag.Parse()
    80  	maybePrintVersion()
    81  	maybeRunInDaemonMode()
    82  
    83  	uuid.EnableRandPool()
    84  
    85  	if *cpuProfilePathFlag != "" {
    86  		stop := startCPUProfile()
    87  		defer stop()
    88  	}
    89  	if *allocsProfilePathFlag != "" {
    90  		defer writeAllocsProfile()
    91  	}
    92  	if *heapProfilePathFlag != "" {
    93  		defer writeHeapProfile()
    94  	}
    95  	if *httpListenAddr != "" {
    96  		go func() {
    97  			http.ListenAndServe(*httpListenAddr, nil)
    98  		}()
    99  	}
   100  
   101  	ctx := context.Background()
   102  	shutdownC := make(chan struct{})
   103  
   104  	stopper := stopper.NewStopper("main", stopper.WithLogger(logutil.GetGlobalLogger()))
   105  	if *launchFile != "" {
   106  		if err := startCluster(ctx, stopper, shutdownC); err != nil {
   107  			panic(err)
   108  		}
   109  	} else if *configFile != "" {
   110  		cfg := NewConfig()
   111  		if err := parseConfigFromFile(*configFile, cfg); err != nil {
   112  			panic(fmt.Sprintf("failed to parse config from %s, error: %s", *configFile, err.Error()))
   113  		}
   114  		if err := startService(ctx, cfg, stopper, shutdownC); err != nil {
   115  			panic(err)
   116  		}
   117  	} else {
   118  		panic(errors.New("no configuration specified"))
   119  	}
   120  
   121  	waitSignalToStop(stopper, shutdownC)
   122  	logutil.GetGlobalLogger().Info("Shutdown complete")
   123  }
   124  
   125  func waitSignalToStop(stopper *stopper.Stopper, shutdownC chan struct{}) {
   126  	sigchan := make(chan os.Signal, 1)
   127  	signal.Notify(sigchan, syscall.SIGTERM, syscall.SIGINT)
   128  
   129  	detail := "Starting shutdown..."
   130  	select {
   131  	case sig := <-sigchan:
   132  		detail += "signal: " + sig.String()
   133  		//dump heap profile before stopping services
   134  		heapName, _ := uuid.NewV7()
   135  		heapProfilePath := catalog.BuildProfilePath("heap", heapName.String())
   136  		cnservice.SaveProfile(heapProfilePath, profile.HEAP, globalEtlFS)
   137  		detail += ". heap profile: " + heapProfilePath
   138  		//dump goroutine before stopping services
   139  		routineName, _ := uuid.NewV7()
   140  		routineProfilePath := catalog.BuildProfilePath("routine", routineName.String())
   141  		cnservice.SaveProfile(routineProfilePath, profile.GOROUTINE, globalEtlFS)
   142  		detail += " routine profile: " + routineProfilePath
   143  	case <-shutdownC:
   144  		// waiting, give a chance let all log stores and tn stores to get
   145  		// shutdown cmd from ha keeper
   146  		time.Sleep(time.Second * 5)
   147  		detail += "ha keeper issues shutdown command"
   148  	}
   149  
   150  	stopAllDynamicCNServices()
   151  
   152  	logutil.GetGlobalLogger().Info(detail)
   153  	stopper.Stop()
   154  	if cnProxy != nil {
   155  		if err := cnProxy.Stop(); err != nil {
   156  			logutil.GetGlobalLogger().Error("shutdown cn proxy failed", zap.Error(err))
   157  		}
   158  	}
   159  }
   160  
   161  func startService(
   162  	ctx context.Context,
   163  	cfg *Config,
   164  	stopper *stopper.Stopper,
   165  	shutdownC chan struct{},
   166  ) error {
   167  	if err := cfg.validate(); err != nil {
   168  		return err
   169  	}
   170  	if err := cfg.resolveGossipSeedAddresses(); err != nil {
   171  		return err
   172  	}
   173  	setupProcessLevelRuntime(cfg, stopper)
   174  
   175  	setupStatusServer(runtime.ProcessLevelRuntime())
   176  
   177  	goroutine.StartLeakCheck(stopper, cfg.Goroutine)
   178  
   179  	st, err := cfg.getServiceType()
   180  	if err != nil {
   181  		return err
   182  	}
   183  
   184  	uuid, err := getNodeUUID(ctx, st, cfg)
   185  	if err != nil {
   186  		return err
   187  	}
   188  
   189  	var gossipNode *gossip.Node
   190  	if st == metadata.ServiceType_CN {
   191  		gossipNode, err = gossip.NewNode(ctx, cfg.CN.UUID)
   192  		if err != nil {
   193  			return err
   194  		}
   195  		for i := range cfg.FileServices {
   196  			cfg.FileServices[i].Cache.KeyRouterFactory = gossipNode.DistKeyCacheGetter()
   197  			cfg.FileServices[i].Cache.QueryClient, err = qclient.NewQueryClient(
   198  				cfg.CN.UUID, cfg.FileServices[i].Cache.RPC,
   199  			)
   200  			if err != nil {
   201  				return err
   202  			}
   203  		}
   204  	}
   205  
   206  	fs, err := cfg.createFileService(ctx, st, uuid)
   207  	if err != nil {
   208  		return err
   209  	}
   210  
   211  	etlFS, err := fileservice.Get[fileservice.FileService](fs, defines.ETLFileServiceName)
   212  	if err != nil {
   213  		return err
   214  	}
   215  	if err = initTraceMetric(ctx, st, cfg, stopper, etlFS, uuid); err != nil {
   216  		return err
   217  	}
   218  
   219  	if globalEtlFS == nil {
   220  		globalEtlFS = etlFS
   221  	}
   222  
   223  	switch st {
   224  	case metadata.ServiceType_CN:
   225  		return startCNService(cfg, stopper, fs, gossipNode)
   226  	case metadata.ServiceType_TN:
   227  		return startTNService(cfg, stopper, fs, shutdownC)
   228  	case metadata.ServiceType_LOG:
   229  		return startLogService(cfg, stopper, fs, shutdownC)
   230  	case metadata.ServiceType_PROXY:
   231  		return startProxyService(cfg, stopper)
   232  	case metadata.ServiceType_PYTHON_UDF:
   233  		return startPythonUdfService(cfg, stopper)
   234  	default:
   235  		panic("unknown service type")
   236  	}
   237  }
   238  
   239  // serviceWG control motrace/mometric quit as last one.
   240  var serviceWG sync.WaitGroup
   241  
   242  func startCNService(
   243  	cfg *Config,
   244  	stopper *stopper.Stopper,
   245  	fileService fileservice.FileService,
   246  	gossipNode *gossip.Node,
   247  ) error {
   248  	// start up system module to do some calculation.
   249  	system.Run(stopper)
   250  
   251  	if err := waitClusterCondition(cfg.HAKeeperClient, waitAnyShardReady); err != nil {
   252  		return err
   253  	}
   254  	serviceWG.Add(1)
   255  	return stopper.RunNamedTask("cn-service", func(ctx context.Context) {
   256  		defer serviceWG.Done()
   257  		cfg.initMetaCache()
   258  		c := cfg.getCNServiceConfig()
   259  		commonConfigKVMap, _ := dumpCommonConfig(*cfg)
   260  		s, err := cnservice.NewService(
   261  			&c,
   262  			ctx,
   263  			fileService,
   264  			gossipNode,
   265  			cnservice.WithLogger(logutil.GetGlobalLogger().Named("cn-service").With(zap.String("uuid", cfg.CN.UUID))),
   266  			cnservice.WithMessageHandle(compile.CnServerMessageHandler),
   267  			cnservice.WithConfigData(commonConfigKVMap),
   268  			cnservice.WithTxnTraceData(filepath.Join(cfg.DataDir, c.Txn.Trace.Dir)),
   269  		)
   270  		if err != nil {
   271  			panic(err)
   272  		}
   273  		if err := s.Start(); err != nil {
   274  			panic(err)
   275  		}
   276  
   277  		<-ctx.Done()
   278  		// Close the cache client which is used in file service.
   279  		for _, fs := range cfg.FileServices {
   280  			if fs.Cache.QueryClient != nil {
   281  				_ = fs.Cache.QueryClient.Close()
   282  			}
   283  		}
   284  		if err := s.Close(); err != nil {
   285  			panic(err)
   286  		}
   287  		if err := cnclient.CloseCNClient(); err != nil {
   288  			panic(err)
   289  		}
   290  	})
   291  }
   292  
   293  func startTNService(
   294  	cfg *Config,
   295  	stopper *stopper.Stopper,
   296  	fileService fileservice.FileService,
   297  	shutdownC chan struct{},
   298  ) error {
   299  	if err := waitClusterCondition(cfg.HAKeeperClient, waitHAKeeperRunning); err != nil {
   300  		return err
   301  	}
   302  	r, err := getRuntime(metadata.ServiceType_TN, cfg, stopper)
   303  	if err != nil {
   304  		return err
   305  	}
   306  	serviceWG.Add(1)
   307  	return stopper.RunNamedTask("tn-service", func(ctx context.Context) {
   308  		defer serviceWG.Done()
   309  		cfg.initMetaCache()
   310  		c := cfg.getTNServiceConfig()
   311  		//notify the tn service it is in the standalone cluster
   312  		c.InStandalone = cfg.IsStandalone
   313  		commonConfigKVMap, _ := dumpCommonConfig(*cfg)
   314  		s, err := tnservice.NewService(
   315  			&c,
   316  			r,
   317  			fileService,
   318  			shutdownC,
   319  			tnservice.WithConfigData(commonConfigKVMap))
   320  		if err != nil {
   321  			panic(err)
   322  		}
   323  		if err := s.Start(); err != nil {
   324  			panic(err)
   325  		}
   326  
   327  		<-ctx.Done()
   328  		if err := s.Close(); err != nil {
   329  			panic(err)
   330  		}
   331  	})
   332  }
   333  
   334  func startLogService(
   335  	cfg *Config,
   336  	stopper *stopper.Stopper,
   337  	fileService fileservice.FileService,
   338  	shutdownC chan struct{},
   339  ) error {
   340  	lscfg := cfg.getLogServiceConfig()
   341  	commonConfigKVMap, _ := dumpCommonConfig(*cfg)
   342  	s, err := logservice.NewService(lscfg, fileService,
   343  		shutdownC,
   344  		logservice.WithRuntime(runtime.ProcessLevelRuntime()),
   345  		logservice.WithConfigData(commonConfigKVMap))
   346  	if err != nil {
   347  		panic(err)
   348  	}
   349  	if err := s.Start(); err != nil {
   350  		panic(err)
   351  	}
   352  	serviceWG.Add(1)
   353  	return stopper.RunNamedTask("log-service", func(ctx context.Context) {
   354  		defer serviceWG.Done()
   355  		if cfg.LogService.BootstrapConfig.BootstrapCluster {
   356  			logutil.Infof("bootstrapping hakeeper...")
   357  			if err := s.BootstrapHAKeeper(ctx, cfg.LogService); err != nil {
   358  				panic(err)
   359  			}
   360  		}
   361  
   362  		<-ctx.Done()
   363  		if err := s.Close(); err != nil {
   364  			panic(err)
   365  		}
   366  	})
   367  }
   368  
   369  // startProxyService starts the proxy service.
   370  func startProxyService(cfg *Config, stopper *stopper.Stopper) error {
   371  	if err := waitClusterCondition(cfg.HAKeeperClient, waitHAKeeperRunning); err != nil {
   372  		return err
   373  	}
   374  	serviceWG.Add(1)
   375  	return stopper.RunNamedTask("proxy-service", func(ctx context.Context) {
   376  		defer serviceWG.Done()
   377  		s, err := proxy.NewServer(
   378  			ctx,
   379  			cfg.getProxyConfig(),
   380  			proxy.WithRuntime(runtime.ProcessLevelRuntime()),
   381  		)
   382  		if err != nil {
   383  			panic(err)
   384  		}
   385  		if err := s.Start(); err != nil {
   386  			panic(err)
   387  		}
   388  		<-ctx.Done()
   389  		if err := s.Close(); err != nil {
   390  			panic(err)
   391  		}
   392  	})
   393  }
   394  
   395  // startPythonUdfService starts the python udf service.
   396  func startPythonUdfService(cfg *Config, stopper *stopper.Stopper) error {
   397  	if err := waitClusterCondition(cfg.HAKeeperClient, waitHAKeeperRunning); err != nil {
   398  		return err
   399  	}
   400  	serviceWG.Add(1)
   401  	return stopper.RunNamedTask("python-udf-service", func(ctx context.Context) {
   402  		defer serviceWG.Done()
   403  		s, err := pythonservice.NewService(cfg.PythonUdfServerConfig)
   404  		if err != nil {
   405  			panic(err)
   406  		}
   407  		if err := s.Start(); err != nil {
   408  			panic(err)
   409  		}
   410  		<-ctx.Done()
   411  		if err := s.Close(); err != nil {
   412  			panic(err)
   413  		}
   414  	})
   415  }
   416  
   417  func getNodeUUID(ctx context.Context, st metadata.ServiceType, cfg *Config) (UUID string, err error) {
   418  	switch st {
   419  	case metadata.ServiceType_CN:
   420  		// validate node_uuid
   421  		var uuidErr error
   422  		var nodeUUID uuid.UUID
   423  		if nodeUUID, uuidErr = uuid.Parse(cfg.CN.UUID); uuidErr != nil {
   424  			nodeUUID, _ = uuid.NewV7()
   425  		}
   426  		if err := util.SetUUIDNodeID(ctx, nodeUUID[:]); err != nil {
   427  			return "", moerr.ConvertPanicError(ctx, err)
   428  		}
   429  		UUID = nodeUUID.String()
   430  	case metadata.ServiceType_TN:
   431  		UUID = cfg.getTNServiceConfig().UUID
   432  	case metadata.ServiceType_LOG:
   433  		UUID = cfg.LogService.UUID
   434  	case metadata.ServiceType_PYTHON_UDF:
   435  		UUID = cfg.PythonUdfServerConfig.UUID
   436  	}
   437  	UUID = strings.ReplaceAll(UUID, " ", "_") // remove space in UUID for filename
   438  	return
   439  }
   440  
   441  func initTraceMetric(ctx context.Context, st metadata.ServiceType, cfg *Config, stopper *stopper.Stopper, fs fileservice.FileService, UUID string) error {
   442  	var writerFactory table.WriterFactory
   443  	var err error
   444  	var initWG sync.WaitGroup
   445  	SV := cfg.getObservabilityConfig()
   446  
   447  	nodeRole := st.String()
   448  	if *launchFile != "" {
   449  		nodeRole = mometric.LaunchMode
   450  	}
   451  
   452  	selector := clusterservice.NewSelector().SelectByLabel(SV.LabelSelector, clusterservice.Contain)
   453  	runtime.ProcessLevelRuntime().SetGlobalVariables(runtime.BackgroundCNSelector, selector)
   454  
   455  	if !SV.DisableTrace || !SV.DisableMetric {
   456  		writerFactory = export.GetWriterFactory(fs, UUID, nodeRole, !SV.DisableSqlWriter)
   457  		initWG.Add(1)
   458  		collector := export.NewMOCollector(ctx, export.WithOBCollectorConfig(&SV.OBCollectorConfig))
   459  		stopper.RunNamedTask("trace", func(ctx context.Context) {
   460  			err, act := motrace.InitWithConfig(ctx,
   461  				&SV,
   462  				motrace.WithNode(UUID, nodeRole),
   463  				motrace.WithBatchProcessor(collector),
   464  				motrace.WithFSWriterFactory(writerFactory),
   465  				motrace.WithSQLExecutor(nil),
   466  			)
   467  			initWG.Done()
   468  			if err != nil {
   469  				panic(err)
   470  			}
   471  			if !act {
   472  				return
   473  			}
   474  			<-ctx.Done()
   475  			logutil.Info("motrace receive shutdown signal, wait other services shutdown complete.")
   476  			serviceWG.Wait()
   477  			logutil.Info("Shutdown service complete.")
   478  			// flush trace/log/error framework
   479  			if err = motrace.Shutdown(ctx); err != nil {
   480  				logutil.Warn("Shutdown trace", logutil.ErrorField(err), logutil.NoReportFiled())
   481  			}
   482  		})
   483  		initWG.Wait()
   484  	}
   485  	if !SV.DisableMetric || SV.EnableMetricToProm {
   486  		stopper.RunNamedTask("metric", func(ctx context.Context) {
   487  			if act := mometric.InitMetric(ctx, nil, &SV, UUID, nodeRole, mometric.WithWriterFactory(writerFactory)); !act {
   488  				return
   489  			}
   490  			<-ctx.Done()
   491  			mometric.StopMetricSync()
   492  		})
   493  	}
   494  	if err = export.InitMerge(ctx, &SV); err != nil {
   495  		return err
   496  	}
   497  	return nil
   498  }
   499  
   500  func maybeRunInDaemonMode() {
   501  	if _, isChild := os.LookupEnv("daemon"); *daemon && !isChild {
   502  		childENV := []string{"daemon=true"}
   503  		pwd, err := os.Getwd()
   504  		if err != nil {
   505  			panic(err)
   506  		}
   507  		cpid, err := syscall.ForkExec(os.Args[0], os.Args, &syscall.ProcAttr{
   508  			Dir: pwd,
   509  			Env: append(os.Environ(), childENV...),
   510  			Sys: &syscall.SysProcAttr{
   511  				Setsid: true,
   512  			},
   513  			Files: []uintptr{0, 1, 2}, // print message to the same pty
   514  		})
   515  		if err != nil {
   516  			panic(err)
   517  		}
   518  		logutil.Infof("mo-service is running in daemon mode, child process is %d", cpid)
   519  		os.Exit(0)
   520  	}
   521  }